diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 08240184f2934..eb796368e7820 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -4,3 +4,5 @@ pathlib backports.lzma py PyCrypto +mock +ipython diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index b07ce611c79a2..43e6814ed6c8e 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -18,3 +18,4 @@ pymysql psycopg2 s3fs beautifulsoup4 +ipython diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 5d9cb05a7b402..9a6c1c7edbc5e 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -18,3 +18,4 @@ pymysql beautifulsoup4 s3fs xarray +ipython diff --git a/doc/source/api.rst b/doc/source/api.rst index 6c4a3cff5b4cf..33ac5fde651d4 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -60,6 +60,7 @@ JSON :toctree: generated/ json_normalize + build_table_schema .. currentmodule:: pandas diff --git a/doc/source/io.rst b/doc/source/io.rst index b36ae8c2ed450..c34cc1ec17512 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2033,6 +2033,126 @@ using Hadoop or Spark. df df.to_json(orient='records', lines=True) + +.. _io.table_schema: + +Table Schema +'''''''''''' + +.. versionadded:: 0.20.0 + +`Table Schema`_ is a spec for describing tabular datasets as a JSON +object. The JSON includes information on the field names, types, and +other attributes. You can use the orient ``table`` to build +a JSON string with two fields, ``schema`` and ``data``. + +.. ipython:: python + + df = pd.DataFrame( + {'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3), + }, index=pd.Index(range(3), name='idx')) + df + df.to_json(orient='table', date_format="iso") + +The ``schema`` field contains the ``fields`` key, which itself contains +a list of column name to type pairs, including the ``Index`` or ``MultiIndex`` +(see below for a list of types). +The ``schema`` field also contains a ``primaryKey`` field if the (Multi)index +is unique. + +The second field, ``data``, contains the serialized data with the ``records`` +orient. +The index is included, and any datetimes are ISO 8601 formatted, as required +by the Table Schema spec. + +The full list of types supported are described in the Table Schema +spec. This table shows the mapping from pandas types: + +============== ================= +Pandas type Table Schema type +============== ================= +int64 integer +float64 number +bool boolean +datetime64[ns] datetime +timedelta64[ns] duration +categorical any +object str +=============== ================= + +A few notes on the generated table schema: + +- The ``schema`` object contains a ``pandas_version`` field. This contains + the version of pandas' dialect of the schema, and will be incremented + with each revision. +- All dates are converted to UTC when serializing. Even timezone naïve values, + which are treated as UTC with an offset of 0. + + .. ipython:: python: + + from pandas.io.json import build_table_schema + s = pd.Series(pd.date_range('2016', periods=4)) + build_table_schema(s) + +- datetimes with a timezone (before serializing), include an additional field + ``tz`` with the time zone name (e.g. ``'US/Central'``). + + .. ipython:: python + + s_tz = pd.Series(pd.date_range('2016', periods=12, + tz='US/Central')) + build_table_schema(s_tz) + +- Periods are converted to timestamps before serialization, and so have the + same behavior of being converted to UTC. In addition, periods will contain + and additional field ``freq`` with the period's frequency, e.g. ``'A-DEC'`` + + .. ipython:: python + + s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC', + periods=4)) + build_table_schema(s_per) + +- Categoricals use the ``any`` type and an ``enum`` constraint listing + the set of possible values. Additionally, an ``ordered`` field is included + + .. ipython:: python + + s_cat = pd.Series(pd.Categorical(['a', 'b', 'a'])) + build_table_schema(s_cat) + +- A ``primaryKey`` field, containing an array of labels, is included + *if the index is unique*: + + .. ipython:: python + + s_dupe = pd.Series([1, 2], index=[1, 1]) + build_table_schema(s_dupe) + +- The ``primaryKey`` behavior is the same with MultiIndexes, but in this + case the ``primaryKey`` is an array: + + .. ipython:: python + + s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'), + (0, 1)])) + build_table_schema(s_multi) + +- The default naming roughly follows these rules: + + + For series, the ``object.name`` is used. If that's none, then the + name is ``values`` + + For DataFrames, the stringified version of the column name is used + + For ``Index`` (not ``MultiIndex``), ``index.name`` is used, with a + fallback to ``index`` if that is None. + + For ``MultiIndex``, ``mi.names`` is used. If any level has no name, + then ``level_`` is used. + + +_Table Schema: http://specs.frictionlessdata.io/json-table-schema/ + HTML ---- diff --git a/doc/source/options.rst b/doc/source/options.rst index 10a13ed36df8d..1a0e5cf6b7235 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -397,6 +397,9 @@ display.width 80 Width of the display in charact IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to correctly detect the width. +display.html.table_schema False Whether to publish a Table Schema + representation for frontends that + support it. html.border 1 A ``border=value`` attribute is inserted in the ```` tag for the DataFrame HTML repr. @@ -424,6 +427,7 @@ mode.use_inf_as_null False True means treat None, NaN, -IN are not null (new way). =================================== ============ ================================== + .. _basics.console_output: Number Formatting @@ -512,3 +516,20 @@ Enabling ``display.unicode.ambiguous_as_wide`` lets pandas to figure these chara pd.set_option('display.unicode.east_asian_width', False) pd.set_option('display.unicode.ambiguous_as_wide', False) + +.. _options.table_schema: + +Table Schema Display +-------------------- + +.. versionadded:: 0.20.0 + +``DataFrame`` and ``Series`` will publish a Table Schema representation +by default. False by default, this can be enabled globally with the +``display.html.table_schema`` option: + +.. ipython:: python + + pd.set_option('display.html.table_schema', True) + +Only ``'display.max_rows'`` are serialized and published. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8b6c53a159ad8..7b4538bd181d2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -12,6 +12,7 @@ Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` - Switched the test framework to `pytest`_ (:issue:`13097`) +- A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref: `here ` .. _pytest: http://doc.pytest.org/en/latest/ @@ -154,6 +155,40 @@ New Behavior: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() +.. _whatsnew_0200.enhancements.table_schema + +Table Schema Output +^^^^^^^^^^^^^^^^^^^ + +The new orient ``'table'`` for :meth:`DataFrame.to_json` +will generate a `Table Schema`_ compatible string representation of +the data. + +.. ipython:: python + + df = pd.DataFrame( + {'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3), + }, index=pd.Index(range(3), name='idx')) + df + df.to_json(orient='table') + + +See :ref:`IO: Table Schema for more`. + +Additionally, the repr for ``DataFrame`` and ``Series`` can now publish +this JSON Table schema representation of the Series or DataFrame if you are +using IPython (or another frontend like `nteract`_ using the Jupyter messaging +protocol). +This gives frontends like the Jupyter notebook and `nteract`_ +more flexiblity in how they display pandas objects, since they have +more information about the data. +You must enable this by setting the ``display.html.table_schema`` option to True. + +.. _Table Schema: http://specs.frictionlessdata.io/json-table-schema/ +.. _nteract: http://nteract.io/ + .. _whatsnew_0200.enhancements.other: Other enhancements diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 89616890e1de1..931fe0661818d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -164,6 +164,13 @@ (default: False) """ +pc_table_schema_doc = """ +: boolean + Whether to publish a Table Schema representation for frontends + that support it. + (default: False) +""" + pc_line_width_deprecation_warning = """\ line_width has been deprecated, use display.width instead (currently both are identical) @@ -366,6 +373,9 @@ def mpl_style_cb(key): validator=is_text) cf.register_option('latex.multirow', False, pc_latex_multirow, validator=is_bool) + cf.register_option('html.table_schema', False, pc_table_schema_doc, + validator=is_bool) + cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 127aac970fbc1..298fa75779420 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,6 +4,7 @@ import operator import weakref import gc +import json import numpy as np import pandas.lib as lib @@ -129,6 +130,37 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + def _ipython_display_(self): + try: + from IPython.display import display + except ImportError: + return None + + # Series doesn't define _repr_html_ or _repr_latex_ + latex = self._repr_latex_() if hasattr(self, '_repr_latex_') else None + html = self._repr_html_() if hasattr(self, '_repr_html_') else None + table_schema = self._repr_table_schema_() + # We need the inital newline since we aren't going through the + # usual __repr__. See + # https://github.com/pandas-dev/pandas/pull/14904#issuecomment-277829277 + text = "\n" + repr(self) + + reprs = {"text/plain": text, "text/html": html, "text/latex": latex, + "application/vnd.dataresource+json": table_schema} + reprs = {k: v for k, v in reprs.items() if v} + display(reprs, raw=True) + + def _repr_table_schema_(self): + """ + Not a real Jupyter special repr method, but we use the same + naming convention. + """ + if config.get_option("display.html.table_schema"): + data = self.head(config.get_option('display.max_rows')) + payload = json.loads(data.to_json(orient='table'), + object_pairs_hook=collections.OrderedDict) + return payload + def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -1094,7 +1126,7 @@ def __setstate__(self, state): strings before writing. """ - def to_json(self, path_or_buf=None, orient=None, date_format='epoch', + def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): """ @@ -1129,10 +1161,17 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array + - table : dict like {'schema': {schema}, 'data': {data}} + describing the data, and the data component is + like ``orient='records'``. - date_format : {'epoch', 'iso'} + .. versionchanged:: 0.20.0 + + date_format : {None, 'epoch', 'iso'} Type of date conversion. `epoch` = epoch milliseconds, - `iso`` = ISO8601, default is epoch. + `iso` = ISO8601. The default depends on the `orient`. For + `orient='table'`, the default is `'iso'`. For all other orients, + the default is `'epoch'`. double_precision : The number of decimal places to use when encoding floating point values, default 10. force_ascii : force encoded string to be ASCII, default True. @@ -1151,14 +1190,53 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', .. versionadded:: 0.19.0 - Returns ------- same type as input object with filtered info axis + See Also + -------- + pd.read_json + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '{"schema": {"fields": [{"name": "index", "type": "string"}, + {"name": "col 1", "type": "string"}, + {"name": "col 2", "type": "string"}], + "primaryKey": "index", + "pandas_version": "0.20.0"}, + "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, + {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ from pandas.io import json + if date_format is None and orient == 'table': + date_format = 'iso' + elif date_format is None: + date_format = 'epoch' return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, date_format=date_format, double_precision=double_precision, diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index a9390a04cc2cd..32d110b3404a9 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -1,4 +1,5 @@ from .json import to_json, read_json, loads, dumps # noqa from .normalize import json_normalize # noqa +from .table_schema import build_table_schema # noqa -del json, normalize # noqa +del json, normalize, table_schema # noqa diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 6fc766081eefe..a00d3492e8a37 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -1,5 +1,4 @@ # pylint: disable-msg=E1101,W0613,W0603 - import os import numpy as np @@ -12,10 +11,14 @@ from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits +from .table_schema import build_table_schema +from pandas.types.common import is_period_dtype loads = _json.loads dumps = _json.dumps +TABLE_SCHEMA_VERSION = '0.20.0' + # interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', @@ -26,19 +29,22 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', raise ValueError( "'lines' keyword only valid when 'orient' is records") - if isinstance(obj, Series): - s = SeriesWriter( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler).write() + if orient == 'table' and isinstance(obj, Series): + obj = obj.to_frame(name=obj.name or 'values') + if orient == 'table' and isinstance(obj, DataFrame): + writer = JSONTableWriter + elif isinstance(obj, Series): + writer = SeriesWriter elif isinstance(obj, DataFrame): - s = FrameWriter( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler).write() + writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") + s = writer( + obj, orient=orient, date_format=date_format, + double_precision=double_precision, ensure_ascii=force_ascii, + date_unit=date_unit, default_handler=default_handler).write() + if lines: s = _convert_to_line_delimits(s) @@ -81,7 +87,8 @@ def write(self): ensure_ascii=self.ensure_ascii, date_unit=self.date_unit, iso_dates=self.date_format == 'iso', - default_handler=self.default_handler) + default_handler=self.default_handler + ) class SeriesWriter(Writer): @@ -108,6 +115,55 @@ def _format_axes(self): "'%s'." % self.orient) +class JSONTableWriter(FrameWriter): + _default_orient = 'records' + + def __init__(self, obj, orient, date_format, double_precision, + ensure_ascii, date_unit, default_handler=None): + """ + Adds a `schema` attribut with the Table Schema, resets + the index (can't do in caller, because the schema inference needs + to know what the index is, forces orient to records, and forces + date_format to 'iso'. + """ + super(JSONTableWriter, self).__init__( + obj, orient, date_format, double_precision, ensure_ascii, + date_unit, default_handler=default_handler) + + if date_format != 'iso': + msg = ("Trying to write with `orient='table'` and " + "`date_format='%s'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`" % date_format) + raise ValueError(msg) + + self.schema = build_table_schema(obj) + + # TODO: Do this timedelta properly in objToJSON.c See GH #15137 + if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or + len(obj.columns & obj.index.names)): + msg = "Overlapping names between the index and columns" + raise ValueError(msg) + + obj = obj.copy() + timedeltas = obj.select_dtypes(include=['timedelta']).columns + if len(timedeltas): + obj[timedeltas] = obj[timedeltas].applymap( + lambda x: x.isoformat()) + # Convert PeriodIndex to datetimes before serialzing + if is_period_dtype(obj.index): + obj.index = obj.index.to_timestamp() + + self.obj = obj.reset_index() + self.date_format = 'iso' + self.orient = 'records' + + def write(self): + data = super(JSONTableWriter, self).write() + serialized = '{{"schema": {}, "data": {}}}'.format( + dumps(self.schema), data) + return serialized + + def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, @@ -244,6 +300,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, col 1 col 2 0 a b 1 c d + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '{"schema": {"fields": [{"name": "index", "type": "string"}, + {"name": "col 1", "type": "string"}, + {"name": "col 2", "type": "string"}], + "primaryKey": "index", + "pandas_version": "0.20.0"}, + "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, + {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py new file mode 100644 index 0000000000000..48f92d28baf61 --- /dev/null +++ b/pandas/io/json/table_schema.py @@ -0,0 +1,177 @@ +""" +Table Schema builders + +http://specs.frictionlessdata.io/json-table-schema/ +""" +from pandas.types.common import ( + is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype, + is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_categorical_dtype, is_period_dtype, is_string_dtype +) + + +def as_json_table_type(x): + """ + Convert a NumPy / pandas type to its corresponding json_table. + + Parameters + ---------- + x : array or dtype + + Returns + ------- + t : str + the Table Schema data types + + Notes + ----- + This table shows the relationship between NumPy / pandas dtypes, + and Table Schema dtypes. + + ============== ================= + Pandas type Table Schema type + ============== ================= + int64 integer + float64 number + bool boolean + datetime64[ns] datetime + timedelta64[ns] duration + object str + categorical any + =============== ================= + """ + if is_integer_dtype(x): + return 'integer' + elif is_bool_dtype(x): + return 'boolean' + elif is_numeric_dtype(x): + return 'number' + elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or + is_period_dtype(x)): + return 'datetime' + elif is_timedelta64_dtype(x): + return 'duration' + elif is_categorical_dtype(x): + return 'any' + elif is_string_dtype(x): + return 'string' + else: + return 'any' + + +def set_default_names(data): + """Sets index names to 'index' for regular, or 'level_x' for Multi""" + if all(name is not None for name in data.index.names): + return data + + data = data.copy() + if data.index.nlevels > 1: + names = [name if name is not None else 'level_{}'.format(i) + for i, name in enumerate(data.index.names)] + data.index.names = names + else: + data.index.name = data.index.name or 'index' + return data + + +def make_field(arr, dtype=None): + dtype = dtype or arr.dtype + field = {'name': arr.name or 'values', + 'type': as_json_table_type(dtype)} + + if is_categorical_dtype(arr): + if hasattr(arr, 'categories'): + cats = arr.categories + ordered = arr.ordered + else: + cats = arr.cat.categories + ordered = arr.cat.ordered + field['constraints'] = {"enum": list(cats)} + field['ordered'] = ordered + elif is_period_dtype(arr): + field['freq'] = arr.freqstr + elif is_datetime64tz_dtype(arr): + if hasattr(arr, 'dt'): + field['tz'] = arr.dt.tz.zone + else: + field['tz'] = arr.tz.zone + return field + + +def build_table_schema(data, index=True, primary_key=None, version=True): + """ + Create a Table schema from ``data``. + + Parameters + ---------- + data : Series, DataFrame + index : bool, default True + Whether to include ``data.index`` in the schema. + primary_key : bool or None, default True + column names to designate as the primary key. + The default `None` will set `'primaryKey'` to the index + level or levels if the index is unique. + version : bool, default True + Whether to include a field `pandas_version` with the version + of pandas that generated the schema. + + Returns + ------- + schema : dict + + Examples + -------- + >>> df = pd.DataFrame( + ... {'A': [1, 2, 3], + ... 'B': ['a', 'b', 'c'], + ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... }, index=pd.Index(range(3), name='idx')) + >>> build_table_schema(df) + {'fields': [{'name': 'idx', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}], + 'pandas_version': '0.20.0', + 'primaryKey': ['idx']} + + Notes + ----- + See `_as_json_table_type` for conversion types. + Timedeltas as converted to ISO8601 duration format with + 9 decimal places after the secnods field for nanosecond precision. + + Categoricals are converted to the `any` dtype, and use the `enum` field + constraint to list the allowed values. The `ordered` attribute is included + in an `ordered` field. + """ + if index is True: + data = set_default_names(data) + + schema = {} + fields = [] + + if index: + if data.index.nlevels > 1: + for level in data.index.levels: + fields.append(make_field(level)) + else: + fields.append(make_field(data.index)) + + if data.ndim > 1: + for column, s in data.iteritems(): + fields.append(make_field(s)) + else: + fields.append(make_field(data)) + + schema['fields'] = fields + if index and data.index.is_unique and primary_key is None: + if data.index.nlevels == 1: + schema['primaryKey'] = [data.index.name] + else: + schema['primaryKey'] = data.index.names + elif primary_key is not None: + schema['primaryKey'] = primary_key + + if version: + schema['pandas_version'] = '0.20.0' + return schema diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 52f3e06c6cbd0..cacba2ad3f3ba 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import pytest from pandas import compat +import pandas as pd import pandas.formats.printing as printing import pandas.formats.format as fmt import pandas.util.testing as tm @@ -118,6 +120,65 @@ def test_ambiguous_width(self): self.assertEqual(adjoined, expected) +class TestTableSchemaRepr(tm.TestCase): + + @classmethod + def setUpClass(cls): + pytest.importorskip('IPython') + try: + import mock + except ImportError: + try: + from unittest import mock + except ImportError: + pytest.skip("Mock is not installed") + cls.mock = mock + + def test_publishes(self): + df = pd.DataFrame({"A": [1, 2]}) + objects = [df['A'], df, df] # dataframe / series + expected_keys = [ + {'text/plain', 'application/vnd.dataresource+json'}, + {'text/plain', 'text/html', 'application/vnd.dataresource+json'}, + ] + + make_patch = self.mock.patch('IPython.display.display') + opt = pd.option_context('display.html.table_schema', True) + for obj, expected in zip(objects, expected_keys): + with opt, make_patch as mock_display: + handle = obj._ipython_display_() + self.assertEqual(mock_display.call_count, 1) + self.assertIsNone(handle) + args, kwargs = mock_display.call_args + arg, = args # just one argument + + self.assertEqual(kwargs, {"raw": True}) + self.assertEqual(set(arg.keys()), expected) + + with_latex = pd.option_context('display.latex.repr', True) + + with opt, with_latex, make_patch as mock_display: + handle = obj._ipython_display_() + args, kwargs = mock_display.call_args + arg, = args + + expected = {'text/plain', 'text/html', 'text/latex', + 'application/vnd.dataresource+json'} + self.assertEqual(set(arg.keys()), expected) + + def test_config_on(self): + df = pd.DataFrame({"A": [1, 2]}) + with pd.option_context("display.html.table_schema", True): + result = df._repr_table_schema_() + self.assertIsNotNone(result) + + def test_config_default_off(self): + df = pd.DataFrame({"A": [1, 2]}) + with pd.option_context("display.html.table_schema", False): + result = df._repr_table_schema_() + self.assertIsNone(result) + + # TODO: fix this broken test # def test_console_encode(): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py new file mode 100644 index 0000000000000..d1795f2816817 --- /dev/null +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -0,0 +1,462 @@ +"""Tests for Table Schema integration.""" +import json +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pytest + +from pandas import DataFrame +from pandas.types.dtypes import PeriodDtype, CategoricalDtype, DatetimeTZDtype +import pandas.util.testing as tm +from pandas.io.json.table_schema import ( + as_json_table_type, build_table_schema, make_field, set_default_names +) + + +class TestBuildSchema(tm.TestCase): + + def setUp(self): + self.df = DataFrame( + {'A': [1, 2, 3, 4], + 'B': ['a', 'b', 'c', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=4), + 'D': pd.timedelta_range('1H', periods=4, freq='T'), + }, + index=pd.Index(range(4), name='idx')) + + def test_build_table_schema(self): + result = build_table_schema(self.df, version=False) + expected = { + 'fields': [{'name': 'idx', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}, + {'name': 'D', 'type': 'duration'}, + ], + 'primaryKey': ['idx'] + } + self.assertEqual(result, expected) + result = build_table_schema(self.df) + self.assertTrue("pandas_version" in result) + + def test_series(self): + s = pd.Series([1, 2, 3], name='foo') + result = build_table_schema(s, version=False) + expected = {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'foo', 'type': 'integer'}], + 'primaryKey': ['index']} + self.assertEqual(result, expected) + result = build_table_schema(s) + self.assertTrue('pandas_version' in result) + + def tets_series_unnamed(self): + result = build_table_schema(pd.Series([1, 2, 3]), version=False) + expected = {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'values', 'type': 'integer'}], + 'primaryKey': ['index']} + self.assertEqual(result, expected) + + def test_multiindex(self): + df = self.df.copy() + idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)]) + df.index = idx + + result = build_table_schema(df, version=False) + expected = { + 'fields': [{'name': 'level_0', 'type': 'string'}, + {'name': 'level_1', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}, + {'name': 'D', 'type': 'duration'}, + ], + 'primaryKey': ['level_0', 'level_1'] + } + self.assertEqual(result, expected) + + df.index.names = ['idx0', None] + expected['fields'][0]['name'] = 'idx0' + expected['primaryKey'] = ['idx0', 'level_1'] + result = build_table_schema(df, version=False) + self.assertEqual(result, expected) + + +class TestTableSchemaType(tm.TestCase): + + def test_as_json_table_type_int_data(self): + int_data = [1, 2, 3] + int_types = [np.int, np.int16, np.int32, np.int64] + for t in int_types: + self.assertEqual(as_json_table_type(np.array(int_data, dtype=t)), + 'integer') + + def test_as_json_table_type_float_data(self): + float_data = [1., 2., 3.] + float_types = [np.float, np.float16, np.float32, np.float64] + for t in float_types: + self.assertEqual(as_json_table_type(np.array(float_data, + dtype=t)), + 'number') + + def test_as_json_table_type_bool_data(self): + bool_data = [True, False] + bool_types = [bool, np.bool] + for t in bool_types: + self.assertEqual(as_json_table_type(np.array(bool_data, dtype=t)), + 'boolean') + + def test_as_json_table_type_date_data(self): + date_data = [pd.to_datetime(['2016']), + pd.to_datetime(['2016'], utc=True), + pd.Series(pd.to_datetime(['2016'])), + pd.Series(pd.to_datetime(['2016'], utc=True)), + pd.period_range('2016', freq='A', periods=3)] + for arr in date_data: + self.assertEqual(as_json_table_type(arr), 'datetime') + + def test_as_json_table_type_string_data(self): + strings = [pd.Series(['a', 'b']), pd.Index(['a', 'b'])] + for t in strings: + self.assertEqual(as_json_table_type(t), 'string') + + def test_as_json_table_type_categorical_data(self): + self.assertEqual(as_json_table_type(pd.Categorical(['a'])), 'any') + self.assertEqual(as_json_table_type(pd.Categorical([1])), 'any') + self.assertEqual(as_json_table_type( + pd.Series(pd.Categorical([1]))), 'any') + self.assertEqual(as_json_table_type(pd.CategoricalIndex([1])), 'any') + self.assertEqual(as_json_table_type(pd.Categorical([1])), 'any') + + # ------ + # dtypes + # ------ + def test_as_json_table_type_int_dtypes(self): + integers = [np.int, np.int16, np.int32, np.int64] + for t in integers: + self.assertEqual(as_json_table_type(t), 'integer') + + def test_as_json_table_type_float_dtypes(self): + floats = [np.float, np.float16, np.float32, np.float64] + for t in floats: + self.assertEqual(as_json_table_type(t), 'number') + + def test_as_json_table_type_bool_dtypes(self): + bools = [bool, np.bool] + for t in bools: + self.assertEqual(as_json_table_type(t), 'boolean') + + def test_as_json_table_type_date_dtypes(self): + # TODO: datedate.date? datetime.time? + dates = [np.datetime64, np.dtype("