From 4cd506e1bf371fce15359f736e8ed4199d408deb Mon Sep 17 00:00:00 2001 From: louispotok Date: Mon, 24 Jul 2017 11:53:46 -0400 Subject: [PATCH 01/62] Add chunksize param to read_json when lines=True Previous behavior: reading the whole file to memory and then split into lines. New behavior, if lines=True and chunksize is passed: read in `chunksize` lines at a time, and concat. This only covers some kinds of input to read_json. When chunksize is passed, read_json becomes slower but more memory-efficient. --- pandas/io/json/json.py | 52 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 5dae6099446d0..690f2b6a8da69 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -1,4 +1,6 @@ # pylint: disable-msg=E1101,W0613,W0603 +from itertools import islice +from pandas import concat import os import numpy as np @@ -175,7 +177,7 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False): + lines=False, chunksize=None): """ Convert a JSON string to pandas object @@ -264,6 +266,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 + chunksize: integer, default None + If `lines=True`, how many lines to read into memory at a time. + If this is None, the file will be read into memory all at once. + Passing a chunksize helps with memory usage, but is slower. + Also note this is different from the `chunksize` parameter in + `read_csv`, which returns a FileTextReader. + If the JSON input is a string, this argument has no effect. + Returns ------- result : Series or DataFrame, depending on the value of `typ`. @@ -323,6 +333,27 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) + + def _read_json_as_lines(fh, chunksize): + return_val = None + while True: + lines = list(islice(fh, chunksize)) + + if lines: + lines_json = '[' + ','.join(lines) + ']' + obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, + convert_dates, keep_default_dates, numpy, + precise_float, date_unit) + if not return_val: + obj = return_val + else: + return_val = concat([return_val, obj]) + + else: + break + fh.close() + return return_val + if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -335,12 +366,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if exists: fh, handles = _get_handle(filepath_or_buffer, 'r', encoding=encoding) - json = fh.read() - fh.close() + if lines and chunksize: + return _read_json_as_lines(fh, chunksize) + else: + json = fh.read() + fh.close() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): - json = filepath_or_buffer.read() + if lines and chunksize: + return _read_json_as_lines(fh, chunksize) + else: + json = filepath_or_buffer.read() else: json = filepath_or_buffer @@ -350,6 +387,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, lines = list(StringIO(json.strip())) json = '[' + ','.join(lines) + ']' + return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, + keep_default_dates, numpy, precise_float, date_unit) + + +def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, + keep_default_dates, numpy, precise_float, + date_unit): obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, From 9fe44f1c190959559e442451554643506f9505a3 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 3 Aug 2017 17:55:34 -0700 Subject: [PATCH 02/62] Add read_json chunksize change to whatsnew --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 50f11c38bae23..ff736f5d4995d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -471,6 +471,7 @@ Other API Changes - :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) +- :func:`read_json` now accepts a ``chunksize`` parameter that can reduce memory usage when ``lines=True``. (:issue:`17048`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) From 6de3a2726f2e7aced1ae761caaa0fa2f3d444db1 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 3 Aug 2017 17:57:30 -0700 Subject: [PATCH 03/62] Add versionadded to docstring --- pandas/io/json/json.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 690f2b6a8da69..3f1a35d763a0e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -274,6 +274,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, `read_csv`, which returns a FileTextReader. If the JSON input is a string, this argument has no effect. + .. versionadded:: 0.21.0 + Returns ------- result : Series or DataFrame, depending on the value of `typ`. From e235c702b0f5a9161cd330c23bc3c5eef4ced699 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 3 Aug 2017 18:12:32 -0700 Subject: [PATCH 04/62] add docstring for _read_json_as_lines --- pandas/io/json/json.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 3f1a35d763a0e..39c30a11e82af 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -337,6 +337,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, encoding=encoding) def _read_json_as_lines(fh, chunksize): + """ + Read json lines from fh in chunks, then concatenate the resulting + pandas objects. + + Parameters + ---------- + fh : a file-like object + chunksize : integer + """ return_val = None while True: lines = list(islice(fh, chunksize)) From 0a5a8f99ac8980c0cbe4589828744e07a20baa1a Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 3 Aug 2017 18:29:39 -0700 Subject: [PATCH 05/62] add basic read_json chunksize test --- pandas/tests/io/json/test_pandas.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 671d4248818e4..3b7055c8a4c99 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1032,6 +1032,15 @@ def test_to_jsonl(self): assert result == expected assert_frame_equal(pd.read_json(result, lines=True), df) + def test_read_jsonchunks(self): + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + strio = df.to_json(lines=True, orient="records") + + unchunked = pd.read_json(strio, lines=True) + chunked = pd.read_json(strio, lines=True, chunksize=1) + + assert_frame_equal(chunked, unchunked) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From ce234447fd0fb1c58dd78fd9295b13efdf3bf379 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 3 Aug 2017 18:33:49 -0700 Subject: [PATCH 06/62] validate read_json chunksize is an integer and >=1 --- pandas/io/json/json.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 39c30a11e82af..386e79ebeb239 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, _stringify_path) +from pandas.io.parsers import _validate_integer from pandas.core.common import AbstractMethodError from pandas.io.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits @@ -336,6 +337,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) + if chunksize is not None: + _validate_integer("chunksize", chunksize, 1) + def _read_json_as_lines(fh, chunksize): """ Read json lines from fh in chunks, then concatenate the resulting From a97ca0bec62062a04e5230f76820f7edf4571d71 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 3 Aug 2017 18:41:14 -0700 Subject: [PATCH 07/62] Add more tests to read_json chunksize - Errors correctly when non-integer is passed as chunksize - Accepts a float that is close to an integer as chunksize --- pandas/tests/io/json/test_pandas.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3b7055c8a4c99..a9644a0e49fb5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1041,6 +1041,23 @@ def test_read_jsonchunks(self): assert_frame_equal(chunked, unchunked) + chunked_float = pd.read_json(strio, lines=True, chunksize=1.0) + assert_frame_equal(chunked_float, unchunked) + + msg = r"'chunksize' must be an integer >=1" + + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(strio, lines=True, chunksize=0) + + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(strio, lines=True, chunksize=-1) + + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(strio, lines=True, chunksize=2.2) + + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(strio, lines=True, chunksize='foo') + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From 2861d0e611e676bf1842f03776f32da935cf0f17 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 13 Aug 2017 15:44:56 -0700 Subject: [PATCH 08/62] Return JsonLineReader from read_json When chunksize is passed and lines=True, read_json now returns a JsonLineReader, which inherits from BaseIterator. Also, internally, wrap up read_json kwargs into a dictionary and pass them down opaquely to the SeriesParser or FrameParser. --- pandas/io/json/json.py | 84 +++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 386e79ebeb239..4dd8c03b53be3 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -10,7 +10,7 @@ from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, - _stringify_path) + _stringify_path, BaseIterator) from pandas.io.parsers import _validate_integer from pandas.core.common import AbstractMethodError from pandas.io.formats.printing import pprint_thing @@ -337,38 +337,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) + # These kwargs are only needed by the Parsers, so we just wrap them up and + # pass them down. + kwargs = {"typ": typ, "orient": orient, "dtype": dtype, + "convert_axes": convert_axes, "convert_dates": convert_dates, + "keep_default_dates": keep_default_dates, "numpy": numpy, + "precise_float": precise_float, "date_unit": date_unit} + if chunksize is not None: _validate_integer("chunksize", chunksize, 1) - def _read_json_as_lines(fh, chunksize): - """ - Read json lines from fh in chunks, then concatenate the resulting - pandas objects. - - Parameters - ---------- - fh : a file-like object - chunksize : integer - """ - return_val = None - while True: - lines = list(islice(fh, chunksize)) - - if lines: - lines_json = '[' + ','.join(lines) + ']' - obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, - convert_dates, keep_default_dates, numpy, - precise_float, date_unit) - if not return_val: - obj = return_val - else: - return_val = concat([return_val, obj]) - - else: - break - fh.close() - return return_val - if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -382,7 +360,7 @@ def _read_json_as_lines(fh, chunksize): fh, handles = _get_handle(filepath_or_buffer, 'r', encoding=encoding) if lines and chunksize: - return _read_json_as_lines(fh, chunksize) + return JsonLineReader(fh, chunksize, **kwargs) else: json = fh.read() fh.close() @@ -390,7 +368,7 @@ def _read_json_as_lines(fh, chunksize): json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): if lines and chunksize: - return _read_json_as_lines(fh, chunksize) + return JsonLineReader(fh, chunksize, **kwargs) else: json = filepath_or_buffer.read() else: @@ -402,29 +380,49 @@ def _read_json_as_lines(fh, chunksize): lines = list(StringIO(json.strip())) json = '[' + ','.join(lines) + ']' - return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, date_unit) + return _get_obj(json, **kwargs) -def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit): +def _get_obj(json, **kwargs): + typ = kwargs['typ'] + dtype = kwargs['dtype'] + kwargs = {k: v for k, v in kwargs.items() if k != 'typ'} obj = None if typ == 'frame': - obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit).parse() + obj = FrameParser(json, **kwargs).parse() if typ == 'series' or obj is None: if not isinstance(dtype, bool): dtype = dict(data=dtype) - obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit).parse() + obj = SeriesParser(json, **kwargs).parse() return obj +class JsonLineReader(BaseIterator): + """ + Iterates over a JSON document that is formatted with one JSON record per + line. The `chunksize` initialization parameter controls how many lines are + read per iteration. + """ + def __init__(self, fh, chunksize, **kwargs): + self.fh = fh + self.chunksize = chunksize + self.kwargs = kwargs + + def __next__(self): + lines = list(islice(self.fh, self.chunksize)) + if lines: + lines_json = '[' + ','.join(lines) + ']' + return _get_obj(json=lines_json, **self.kwargs) + + else: + try: + self.fh.close() + except: + pass + return StopIteration + class Parser(object): _STAMP_UNITS = ('s', 'ms', 'us', 'ns') From da59b4a6e04533d832810107b3a80e452e987e70 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 13 Aug 2017 15:49:09 -0700 Subject: [PATCH 09/62] Raise ValueError if chunksize is not None, but not lines --- pandas/io/json/json.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4dd8c03b53be3..7e0d340a26c68 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -346,6 +346,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if chunksize is not None: _validate_integer("chunksize", chunksize, 1) + if not lines: + raise ValueError("chunksize should only be passed if lines=True") if isinstance(filepath_or_buffer, compat.string_types): try: From 4544f82439c16f9c235af537ecbea2b1af65d38c Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 13 Aug 2017 15:56:43 -0700 Subject: [PATCH 10/62] Add issue number to test docstring and add test. Test that read_json raises exception if chunksize is passed and lines != True. --- pandas/tests/io/json/test_pandas.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a9644a0e49fb5..8229fc70080c4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1033,6 +1033,7 @@ def test_to_jsonl(self): assert_frame_equal(pd.read_json(result, lines=True), df) def test_read_jsonchunks(self): + # GH17048: memory usage when lines=True df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) strio = df.to_json(lines=True, orient="records") @@ -1058,6 +1059,11 @@ def test_read_jsonchunks(self): with tm.assert_raises_regex(ValueError, msg): pd.read_json(strio, lines=True, chunksize='foo') + + msg = "chunksize should only be passed if lines=True" + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(strio, lines=False, chunksize=2) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From dad7f117cb66a5d30af582cf6f8cb953d18030fc Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 13 Aug 2017 16:42:20 -0700 Subject: [PATCH 11/62] bugfix: raise StopIteration, dont return it --- pandas/io/json/json.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 7e0d340a26c68..7e94c5d8ed251 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -1,6 +1,5 @@ # pylint: disable-msg=E1101,W0613,W0603 from itertools import islice -from pandas import concat import os import numpy as np @@ -423,7 +422,7 @@ def __next__(self): self.fh.close() except: pass - return StopIteration + raise StopIteration class Parser(object): From bb8b1b66d5cea9039ddb9d54c5635d4b1606db56 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 13 Aug 2017 16:45:21 -0700 Subject: [PATCH 12/62] PEP8 cleanup --- pandas/io/json/json.py | 2 +- pandas/tests/io/json/test_pandas.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 7e94c5d8ed251..d83441dddd7af 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -420,7 +420,7 @@ def __next__(self): else: try: self.fh.close() - except: + except IOError: pass raise StopIteration diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 8229fc70080c4..b3eea4081c02b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1059,7 +1059,6 @@ def test_read_jsonchunks(self): with tm.assert_raises_regex(ValueError, msg): pd.read_json(strio, lines=True, chunksize='foo') - msg = "chunksize should only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): pd.read_json(strio, lines=False, chunksize=2) From 3e81bba44713dc457c9d16b497a016dd1035faa0 Mon Sep 17 00:00:00 2001 From: louispotok Date: Mon, 14 Aug 2017 08:31:42 -0700 Subject: [PATCH 13/62] Bugfixes for chunksize * use _validate_integer to cast chunksize to integer, instead of just checking * pass filepath_or_buffer to JsonLineReader where appropriate * JsonLineReader uses integer index * fix read_json chunksize tests to actually use StringIO and test Series --- pandas/io/json/json.py | 19 ++++++++++++--- pandas/tests/io/json/test_pandas.py | 37 ++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index d83441dddd7af..beb4a0dc3650e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -344,7 +344,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, "precise_float": precise_float, "date_unit": date_unit} if chunksize is not None: - _validate_integer("chunksize", chunksize, 1) + chunksize = _validate_integer("chunksize", chunksize, 1) if not lines: raise ValueError("chunksize should only be passed if lines=True") @@ -369,7 +369,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): if lines and chunksize: - return JsonLineReader(fh, chunksize, **kwargs) + return JsonLineReader(filepath_or_buffer, chunksize, **kwargs) else: json = filepath_or_buffer.read() else: @@ -405,17 +405,30 @@ class JsonLineReader(BaseIterator): Iterates over a JSON document that is formatted with one JSON record per line. The `chunksize` initialization parameter controls how many lines are read per iteration. + + We explicitly override the index on the return value so that the index of + the resulting object will be like `range(len(obj))`. If we didn't do this, + it would have index like `range(chunksize) * number_chunks.` + This is so that `read_json(lines=True)` will return an identical object to + `read_json(lines=True, chunksize=n)`. """ def __init__(self, fh, chunksize, **kwargs): self.fh = fh self.chunksize = chunksize self.kwargs = kwargs + self.nrows_seen = 0 def __next__(self): lines = list(islice(self.fh, self.chunksize)) if lines: lines_json = '[' + ','.join(lines) + ']' - return _get_obj(json=lines_json, **self.kwargs) + obj = _get_obj(json=lines_json, **self.kwargs) + + # Make sure that the returned objects have the right index + obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) + self.nrows_seen += len(obj) + + return obj else: try: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b3eea4081c02b..5bf512ec020e8 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1035,33 +1035,52 @@ def test_to_jsonl(self): def test_read_jsonchunks(self): # GH17048: memory usage when lines=True df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - strio = df.to_json(lines=True, orient="records") - unchunked = pd.read_json(strio, lines=True) - chunked = pd.read_json(strio, lines=True, chunksize=1) + def get_strio(): + return StringIO(df.to_json(lines=True, orient="records")) + + def test_with_chunksize(c): + return pd.concat(pd.read_json(get_strio(), lines=True, chunksize=c)) + + unchunked = pd.read_json(get_strio(), lines=True) + + chunked = test_with_chunksize(1) assert_frame_equal(chunked, unchunked) - chunked_float = pd.read_json(strio, lines=True, chunksize=1.0) + chunked_float = test_with_chunksize(1.0) assert_frame_equal(chunked_float, unchunked) msg = r"'chunksize' must be an integer >=1" with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio, lines=True, chunksize=0) + test_with_chunksize(0) with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio, lines=True, chunksize=-1) + test_with_chunksize(-1) with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio, lines=True, chunksize=2.2) + test_with_chunksize(-2.2) with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio, lines=True, chunksize='foo') + test_with_chunksize('foo') msg = "chunksize should only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio, lines=False, chunksize=2) + pd.read_json(get_strio(), lines=False, chunksize=2) + + # Test that reading in Series also works + s = pd.Series({'A': 1, 'B': 2}) + + strio = StringIO(s.to_json(lines=True, orient="records")) + unchunked = pd.read_json(strio, lines=True, typ='Series') + + strio = StringIO(s.to_json(lines=True, orient="records")) + chunked = pd.concat(pd.read_json( + strio, lines=True, typ='Series', chunksize=1 + )) + + assert_series_equal(chunked, unchunked) def test_latin_encoding(self): if compat.PY2: From e049d2911d32840aaca8e7c8ff90cdc6617dccf4 Mon Sep 17 00:00:00 2001 From: louispotok Date: Mon, 14 Aug 2017 08:43:59 -0700 Subject: [PATCH 14/62] add chunksize test for reading from file --- pandas/tests/io/json/test_pandas.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5bf512ec020e8..a8b4ad48786e2 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1082,6 +1082,16 @@ def test_with_chunksize(c): assert_series_equal(chunked, unchunked) + chunks = list(pd.read_json(get_strio(), lines=True, chunksize=2)) + assert chunks[0].shape == (2, 2) + assert chunks[1].shape == (1, 2) + + with ensure_clean('test.json') as path: + df.to_json(path, lines=True, orient="records") + unchunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) + chunked = pd.read_json(path, lines=True) + assert_frame_equal(unchunked, chunked) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From 400d313b8e548985f210ab73021d0802afec7b68 Mon Sep 17 00:00:00 2001 From: louispotok Date: Mon, 14 Aug 2017 09:26:15 -0700 Subject: [PATCH 15/62] pep8 cleanup --- pandas/io/json/json.py | 1 + pandas/tests/io/json/test_pandas.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index beb4a0dc3650e..d4de41b8553f9 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -437,6 +437,7 @@ def __next__(self): pass raise StopIteration + class Parser(object): _STAMP_UNITS = ('s', 'ms', 'us', 'ns') diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a8b4ad48786e2..f10250c734fe9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1040,7 +1040,8 @@ def get_strio(): return StringIO(df.to_json(lines=True, orient="records")) def test_with_chunksize(c): - return pd.concat(pd.read_json(get_strio(), lines=True, chunksize=c)) + iterator = pd.read_json(get_strio(), lines=True, chunksize=c) + return pd.concat(iterator) unchunked = pd.read_json(get_strio(), lines=True) From b756c90e45fee3cb30065940a537ec53cb797d9f Mon Sep 17 00:00:00 2001 From: louispotok Date: Mon, 14 Aug 2017 11:30:08 -0700 Subject: [PATCH 16/62] Run chunksize checks before file is opened Also, fix badly named vars in test. --- pandas/io/json/json.py | 10 +++++----- pandas/tests/io/json/test_pandas.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index d4de41b8553f9..1bb3a0b61f6a6 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -333,6 +333,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ + if chunksize is not None: + chunksize = _validate_integer("chunksize", chunksize, 1) + if not lines: + raise ValueError("chunksize should only be passed if lines=True") + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) @@ -343,11 +348,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, "keep_default_dates": keep_default_dates, "numpy": numpy, "precise_float": precise_float, "date_unit": date_unit} - if chunksize is not None: - chunksize = _validate_integer("chunksize", chunksize, 1) - if not lines: - raise ValueError("chunksize should only be passed if lines=True") - if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f10250c734fe9..233fd8f939ff8 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1089,8 +1089,8 @@ def test_with_chunksize(c): with ensure_clean('test.json') as path: df.to_json(path, lines=True, orient="records") - unchunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) - chunked = pd.read_json(path, lines=True) + chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) + unchunked = pd.read_json(path, lines=True) assert_frame_equal(unchunked, chunked) def test_latin_encoding(self): From b71f65b41ba64f9b84d18f5e79d5b5b4ca738034 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 10 Sep 2017 16:52:23 -0400 Subject: [PATCH 17/62] move strio df in test to fixture --- pandas/tests/io/json/test_pandas.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 233fd8f939ff8..07360661e0227 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -35,6 +35,12 @@ _mixed_frame = _frame.copy() +@pytest.fixture +def strio_lines_json_df(): + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + return StringIO(df.to_json(lines=True, orient="records")) + + class TestPandasContainer(object): def setup_method(self, method): @@ -1032,18 +1038,14 @@ def test_to_jsonl(self): assert result == expected assert_frame_equal(pd.read_json(result, lines=True), df) - def test_read_jsonchunks(self): + def test_read_jsonchunks(self, strio_lines_json_df): # GH17048: memory usage when lines=True - df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - - def get_strio(): - return StringIO(df.to_json(lines=True, orient="records")) def test_with_chunksize(c): - iterator = pd.read_json(get_strio(), lines=True, chunksize=c) + iterator = pd.read_json(strio_lines_json_df, lines=True, chunksize=c) return pd.concat(iterator) - unchunked = pd.read_json(get_strio(), lines=True) + unchunked = pd.read_json(strio_lines_json_df, lines=True) chunked = test_with_chunksize(1) @@ -1068,7 +1070,7 @@ def test_with_chunksize(c): msg = "chunksize should only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): - pd.read_json(get_strio(), lines=False, chunksize=2) + pd.read_json(strio_lines_json_df, lines=False, chunksize=2) # Test that reading in Series also works s = pd.Series({'A': 1, 'B': 2}) @@ -1083,11 +1085,12 @@ def test_with_chunksize(c): assert_series_equal(chunked, unchunked) - chunks = list(pd.read_json(get_strio(), lines=True, chunksize=2)) + chunks = list(pd.read_json(strio_lines_json_df, lines=True, chunksize=2)) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) df.to_json(path, lines=True, orient="records") chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) unchunked = pd.read_json(path, lines=True) From d6e86af38bb07f842f5d5731f5af1de183fac185 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 10 Sep 2017 17:19:33 -0400 Subject: [PATCH 18/62] Improve read_json chunking tests * split into multiple tests * move helper functions out * add comments --- pandas/tests/io/json/test_pandas.py | 61 +++++++++++++++++------------ 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 07360661e0227..4df7d74829c9e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -41,6 +41,9 @@ def strio_lines_json_df(): return StringIO(df.to_json(lines=True, orient="records")) +def json_lines_to_df_chunked(jlines, chunksize): + return pd.concat(pd.read_json(jlines, lines=True, chunksize=chunksize)) + class TestPandasContainer(object): def setup_method(self, method): @@ -1038,41 +1041,26 @@ def test_to_jsonl(self): assert result == expected assert_frame_equal(pd.read_json(result, lines=True), df) - def test_read_jsonchunks(self, strio_lines_json_df): + def test_readjson_chunks(self): + """Basic test that read_json(chunks=True) gives the same result as + read_json(chunks=False)""" # GH17048: memory usage when lines=True - def test_with_chunksize(c): - iterator = pd.read_json(strio_lines_json_df, lines=True, chunksize=c) - return pd.concat(iterator) - - unchunked = pd.read_json(strio_lines_json_df, lines=True) - - chunked = test_with_chunksize(1) + unchunked = pd.read_json(strio_lines_json_df(), lines=True) + chunked = json_lines_to_df_chunked(strio_lines_json_df(), 1) assert_frame_equal(chunked, unchunked) - chunked_float = test_with_chunksize(1.0) + chunked_float = json_lines_to_df_chunked(strio_lines_json_df(), 1.0) assert_frame_equal(chunked_float, unchunked) - msg = r"'chunksize' must be an integer >=1" - - with tm.assert_raises_regex(ValueError, msg): - test_with_chunksize(0) - - with tm.assert_raises_regex(ValueError, msg): - test_with_chunksize(-1) - - with tm.assert_raises_regex(ValueError, msg): - test_with_chunksize(-2.2) - - with tm.assert_raises_regex(ValueError, msg): - test_with_chunksize('foo') - + def test_readjson_chunksize_requires_lines(): msg = "chunksize should only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio_lines_json_df, lines=False, chunksize=2) + pd.read_json(strio_lines_json_df(), lines=False, chunksize=2) - # Test that reading in Series also works + def test_readjson_chunks_series(self): + """Test reading line-format JSON to Series with chunksize param""" s = pd.Series({'A': 1, 'B': 2}) strio = StringIO(s.to_json(lines=True, orient="records")) @@ -1085,10 +1073,15 @@ def test_with_chunksize(c): assert_series_equal(chunked, unchunked) - chunks = list(pd.read_json(strio_lines_json_df, lines=True, chunksize=2)) + def test_readjson_each_chunk(self): + """Other tests check that the final result of read_json(chunksize=True) + is correct. This checks that the intermediate chunks read in are correct. + """ + chunks = list(pd.read_json(strio_lines_json_df(), lines=True, chunksize=2)) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) + def test_readjson_chunks_from_file(self): with ensure_clean('test.json') as path: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) df.to_json(path, lines=True, orient="records") @@ -1096,6 +1089,22 @@ def test_with_chunksize(c): unchunked = pd.read_json(path, lines=True) assert_frame_equal(unchunked, chunked) + def test_readjson_invalid_chunksize(self): + msg = r"'chunksize' must be an integer >=1" + + with tm.assert_raises_regex(ValueError, msg): + json_lines_to_df_chunked(strio_lines_json_df(), 0) + + with tm.assert_raises_regex(ValueError, msg): + json_lines_to_df_chunked(strio_lines_json_df(), -1) + + with tm.assert_raises_regex(ValueError, msg): + json_lines_to_df_chunked(strio_lines_json_df(), -2.2) + + with tm.assert_raises_regex(ValueError, msg): + json_lines_to_df_chunked(strio_lines_json_df(), 'foo') + + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From 4d912801615c899a770fbdd7115aa8bab7e56b87 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 10 Sep 2017 17:27:15 -0400 Subject: [PATCH 19/62] bugfix in read_json tests, remove fixture --- pandas/tests/io/json/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4df7d74829c9e..582d5dfdb5b81 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -35,7 +35,6 @@ _mixed_frame = _frame.copy() -@pytest.fixture def strio_lines_json_df(): df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) return StringIO(df.to_json(lines=True, orient="records")) @@ -44,6 +43,7 @@ def strio_lines_json_df(): def json_lines_to_df_chunked(jlines, chunksize): return pd.concat(pd.read_json(jlines, lines=True, chunksize=chunksize)) + class TestPandasContainer(object): def setup_method(self, method): @@ -1054,7 +1054,7 @@ def test_readjson_chunks(self): chunked_float = json_lines_to_df_chunked(strio_lines_json_df(), 1.0) assert_frame_equal(chunked_float, unchunked) - def test_readjson_chunksize_requires_lines(): + def test_readjson_chunksize_requires_lines(self): msg = "chunksize should only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): pd.read_json(strio_lines_json_df(), lines=False, chunksize=2) From 24744290bc2f6acc6e5dae3f63311809945c92e6 Mon Sep 17 00:00:00 2001 From: louispotok Date: Sun, 10 Sep 2017 19:02:13 -0400 Subject: [PATCH 20/62] JsonLineReader opens and closes filepaths --- pandas/io/json/json.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 1bb3a0b61f6a6..0211e9accb9e7 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -358,18 +358,19 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - fh, handles = _get_handle(filepath_or_buffer, 'r', - encoding=encoding) if lines and chunksize: - return JsonLineReader(fh, chunksize, **kwargs) + return JsonLineReader(filepath_or_buffer, chunksize, + encoding, **kwargs) else: + fh, handles = _get_handle(filepath_or_buffer, 'r', + encoding=encoding) json = fh.read() fh.close() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): if lines and chunksize: - return JsonLineReader(filepath_or_buffer, chunksize, **kwargs) + return JsonLineReader(filepath_or_buffer, chunksize, encoding, **kwargs) else: json = filepath_or_buffer.read() else: @@ -412,14 +413,22 @@ class JsonLineReader(BaseIterator): This is so that `read_json(lines=True)` will return an identical object to `read_json(lines=True, chunksize=n)`. """ - def __init__(self, fh, chunksize, **kwargs): - self.fh = fh + def __init__(self, filepath_or_buffer, chunksize, encoding, **kwargs): + + try: + self.iterator, _ = _get_handle(filepath_or_buffer, 'r', + encoding=encoding) + except: + if hasattr(filepath_or_buffer, 'read'): + self.iterator = filepath_or_buffer + else: + raise ValueError("cannot read json from given input") self.chunksize = chunksize self.kwargs = kwargs self.nrows_seen = 0 def __next__(self): - lines = list(islice(self.fh, self.chunksize)) + lines = list(islice(self.iterator, self.chunksize)) if lines: lines_json = '[' + ','.join(lines) + ']' obj = _get_obj(json=lines_json, **self.kwargs) @@ -432,7 +441,7 @@ def __next__(self): else: try: - self.fh.close() + self.iterator.close() except IOError: pass raise StopIteration From b18b3df4ca20cae05e94bc0e5ab0a402f357d6de Mon Sep 17 00:00:00 2001 From: louispotok Date: Tue, 12 Sep 2017 11:55:06 -0400 Subject: [PATCH 21/62] pep8 cleanup --- pandas/io/json/json.py | 3 ++- pandas/tests/io/json/test_pandas.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 0211e9accb9e7..63c196178adbe 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -370,7 +370,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): if lines and chunksize: - return JsonLineReader(filepath_or_buffer, chunksize, encoding, **kwargs) + return JsonLineReader(filepath_or_buffer, chunksize, encoding, + **kwargs) else: json = filepath_or_buffer.read() else: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 582d5dfdb5b81..40375c1745415 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1074,10 +1074,13 @@ def test_readjson_chunks_series(self): assert_series_equal(chunked, unchunked) def test_readjson_each_chunk(self): - """Other tests check that the final result of read_json(chunksize=True) - is correct. This checks that the intermediate chunks read in are correct. """ - chunks = list(pd.read_json(strio_lines_json_df(), lines=True, chunksize=2)) + Other tests check that the final result of read_json(chunksize=True) is + correct. This checks that the intermediate chunks read in are correct. + """ + chunks = list( + pd.read_json(strio_lines_json_df(), lines=True, chunksize=2) + ) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) @@ -1104,7 +1107,6 @@ def test_readjson_invalid_chunksize(self): with tm.assert_raises_regex(ValueError, msg): json_lines_to_df_chunked(strio_lines_json_df(), 'foo') - def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From 4c1d6a6ded14c0854a14f43eee1ec8d87c8bd1e9 Mon Sep 17 00:00:00 2001 From: louispotok Date: Wed, 13 Sep 2017 09:30:34 -0700 Subject: [PATCH 22/62] update whatsnew --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ff736f5d4995d..693531e35e88f 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -471,7 +471,7 @@ Other API Changes - :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) -- :func:`read_json` now accepts a ``chunksize`` parameter that can reduce memory usage when ``lines=True``. (:issue:`17048`) +- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) From d589b0ba33c5d5073f54da279dec7dd045263859 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 14 Sep 2017 09:33:23 -0700 Subject: [PATCH 23/62] update docs on read_json chunksize --- doc/source/io.rst | 11 +++++++++++ pandas/io/json/json.py | 9 +++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index d6abed6e9d1ad..77f631550eab7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1845,6 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` seconds, milliseconds, microseconds or nanoseconds respectively. - ``lines`` : reads file as one json object per line. - ``encoding`` : The encoding to use to decode py3 bytes. +- ``chunksize`` : when used in combination with ``lines=True``, return a JsonLineReader which reads in ``chunksize`` lines per iteration. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -2049,6 +2050,10 @@ Line delimited json pandas is able to read and write line-delimited json files that are common in data processing pipelines using Hadoop or Spark. +.. versionadded:: 0.21.0 + +For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream. + .. ipython:: python jsonl = ''' @@ -2059,6 +2064,12 @@ using Hadoop or Spark. df df.to_json(orient='records', lines=True) + # chunksize has no effect when reading a string. + import io + reader = pd.read_json(io.StringIO(jsonl), lines=True, chunksize=1) + reader + for chunk in reader: + print(chunk) .. _io.table_schema: diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 63c196178adbe..c8c1b001bb8e4 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -267,11 +267,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 chunksize: integer, default None - If `lines=True`, how many lines to read into memory at a time. + Return JsonLineReader object for iteration. + See the `line-delimted json docs + `_ + for more information on ``chunksize``. + This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - Passing a chunksize helps with memory usage, but is slower. - Also note this is different from the `chunksize` parameter in - `read_csv`, which returns a FileTextReader. If the JSON input is a string, this argument has no effect. .. versionadded:: 0.21.0 From eba45a2524b03bb567782b6ce166b74fe97f0335 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 14 Sep 2017 11:50:19 -0700 Subject: [PATCH 24/62] Always use JsonReader in read_json Either read it all before return, or return the JsonReader if chunksize is passed. --- doc/source/io.rst | 2 +- pandas/io/json/json.py | 180 +++++++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 80 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 77f631550eab7..156e6649e8ebd 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1845,7 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` seconds, milliseconds, microseconds or nanoseconds respectively. - ``lines`` : reads file as one json object per line. - ``encoding`` : The encoding to use to decode py3 bytes. -- ``chunksize`` : when used in combination with ``lines=True``, return a JsonLineReader which reads in ``chunksize`` lines per iteration. +- ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index c8c1b001bb8e4..5dd419669c643 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -12,6 +12,7 @@ _stringify_path, BaseIterator) from pandas.io.parsers import _validate_integer from pandas.core.common import AbstractMethodError +from pandas.core.reshape import concat from pandas.io.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits from .table_schema import build_table_schema @@ -267,7 +268,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 chunksize: integer, default None - Return JsonLineReader object for iteration. + Return JsonReader object for iteration. See the `line-delimted json docs `_ for more information on ``chunksize``. @@ -342,98 +343,119 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) - # These kwargs are only needed by the Parsers, so we just wrap them up and - # pass them down. - kwargs = {"typ": typ, "orient": orient, "dtype": dtype, - "convert_axes": convert_axes, "convert_dates": convert_dates, - "keep_default_dates": keep_default_dates, "numpy": numpy, - "precise_float": precise_float, "date_unit": date_unit} - - if isinstance(filepath_or_buffer, compat.string_types): - try: - exists = os.path.exists(filepath_or_buffer) - - # if the filepath is too long will raise here - # 5874 - except (TypeError, ValueError): - exists = False - - if exists: - if lines and chunksize: - return JsonLineReader(filepath_or_buffer, chunksize, - encoding, **kwargs) - else: - fh, handles = _get_handle(filepath_or_buffer, 'r', - encoding=encoding) - json = fh.read() - fh.close() - else: - json = filepath_or_buffer - elif hasattr(filepath_or_buffer, 'read'): - if lines and chunksize: - return JsonLineReader(filepath_or_buffer, chunksize, encoding, - **kwargs) - else: - json = filepath_or_buffer.read() - else: - json = filepath_or_buffer + json_reader = JsonReader( + filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, + convert_axes=convert_axes, convert_dates=convert_dates, + keep_default_dates=keep_default_dates, numpy=numpy, + precise_float=precise_float, date_unit=date_unit, encoding=encoding, + lines=lines, chunksize=chunksize + ) - if lines: - # If given a json lines file, we break the string into lines, add - # commas and put it in a json list to make a valid json object. - lines = list(StringIO(json.strip())) - json = '[' + ','.join(lines) + ']' + if chunksize: + return json_reader - return _get_obj(json, **kwargs) + else: + return json_reader.read() -def _get_obj(json, **kwargs): - typ = kwargs['typ'] - dtype = kwargs['dtype'] - kwargs = {k: v for k, v in kwargs.items() if k != 'typ'} - obj = None - if typ == 'frame': - obj = FrameParser(json, **kwargs).parse() +class JsonReader(BaseIterator): + """ + Reads a JSON document to a pandas object. - if typ == 'series' or obj is None: - if not isinstance(dtype, bool): - dtype = dict(data=dtype) - obj = SeriesParser(json, **kwargs).parse() + If initialized with ``lines=True`` and ``chunksize``, can be iterated over + ``chunksize`` lines at a time. + """ + def __init__( + self, filepath_or_buffer, orient, typ, dtype, convert_axes, + convert_dates, keep_default_dates, numpy, precise_float, date_unit, + encoding, lines, chunksize, raw_json=False + ): - return obj + self.path_or_buf = filepath_or_buffer + self.orient = orient + self.typ = typ + self.dtype = dtype + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.keep_default_dates = keep_default_dates + self.numpy = numpy + self.precise_float = precise_float + self.date_unit = date_unit + self.encoding = encoding + self.lines = lines + self.chunksize = chunksize + self.nrows_seen = 0 + self.raw_json = False + if isinstance(filepath_or_buffer, compat.string_types): + try: + exists = os.path.exists(filepath_or_buffer) -class JsonLineReader(BaseIterator): - """ - Iterates over a JSON document that is formatted with one JSON record per - line. The `chunksize` initialization parameter controls how many lines are - read per iteration. - - We explicitly override the index on the return value so that the index of - the resulting object will be like `range(len(obj))`. If we didn't do this, - it would have index like `range(chunksize) * number_chunks.` - This is so that `read_json(lines=True)` will return an identical object to - `read_json(lines=True, chunksize=n)`. - """ - def __init__(self, filepath_or_buffer, chunksize, encoding, **kwargs): + # if the filepath is too long will raise here + # 5874 + except (TypeError, ValueError): + exists = False - try: - self.iterator, _ = _get_handle(filepath_or_buffer, 'r', + if exists: + self.data, _ = _get_handle(filepath_or_buffer, 'r', encoding=encoding) - except: - if hasattr(filepath_or_buffer, 'read'): - self.iterator = filepath_or_buffer else: - raise ValueError("cannot read json from given input") - self.chunksize = chunksize - self.kwargs = kwargs - self.nrows_seen = 0 + self.raw_json = True + self.data = filepath_or_buffer + elif hasattr(filepath_or_buffer, 'read'): + self.data = filepath_or_buffer + else: + self.raw_json = True + self.data = filepath_or_buffer + + if self.raw_json and lines: + self.data = self.combine_lines(self.data) + + def combine_lines(self, data): + """Combines a multi-line JSON document into a single document""" + # If given a json lines file, we break the string into lines, add + # commas and put it in a json list to make a valid json object. + lines = list(StringIO(data.strip())) + return '[' + ','.join(lines) + ']' + + def read(self): + """Read the whole JSON input into a pandas object""" + if self.raw_json: + return self._get_obj(self.data) + elif self.lines and self.chunksize: + return concat(self) + else: + if self.lines: + return self._get_obj(self.combine_lines(self.data.read())) + else: + return self._get_obj(self.data.read()) + + def _get_obj(self, json): + typ = self.typ + dtype = self.dtype + kwargs = { + "orient": self.orient, "dtype": self.dtype, + "convert_axes": self.convert_axes, + "convert_dates": self.convert_dates, + "keep_default_dates": self.keep_default_dates, "numpy": self.numpy, + "precise_float": self.precise_float, "date_unit": self.date_unit + } + obj = None + if typ == 'frame': + obj = FrameParser(json, **kwargs).parse() + + if typ == 'series' or obj is None: + if not isinstance(dtype, bool): + dtype = dict(data=dtype) + obj = SeriesParser(json, **kwargs).parse() + + return obj def __next__(self): - lines = list(islice(self.iterator, self.chunksize)) + lines = list(islice(self.data, self.chunksize)) if lines: lines_json = '[' + ','.join(lines) + ']' - obj = _get_obj(json=lines_json, **self.kwargs) + obj = self._get_obj(lines_json) # Make sure that the returned objects have the right index obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) @@ -443,7 +465,7 @@ def __next__(self): else: try: - self.iterator.close() + self.data.close() except IOError: pass raise StopIteration From d0ea295996d62dbecb1357303d94535cd7bff1c9 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 14 Sep 2017 12:04:31 -0700 Subject: [PATCH 25/62] make lines_json_df a fixture --- pandas/tests/io/json/test_pandas.py | 35 +++++++++++++---------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 40375c1745415..a1f39ab78a25c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -35,13 +35,10 @@ _mixed_frame = _frame.copy() -def strio_lines_json_df(): +@pytest.fixture +def lines_json_df(): df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - return StringIO(df.to_json(lines=True, orient="records")) - - -def json_lines_to_df_chunked(jlines, chunksize): - return pd.concat(pd.read_json(jlines, lines=True, chunksize=chunksize)) + return df.to_json(lines=True, orient="records") class TestPandasContainer(object): @@ -1041,23 +1038,23 @@ def test_to_jsonl(self): assert result == expected assert_frame_equal(pd.read_json(result, lines=True), df) - def test_readjson_chunks(self): + def test_readjson_chunks(self, lines_json_df): """Basic test that read_json(chunks=True) gives the same result as read_json(chunks=False)""" # GH17048: memory usage when lines=True - unchunked = pd.read_json(strio_lines_json_df(), lines=True) - chunked = json_lines_to_df_chunked(strio_lines_json_df(), 1) + unchunked = pd.read_json(StringIO(lines_json_df), lines=True) + chunked = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1)) assert_frame_equal(chunked, unchunked) - chunked_float = json_lines_to_df_chunked(strio_lines_json_df(), 1.0) + chunked_float = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0)) assert_frame_equal(chunked_float, unchunked) - def test_readjson_chunksize_requires_lines(self): + def test_readjson_chunksize_requires_lines(self, lines_json_df): msg = "chunksize should only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): - pd.read_json(strio_lines_json_df(), lines=False, chunksize=2) + pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) def test_readjson_chunks_series(self): """Test reading line-format JSON to Series with chunksize param""" @@ -1073,13 +1070,13 @@ def test_readjson_chunks_series(self): assert_series_equal(chunked, unchunked) - def test_readjson_each_chunk(self): + def test_readjson_each_chunk(self, lines_json_df): """ Other tests check that the final result of read_json(chunksize=True) is correct. This checks that the intermediate chunks read in are correct. """ chunks = list( - pd.read_json(strio_lines_json_df(), lines=True, chunksize=2) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) ) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) @@ -1092,20 +1089,20 @@ def test_readjson_chunks_from_file(self): unchunked = pd.read_json(path, lines=True) assert_frame_equal(unchunked, chunked) - def test_readjson_invalid_chunksize(self): + def test_readjson_invalid_chunksize(self, lines_json_df): msg = r"'chunksize' must be an integer >=1" with tm.assert_raises_regex(ValueError, msg): - json_lines_to_df_chunked(strio_lines_json_df(), 0) + pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0)) with tm.assert_raises_regex(ValueError, msg): - json_lines_to_df_chunked(strio_lines_json_df(), -1) + pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1)) with tm.assert_raises_regex(ValueError, msg): - json_lines_to_df_chunked(strio_lines_json_df(), -2.2) + pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2)) with tm.assert_raises_regex(ValueError, msg): - json_lines_to_df_chunked(strio_lines_json_df(), 'foo') + pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo')) def test_latin_encoding(self): if compat.PY2: From bb3182d2e9d89210ec8b10a443db07dcd07141d1 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 14 Sep 2017 12:11:51 -0700 Subject: [PATCH 26/62] remove unneeded concats in tests --- pandas/tests/io/json/test_pandas.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a1f39ab78a25c..c06e2659fb300 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1044,11 +1044,15 @@ def test_readjson_chunks(self, lines_json_df): # GH17048: memory usage when lines=True unchunked = pd.read_json(StringIO(lines_json_df), lines=True) - chunked = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1)) + chunked = pd.concat( + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1) + ) assert_frame_equal(chunked, unchunked) - chunked_float = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0)) + chunked_float = pd.concat( + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0) + ) assert_frame_equal(chunked_float, unchunked) def test_readjson_chunksize_requires_lines(self, lines_json_df): @@ -1093,16 +1097,16 @@ def test_readjson_invalid_chunksize(self, lines_json_df): msg = r"'chunksize' must be an integer >=1" with tm.assert_raises_regex(ValueError, msg): - pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0)) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0) with tm.assert_raises_regex(ValueError, msg): - pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1)) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1) with tm.assert_raises_regex(ValueError, msg): - pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2)) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2) with tm.assert_raises_regex(ValueError, msg): - pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo')) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo') def test_latin_encoding(self): if compat.PY2: From 8cc43ff786232a11c344acbc82429561a56b52f6 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 14 Sep 2017 12:14:50 -0700 Subject: [PATCH 27/62] parametrize some tests --- pandas/tests/io/json/test_pandas.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c06e2659fb300..5ffc9249f6899 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1043,17 +1043,14 @@ def test_readjson_chunks(self, lines_json_df): read_json(chunks=False)""" # GH17048: memory usage when lines=True - unchunked = pd.read_json(StringIO(lines_json_df), lines=True) - chunked = pd.concat( - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1) - ) + for cs in [1, 1.0]: - assert_frame_equal(chunked, unchunked) + unchunked = pd.read_json(StringIO(lines_json_df), lines=True) + chunked = pd.concat( + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs) + ) - chunked_float = pd.concat( - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0) - ) - assert_frame_equal(chunked_float, unchunked) + assert_frame_equal(chunked, unchunked) def test_readjson_chunksize_requires_lines(self, lines_json_df): msg = "chunksize should only be passed if lines=True" @@ -1096,17 +1093,9 @@ def test_readjson_chunks_from_file(self): def test_readjson_invalid_chunksize(self, lines_json_df): msg = r"'chunksize' must be an integer >=1" - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0) - - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1) - - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2) - - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo') + for cs in [0, -1, 2.2, 'foo']: + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs) def test_latin_encoding(self): if compat.PY2: From de03462d6cd53663c498052a617f531f0de53a2b Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 08:30:38 -0700 Subject: [PATCH 28/62] add __close__ method to JsonReader and use it --- pandas/io/json/json.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 5dd419669c643..6bca67a28862d 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -363,7 +363,8 @@ class JsonReader(BaseIterator): Reads a JSON document to a pandas object. If initialized with ``lines=True`` and ``chunksize``, can be iterated over - ``chunksize`` lines at a time. + ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the + whole document. """ def __init__( self, filepath_or_buffer, orient, typ, dtype, convert_axes, @@ -426,9 +427,11 @@ def read(self): return concat(self) else: if self.lines: - return self._get_obj(self.combine_lines(self.data.read())) + to_return = self._get_obj(self.combine_lines(self.data.read())) else: - return self._get_obj(self.data.read()) + to_return = self._get_obj(self.data.read()) + self.__close__() + return to_return def _get_obj(self, json): typ = self.typ @@ -451,6 +454,12 @@ def _get_obj(self, json): return obj + def __close__(self): + try: + self.data.close() + except IOError: + pass + def __next__(self): lines = list(islice(self.data, self.chunksize)) if lines: @@ -464,10 +473,7 @@ def __next__(self): return obj else: - try: - self.data.close() - except IOError: - pass + self.__close__() raise StopIteration From 07b31c7f8c8c23c9b3900bf5b6aafbe7d1cae2b9 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 08:38:40 -0700 Subject: [PATCH 29/62] remove import io in docs --- doc/source/io.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 156e6649e8ebd..2bbe3f0738b92 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2065,8 +2065,7 @@ For line-delimited json files, pandas can also return an iterator which reads in df.to_json(orient='records', lines=True) # chunksize has no effect when reading a string. - import io - reader = pd.read_json(io.StringIO(jsonl), lines=True, chunksize=1) + reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) reader for chunk in reader: print(chunk) From 7d0642fd4ea9c5240382f4ea2f037ccc928c4c17 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 08:42:27 -0700 Subject: [PATCH 30/62] move read_json in whatsnew to Other Enhancements --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 693531e35e88f..d5d508d02cb73 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -162,6 +162,7 @@ Other Enhancements - :func:`MultiIndex.is_monotonic_decreasing` has been implemented. Previously returned ``False`` in all cases. (:issue:`16554`) - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`) - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) +- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names @@ -471,7 +472,6 @@ Other API Changes - :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) -- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) From 398961b96630feca2fd06372409328bced65b198 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 08:57:01 -0700 Subject: [PATCH 31/62] move chunksize and lines validation into JsonReader --- pandas/io/json/json.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 6bca67a28862d..843039392dcb2 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -335,11 +335,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if chunksize is not None: - chunksize = _validate_integer("chunksize", chunksize, 1) - if not lines: - raise ValueError("chunksize should only be passed if lines=True") - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) @@ -388,6 +383,11 @@ def __init__( self.nrows_seen = 0 self.raw_json = False + if self.chunksize is not None: + self.chunksize = _validate_integer("chunksize", self.chunksize, 1) + if not self.lines: + raise ValueError("chunksize should only be passed if lines=True") + if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) From dfa29671e6a1b49c44dac2eedb4eb3a273cc5254 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 08:57:58 -0700 Subject: [PATCH 32/62] remove extraneous else --- pandas/io/json/json.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 843039392dcb2..47e3474ac6a54 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -349,8 +349,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if chunksize: return json_reader - else: - return json_reader.read() + return json_reader.read() class JsonReader(BaseIterator): From b0e4bb010298f9acd28bc235834b3c7a7376ac82 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:05:54 -0700 Subject: [PATCH 33/62] remove unneccessary cast to list --- pandas/io/json/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 47e3474ac6a54..0d12ccdd4ade8 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -415,7 +415,7 @@ def combine_lines(self, data): """Combines a multi-line JSON document into a single document""" # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. - lines = list(StringIO(data.strip())) + lines = StringIO(data.strip()) return '[' + ','.join(lines) + ']' def read(self): From e3197c5546310035ed3dd74ffeebfc4bd36f29b7 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:13:12 -0700 Subject: [PATCH 34/62] move combine_lines call into read --- pandas/io/json/json.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 0d12ccdd4ade8..bf2f1426be8bc 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -408,9 +408,6 @@ def __init__( self.raw_json = True self.data = filepath_or_buffer - if self.raw_json and lines: - self.data = self.combine_lines(self.data) - def combine_lines(self, data): """Combines a multi-line JSON document into a single document""" # If given a json lines file, we break the string into lines, add @@ -421,7 +418,10 @@ def combine_lines(self, data): def read(self): """Read the whole JSON input into a pandas object""" if self.raw_json: - return self._get_obj(self.data) + if self.lines: + return self._get_obj(self.combine_lines(self.data)) + else: + return self._get_obj(self.data) elif self.lines and self.chunksize: return concat(self) else: From 39f9881f08108a382c2fb1f750ecffd8cc4f7c69 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:15:00 -0700 Subject: [PATCH 35/62] remove another extraneous else --- pandas/io/json/json.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index bf2f1426be8bc..6fe5447258982 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -403,7 +403,7 @@ def __init__( self.raw_json = True self.data = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): - self.data = filepath_or_buffer + self.data = filepath_or_buffer else: self.raw_json = True self.data = filepath_or_buffer @@ -422,15 +422,15 @@ def read(self): return self._get_obj(self.combine_lines(self.data)) else: return self._get_obj(self.data) - elif self.lines and self.chunksize: + if self.lines and self.chunksize: return concat(self) + + if self.lines: + to_return = self._get_obj(self.combine_lines(self.data.read())) else: - if self.lines: - to_return = self._get_obj(self.combine_lines(self.data.read())) - else: - to_return = self._get_obj(self.data.read()) - self.__close__() - return to_return + to_return = self._get_obj(self.data.read()) + self.__close__() + return to_return def _get_obj(self, json): typ = self.typ From c2247c314e53058460290d3ab5fd66ca0087efed Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:26:53 -0700 Subject: [PATCH 36/62] always close JsonReader --- pandas/io/json/json.py | 36 ++++++++++++++++------------- pandas/tests/io/json/test_pandas.py | 2 +- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 6fe5447258982..29f38bb6753c4 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -385,7 +385,7 @@ def __init__( if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: - raise ValueError("chunksize should only be passed if lines=True") + raise ValueError("chunksize can only be passed if lines=True") if isinstance(filepath_or_buffer, compat.string_types): try: @@ -419,20 +419,24 @@ def read(self): """Read the whole JSON input into a pandas object""" if self.raw_json: if self.lines: - return self._get_obj(self.combine_lines(self.data)) + obj = self._get_object_parser(self.combine_lines(self.data)) else: - return self._get_obj(self.data) - if self.lines and self.chunksize: - return concat(self) - - if self.lines: - to_return = self._get_obj(self.combine_lines(self.data.read())) + obj = self._get_object_parser(self.data) + elif self.lines and self.chunksize: + obj = concat(self) else: - to_return = self._get_obj(self.data.read()) - self.__close__() - return to_return - def _get_obj(self, json): + if self.lines: + obj = self._get_object_parser( + self.combine_lines(self.data.read()) + ) + else: + obj = self._get_object_parser(self.data.read()) + self.close() + return obj + + def _get_object_parser(self, json): + """parses a json document into a pandas object""" typ = self.typ dtype = self.dtype kwargs = { @@ -453,17 +457,17 @@ def _get_obj(self, json): return obj - def __close__(self): + def close(self): try: self.data.close() - except IOError: + except (IOError, AttributeError): pass def __next__(self): lines = list(islice(self.data, self.chunksize)) if lines: lines_json = '[' + ','.join(lines) + ']' - obj = self._get_obj(lines_json) + obj = self._get_object_parser(lines_json) # Make sure that the returned objects have the right index obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) @@ -472,7 +476,7 @@ def __next__(self): return obj else: - self.__close__() + self.close() raise StopIteration diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5ffc9249f6899..b4ad9264e4dd2 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1053,7 +1053,7 @@ def test_readjson_chunks(self, lines_json_df): assert_frame_equal(chunked, unchunked) def test_readjson_chunksize_requires_lines(self, lines_json_df): - msg = "chunksize should only be passed if lines=True" + msg = "chunksize can only be passed if lines=True" with tm.assert_raises_regex(ValueError, msg): pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) From 46d8a685c1967bdfd302031112bd4cac0d9e77e3 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:43:34 -0700 Subject: [PATCH 37/62] add test that read_json closes file correctly --- pandas/tests/io/json/test_pandas.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b4ad9264e4dd2..fb71a293fef8e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1090,6 +1090,19 @@ def test_readjson_chunks_from_file(self): unchunked = pd.read_json(path, lines=True) assert_frame_equal(unchunked, chunked) + def test_readjson_chunks_closes(self): + for chunksize in [None, 1]: + with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + f = open(path, 'r') + if chunksize is not None: + pd.concat(pd.read_json(f, lines=True, chunksize=chunksize)) + else: + pd.read_json(f, lines=True) + assert f.closed, \ + "didn't close file with chunksize = %s" % chunksize + def test_readjson_invalid_chunksize(self, lines_json_df): msg = r"'chunksize' must be an integer >=1" From 066e26dc5617e2a49f0365299b4faa09e428bca3 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:45:16 -0700 Subject: [PATCH 38/62] minor formatting fixups --- pandas/io/json/json.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 29f38bb6753c4..9b5f2f67921f9 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -360,11 +360,9 @@ class JsonReader(BaseIterator): ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the whole document. """ - def __init__( - self, filepath_or_buffer, orient, typ, dtype, convert_axes, - convert_dates, keep_default_dates, numpy, precise_float, date_unit, - encoding, lines, chunksize, raw_json=False - ): + def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, + convert_dates, keep_default_dates, numpy, precise_float, + date_unit, encoding, lines, chunksize, raw_json=False): self.path_or_buf = filepath_or_buffer self.orient = orient @@ -391,8 +389,7 @@ def __init__( try: exists = os.path.exists(filepath_or_buffer) - # if the filepath is too long will raise here - # 5874 + # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): exists = False @@ -469,7 +466,7 @@ def __next__(self): lines_json = '[' + ','.join(lines) + ']' obj = self._get_object_parser(lines_json) - # Make sure that the returned objects have the right index + # Make sure that the returned objects have the right index. obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) self.nrows_seen += len(obj) From 08e8b6cd06f779d766367bf666fadc5e12e404c5 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 09:53:02 -0700 Subject: [PATCH 39/62] remove extraneous else --- pandas/io/json/json.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 9b5f2f67921f9..762729d34e51a 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -472,9 +472,8 @@ def __next__(self): return obj - else: - self.close() - raise StopIteration + self.close() + raise StopIteration class Parser(object): From 1ac6953bd380b790b42887c8df329a790da801af Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 12:43:14 -0700 Subject: [PATCH 40/62] add benchmarks for read_json --- asv_bench/benchmarks/packers.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 24f80cc836dd4..743e1756e26fd 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -85,6 +85,25 @@ def time_packers_read_json(self): pd.read_json(self.f, orient='split') +class packers_read_json_lines(_Packers): + + def setup(self): + self._setup() + self.df.to_json(self.f, orient="records", lines=True) + + def time_packers_read_json_lines(self): + pd.read_json(self.f, lines=True) + +class packers_read_json_lines_chunks(_Packers): + def setup(self): + self._setup() + self.df.to_json('abc.json', orient="records", lines=True) + self.df.index = np.arange(self.N) + + def time_packers_read_json_lines_chunks(self): + chunksize = int(self.C / 5.0) + pd.read_json('abc.json', lines=True, chunksize=chunksize) + class packers_read_json_date_index(_Packers): def setup(self): From 0782df968b7d34818a535536e2adc4fe0fd6549d Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 13:09:41 -0700 Subject: [PATCH 41/62] update benchmarks --- asv_bench/benchmarks/packers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 743e1756e26fd..f3b298041147a 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -97,12 +97,11 @@ def time_packers_read_json_lines(self): class packers_read_json_lines_chunks(_Packers): def setup(self): self._setup() - self.df.to_json('abc.json', orient="records", lines=True) - self.df.index = np.arange(self.N) + self.df.to_json(self.f, orient="records", lines=True) def time_packers_read_json_lines_chunks(self): chunksize = int(self.C / 5.0) - pd.read_json('abc.json', lines=True, chunksize=chunksize) + next([c for c in pd.read_json(self.f, lines=True, chunksize=chunksize)]) class packers_read_json_date_index(_Packers): From 014d493b51c0d3d4442c2ef4138c5824864bf5b0 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 13:20:31 -0700 Subject: [PATCH 42/62] move json_lines tests to io_bench --- asv_bench/benchmarks/io_bench.py | 15 +++++++++++++++ asv_bench/benchmarks/packers.py | 18 ------------------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index 52064d2cdb8a2..12d1182cafc4c 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -192,3 +192,18 @@ def time_read_nrows(self, compression, engine): ext = ".bz2" pd.read_csv(self.big_fname + ext, nrows=10, compression=compression, engine=engine) + +class read_json_lines(object): + goal_time = 0.2 + + def setup(self): + self.N = 1000000 + self.C = 5 + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)])) + self.df.to_json("__test__.json",orient="records",lines=True) + + def time_read_json_lines(self): + pd.read_json("__test__.json", lines=True) + + def time_read_json_lines_chunk(self): + pd.read_json("__test__.json", lines=True, chunksize=self.N/4) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index f3b298041147a..24f80cc836dd4 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -85,24 +85,6 @@ def time_packers_read_json(self): pd.read_json(self.f, orient='split') -class packers_read_json_lines(_Packers): - - def setup(self): - self._setup() - self.df.to_json(self.f, orient="records", lines=True) - - def time_packers_read_json_lines(self): - pd.read_json(self.f, lines=True) - -class packers_read_json_lines_chunks(_Packers): - def setup(self): - self._setup() - self.df.to_json(self.f, orient="records", lines=True) - - def time_packers_read_json_lines_chunks(self): - chunksize = int(self.C / 5.0) - next([c for c in pd.read_json(self.f, lines=True, chunksize=chunksize)]) - class packers_read_json_date_index(_Packers): def setup(self): From a913d8e011be316f9b87134c45fa3e87d3bd44a2 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 13:27:56 -0700 Subject: [PATCH 43/62] add peakmem for jsonlines --- asv_bench/benchmarks/io_bench.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index 12d1182cafc4c..b5b9de3d85f6c 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -193,6 +193,7 @@ def time_read_nrows(self, compression, engine): pd.read_csv(self.big_fname + ext, nrows=10, compression=compression, engine=engine) + class read_json_lines(object): goal_time = 0.2 @@ -206,4 +207,10 @@ def time_read_json_lines(self): pd.read_json("__test__.json", lines=True) def time_read_json_lines_chunk(self): - pd.read_json("__test__.json", lines=True, chunksize=self.N/4) + pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4)) + + def peakmem_read_json_lines(self): + pd.read_json("__test__.json", lines=True) + + def peakmem_read_json_lines_chunk(self): + pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4)) From ce7aef6fe6d957f01f6f82105cf64a72134ad345 Mon Sep 17 00:00:00 2001 From: louispotok Date: Fri, 15 Sep 2017 13:35:16 -0700 Subject: [PATCH 44/62] smaller benchmark --- asv_bench/benchmarks/io_bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index b5b9de3d85f6c..aefced4a42a6f 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -198,7 +198,7 @@ class read_json_lines(object): goal_time = 0.2 def setup(self): - self.N = 1000000 + self.N = 100000 self.C = 5 self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)])) self.df.to_json("__test__.json",orient="records",lines=True) From 1dc15266ff7e68a667588967ea7c9aa3ff887d44 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 09:47:37 -0700 Subject: [PATCH 45/62] refactor JsonReader --- pandas/io/json/json.py | 88 ++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 762729d34e51a..58037d993c1c6 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -12,7 +12,7 @@ _stringify_path, BaseIterator) from pandas.io.parsers import _validate_integer from pandas.core.common import AbstractMethodError -from pandas.core.reshape import concat +from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits from .table_schema import build_table_schema @@ -378,13 +378,48 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 - self.raw_json = False if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + self.fp_or_buf = filepath_or_buffer + data = self._get_data_from_filepath(filepath_or_buffer) + self.data = self._preprocess_data(data) + + def _preprocess_data(self, data): + """ + At this point, the data either has a `read` attribute (e.g. a file + object or a StringIO) or is a string that is a JSON document. + """ + if hasattr(data, 'read'): + if self.chunksize: + data = data + else: + data = data.read() + + else: + if self.chunksize: + data = StringIO(data) + else: + data = data + + return data + + def _get_data_from_filepath(self, filepath_or_buffer): + """ + read_json accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. JSON string + + This function turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + + data = None + if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -394,16 +429,13 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, exists = False if exists: - self.data, _ = _get_handle(filepath_or_buffer, 'r', - encoding=encoding) - else: - self.raw_json = True - self.data = filepath_or_buffer - elif hasattr(filepath_or_buffer, 'read'): - self.data = filepath_or_buffer - else: - self.raw_json = True - self.data = filepath_or_buffer + data, _ = _get_handle(filepath_or_buffer, 'r', + encoding=self.encoding) + + if not data: + data = filepath_or_buffer + + return data def combine_lines(self, data): """Combines a multi-line JSON document into a single document""" @@ -414,21 +446,12 @@ def combine_lines(self, data): def read(self): """Read the whole JSON input into a pandas object""" - if self.raw_json: - if self.lines: - obj = self._get_object_parser(self.combine_lines(self.data)) - else: - obj = self._get_object_parser(self.data) - elif self.lines and self.chunksize: + if self.lines and self.chunksize: obj = concat(self) + elif self.lines: + obj = self._get_object_parser(self.combine_lines(self.data)) else: - - if self.lines: - obj = self._get_object_parser( - self.combine_lines(self.data.read()) - ) - else: - obj = self._get_object_parser(self.data.read()) + obj = self._get_object_parser(self.data) self.close() return obj @@ -455,14 +478,29 @@ def _get_object_parser(self, json): return obj def close(self): + """ + If self.chunksize, self.data may need closing. + If not, self.fp_or_buff may need closing. + """ try: self.data.close() except (IOError, AttributeError): pass + try: + self.fp_or_buf.close() + except(IOError, AttributeError): + pass + def __next__(self): lines = list(islice(self.data, self.chunksize)) if lines: + + # _get_object_parser can't handle multiple empty lines, so we just + # pass it one and it will correctly return an empty object + if all(line=="\n" for line in lines): + lines = lines[0] + lines_json = '[' + ','.join(lines) + ']' obj = self._get_object_parser(lines_json) From 03b6069c8222a3c40655eb4624e9e440aa98eb6b Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 10:07:14 -0700 Subject: [PATCH 46/62] add test for reading with multiple empty lines --- pandas/tests/io/json/test_pandas.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fb71a293fef8e..fe9950485c7fa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1110,6 +1110,28 @@ def test_readjson_invalid_chunksize(self, lines_json_df): with tm.assert_raises_regex(ValueError, msg): pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs) + def test_readjson_chunks_multiple_empty_lines(self): + j = """ + + {"A":1,"B":4} + + + + {"A":2,"B":5} + + + + + + + + {"A":3,"B":6} + """ + for chunksize in [None, 1, 2]: + orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + test = pd.concat(pd.read_json(j, lines=True, chunksize=chunksize)) + tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( From aef6bbc500605f4fad97ec2fe997cf0c2c21432c Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 10:50:11 -0700 Subject: [PATCH 47/62] add support for JSON docs with multiple consecutive newlines --- pandas/io/json/json.py | 9 +++------ pandas/tests/io/json/test_pandas.py | 4 +++- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 58037d993c1c6..f46f4d87a30f4 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -441,7 +441,8 @@ def combine_lines(self, data): """Combines a multi-line JSON document into a single document""" # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. - lines = StringIO(data.strip()) + + lines = filter(None, data.strip().split('\n')) return '[' + ','.join(lines) + ']' def read(self): @@ -496,11 +497,7 @@ def __next__(self): lines = list(islice(self.data, self.chunksize)) if lines: - # _get_object_parser can't handle multiple empty lines, so we just - # pass it one and it will correctly return an empty object - if all(line=="\n" for line in lines): - lines = lines[0] - + lines = filter(None, map(lambda x: x.strip(), lines)) lines_json = '[' + ','.join(lines) + ']' obj = self._get_object_parser(lines_json) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fe9950485c7fa..0733eb8e9526f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1129,7 +1129,9 @@ def test_readjson_chunks_multiple_empty_lines(self): """ for chunksize in [None, 1, 2]: orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - test = pd.concat(pd.read_json(j, lines=True, chunksize=chunksize)) + test = pd.read_json(j, lines=True, chunksize=chunksize) + if chunksize is not None: + test = pd.concat(test) tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) def test_latin_encoding(self): From 30e40436a353376c81c208734a6798ac39e7b356 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 10:51:04 -0700 Subject: [PATCH 48/62] remove raw_json init param --- pandas/io/json/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index f46f4d87a30f4..efe4fc9010a64 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -362,7 +362,7 @@ class JsonReader(BaseIterator): """ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, - date_unit, encoding, lines, chunksize, raw_json=False): + date_unit, encoding, lines, chunksize): self.path_or_buf = filepath_or_buffer self.orient = orient From 7dae78a9bf938479fa9e7bf57adcedeaaca8bf33 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 10:55:31 -0700 Subject: [PATCH 49/62] DRY for combining lines --- pandas/io/json/json.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index efe4fc9010a64..7083d82dd9d63 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -437,20 +437,20 @@ def _get_data_from_filepath(self, filepath_or_buffer): return data - def combine_lines(self, data): - """Combines a multi-line JSON document into a single document""" - # If given a json lines file, we break the string into lines, add - # commas and put it in a json list to make a valid json object. - - lines = filter(None, data.strip().split('\n')) + def combine_lines(self, lines): + """Combines a list of JSON objects into one JSON object""" + lines = filter(None, map(lambda x: x.strip(), lines)) return '[' + ','.join(lines) + ']' + def read(self): """Read the whole JSON input into a pandas object""" if self.lines and self.chunksize: obj = concat(self) elif self.lines: - obj = self._get_object_parser(self.combine_lines(self.data)) + obj = self._get_object_parser( + self.combine_lines(self.data.split('\n')) + ) else: obj = self._get_object_parser(self.data) self.close() @@ -496,9 +496,7 @@ def close(self): def __next__(self): lines = list(islice(self.data, self.chunksize)) if lines: - - lines = filter(None, map(lambda x: x.strip(), lines)) - lines_json = '[' + ','.join(lines) + ']' + lines_json = self.combine_lines(lines) obj = self._get_object_parser(lines_json) # Make sure that the returned objects have the right index. From fe95445eb337c82da8c95ef6a50f607fccd05f0b Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 10:59:27 -0700 Subject: [PATCH 50/62] use floor division in asv bench --- asv_bench/benchmarks/io_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index aefced4a42a6f..a735c7bf88a81 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -207,10 +207,10 @@ def time_read_json_lines(self): pd.read_json("__test__.json", lines=True) def time_read_json_lines_chunk(self): - pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4)) + pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4)) def peakmem_read_json_lines(self): pd.read_json("__test__.json", lines=True) def peakmem_read_json_lines_chunk(self): - pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4)) + pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4)) From e41124af9f442ebd84c89d8f4b00676d70ba39da Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 11:03:04 -0700 Subject: [PATCH 51/62] add teardown to asv bench --- asv_bench/benchmarks/io_bench.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index a735c7bf88a81..93273955a29b9 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -1,3 +1,4 @@ +import os from .pandas_vb_common import * from pandas import concat, Timestamp, compat try: @@ -196,21 +197,28 @@ def time_read_nrows(self, compression, engine): class read_json_lines(object): goal_time = 0.2 + fname = "__test__.json" def setup(self): self.N = 100000 self.C = 5 self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)])) - self.df.to_json("__test__.json",orient="records",lines=True) + self.df.to_json(self.fname,orient="records",lines=True) + + def teardown(self): + try: + os.remove(self.fname) + except: + pass def time_read_json_lines(self): - pd.read_json("__test__.json", lines=True) + pd.read_json(self.fname, lines=True) def time_read_json_lines_chunk(self): - pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4)) + pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) def peakmem_read_json_lines(self): - pd.read_json("__test__.json", lines=True) + pd.read_json(self.fname, lines=True) def peakmem_read_json_lines_chunk(self): - pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4)) + pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) From 9cfd012ee393d08634c0c7ed159da2b149e442e5 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 11:09:20 -0700 Subject: [PATCH 52/62] add docs --- pandas/io/json/json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 7083d82dd9d63..58f9d1070d6c4 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -392,6 +392,9 @@ def _preprocess_data(self, data): """ At this point, the data either has a `read` attribute (e.g. a file object or a StringIO) or is a string that is a JSON document. + + If self.chunksize, we want to prepare the data for the `__next__` + method. Otherwise, we want to read it into memory for the `read` method. """ if hasattr(data, 'read'): if self.chunksize: @@ -442,7 +445,6 @@ def combine_lines(self, lines): lines = filter(None, map(lambda x: x.strip(), lines)) return '[' + ','.join(lines) + ']' - def read(self): """Read the whole JSON input into a pandas object""" if self.lines and self.chunksize: From 035ca84b1296eb181268a366560209351ff007cf Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 13:59:48 -0700 Subject: [PATCH 53/62] pep fixup --- pandas/io/json/json.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 58f9d1070d6c4..84fba8d751abe 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -393,8 +393,8 @@ def _preprocess_data(self, data): At this point, the data either has a `read` attribute (e.g. a file object or a StringIO) or is a string that is a JSON document. - If self.chunksize, we want to prepare the data for the `__next__` - method. Otherwise, we want to read it into memory for the `read` method. + If self.chunksize, we prepare the data for the `__next__` method. + Otherwise, we read it into memory for the `read` method. """ if hasattr(data, 'read'): if self.chunksize: @@ -417,7 +417,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): 2. file-like object (e.g. open file object, StringIO) 3. JSON string - This function turns (1) into (2) to simplify the rest of the processing. + This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ From 4c92287e04b2d183dfc52b024ed37aadfb4251b8 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 17:50:08 -0700 Subject: [PATCH 54/62] update documentation --- doc/source/io.rst | 1 - pandas/io/json/json.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 2bbe3f0738b92..55ca43b0d03c3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2064,7 +2064,6 @@ For line-delimited json files, pandas can also return an iterator which reads in df df.to_json(orient='records', lines=True) - # chunksize has no effect when reading a string. reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) reader for chunk in reader: diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 84fba8d751abe..382e40e081437 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -274,7 +274,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - If the JSON input is a string, this argument has no effect. .. versionadded:: 0.21.0 @@ -354,7 +353,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, class JsonReader(BaseIterator): """ - Reads a JSON document to a pandas object. + JsonReader provides an interface for reading in a JSON file. If initialized with ``lines=True`` and ``chunksize``, can be iterated over ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the From 61178be8aee8475537669fbc95307b6bf2ba72d8 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 17:54:44 -0700 Subject: [PATCH 55/62] simplify JsonReader._preprocess_data --- pandas/io/json/json.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 382e40e081437..fe79f596cec7c 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -395,17 +395,10 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, 'read'): - if self.chunksize: - data = data - else: - data = data.read() - - else: - if self.chunksize: - data = StringIO(data) - else: - data = data + if hasattr(data, 'read') and not self.chunksize: + data = data.read() + if not hasattr(data, 'read') and self.chunksize: + data = StringIO(data) return data From a2841875de1e7210cf4a3d65062447221837692e Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 17:58:27 -0700 Subject: [PATCH 56/62] simplify _get_data_from_filepath --- pandas/io/json/json.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index fe79f596cec7c..4feea78d037e6 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -413,22 +413,20 @@ def _get_data_from_filepath(self, filepath_or_buffer): It returns input types (2) and (3) unchanged. """ - data = None + data = filepath_or_buffer - if isinstance(filepath_or_buffer, compat.string_types): + if isinstance(data, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): - exists = False - - if exists: - data, _ = _get_handle(filepath_or_buffer, 'r', - encoding=self.encoding) + pass - if not data: - data = filepath_or_buffer + else: + if exists: + data, _ = _get_handle(filepath_or_buffer, 'r', + encoding=self.encoding) return data From 55170ddb08b90d4a22c787f76e9e2e7f76317d59 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 18:13:41 -0700 Subject: [PATCH 57/62] Update read_json tests Split out tests with lines=True into separate test class Parametrize tests Replace """ comments with #. --- pandas/tests/io/json/test_pandas.py | 183 ++++++++++++++-------------- 1 file changed, 92 insertions(+), 91 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0733eb8e9526f..6d2a1c0a01706 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -991,6 +991,62 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp + def test_latin_encoding(self): + if compat.PY2: + tm.assert_raises_regex( + TypeError, r'\[unicode\] is not implemented as a table column') + return + + # GH 13774 + pytest.skip("encoding not implemented in .to_json(), " + "xref #13774") + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(Series(val, dtype=dtype)) + + def roundtrip(s, encoding='latin-1'): + with ensure_clean('test.json') as path: + s.to_json(path, encoding=encoding) + retr = read_json(path, encoding=encoding) + assert_series_equal(s, retr, check_categorical=False) + + for s in examples: + roundtrip(s) + + def test_data_frame_size_after_to_json(self): + # GH15344 + df = DataFrame({'a': [str(1)]}) + + size_before = df.memory_usage(index=True, deep=True).sum() + df.to_json() + size_after = df.memory_usage(index=True, deep=True).sum() + + assert size_before == size_after + + +class TestPandasJsonLines(object): + def test_read_jsonl(self): # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) @@ -1038,19 +1094,18 @@ def test_to_jsonl(self): assert result == expected assert_frame_equal(pd.read_json(result, lines=True), df) - def test_readjson_chunks(self, lines_json_df): - """Basic test that read_json(chunks=True) gives the same result as - read_json(chunks=False)""" + @pytest.mark.parametrize("chunksize", [1, 1.0]) + def test_readjson_chunks(self, lines_json_df, chunksize): + # Basic test that read_json(chunks=True) gives the same result as + # read_json(chunks=False) # GH17048: memory usage when lines=True - for cs in [1, 1.0]: + unchunked = pd.read_json(StringIO(lines_json_df), lines=True) + reader = pd.read_json(StringIO(lines_json_df), lines=True, + chunksize=chunksize) + chunked = pd.concat(reader) - unchunked = pd.read_json(StringIO(lines_json_df), lines=True) - chunked = pd.concat( - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs) - ) - - assert_frame_equal(chunked, unchunked) + assert_frame_equal(chunked, unchunked) def test_readjson_chunksize_requires_lines(self, lines_json_df): msg = "chunksize can only be passed if lines=True" @@ -1058,7 +1113,7 @@ def test_readjson_chunksize_requires_lines(self, lines_json_df): pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) def test_readjson_chunks_series(self): - """Test reading line-format JSON to Series with chunksize param""" + # Test reading line-format JSON to Series with chunksize param s = pd.Series({'A': 1, 'B': 2}) strio = StringIO(s.to_json(lines=True, orient="records")) @@ -1072,10 +1127,8 @@ def test_readjson_chunks_series(self): assert_series_equal(chunked, unchunked) def test_readjson_each_chunk(self, lines_json_df): - """ - Other tests check that the final result of read_json(chunksize=True) is - correct. This checks that the intermediate chunks read in are correct. - """ + # Other tests check that the final result of read_json(chunksize=True) + # is correct. This checks the intermediate chunks. chunks = list( pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) ) @@ -1090,27 +1143,29 @@ def test_readjson_chunks_from_file(self): unchunked = pd.read_json(path, lines=True) assert_frame_equal(unchunked, chunked) - def test_readjson_chunks_closes(self): - for chunksize in [None, 1]: - with ensure_clean('test.json') as path: - df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - df.to_json(path, lines=True, orient="records") - f = open(path, 'r') - if chunksize is not None: - pd.concat(pd.read_json(f, lines=True, chunksize=chunksize)) - else: - pd.read_json(f, lines=True) - assert f.closed, \ - "didn't close file with chunksize = %s" % chunksize + @pytest.mark.parametrize("chunksize", [None, 1]) + def test_readjson_chunks_closes(self, chunksize): + with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + f = open(path, 'r') + if chunksize is not None: + pd.concat(pd.read_json(f, lines=True, chunksize=chunksize)) + else: + pd.read_json(f, lines=True) + assert f.closed, \ + "didn't close file with chunksize = %s" % chunksize - def test_readjson_invalid_chunksize(self, lines_json_df): + @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) + def test_readjson_invalid_chunksize(self, lines_json_df, chunksize): msg = r"'chunksize' must be an integer >=1" - for cs in [0, -1, 2.2, 'foo']: - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs) + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(StringIO(lines_json_df), lines=True, + chunksize=chunksize) - def test_readjson_chunks_multiple_empty_lines(self): + @pytest.mark.parametrize("chunksize", [None, 1, 2]) + def test_readjson_chunks_multiple_empty_lines(self, chunksize): j = """ {"A":1,"B":4} @@ -1127,62 +1182,8 @@ def test_readjson_chunks_multiple_empty_lines(self): {"A":3,"B":6} """ - for chunksize in [None, 1, 2]: - orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - test = pd.read_json(j, lines=True, chunksize=chunksize) - if chunksize is not None: - test = pd.concat(test) - tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) - - def test_latin_encoding(self): - if compat.PY2: - tm.assert_raises_regex( - TypeError, r'\[unicode\] is not implemented as a table column') - return - - # GH 13774 - pytest.skip("encoding not implemented in .to_json(), " - "xref #13774") - - values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'a', b'b', b'c'], - [b'EE, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], - [b'', b'a', b'b', b'c'], - [b'\xf8\xfc', b'a', b'b', b'c'], - [b'A\xf8\xfc', b'', b'a', b'b', b'c'], - [np.nan, b'', b'b', b'c'], - [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] - - def _try_decode(x, encoding='latin-1'): - try: - return x.decode(encoding) - except AttributeError: - return x - - # not sure how to remove latin-1 from code in python 2 and 3 - values = [[_try_decode(x) for x in y] for y in values] - - examples = [] - for dtype in ['category', object]: - for val in values: - examples.append(Series(val, dtype=dtype)) - - def roundtrip(s, encoding='latin-1'): - with ensure_clean('test.json') as path: - s.to_json(path, encoding=encoding) - retr = read_json(path, encoding=encoding) - assert_series_equal(s, retr, check_categorical=False) - - for s in examples: - roundtrip(s) - - def test_data_frame_size_after_to_json(self): - # GH15344 - df = DataFrame({'a': [str(1)]}) - - size_before = df.memory_usage(index=True, deep=True).sum() - df.to_json() - size_after = df.memory_usage(index=True, deep=True).sum() - - assert size_before == size_after + orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + test = pd.read_json(j, lines=True, chunksize=chunksize) + if chunksize is not None: + test = pd.concat(test) + tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) From 1d7087dc53872b2c9852f75ca4d88c44a2511dee Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 21 Sep 2017 18:50:02 -0700 Subject: [PATCH 58/62] JsonReader should only close if it opened --- pandas/io/json/json.py | 22 ++++++++++------------ pandas/tests/io/json/test_pandas.py | 16 +++++++++------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4feea78d037e6..9f0ad24639b7c 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -377,13 +377,13 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 + self.should_close = False if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") - self.fp_or_buf = filepath_or_buffer data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) @@ -427,6 +427,8 @@ def _get_data_from_filepath(self, filepath_or_buffer): if exists: data, _ = _get_handle(filepath_or_buffer, 'r', encoding=self.encoding) + self.should_close = True + self.open_stream = data return data @@ -472,18 +474,14 @@ def _get_object_parser(self, json): def close(self): """ - If self.chunksize, self.data may need closing. - If not, self.fp_or_buff may need closing. + If we opened a stream earlier, in _get_data_from_filepath, we should + close it. If an open stream or file was passed, we leave it open. """ - try: - self.data.close() - except (IOError, AttributeError): - pass - - try: - self.fp_or_buf.close() - except(IOError, AttributeError): - pass + if self.should_close: + try: + self.open_stream.close() + except (IOError, AttributeError): + pass def __next__(self): lines = list(islice(self.data, self.chunksize)) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6d2a1c0a01706..49dd71ac0e961 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -10,6 +10,7 @@ read_json, compat) from datetime import timedelta import pandas as pd +from pandas.io.json.json import JsonReader from pandas.util.testing import (assert_almost_equal, assert_frame_equal, assert_series_equal, network, @@ -1148,13 +1149,14 @@ def test_readjson_chunks_closes(self, chunksize): with ensure_clean('test.json') as path: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - f = open(path, 'r') - if chunksize is not None: - pd.concat(pd.read_json(f, lines=True, chunksize=chunksize)) - else: - pd.read_json(f, lines=True) - assert f.closed, \ - "didn't close file with chunksize = %s" % chunksize + reader = JsonReader( + path, orient=None, typ="frame", dtype=True, convert_axes=True, + convert_dates=True, keep_default_dates=True, numpy=False, + precise_float=False, date_unit=None, encoding=None, + lines=True, chunksize=chunksize) + reader.read() + assert reader.open_stream.closed, "didn't close stream with \ + chunksize = %s" % chunksize @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) def test_readjson_invalid_chunksize(self, lines_json_df, chunksize): From 6a76c557939d0ad155ff1db5cc77b8cbe3d897ec Mon Sep 17 00:00:00 2001 From: louispotok Date: Tue, 26 Sep 2017 13:18:37 -0700 Subject: [PATCH 59/62] split out json readlines to sep test class --- pandas/tests/io/json/test_pandas.py | 150 ---------------------- pandas/tests/io/json/test_readlines.py | 167 +++++++++++++++++++++++++ 2 files changed, 167 insertions(+), 150 deletions(-) create mode 100644 pandas/tests/io/json/test_readlines.py diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 49dd71ac0e961..b46e3b3033a53 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -10,7 +10,6 @@ read_json, compat) from datetime import timedelta import pandas as pd -from pandas.io.json.json import JsonReader from pandas.util.testing import (assert_almost_equal, assert_frame_equal, assert_series_equal, network, @@ -36,12 +35,6 @@ _mixed_frame = _frame.copy() -@pytest.fixture -def lines_json_df(): - df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - return df.to_json(lines=True, orient="records") - - class TestPandasContainer(object): def setup_method(self, method): @@ -1046,146 +1039,3 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after -class TestPandasJsonLines(object): - - def test_read_jsonl(self): - # GH9180 - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) - expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - assert_frame_equal(result, expected) - - def test_read_jsonl_unicode_chars(self): - # GH15132: non-ascii unicode characters - # \u201d == RIGHT DOUBLE QUOTATION MARK - - # simulate file handle - json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - json = StringIO(json) - result = read_json(json, lines=True) - expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], - columns=['a', 'b']) - assert_frame_equal(result, expected) - - # simulate string - json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - result = read_json(json, lines=True) - expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], - columns=['a', 'b']) - assert_frame_equal(result, expected) - - def test_to_jsonl(self): - # GH9180 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.to_json(orient="records", lines=True) - expected = '{"a":1,"b":2}\n{"a":1,"b":2}' - assert result == expected - - df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) - result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' - assert result == expected - assert_frame_equal(pd.read_json(result, lines=True), df) - - # GH15096: escaped characters in columns and data - df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], - columns=["a\\", 'b']) - result = df.to_json(orient="records", lines=True) - expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' - '{"a\\\\":"foo\\"","b":"bar"}') - assert result == expected - assert_frame_equal(pd.read_json(result, lines=True), df) - - @pytest.mark.parametrize("chunksize", [1, 1.0]) - def test_readjson_chunks(self, lines_json_df, chunksize): - # Basic test that read_json(chunks=True) gives the same result as - # read_json(chunks=False) - # GH17048: memory usage when lines=True - - unchunked = pd.read_json(StringIO(lines_json_df), lines=True) - reader = pd.read_json(StringIO(lines_json_df), lines=True, - chunksize=chunksize) - chunked = pd.concat(reader) - - assert_frame_equal(chunked, unchunked) - - def test_readjson_chunksize_requires_lines(self, lines_json_df): - msg = "chunksize can only be passed if lines=True" - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) - - def test_readjson_chunks_series(self): - # Test reading line-format JSON to Series with chunksize param - s = pd.Series({'A': 1, 'B': 2}) - - strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = pd.read_json(strio, lines=True, typ='Series') - - strio = StringIO(s.to_json(lines=True, orient="records")) - chunked = pd.concat(pd.read_json( - strio, lines=True, typ='Series', chunksize=1 - )) - - assert_series_equal(chunked, unchunked) - - def test_readjson_each_chunk(self, lines_json_df): - # Other tests check that the final result of read_json(chunksize=True) - # is correct. This checks the intermediate chunks. - chunks = list( - pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) - ) - assert chunks[0].shape == (2, 2) - assert chunks[1].shape == (1, 2) - - def test_readjson_chunks_from_file(self): - with ensure_clean('test.json') as path: - df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - df.to_json(path, lines=True, orient="records") - chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) - unchunked = pd.read_json(path, lines=True) - assert_frame_equal(unchunked, chunked) - - @pytest.mark.parametrize("chunksize", [None, 1]) - def test_readjson_chunks_closes(self, chunksize): - with ensure_clean('test.json') as path: - df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - df.to_json(path, lines=True, orient="records") - reader = JsonReader( - path, orient=None, typ="frame", dtype=True, convert_axes=True, - convert_dates=True, keep_default_dates=True, numpy=False, - precise_float=False, date_unit=None, encoding=None, - lines=True, chunksize=chunksize) - reader.read() - assert reader.open_stream.closed, "didn't close stream with \ - chunksize = %s" % chunksize - - @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) - def test_readjson_invalid_chunksize(self, lines_json_df, chunksize): - msg = r"'chunksize' must be an integer >=1" - - with tm.assert_raises_regex(ValueError, msg): - pd.read_json(StringIO(lines_json_df), lines=True, - chunksize=chunksize) - - @pytest.mark.parametrize("chunksize", [None, 1, 2]) - def test_readjson_chunks_multiple_empty_lines(self, chunksize): - j = """ - - {"A":1,"B":4} - - - - {"A":2,"B":5} - - - - - - - - {"A":3,"B":6} - """ - orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - test = pd.read_json(j, lines=True, chunksize=chunksize) - if chunksize is not None: - test = pd.concat(test) - tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py new file mode 100644 index 0000000000000..711cf89de29e3 --- /dev/null +++ b/pandas/tests/io/json/test_readlines.py @@ -0,0 +1,167 @@ +import pytest +import pandas as pd +from pandas import DataFrame, read_json +from pandas.compat import StringIO +from pandas.io.json.json import JsonReader +import pandas.util.testing as tm +from pandas.util.testing import (assert_frame_equal, assert_series_equal, + ensure_clean) + + +@pytest.fixture +def lines_json_df(): + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + return df.to_json(lines=True, orient="records") + + +def test_read_jsonl(): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + +def test_read_jsonl_unicode_chars(): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + +def test_to_jsonl(): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + assert_frame_equal(read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + assert result == expected + assert_frame_equal(read_json(result, lines=True), df) + + +@pytest.mark.parametrize("chunksize", [1, 1.0]) +def test_readjson_chunks(lines_json_df, chunksize): + # Basic test that read_json(chunks=True) gives the same result as + # read_json(chunks=False) + # GH17048: memory usage when lines=True + + unchunked = read_json(StringIO(lines_json_df), lines=True) + reader = read_json(StringIO(lines_json_df), lines=True, + chunksize=chunksize) + chunked = pd.concat(reader) + + assert_frame_equal(chunked, unchunked) + + +def test_readjson_chunksize_requires_lines(lines_json_df): + msg = "chunksize can only be passed if lines=True" + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) + + +def test_readjson_chunks_series(): + # Test reading line-format JSON to Series with chunksize param + s = pd.Series({'A': 1, 'B': 2}) + + strio = StringIO(s.to_json(lines=True, orient="records")) + unchunked = pd.read_json(strio, lines=True, typ='Series') + + strio = StringIO(s.to_json(lines=True, orient="records")) + chunked = pd.concat(pd.read_json( + strio, lines=True, typ='Series', chunksize=1 + )) + + assert_series_equal(chunked, unchunked) + + +def test_readjson_each_chunk(lines_json_df): + # Other tests check that the final result of read_json(chunksize=True) + # is correct. This checks the intermediate chunks. + chunks = list( + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) + ) + assert chunks[0].shape == (2, 2) + assert chunks[1].shape == (1, 2) + + +def test_readjson_chunks_from_file(): + with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) + unchunked = pd.read_json(path, lines=True) + assert_frame_equal(unchunked, chunked) + + +@pytest.mark.parametrize("chunksize", [None, 1]) +def test_readjson_chunks_closes(chunksize): + with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + reader = JsonReader( + path, orient=None, typ="frame", dtype=True, convert_axes=True, + convert_dates=True, keep_default_dates=True, numpy=False, + precise_float=False, date_unit=None, encoding=None, + lines=True, chunksize=chunksize) + reader.read() + assert reader.open_stream.closed, "didn't close stream with \ + chunksize = %s" % chunksize + + +@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) +def test_readjson_invalid_chunksize(lines_json_df, chunksize): + msg = r"'chunksize' must be an integer >=1" + + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(StringIO(lines_json_df), lines=True, + chunksize=chunksize) + + +@pytest.mark.parametrize("chunksize", [None, 1, 2]) +def test_readjson_chunks_multiple_empty_lines(chunksize): + j = """ + + {"A":1,"B":4} + + + + {"A":2,"B":5} + + + + + + + + {"A":3,"B":6} + """ + orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + test = pd.read_json(j, lines=True, chunksize=chunksize) + if chunksize is not None: + test = pd.concat(test) + tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) From a72411fded15efa29acf842c59c8dd1ba97cbf8e Mon Sep 17 00:00:00 2001 From: louispotok Date: Tue, 26 Sep 2017 16:25:09 -0700 Subject: [PATCH 60/62] add encoding to test_readlines --- pandas/tests/io/json/test_readlines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 711cf89de29e3..d14355b07cf20 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import pytest import pandas as pd from pandas import DataFrame, read_json From 56129344586c84d32359dcb07cd033d448b7c90b Mon Sep 17 00:00:00 2001 From: louispotok Date: Tue, 26 Sep 2017 18:53:41 -0700 Subject: [PATCH 61/62] pep8 cleanup --- pandas/tests/io/json/test_pandas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b46e3b3033a53..de4afec883efd 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1037,5 +1037,3 @@ def test_data_frame_size_after_to_json(self): size_after = df.memory_usage(index=True, deep=True).sum() assert size_before == size_after - - From 28d1cbe6cefa4561d79c8cf6245f3448b4f5b422 Mon Sep 17 00:00:00 2001 From: louispotok Date: Wed, 27 Sep 2017 09:39:02 -0700 Subject: [PATCH 62/62] minor fixups --- doc/source/io.rst | 1 + pandas/io/json/json.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 55ca43b0d03c3..4eba9687efc58 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2064,6 +2064,7 @@ For line-delimited json files, pandas can also return an iterator which reads in df df.to_json(orient='records', lines=True) + # reader is an iterator that returns `chunksize` lines each iteration reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) reader for chunk in reader: diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 9f0ad24639b7c..ab74b265b6a06 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -432,7 +432,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): return data - def combine_lines(self, lines): + def _combine_lines(self, lines): """Combines a list of JSON objects into one JSON object""" lines = filter(None, map(lambda x: x.strip(), lines)) return '[' + ','.join(lines) + ']' @@ -443,7 +443,7 @@ def read(self): obj = concat(self) elif self.lines: obj = self._get_object_parser( - self.combine_lines(self.data.split('\n')) + self._combine_lines(self.data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -486,7 +486,7 @@ def close(self): def __next__(self): lines = list(islice(self.data, self.chunksize)) if lines: - lines_json = self.combine_lines(lines) + lines_json = self._combine_lines(lines) obj = self._get_object_parser(lines_json) # Make sure that the returned objects have the right index.