From 4cd506e1bf371fce15359f736e8ed4199d408deb Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Mon, 24 Jul 2017 11:53:46 -0400
Subject: [PATCH 01/62] Add chunksize param to read_json when lines=True

Previous behavior: reading the whole file to memory and then split into lines.
New behavior, if lines=True and chunksize is passed: read in `chunksize`
lines at a time, and concat.
This only covers some kinds of input to read_json.
When chunksize is passed, read_json becomes slower but more
memory-efficient.
---
 pandas/io/json/json.py | 52 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 5dae6099446d0..690f2b6a8da69 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -1,4 +1,6 @@
 # pylint: disable-msg=E1101,W0613,W0603
+from itertools import islice
+from pandas import concat
 import os
 import numpy as np
 
@@ -175,7 +177,7 @@ def write(self):
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False):
+              lines=False, chunksize=None):
     """
     Convert a JSON string to pandas object
 
@@ -264,6 +266,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
         .. versionadded:: 0.19.0
 
+    chunksize: integer, default None
+        If `lines=True`, how many lines to read into memory at a time.
+        If this is None, the file will be read into memory all at once.
+        Passing a chunksize helps with memory usage, but is slower.
+        Also note this is different from the `chunksize` parameter in
+            `read_csv`, which returns a FileTextReader.
+        If the JSON input is a string, this argument has no effect.
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.
@@ -323,6 +333,27 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                       encoding=encoding)
+
+    def _read_json_as_lines(fh, chunksize):
+        return_val = None
+        while True:
+            lines = list(islice(fh, chunksize))
+
+            if lines:
+                lines_json = '[' + ','.join(lines) + ']'
+                obj = _get_obj(typ, lines_json, orient, dtype, convert_axes,
+                               convert_dates, keep_default_dates, numpy,
+                               precise_float, date_unit)
+                if not return_val:
+                    obj = return_val
+                else:
+                    return_val = concat([return_val, obj])
+
+            else:
+                break
+        fh.close()
+        return return_val
+
     if isinstance(filepath_or_buffer, compat.string_types):
         try:
             exists = os.path.exists(filepath_or_buffer)
@@ -335,12 +366,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         if exists:
             fh, handles = _get_handle(filepath_or_buffer, 'r',
                                       encoding=encoding)
-            json = fh.read()
-            fh.close()
+            if lines and chunksize:
+                return _read_json_as_lines(fh, chunksize)
+            else:
+                json = fh.read()
+                fh.close()
         else:
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
-        json = filepath_or_buffer.read()
+        if lines and chunksize:
+            return _read_json_as_lines(fh, chunksize)
+        else:
+            json = filepath_or_buffer.read()
     else:
         json = filepath_or_buffer
 
@@ -350,6 +387,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         lines = list(StringIO(json.strip()))
         json = '[' + ','.join(lines) + ']'
 
+    return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
+                    keep_default_dates, numpy, precise_float, date_unit)
+
+
+def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
+             keep_default_dates, numpy, precise_float,
+             date_unit):
     obj = None
     if typ == 'frame':
         obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,

From 9fe44f1c190959559e442451554643506f9505a3 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 3 Aug 2017 17:55:34 -0700
Subject: [PATCH 02/62] Add read_json chunksize change to whatsnew

---
 doc/source/whatsnew/v0.21.0.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 50f11c38bae23..ff736f5d4995d 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -471,6 +471,7 @@ Other API Changes
 - :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
 - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
 - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
+- :func:`read_json` now accepts a ``chunksize`` parameter that can reduce memory usage when ``lines=True``. (:issue:`17048`)
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
 - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
 - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`)

From 6de3a2726f2e7aced1ae761caaa0fa2f3d444db1 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 3 Aug 2017 17:57:30 -0700
Subject: [PATCH 03/62] Add versionadded to docstring

---
 pandas/io/json/json.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 690f2b6a8da69..3f1a35d763a0e 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -274,6 +274,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
             `read_csv`, which returns a FileTextReader.
         If the JSON input is a string, this argument has no effect.
 
+        .. versionadded:: 0.21.0
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.

From e235c702b0f5a9161cd330c23bc3c5eef4ced699 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 3 Aug 2017 18:12:32 -0700
Subject: [PATCH 04/62] add docstring for _read_json_as_lines

---
 pandas/io/json/json.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 3f1a35d763a0e..39c30a11e82af 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -337,6 +337,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
                                                       encoding=encoding)
 
     def _read_json_as_lines(fh, chunksize):
+        """
+        Read json lines from fh in chunks, then concatenate the resulting
+        pandas objects.
+
+        Parameters
+        ----------
+        fh : a file-like object
+        chunksize : integer
+        """
         return_val = None
         while True:
             lines = list(islice(fh, chunksize))

From 0a5a8f99ac8980c0cbe4589828744e07a20baa1a Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 3 Aug 2017 18:29:39 -0700
Subject: [PATCH 05/62] add basic read_json chunksize test

---
 pandas/tests/io/json/test_pandas.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 671d4248818e4..3b7055c8a4c99 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1032,6 +1032,15 @@ def test_to_jsonl(self):
         assert result == expected
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
+    def test_read_jsonchunks(self):
+        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        strio = df.to_json(lines=True, orient="records")
+
+        unchunked = pd.read_json(strio, lines=True)
+        chunked = pd.read_json(strio, lines=True, chunksize=1)
+
+        assert_frame_equal(chunked, unchunked)
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From ce234447fd0fb1c58dd78fd9295b13efdf3bf379 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 3 Aug 2017 18:33:49 -0700
Subject: [PATCH 06/62] validate read_json chunksize is an integer and >=1

---
 pandas/io/json/json.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 39c30a11e82af..386e79ebeb239 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -11,6 +11,7 @@
 from pandas import Series, DataFrame, to_datetime, MultiIndex
 from pandas.io.common import (get_filepath_or_buffer, _get_handle,
                               _stringify_path)
+from pandas.io.parsers import _validate_integer
 from pandas.core.common import AbstractMethodError
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
@@ -336,6 +337,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                       encoding=encoding)
 
+    if chunksize is not None:
+        _validate_integer("chunksize", chunksize, 1)
+
     def _read_json_as_lines(fh, chunksize):
         """
         Read json lines from fh in chunks, then concatenate the resulting

From a97ca0bec62062a04e5230f76820f7edf4571d71 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 3 Aug 2017 18:41:14 -0700
Subject: [PATCH 07/62] Add more tests to read_json chunksize

- Errors correctly when non-integer is passed as chunksize
- Accepts a float that is close to an integer as chunksize
---
 pandas/tests/io/json/test_pandas.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 3b7055c8a4c99..a9644a0e49fb5 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1041,6 +1041,23 @@ def test_read_jsonchunks(self):
 
         assert_frame_equal(chunked, unchunked)
 
+        chunked_float = pd.read_json(strio, lines=True, chunksize=1.0)
+        assert_frame_equal(chunked_float, unchunked)
+
+        msg = r"'chunksize' must be an integer >=1"
+
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(strio, lines=True, chunksize=0)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(strio, lines=True, chunksize=-1)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(strio, lines=True, chunksize=2.2)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(strio, lines=True, chunksize='foo')
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From 2861d0e611e676bf1842f03776f32da935cf0f17 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 13 Aug 2017 15:44:56 -0700
Subject: [PATCH 08/62] Return JsonLineReader from read_json

When chunksize is passed and lines=True, read_json
now returns a JsonLineReader, which inherits from
BaseIterator.
Also, internally, wrap up read_json kwargs into a dictionary
and pass them down opaquely to the SeriesParser or FrameParser.
---
 pandas/io/json/json.py | 84 +++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 386e79ebeb239..4dd8c03b53be3 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -10,7 +10,7 @@
 from pandas import compat, isna
 from pandas import Series, DataFrame, to_datetime, MultiIndex
 from pandas.io.common import (get_filepath_or_buffer, _get_handle,
-                              _stringify_path)
+                              _stringify_path, BaseIterator)
 from pandas.io.parsers import _validate_integer
 from pandas.core.common import AbstractMethodError
 from pandas.io.formats.printing import pprint_thing
@@ -337,38 +337,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                       encoding=encoding)
 
+    # These kwargs are only needed by the Parsers, so we just wrap them up and
+    # pass them down.
+    kwargs = {"typ": typ, "orient": orient, "dtype": dtype,
+              "convert_axes": convert_axes, "convert_dates": convert_dates,
+              "keep_default_dates": keep_default_dates, "numpy": numpy,
+              "precise_float": precise_float, "date_unit": date_unit}
+
     if chunksize is not None:
         _validate_integer("chunksize", chunksize, 1)
 
-    def _read_json_as_lines(fh, chunksize):
-        """
-        Read json lines from fh in chunks, then concatenate the resulting
-        pandas objects.
-
-        Parameters
-        ----------
-        fh : a file-like object
-        chunksize : integer
-        """
-        return_val = None
-        while True:
-            lines = list(islice(fh, chunksize))
-
-            if lines:
-                lines_json = '[' + ','.join(lines) + ']'
-                obj = _get_obj(typ, lines_json, orient, dtype, convert_axes,
-                               convert_dates, keep_default_dates, numpy,
-                               precise_float, date_unit)
-                if not return_val:
-                    obj = return_val
-                else:
-                    return_val = concat([return_val, obj])
-
-            else:
-                break
-        fh.close()
-        return return_val
-
     if isinstance(filepath_or_buffer, compat.string_types):
         try:
             exists = os.path.exists(filepath_or_buffer)
@@ -382,7 +360,7 @@ def _read_json_as_lines(fh, chunksize):
             fh, handles = _get_handle(filepath_or_buffer, 'r',
                                       encoding=encoding)
             if lines and chunksize:
-                return _read_json_as_lines(fh, chunksize)
+                return JsonLineReader(fh, chunksize, **kwargs)
             else:
                 json = fh.read()
                 fh.close()
@@ -390,7 +368,7 @@ def _read_json_as_lines(fh, chunksize):
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
         if lines and chunksize:
-            return _read_json_as_lines(fh, chunksize)
+            return JsonLineReader(fh, chunksize, **kwargs)
         else:
             json = filepath_or_buffer.read()
     else:
@@ -402,29 +380,49 @@ def _read_json_as_lines(fh, chunksize):
         lines = list(StringIO(json.strip()))
         json = '[' + ','.join(lines) + ']'
 
-    return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
-                    keep_default_dates, numpy, precise_float, date_unit)
+    return _get_obj(json, **kwargs)
 
 
-def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
-             keep_default_dates, numpy, precise_float,
-             date_unit):
+def _get_obj(json, **kwargs):
+    typ = kwargs['typ']
+    dtype = kwargs['dtype']
+    kwargs = {k: v for k, v in kwargs.items() if k != 'typ'}
     obj = None
     if typ == 'frame':
-        obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
-                          keep_default_dates, numpy, precise_float,
-                          date_unit).parse()
+        obj = FrameParser(json, **kwargs).parse()
 
     if typ == 'series' or obj is None:
         if not isinstance(dtype, bool):
             dtype = dict(data=dtype)
-        obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates,
-                           keep_default_dates, numpy, precise_float,
-                           date_unit).parse()
+        obj = SeriesParser(json, **kwargs).parse()
 
     return obj
 
 
+class JsonLineReader(BaseIterator):
+    """
+    Iterates over a JSON document that is formatted with one JSON record per
+    line. The `chunksize` initialization parameter controls how many lines are
+    read per iteration.
+    """
+    def __init__(self, fh, chunksize, **kwargs):
+        self.fh = fh
+        self.chunksize = chunksize
+        self.kwargs = kwargs
+
+    def __next__(self):
+        lines = list(islice(self.fh, self.chunksize))
+        if lines:
+            lines_json = '[' + ','.join(lines) + ']'
+            return _get_obj(json=lines_json, **self.kwargs)
+
+        else:
+            try:
+                self.fh.close()
+            except:
+                pass
+            return StopIteration
+
 class Parser(object):
 
     _STAMP_UNITS = ('s', 'ms', 'us', 'ns')

From da59b4a6e04533d832810107b3a80e452e987e70 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 13 Aug 2017 15:49:09 -0700
Subject: [PATCH 09/62] Raise ValueError if chunksize is not None, but not
 lines

---
 pandas/io/json/json.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 4dd8c03b53be3..7e0d340a26c68 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -346,6 +346,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
     if chunksize is not None:
         _validate_integer("chunksize", chunksize, 1)
+        if not lines:
+            raise ValueError("chunksize should only be passed if lines=True")
 
     if isinstance(filepath_or_buffer, compat.string_types):
         try:

From 4544f82439c16f9c235af537ecbea2b1af65d38c Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 13 Aug 2017 15:56:43 -0700
Subject: [PATCH 10/62] Add issue number to test docstring and add test.

Test that read_json raises exception if chunksize is passed
and lines != True.
---
 pandas/tests/io/json/test_pandas.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index a9644a0e49fb5..8229fc70080c4 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1033,6 +1033,7 @@ def test_to_jsonl(self):
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
     def test_read_jsonchunks(self):
+        # GH17048: memory usage when lines=True
         df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
         strio = df.to_json(lines=True, orient="records")
 
@@ -1058,6 +1059,11 @@ def test_read_jsonchunks(self):
         with tm.assert_raises_regex(ValueError, msg):
             pd.read_json(strio, lines=True, chunksize='foo')
 
+
+        msg = "chunksize should only be passed if lines=True"
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(strio, lines=False, chunksize=2)
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From dad7f117cb66a5d30af582cf6f8cb953d18030fc Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 13 Aug 2017 16:42:20 -0700
Subject: [PATCH 11/62] bugfix: raise StopIteration, dont return it

---
 pandas/io/json/json.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 7e0d340a26c68..7e94c5d8ed251 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -1,6 +1,5 @@
 # pylint: disable-msg=E1101,W0613,W0603
 from itertools import islice
-from pandas import concat
 import os
 import numpy as np
 
@@ -423,7 +422,7 @@ def __next__(self):
                 self.fh.close()
             except:
                 pass
-            return StopIteration
+            raise StopIteration
 
 class Parser(object):
 

From bb8b1b66d5cea9039ddb9d54c5635d4b1606db56 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 13 Aug 2017 16:45:21 -0700
Subject: [PATCH 12/62] PEP8 cleanup

---
 pandas/io/json/json.py              | 2 +-
 pandas/tests/io/json/test_pandas.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 7e94c5d8ed251..d83441dddd7af 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -420,7 +420,7 @@ def __next__(self):
         else:
             try:
                 self.fh.close()
-            except:
+            except IOError:
                 pass
             raise StopIteration
 
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 8229fc70080c4..b3eea4081c02b 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1059,7 +1059,6 @@ def test_read_jsonchunks(self):
         with tm.assert_raises_regex(ValueError, msg):
             pd.read_json(strio, lines=True, chunksize='foo')
 
-
         msg = "chunksize should only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
             pd.read_json(strio, lines=False, chunksize=2)

From 3e81bba44713dc457c9d16b497a016dd1035faa0 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Mon, 14 Aug 2017 08:31:42 -0700
Subject: [PATCH 13/62] Bugfixes for chunksize

* use _validate_integer to cast chunksize to integer, instead
  of just checking
* pass filepath_or_buffer to JsonLineReader where appropriate
* JsonLineReader uses integer index
* fix read_json chunksize tests to actually use StringIO and test Series
---
 pandas/io/json/json.py              | 19 ++++++++++++---
 pandas/tests/io/json/test_pandas.py | 37 ++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index d83441dddd7af..beb4a0dc3650e 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -344,7 +344,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               "precise_float": precise_float, "date_unit": date_unit}
 
     if chunksize is not None:
-        _validate_integer("chunksize", chunksize, 1)
+        chunksize = _validate_integer("chunksize", chunksize, 1)
         if not lines:
             raise ValueError("chunksize should only be passed if lines=True")
 
@@ -369,7 +369,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
         if lines and chunksize:
-            return JsonLineReader(fh, chunksize, **kwargs)
+            return JsonLineReader(filepath_or_buffer, chunksize, **kwargs)
         else:
             json = filepath_or_buffer.read()
     else:
@@ -405,17 +405,30 @@ class JsonLineReader(BaseIterator):
     Iterates over a JSON document that is formatted with one JSON record per
     line. The `chunksize` initialization parameter controls how many lines are
     read per iteration.
+
+    We explicitly override the index on the return value so that the index of
+    the resulting object will be like `range(len(obj))`. If we didn't do this,
+    it would have index like `range(chunksize) * number_chunks.`
+    This is so that `read_json(lines=True)` will return an identical object to
+    `read_json(lines=True, chunksize=n)`.
     """
     def __init__(self, fh, chunksize, **kwargs):
         self.fh = fh
         self.chunksize = chunksize
         self.kwargs = kwargs
+        self.nrows_seen = 0
 
     def __next__(self):
         lines = list(islice(self.fh, self.chunksize))
         if lines:
             lines_json = '[' + ','.join(lines) + ']'
-            return _get_obj(json=lines_json, **self.kwargs)
+            obj = _get_obj(json=lines_json, **self.kwargs)
+
+            # Make sure that the returned objects have the right index
+            obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
+            self.nrows_seen += len(obj)
+
+            return obj
 
         else:
             try:
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index b3eea4081c02b..5bf512ec020e8 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1035,33 +1035,52 @@ def test_to_jsonl(self):
     def test_read_jsonchunks(self):
         # GH17048: memory usage when lines=True
         df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-        strio = df.to_json(lines=True, orient="records")
 
-        unchunked = pd.read_json(strio, lines=True)
-        chunked = pd.read_json(strio, lines=True, chunksize=1)
+        def get_strio():
+            return StringIO(df.to_json(lines=True, orient="records"))
+
+        def test_with_chunksize(c):
+            return pd.concat(pd.read_json(get_strio(), lines=True, chunksize=c))
+
+        unchunked = pd.read_json(get_strio(), lines=True)
+
+        chunked = test_with_chunksize(1)
 
         assert_frame_equal(chunked, unchunked)
 
-        chunked_float = pd.read_json(strio, lines=True, chunksize=1.0)
+        chunked_float = test_with_chunksize(1.0)
         assert_frame_equal(chunked_float, unchunked)
 
         msg = r"'chunksize' must be an integer >=1"
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio, lines=True, chunksize=0)
+            test_with_chunksize(0)
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio, lines=True, chunksize=-1)
+            test_with_chunksize(-1)
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio, lines=True, chunksize=2.2)
+            test_with_chunksize(-2.2)
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio, lines=True, chunksize='foo')
+            test_with_chunksize('foo')
 
         msg = "chunksize should only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio, lines=False, chunksize=2)
+            pd.read_json(get_strio(), lines=False, chunksize=2)
+
+        # Test that reading in Series also works
+        s = pd.Series({'A': 1, 'B': 2})
+
+        strio = StringIO(s.to_json(lines=True, orient="records"))
+        unchunked = pd.read_json(strio, lines=True, typ='Series')
+
+        strio = StringIO(s.to_json(lines=True, orient="records"))
+        chunked = pd.concat(pd.read_json(
+            strio, lines=True, typ='Series', chunksize=1
+        ))
+
+        assert_series_equal(chunked, unchunked)
 
     def test_latin_encoding(self):
         if compat.PY2:

From e049d2911d32840aaca8e7c8ff90cdc6617dccf4 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Mon, 14 Aug 2017 08:43:59 -0700
Subject: [PATCH 14/62] add chunksize test for reading from file

---
 pandas/tests/io/json/test_pandas.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 5bf512ec020e8..a8b4ad48786e2 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1082,6 +1082,16 @@ def test_with_chunksize(c):
 
         assert_series_equal(chunked, unchunked)
 
+        chunks = list(pd.read_json(get_strio(), lines=True, chunksize=2))
+        assert chunks[0].shape == (2, 2)
+        assert chunks[1].shape == (1, 2)
+
+        with ensure_clean('test.json') as path:
+            df.to_json(path, lines=True, orient="records")
+            unchunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
+            chunked = pd.read_json(path, lines=True)
+            assert_frame_equal(unchunked, chunked)
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From 400d313b8e548985f210ab73021d0802afec7b68 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Mon, 14 Aug 2017 09:26:15 -0700
Subject: [PATCH 15/62] pep8 cleanup

---
 pandas/io/json/json.py              | 1 +
 pandas/tests/io/json/test_pandas.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index beb4a0dc3650e..d4de41b8553f9 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -437,6 +437,7 @@ def __next__(self):
                 pass
             raise StopIteration
 
+
 class Parser(object):
 
     _STAMP_UNITS = ('s', 'ms', 'us', 'ns')
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index a8b4ad48786e2..f10250c734fe9 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1040,7 +1040,8 @@ def get_strio():
             return StringIO(df.to_json(lines=True, orient="records"))
 
         def test_with_chunksize(c):
-            return pd.concat(pd.read_json(get_strio(), lines=True, chunksize=c))
+            iterator = pd.read_json(get_strio(), lines=True, chunksize=c)
+            return pd.concat(iterator)
 
         unchunked = pd.read_json(get_strio(), lines=True)
 

From b756c90e45fee3cb30065940a537ec53cb797d9f Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Mon, 14 Aug 2017 11:30:08 -0700
Subject: [PATCH 16/62] Run chunksize checks before file is opened

Also, fix badly named vars in test.
---
 pandas/io/json/json.py              | 10 +++++-----
 pandas/tests/io/json/test_pandas.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index d4de41b8553f9..1bb3a0b61f6a6 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -333,6 +333,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
                 {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
     """
 
+    if chunksize is not None:
+        chunksize = _validate_integer("chunksize", chunksize, 1)
+        if not lines:
+            raise ValueError("chunksize should only be passed if lines=True")
+
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                       encoding=encoding)
 
@@ -343,11 +348,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               "keep_default_dates": keep_default_dates, "numpy": numpy,
               "precise_float": precise_float, "date_unit": date_unit}
 
-    if chunksize is not None:
-        chunksize = _validate_integer("chunksize", chunksize, 1)
-        if not lines:
-            raise ValueError("chunksize should only be passed if lines=True")
-
     if isinstance(filepath_or_buffer, compat.string_types):
         try:
             exists = os.path.exists(filepath_or_buffer)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index f10250c734fe9..233fd8f939ff8 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1089,8 +1089,8 @@ def test_with_chunksize(c):
 
         with ensure_clean('test.json') as path:
             df.to_json(path, lines=True, orient="records")
-            unchunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
-            chunked = pd.read_json(path, lines=True)
+            chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
+            unchunked = pd.read_json(path, lines=True)
             assert_frame_equal(unchunked, chunked)
 
     def test_latin_encoding(self):

From b71f65b41ba64f9b84d18f5e79d5b5b4ca738034 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 10 Sep 2017 16:52:23 -0400
Subject: [PATCH 17/62] move strio df in test to fixture

---
 pandas/tests/io/json/test_pandas.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 233fd8f939ff8..07360661e0227 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -35,6 +35,12 @@
 _mixed_frame = _frame.copy()
 
 
+@pytest.fixture
+def strio_lines_json_df():
+    df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+    return StringIO(df.to_json(lines=True, orient="records"))
+
+
 class TestPandasContainer(object):
 
     def setup_method(self, method):
@@ -1032,18 +1038,14 @@ def test_to_jsonl(self):
         assert result == expected
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
-    def test_read_jsonchunks(self):
+    def test_read_jsonchunks(self, strio_lines_json_df):
         # GH17048: memory usage when lines=True
-        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-
-        def get_strio():
-            return StringIO(df.to_json(lines=True, orient="records"))
 
         def test_with_chunksize(c):
-            iterator = pd.read_json(get_strio(), lines=True, chunksize=c)
+            iterator = pd.read_json(strio_lines_json_df, lines=True, chunksize=c)
             return pd.concat(iterator)
 
-        unchunked = pd.read_json(get_strio(), lines=True)
+        unchunked = pd.read_json(strio_lines_json_df, lines=True)
 
         chunked = test_with_chunksize(1)
 
@@ -1068,7 +1070,7 @@ def test_with_chunksize(c):
 
         msg = "chunksize should only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(get_strio(), lines=False, chunksize=2)
+            pd.read_json(strio_lines_json_df, lines=False, chunksize=2)
 
         # Test that reading in Series also works
         s = pd.Series({'A': 1, 'B': 2})
@@ -1083,11 +1085,12 @@ def test_with_chunksize(c):
 
         assert_series_equal(chunked, unchunked)
 
-        chunks = list(pd.read_json(get_strio(), lines=True, chunksize=2))
+        chunks = list(pd.read_json(strio_lines_json_df, lines=True, chunksize=2))
         assert chunks[0].shape == (2, 2)
         assert chunks[1].shape == (1, 2)
 
         with ensure_clean('test.json') as path:
+            df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
             df.to_json(path, lines=True, orient="records")
             chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
             unchunked = pd.read_json(path, lines=True)

From d6e86af38bb07f842f5d5731f5af1de183fac185 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 10 Sep 2017 17:19:33 -0400
Subject: [PATCH 18/62] Improve read_json chunking tests

* split into multiple tests
* move helper functions out
* add comments
---
 pandas/tests/io/json/test_pandas.py | 61 +++++++++++++++++------------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 07360661e0227..4df7d74829c9e 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -41,6 +41,9 @@ def strio_lines_json_df():
     return StringIO(df.to_json(lines=True, orient="records"))
 
 
+def json_lines_to_df_chunked(jlines, chunksize):
+    return pd.concat(pd.read_json(jlines, lines=True, chunksize=chunksize))
+
 class TestPandasContainer(object):
 
     def setup_method(self, method):
@@ -1038,41 +1041,26 @@ def test_to_jsonl(self):
         assert result == expected
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
-    def test_read_jsonchunks(self, strio_lines_json_df):
+    def test_readjson_chunks(self):
+        """Basic test that read_json(chunks=True) gives the same result as
+        read_json(chunks=False)"""
         # GH17048: memory usage when lines=True
 
-        def test_with_chunksize(c):
-            iterator = pd.read_json(strio_lines_json_df, lines=True, chunksize=c)
-            return pd.concat(iterator)
-
-        unchunked = pd.read_json(strio_lines_json_df, lines=True)
-
-        chunked = test_with_chunksize(1)
+        unchunked = pd.read_json(strio_lines_json_df(), lines=True)
+        chunked = json_lines_to_df_chunked(strio_lines_json_df(), 1)
 
         assert_frame_equal(chunked, unchunked)
 
-        chunked_float = test_with_chunksize(1.0)
+        chunked_float = json_lines_to_df_chunked(strio_lines_json_df(), 1.0)
         assert_frame_equal(chunked_float, unchunked)
 
-        msg = r"'chunksize' must be an integer >=1"
-
-        with tm.assert_raises_regex(ValueError, msg):
-            test_with_chunksize(0)
-
-        with tm.assert_raises_regex(ValueError, msg):
-            test_with_chunksize(-1)
-
-        with tm.assert_raises_regex(ValueError, msg):
-            test_with_chunksize(-2.2)
-
-        with tm.assert_raises_regex(ValueError, msg):
-            test_with_chunksize('foo')
-
+    def test_readjson_chunksize_requires_lines():
         msg = "chunksize should only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio_lines_json_df, lines=False, chunksize=2)
+            pd.read_json(strio_lines_json_df(), lines=False, chunksize=2)
 
-        # Test that reading in Series also works
+    def test_readjson_chunks_series(self):
+        """Test reading line-format JSON to Series with chunksize param"""
         s = pd.Series({'A': 1, 'B': 2})
 
         strio = StringIO(s.to_json(lines=True, orient="records"))
@@ -1085,10 +1073,15 @@ def test_with_chunksize(c):
 
         assert_series_equal(chunked, unchunked)
 
-        chunks = list(pd.read_json(strio_lines_json_df, lines=True, chunksize=2))
+    def test_readjson_each_chunk(self):
+        """Other tests check that the final result of read_json(chunksize=True)
+        is correct. This checks that the intermediate chunks read in are correct.
+        """
+        chunks = list(pd.read_json(strio_lines_json_df(), lines=True, chunksize=2))
         assert chunks[0].shape == (2, 2)
         assert chunks[1].shape == (1, 2)
 
+    def test_readjson_chunks_from_file(self):
         with ensure_clean('test.json') as path:
             df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
             df.to_json(path, lines=True, orient="records")
@@ -1096,6 +1089,22 @@ def test_with_chunksize(c):
             unchunked = pd.read_json(path, lines=True)
             assert_frame_equal(unchunked, chunked)
 
+    def test_readjson_invalid_chunksize(self):
+        msg = r"'chunksize' must be an integer >=1"
+
+        with tm.assert_raises_regex(ValueError, msg):
+            json_lines_to_df_chunked(strio_lines_json_df(), 0)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            json_lines_to_df_chunked(strio_lines_json_df(), -1)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            json_lines_to_df_chunked(strio_lines_json_df(), -2.2)
+
+        with tm.assert_raises_regex(ValueError, msg):
+            json_lines_to_df_chunked(strio_lines_json_df(), 'foo')
+
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From 4d912801615c899a770fbdd7115aa8bab7e56b87 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 10 Sep 2017 17:27:15 -0400
Subject: [PATCH 19/62] bugfix in read_json tests, remove fixture

---
 pandas/tests/io/json/test_pandas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 4df7d74829c9e..582d5dfdb5b81 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -35,7 +35,6 @@
 _mixed_frame = _frame.copy()
 
 
-@pytest.fixture
 def strio_lines_json_df():
     df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
     return StringIO(df.to_json(lines=True, orient="records"))
@@ -44,6 +43,7 @@ def strio_lines_json_df():
 def json_lines_to_df_chunked(jlines, chunksize):
     return pd.concat(pd.read_json(jlines, lines=True, chunksize=chunksize))
 
+
 class TestPandasContainer(object):
 
     def setup_method(self, method):
@@ -1054,7 +1054,7 @@ def test_readjson_chunks(self):
         chunked_float = json_lines_to_df_chunked(strio_lines_json_df(), 1.0)
         assert_frame_equal(chunked_float, unchunked)
 
-    def test_readjson_chunksize_requires_lines():
+    def test_readjson_chunksize_requires_lines(self):
         msg = "chunksize should only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
             pd.read_json(strio_lines_json_df(), lines=False, chunksize=2)

From 24744290bc2f6acc6e5dae3f63311809945c92e6 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Sun, 10 Sep 2017 19:02:13 -0400
Subject: [PATCH 20/62] JsonLineReader opens and closes filepaths

---
 pandas/io/json/json.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 1bb3a0b61f6a6..0211e9accb9e7 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -358,18 +358,19 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
             exists = False
 
         if exists:
-            fh, handles = _get_handle(filepath_or_buffer, 'r',
-                                      encoding=encoding)
             if lines and chunksize:
-                return JsonLineReader(fh, chunksize, **kwargs)
+                return JsonLineReader(filepath_or_buffer, chunksize,
+                                      encoding, **kwargs)
             else:
+                fh, handles = _get_handle(filepath_or_buffer, 'r',
+                                          encoding=encoding)
                 json = fh.read()
                 fh.close()
         else:
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
         if lines and chunksize:
-            return JsonLineReader(filepath_or_buffer, chunksize, **kwargs)
+            return JsonLineReader(filepath_or_buffer, chunksize, encoding, **kwargs)
         else:
             json = filepath_or_buffer.read()
     else:
@@ -412,14 +413,22 @@ class JsonLineReader(BaseIterator):
     This is so that `read_json(lines=True)` will return an identical object to
     `read_json(lines=True, chunksize=n)`.
     """
-    def __init__(self, fh, chunksize, **kwargs):
-        self.fh = fh
+    def __init__(self, filepath_or_buffer, chunksize, encoding, **kwargs):
+
+        try:
+            self.iterator, _ = _get_handle(filepath_or_buffer, 'r',
+                                           encoding=encoding)
+        except:
+            if hasattr(filepath_or_buffer, 'read'):
+                self.iterator = filepath_or_buffer
+            else:
+                raise ValueError("cannot read json from given input")
         self.chunksize = chunksize
         self.kwargs = kwargs
         self.nrows_seen = 0
 
     def __next__(self):
-        lines = list(islice(self.fh, self.chunksize))
+        lines = list(islice(self.iterator, self.chunksize))
         if lines:
             lines_json = '[' + ','.join(lines) + ']'
             obj = _get_obj(json=lines_json, **self.kwargs)
@@ -432,7 +441,7 @@ def __next__(self):
 
         else:
             try:
-                self.fh.close()
+                self.iterator.close()
             except IOError:
                 pass
             raise StopIteration

From b18b3df4ca20cae05e94bc0e5ab0a402f357d6de Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Tue, 12 Sep 2017 11:55:06 -0400
Subject: [PATCH 21/62] pep8 cleanup

---
 pandas/io/json/json.py              |  3 ++-
 pandas/tests/io/json/test_pandas.py | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 0211e9accb9e7..63c196178adbe 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -370,7 +370,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
         if lines and chunksize:
-            return JsonLineReader(filepath_or_buffer, chunksize, encoding, **kwargs)
+            return JsonLineReader(filepath_or_buffer, chunksize, encoding,
+                                  **kwargs)
         else:
             json = filepath_or_buffer.read()
     else:
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 582d5dfdb5b81..40375c1745415 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1074,10 +1074,13 @@ def test_readjson_chunks_series(self):
         assert_series_equal(chunked, unchunked)
 
     def test_readjson_each_chunk(self):
-        """Other tests check that the final result of read_json(chunksize=True)
-        is correct. This checks that the intermediate chunks read in are correct.
         """
-        chunks = list(pd.read_json(strio_lines_json_df(), lines=True, chunksize=2))
+        Other tests check that the final result of read_json(chunksize=True) is
+        correct. This checks that the intermediate chunks read in are correct.
+        """
+        chunks = list(
+            pd.read_json(strio_lines_json_df(), lines=True, chunksize=2)
+        )
         assert chunks[0].shape == (2, 2)
         assert chunks[1].shape == (1, 2)
 
@@ -1104,7 +1107,6 @@ def test_readjson_invalid_chunksize(self):
         with tm.assert_raises_regex(ValueError, msg):
             json_lines_to_df_chunked(strio_lines_json_df(), 'foo')
 
-
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From 4c1d6a6ded14c0854a14f43eee1ec8d87c8bd1e9 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Wed, 13 Sep 2017 09:30:34 -0700
Subject: [PATCH 22/62] update whatsnew

---
 doc/source/whatsnew/v0.21.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index ff736f5d4995d..693531e35e88f 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -471,7 +471,7 @@ Other API Changes
 - :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
 - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
 - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
-- :func:`read_json` now accepts a ``chunksize`` parameter that can reduce memory usage when ``lines=True``. (:issue:`17048`)
+- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
 - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
 - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`)

From d589b0ba33c5d5073f54da279dec7dd045263859 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 14 Sep 2017 09:33:23 -0700
Subject: [PATCH 23/62] update docs on read_json chunksize

---
 doc/source/io.rst      | 11 +++++++++++
 pandas/io/json/json.py |  9 +++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index d6abed6e9d1ad..77f631550eab7 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -1845,6 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
   seconds, milliseconds, microseconds or nanoseconds respectively.
 - ``lines`` : reads file as one json object per line.
 - ``encoding`` : The encoding to use to decode py3 bytes.
+- ``chunksize`` : when used in combination with ``lines=True``, return a JsonLineReader which reads in ``chunksize`` lines per iteration.
 
 The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
 
@@ -2049,6 +2050,10 @@ Line delimited json
 pandas is able to read and write line-delimited json files that are common in data processing pipelines
 using Hadoop or Spark.
 
+.. versionadded:: 0.21.0
+
+For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream.
+
 .. ipython:: python
 
   jsonl = '''
@@ -2059,6 +2064,12 @@ using Hadoop or Spark.
   df
   df.to_json(orient='records', lines=True)
 
+  # chunksize has no effect when reading a string.
+  import io
+  reader = pd.read_json(io.StringIO(jsonl), lines=True, chunksize=1)
+  reader
+  for chunk in reader:
+      print(chunk)
 
 .. _io.table_schema:
 
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 63c196178adbe..c8c1b001bb8e4 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -267,11 +267,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         .. versionadded:: 0.19.0
 
     chunksize: integer, default None
-        If `lines=True`, how many lines to read into memory at a time.
+        Return JsonLineReader object for iteration.
+        See the `line-delimted json docs
+        <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
+        for more information on ``chunksize``.
+        This can only be passed if `lines=True`.
         If this is None, the file will be read into memory all at once.
-        Passing a chunksize helps with memory usage, but is slower.
-        Also note this is different from the `chunksize` parameter in
-            `read_csv`, which returns a FileTextReader.
         If the JSON input is a string, this argument has no effect.
 
         .. versionadded:: 0.21.0

From eba45a2524b03bb567782b6ce166b74fe97f0335 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 14 Sep 2017 11:50:19 -0700
Subject: [PATCH 24/62] Always use JsonReader in read_json

Either read it all before return, or return the JsonReader if chunksize
is passed.
---
 doc/source/io.rst      |   2 +-
 pandas/io/json/json.py | 180 +++++++++++++++++++++++------------------
 2 files changed, 102 insertions(+), 80 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 77f631550eab7..156e6649e8ebd 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -1845,7 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
   seconds, milliseconds, microseconds or nanoseconds respectively.
 - ``lines`` : reads file as one json object per line.
 - ``encoding`` : The encoding to use to decode py3 bytes.
-- ``chunksize`` : when used in combination with ``lines=True``, return a JsonLineReader which reads in ``chunksize`` lines per iteration.
+- ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration.
 
 The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
 
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index c8c1b001bb8e4..5dd419669c643 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -12,6 +12,7 @@
                               _stringify_path, BaseIterator)
 from pandas.io.parsers import _validate_integer
 from pandas.core.common import AbstractMethodError
+from pandas.core.reshape import concat
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
 from .table_schema import build_table_schema
@@ -267,7 +268,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         .. versionadded:: 0.19.0
 
     chunksize: integer, default None
-        Return JsonLineReader object for iteration.
+        Return JsonReader object for iteration.
         See the `line-delimted json docs
         <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
         for more information on ``chunksize``.
@@ -342,98 +343,119 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                       encoding=encoding)
 
-    # These kwargs are only needed by the Parsers, so we just wrap them up and
-    # pass them down.
-    kwargs = {"typ": typ, "orient": orient, "dtype": dtype,
-              "convert_axes": convert_axes, "convert_dates": convert_dates,
-              "keep_default_dates": keep_default_dates, "numpy": numpy,
-              "precise_float": precise_float, "date_unit": date_unit}
-
-    if isinstance(filepath_or_buffer, compat.string_types):
-        try:
-            exists = os.path.exists(filepath_or_buffer)
-
-        # if the filepath is too long will raise here
-        # 5874
-        except (TypeError, ValueError):
-            exists = False
-
-        if exists:
-            if lines and chunksize:
-                return JsonLineReader(filepath_or_buffer, chunksize,
-                                      encoding, **kwargs)
-            else:
-                fh, handles = _get_handle(filepath_or_buffer, 'r',
-                                          encoding=encoding)
-                json = fh.read()
-                fh.close()
-        else:
-            json = filepath_or_buffer
-    elif hasattr(filepath_or_buffer, 'read'):
-        if lines and chunksize:
-            return JsonLineReader(filepath_or_buffer, chunksize, encoding,
-                                  **kwargs)
-        else:
-            json = filepath_or_buffer.read()
-    else:
-        json = filepath_or_buffer
+    json_reader = JsonReader(
+        filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
+        convert_axes=convert_axes, convert_dates=convert_dates,
+        keep_default_dates=keep_default_dates, numpy=numpy,
+        precise_float=precise_float, date_unit=date_unit, encoding=encoding,
+        lines=lines, chunksize=chunksize
+    )
 
-    if lines:
-        # If given a json lines file, we break the string into lines, add
-        # commas and put it in a json list to make a valid json object.
-        lines = list(StringIO(json.strip()))
-        json = '[' + ','.join(lines) + ']'
+    if chunksize:
+        return json_reader
 
-    return _get_obj(json, **kwargs)
+    else:
+        return json_reader.read()
 
 
-def _get_obj(json, **kwargs):
-    typ = kwargs['typ']
-    dtype = kwargs['dtype']
-    kwargs = {k: v for k, v in kwargs.items() if k != 'typ'}
-    obj = None
-    if typ == 'frame':
-        obj = FrameParser(json, **kwargs).parse()
+class JsonReader(BaseIterator):
+    """
+    Reads a JSON document to a pandas object.
 
-    if typ == 'series' or obj is None:
-        if not isinstance(dtype, bool):
-            dtype = dict(data=dtype)
-        obj = SeriesParser(json, **kwargs).parse()
+    If initialized with ``lines=True`` and ``chunksize``, can be iterated over
+    ``chunksize`` lines at a time.
+    """
+    def __init__(
+        self, filepath_or_buffer, orient, typ, dtype, convert_axes,
+        convert_dates, keep_default_dates, numpy, precise_float, date_unit,
+        encoding, lines, chunksize, raw_json=False
+    ):
 
-    return obj
+        self.path_or_buf = filepath_or_buffer
+        self.orient = orient
+        self.typ = typ
+        self.dtype = dtype
+        self.convert_axes = convert_axes
+        self.convert_dates = convert_dates
+        self.keep_default_dates = keep_default_dates
+        self.numpy = numpy
+        self.precise_float = precise_float
+        self.date_unit = date_unit
+        self.encoding = encoding
+        self.lines = lines
+        self.chunksize = chunksize
+        self.nrows_seen = 0
+        self.raw_json = False
 
+        if isinstance(filepath_or_buffer, compat.string_types):
+            try:
+                exists = os.path.exists(filepath_or_buffer)
 
-class JsonLineReader(BaseIterator):
-    """
-    Iterates over a JSON document that is formatted with one JSON record per
-    line. The `chunksize` initialization parameter controls how many lines are
-    read per iteration.
-
-    We explicitly override the index on the return value so that the index of
-    the resulting object will be like `range(len(obj))`. If we didn't do this,
-    it would have index like `range(chunksize) * number_chunks.`
-    This is so that `read_json(lines=True)` will return an identical object to
-    `read_json(lines=True, chunksize=n)`.
-    """
-    def __init__(self, filepath_or_buffer, chunksize, encoding, **kwargs):
+            # if the filepath is too long will raise here
+            # 5874
+            except (TypeError, ValueError):
+                exists = False
 
-        try:
-            self.iterator, _ = _get_handle(filepath_or_buffer, 'r',
+            if exists:
+                self.data, _ = _get_handle(filepath_or_buffer, 'r',
                                            encoding=encoding)
-        except:
-            if hasattr(filepath_or_buffer, 'read'):
-                self.iterator = filepath_or_buffer
             else:
-                raise ValueError("cannot read json from given input")
-        self.chunksize = chunksize
-        self.kwargs = kwargs
-        self.nrows_seen = 0
+                self.raw_json = True
+                self.data = filepath_or_buffer
+        elif hasattr(filepath_or_buffer, 'read'):
+                self.data = filepath_or_buffer
+        else:
+            self.raw_json = True
+            self.data = filepath_or_buffer
+
+        if self.raw_json and lines:
+            self.data = self.combine_lines(self.data)
+
+    def combine_lines(self, data):
+        """Combines a multi-line JSON document into a single document"""
+        # If given a json lines file, we break the string into lines, add
+        # commas and put it in a json list to make a valid json object.
+        lines = list(StringIO(data.strip()))
+        return '[' + ','.join(lines) + ']'
+
+    def read(self):
+        """Read the whole JSON input into a pandas object"""
+        if self.raw_json:
+            return self._get_obj(self.data)
+        elif self.lines and self.chunksize:
+            return concat(self)
+        else:
+            if self.lines:
+                return self._get_obj(self.combine_lines(self.data.read()))
+            else:
+                return self._get_obj(self.data.read())
+
+    def _get_obj(self, json):
+        typ = self.typ
+        dtype = self.dtype
+        kwargs = {
+            "orient": self.orient, "dtype": self.dtype,
+            "convert_axes": self.convert_axes,
+            "convert_dates": self.convert_dates,
+            "keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
+            "precise_float": self.precise_float, "date_unit": self.date_unit
+        }
+        obj = None
+        if typ == 'frame':
+            obj = FrameParser(json, **kwargs).parse()
+
+        if typ == 'series' or obj is None:
+            if not isinstance(dtype, bool):
+                dtype = dict(data=dtype)
+            obj = SeriesParser(json, **kwargs).parse()
+
+        return obj
 
     def __next__(self):
-        lines = list(islice(self.iterator, self.chunksize))
+        lines = list(islice(self.data, self.chunksize))
         if lines:
             lines_json = '[' + ','.join(lines) + ']'
-            obj = _get_obj(json=lines_json, **self.kwargs)
+            obj = self._get_obj(lines_json)
 
             # Make sure that the returned objects have the right index
             obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
@@ -443,7 +465,7 @@ def __next__(self):
 
         else:
             try:
-                self.iterator.close()
+                self.data.close()
             except IOError:
                 pass
             raise StopIteration

From d0ea295996d62dbecb1357303d94535cd7bff1c9 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 14 Sep 2017 12:04:31 -0700
Subject: [PATCH 25/62] make lines_json_df a fixture

---
 pandas/tests/io/json/test_pandas.py | 35 +++++++++++++----------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 40375c1745415..a1f39ab78a25c 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -35,13 +35,10 @@
 _mixed_frame = _frame.copy()
 
 
-def strio_lines_json_df():
+@pytest.fixture
+def lines_json_df():
     df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-    return StringIO(df.to_json(lines=True, orient="records"))
-
-
-def json_lines_to_df_chunked(jlines, chunksize):
-    return pd.concat(pd.read_json(jlines, lines=True, chunksize=chunksize))
+    return df.to_json(lines=True, orient="records")
 
 
 class TestPandasContainer(object):
@@ -1041,23 +1038,23 @@ def test_to_jsonl(self):
         assert result == expected
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
-    def test_readjson_chunks(self):
+    def test_readjson_chunks(self, lines_json_df):
         """Basic test that read_json(chunks=True) gives the same result as
         read_json(chunks=False)"""
         # GH17048: memory usage when lines=True
 
-        unchunked = pd.read_json(strio_lines_json_df(), lines=True)
-        chunked = json_lines_to_df_chunked(strio_lines_json_df(), 1)
+        unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
+        chunked = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1))
 
         assert_frame_equal(chunked, unchunked)
 
-        chunked_float = json_lines_to_df_chunked(strio_lines_json_df(), 1.0)
+        chunked_float = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0))
         assert_frame_equal(chunked_float, unchunked)
 
-    def test_readjson_chunksize_requires_lines(self):
+    def test_readjson_chunksize_requires_lines(self, lines_json_df):
         msg = "chunksize should only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(strio_lines_json_df(), lines=False, chunksize=2)
+            pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
 
     def test_readjson_chunks_series(self):
         """Test reading line-format JSON to Series with chunksize param"""
@@ -1073,13 +1070,13 @@ def test_readjson_chunks_series(self):
 
         assert_series_equal(chunked, unchunked)
 
-    def test_readjson_each_chunk(self):
+    def test_readjson_each_chunk(self, lines_json_df):
         """
         Other tests check that the final result of read_json(chunksize=True) is
         correct. This checks that the intermediate chunks read in are correct.
         """
         chunks = list(
-            pd.read_json(strio_lines_json_df(), lines=True, chunksize=2)
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
         )
         assert chunks[0].shape == (2, 2)
         assert chunks[1].shape == (1, 2)
@@ -1092,20 +1089,20 @@ def test_readjson_chunks_from_file(self):
             unchunked = pd.read_json(path, lines=True)
             assert_frame_equal(unchunked, chunked)
 
-    def test_readjson_invalid_chunksize(self):
+    def test_readjson_invalid_chunksize(self, lines_json_df):
         msg = r"'chunksize' must be an integer >=1"
 
         with tm.assert_raises_regex(ValueError, msg):
-            json_lines_to_df_chunked(strio_lines_json_df(), 0)
+            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0))
 
         with tm.assert_raises_regex(ValueError, msg):
-            json_lines_to_df_chunked(strio_lines_json_df(), -1)
+            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1))
 
         with tm.assert_raises_regex(ValueError, msg):
-            json_lines_to_df_chunked(strio_lines_json_df(), -2.2)
+            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2))
 
         with tm.assert_raises_regex(ValueError, msg):
-            json_lines_to_df_chunked(strio_lines_json_df(), 'foo')
+            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo'))
 
     def test_latin_encoding(self):
         if compat.PY2:

From bb3182d2e9d89210ec8b10a443db07dcd07141d1 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 14 Sep 2017 12:11:51 -0700
Subject: [PATCH 26/62] remove unneeded concats in tests

---
 pandas/tests/io/json/test_pandas.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index a1f39ab78a25c..c06e2659fb300 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1044,11 +1044,15 @@ def test_readjson_chunks(self, lines_json_df):
         # GH17048: memory usage when lines=True
 
         unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
-        chunked = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1))
+        chunked = pd.concat(
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1)
+        )
 
         assert_frame_equal(chunked, unchunked)
 
-        chunked_float = pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0))
+        chunked_float = pd.concat(
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0)
+        )
         assert_frame_equal(chunked_float, unchunked)
 
     def test_readjson_chunksize_requires_lines(self, lines_json_df):
@@ -1093,16 +1097,16 @@ def test_readjson_invalid_chunksize(self, lines_json_df):
         msg = r"'chunksize' must be an integer >=1"
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0))
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0)
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1))
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1)
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2))
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2)
 
         with tm.assert_raises_regex(ValueError, msg):
-            pd.concat(pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo'))
+            pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo')
 
     def test_latin_encoding(self):
         if compat.PY2:

From 8cc43ff786232a11c344acbc82429561a56b52f6 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 14 Sep 2017 12:14:50 -0700
Subject: [PATCH 27/62] parametrize some tests

---
 pandas/tests/io/json/test_pandas.py | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index c06e2659fb300..5ffc9249f6899 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1043,17 +1043,14 @@ def test_readjson_chunks(self, lines_json_df):
         read_json(chunks=False)"""
         # GH17048: memory usage when lines=True
 
-        unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
-        chunked = pd.concat(
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1)
-        )
+        for cs in [1, 1.0]:
 
-        assert_frame_equal(chunked, unchunked)
+            unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
+            chunked = pd.concat(
+                pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
+            )
 
-        chunked_float = pd.concat(
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=1.0)
-        )
-        assert_frame_equal(chunked_float, unchunked)
+            assert_frame_equal(chunked, unchunked)
 
     def test_readjson_chunksize_requires_lines(self, lines_json_df):
         msg = "chunksize should only be passed if lines=True"
@@ -1096,17 +1093,9 @@ def test_readjson_chunks_from_file(self):
     def test_readjson_invalid_chunksize(self, lines_json_df):
         msg = r"'chunksize' must be an integer >=1"
 
-        with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=0)
-
-        with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-1)
-
-        with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=-2.2)
-
-        with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize='foo')
+        for cs in [0, -1, 2.2, 'foo']:
+            with tm.assert_raises_regex(ValueError, msg):
+                pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
 
     def test_latin_encoding(self):
         if compat.PY2:

From de03462d6cd53663c498052a617f531f0de53a2b Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 08:30:38 -0700
Subject: [PATCH 28/62] add __close__ method to JsonReader and use it

---
 pandas/io/json/json.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 5dd419669c643..6bca67a28862d 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -363,7 +363,8 @@ class JsonReader(BaseIterator):
     Reads a JSON document to a pandas object.
 
     If initialized with ``lines=True`` and ``chunksize``, can be iterated over
-    ``chunksize`` lines at a time.
+    ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
+    whole document.
     """
     def __init__(
         self, filepath_or_buffer, orient, typ, dtype, convert_axes,
@@ -426,9 +427,11 @@ def read(self):
             return concat(self)
         else:
             if self.lines:
-                return self._get_obj(self.combine_lines(self.data.read()))
+                to_return = self._get_obj(self.combine_lines(self.data.read()))
             else:
-                return self._get_obj(self.data.read())
+                to_return = self._get_obj(self.data.read())
+            self.__close__()
+            return to_return
 
     def _get_obj(self, json):
         typ = self.typ
@@ -451,6 +454,12 @@ def _get_obj(self, json):
 
         return obj
 
+    def __close__(self):
+        try:
+            self.data.close()
+        except IOError:
+            pass
+
     def __next__(self):
         lines = list(islice(self.data, self.chunksize))
         if lines:
@@ -464,10 +473,7 @@ def __next__(self):
             return obj
 
         else:
-            try:
-                self.data.close()
-            except IOError:
-                pass
+            self.__close__()
             raise StopIteration
 
 

From 07b31c7f8c8c23c9b3900bf5b6aafbe7d1cae2b9 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 08:38:40 -0700
Subject: [PATCH 29/62] remove import io in docs

---
 doc/source/io.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 156e6649e8ebd..2bbe3f0738b92 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2065,8 +2065,7 @@ For line-delimited json files, pandas can also return an iterator which reads in
   df.to_json(orient='records', lines=True)
 
   # chunksize has no effect when reading a string.
-  import io
-  reader = pd.read_json(io.StringIO(jsonl), lines=True, chunksize=1)
+  reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
   reader
   for chunk in reader:
       print(chunk)

From 7d0642fd4ea9c5240382f4ea2f037ccc928c4c17 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 08:42:27 -0700
Subject: [PATCH 30/62] move read_json in whatsnew to Other Enhancements

---
 doc/source/whatsnew/v0.21.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 693531e35e88f..d5d508d02cb73 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -162,6 +162,7 @@ Other Enhancements
 - :func:`MultiIndex.is_monotonic_decreasing` has been implemented.  Previously returned ``False`` in all cases. (:issue:`16554`)
 - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`)
 - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`)
+- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
 
 
@@ -471,7 +472,6 @@ Other API Changes
 - :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
 - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
 - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
-- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
 - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
 - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`)

From 398961b96630feca2fd06372409328bced65b198 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 08:57:01 -0700
Subject: [PATCH 31/62] move chunksize and lines validation into JsonReader

---
 pandas/io/json/json.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 6bca67a28862d..843039392dcb2 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -335,11 +335,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
                 {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
     """
 
-    if chunksize is not None:
-        chunksize = _validate_integer("chunksize", chunksize, 1)
-        if not lines:
-            raise ValueError("chunksize should only be passed if lines=True")
-
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
                                                       encoding=encoding)
 
@@ -388,6 +383,11 @@ def __init__(
         self.nrows_seen = 0
         self.raw_json = False
 
+        if self.chunksize is not None:
+            self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
+            if not self.lines:
+                raise ValueError("chunksize should only be passed if lines=True")
+
         if isinstance(filepath_or_buffer, compat.string_types):
             try:
                 exists = os.path.exists(filepath_or_buffer)

From dfa29671e6a1b49c44dac2eedb4eb3a273cc5254 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 08:57:58 -0700
Subject: [PATCH 32/62] remove extraneous else

---
 pandas/io/json/json.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 843039392dcb2..47e3474ac6a54 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -349,8 +349,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     if chunksize:
         return json_reader
 
-    else:
-        return json_reader.read()
+    return json_reader.read()
 
 
 class JsonReader(BaseIterator):

From b0e4bb010298f9acd28bc235834b3c7a7376ac82 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:05:54 -0700
Subject: [PATCH 33/62] remove unneccessary cast to list

---
 pandas/io/json/json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 47e3474ac6a54..0d12ccdd4ade8 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -415,7 +415,7 @@ def combine_lines(self, data):
         """Combines a multi-line JSON document into a single document"""
         # If given a json lines file, we break the string into lines, add
         # commas and put it in a json list to make a valid json object.
-        lines = list(StringIO(data.strip()))
+        lines = StringIO(data.strip())
         return '[' + ','.join(lines) + ']'
 
     def read(self):

From e3197c5546310035ed3dd74ffeebfc4bd36f29b7 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:13:12 -0700
Subject: [PATCH 34/62] move combine_lines call into read

---
 pandas/io/json/json.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 0d12ccdd4ade8..bf2f1426be8bc 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -408,9 +408,6 @@ def __init__(
             self.raw_json = True
             self.data = filepath_or_buffer
 
-        if self.raw_json and lines:
-            self.data = self.combine_lines(self.data)
-
     def combine_lines(self, data):
         """Combines a multi-line JSON document into a single document"""
         # If given a json lines file, we break the string into lines, add
@@ -421,7 +418,10 @@ def combine_lines(self, data):
     def read(self):
         """Read the whole JSON input into a pandas object"""
         if self.raw_json:
-            return self._get_obj(self.data)
+            if self.lines:
+                return self._get_obj(self.combine_lines(self.data))
+            else:
+                return self._get_obj(self.data)
         elif self.lines and self.chunksize:
             return concat(self)
         else:

From 39f9881f08108a382c2fb1f750ecffd8cc4f7c69 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:15:00 -0700
Subject: [PATCH 35/62] remove another extraneous else

---
 pandas/io/json/json.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index bf2f1426be8bc..6fe5447258982 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -403,7 +403,7 @@ def __init__(
                 self.raw_json = True
                 self.data = filepath_or_buffer
         elif hasattr(filepath_or_buffer, 'read'):
-                self.data = filepath_or_buffer
+            self.data = filepath_or_buffer
         else:
             self.raw_json = True
             self.data = filepath_or_buffer
@@ -422,15 +422,15 @@ def read(self):
                 return self._get_obj(self.combine_lines(self.data))
             else:
                 return self._get_obj(self.data)
-        elif self.lines and self.chunksize:
+        if self.lines and self.chunksize:
             return concat(self)
+
+        if self.lines:
+            to_return = self._get_obj(self.combine_lines(self.data.read()))
         else:
-            if self.lines:
-                to_return = self._get_obj(self.combine_lines(self.data.read()))
-            else:
-                to_return = self._get_obj(self.data.read())
-            self.__close__()
-            return to_return
+            to_return = self._get_obj(self.data.read())
+        self.__close__()
+        return to_return
 
     def _get_obj(self, json):
         typ = self.typ

From c2247c314e53058460290d3ab5fd66ca0087efed Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:26:53 -0700
Subject: [PATCH 36/62] always close JsonReader

---
 pandas/io/json/json.py              | 36 ++++++++++++++++-------------
 pandas/tests/io/json/test_pandas.py |  2 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 6fe5447258982..29f38bb6753c4 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -385,7 +385,7 @@ def __init__(
         if self.chunksize is not None:
             self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
-                raise ValueError("chunksize should only be passed if lines=True")
+                raise ValueError("chunksize can only be passed if lines=True")
 
         if isinstance(filepath_or_buffer, compat.string_types):
             try:
@@ -419,20 +419,24 @@ def read(self):
         """Read the whole JSON input into a pandas object"""
         if self.raw_json:
             if self.lines:
-                return self._get_obj(self.combine_lines(self.data))
+                obj = self._get_object_parser(self.combine_lines(self.data))
             else:
-                return self._get_obj(self.data)
-        if self.lines and self.chunksize:
-            return concat(self)
-
-        if self.lines:
-            to_return = self._get_obj(self.combine_lines(self.data.read()))
+                obj = self._get_object_parser(self.data)
+        elif self.lines and self.chunksize:
+            obj = concat(self)
         else:
-            to_return = self._get_obj(self.data.read())
-        self.__close__()
-        return to_return
 
-    def _get_obj(self, json):
+            if self.lines:
+                obj = self._get_object_parser(
+                    self.combine_lines(self.data.read())
+                )
+            else:
+                obj = self._get_object_parser(self.data.read())
+        self.close()
+        return obj
+
+    def _get_object_parser(self, json):
+        """parses a json document into a pandas object"""
         typ = self.typ
         dtype = self.dtype
         kwargs = {
@@ -453,17 +457,17 @@ def _get_obj(self, json):
 
         return obj
 
-    def __close__(self):
+    def close(self):
         try:
             self.data.close()
-        except IOError:
+        except (IOError, AttributeError):
             pass
 
     def __next__(self):
         lines = list(islice(self.data, self.chunksize))
         if lines:
             lines_json = '[' + ','.join(lines) + ']'
-            obj = self._get_obj(lines_json)
+            obj = self._get_object_parser(lines_json)
 
             # Make sure that the returned objects have the right index
             obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
@@ -472,7 +476,7 @@ def __next__(self):
             return obj
 
         else:
-            self.__close__()
+            self.close()
             raise StopIteration
 
 
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 5ffc9249f6899..b4ad9264e4dd2 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1053,7 +1053,7 @@ def test_readjson_chunks(self, lines_json_df):
             assert_frame_equal(chunked, unchunked)
 
     def test_readjson_chunksize_requires_lines(self, lines_json_df):
-        msg = "chunksize should only be passed if lines=True"
+        msg = "chunksize can only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
             pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
 

From 46d8a685c1967bdfd302031112bd4cac0d9e77e3 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:43:34 -0700
Subject: [PATCH 37/62] add test that read_json closes file correctly

---
 pandas/tests/io/json/test_pandas.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index b4ad9264e4dd2..fb71a293fef8e 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1090,6 +1090,19 @@ def test_readjson_chunks_from_file(self):
             unchunked = pd.read_json(path, lines=True)
             assert_frame_equal(unchunked, chunked)
 
+    def test_readjson_chunks_closes(self):
+        for chunksize in [None, 1]:
+            with ensure_clean('test.json') as path:
+                df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+                df.to_json(path, lines=True, orient="records")
+                f = open(path, 'r')
+                if chunksize is not None:
+                    pd.concat(pd.read_json(f, lines=True, chunksize=chunksize))
+                else:
+                    pd.read_json(f, lines=True)
+                assert f.closed, \
+                    "didn't close file with chunksize = %s" % chunksize
+
     def test_readjson_invalid_chunksize(self, lines_json_df):
         msg = r"'chunksize' must be an integer >=1"
 

From 066e26dc5617e2a49f0365299b4faa09e428bca3 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:45:16 -0700
Subject: [PATCH 38/62] minor formatting fixups

---
 pandas/io/json/json.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 29f38bb6753c4..9b5f2f67921f9 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -360,11 +360,9 @@ class JsonReader(BaseIterator):
     ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
     whole document.
     """
-    def __init__(
-        self, filepath_or_buffer, orient, typ, dtype, convert_axes,
-        convert_dates, keep_default_dates, numpy, precise_float, date_unit,
-        encoding, lines, chunksize, raw_json=False
-    ):
+    def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
+                 convert_dates, keep_default_dates, numpy, precise_float,
+                 date_unit, encoding, lines, chunksize, raw_json=False):
 
         self.path_or_buf = filepath_or_buffer
         self.orient = orient
@@ -391,8 +389,7 @@ def __init__(
             try:
                 exists = os.path.exists(filepath_or_buffer)
 
-            # if the filepath is too long will raise here
-            # 5874
+            # gh-5874: if the filepath is too long will raise here
             except (TypeError, ValueError):
                 exists = False
 
@@ -469,7 +466,7 @@ def __next__(self):
             lines_json = '[' + ','.join(lines) + ']'
             obj = self._get_object_parser(lines_json)
 
-            # Make sure that the returned objects have the right index
+            # Make sure that the returned objects have the right index.
             obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
             self.nrows_seen += len(obj)
 

From 08e8b6cd06f779d766367bf666fadc5e12e404c5 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 09:53:02 -0700
Subject: [PATCH 39/62] remove extraneous else

---
 pandas/io/json/json.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 9b5f2f67921f9..762729d34e51a 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -472,9 +472,8 @@ def __next__(self):
 
             return obj
 
-        else:
-            self.close()
-            raise StopIteration
+        self.close()
+        raise StopIteration
 
 
 class Parser(object):

From 1ac6953bd380b790b42887c8df329a790da801af Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 12:43:14 -0700
Subject: [PATCH 40/62] add benchmarks for read_json

---
 asv_bench/benchmarks/packers.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
index 24f80cc836dd4..743e1756e26fd 100644
--- a/asv_bench/benchmarks/packers.py
+++ b/asv_bench/benchmarks/packers.py
@@ -85,6 +85,25 @@ def time_packers_read_json(self):
         pd.read_json(self.f, orient='split')
 
 
+class packers_read_json_lines(_Packers):
+
+    def setup(self):
+        self._setup()
+        self.df.to_json(self.f, orient="records", lines=True)
+
+    def time_packers_read_json_lines(self):
+        pd.read_json(self.f, lines=True)
+
+class packers_read_json_lines_chunks(_Packers):
+    def setup(self):
+        self._setup()
+        self.df.to_json('abc.json', orient="records", lines=True)
+        self.df.index = np.arange(self.N)
+
+    def time_packers_read_json_lines_chunks(self):
+        chunksize = int(self.C / 5.0)
+        pd.read_json('abc.json', lines=True, chunksize=chunksize)
+
 class packers_read_json_date_index(_Packers):
 
     def setup(self):

From 0782df968b7d34818a535536e2adc4fe0fd6549d Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 13:09:41 -0700
Subject: [PATCH 41/62] update benchmarks

---
 asv_bench/benchmarks/packers.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
index 743e1756e26fd..f3b298041147a 100644
--- a/asv_bench/benchmarks/packers.py
+++ b/asv_bench/benchmarks/packers.py
@@ -97,12 +97,11 @@ def time_packers_read_json_lines(self):
 class packers_read_json_lines_chunks(_Packers):
     def setup(self):
         self._setup()
-        self.df.to_json('abc.json', orient="records", lines=True)
-        self.df.index = np.arange(self.N)
+        self.df.to_json(self.f, orient="records", lines=True)
 
     def time_packers_read_json_lines_chunks(self):
         chunksize = int(self.C / 5.0)
-        pd.read_json('abc.json', lines=True, chunksize=chunksize)
+        next([c for c in pd.read_json(self.f, lines=True, chunksize=chunksize)])
 
 class packers_read_json_date_index(_Packers):
 

From 014d493b51c0d3d4442c2ef4138c5824864bf5b0 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 13:20:31 -0700
Subject: [PATCH 42/62] move json_lines tests to io_bench

---
 asv_bench/benchmarks/io_bench.py | 15 +++++++++++++++
 asv_bench/benchmarks/packers.py  | 18 ------------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
index 52064d2cdb8a2..12d1182cafc4c 100644
--- a/asv_bench/benchmarks/io_bench.py
+++ b/asv_bench/benchmarks/io_bench.py
@@ -192,3 +192,18 @@ def time_read_nrows(self, compression, engine):
             ext = ".bz2"
         pd.read_csv(self.big_fname + ext, nrows=10,
                     compression=compression, engine=engine)
+
+class read_json_lines(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 1000000
+        self.C = 5
+        self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
+        self.df.to_json("__test__.json",orient="records",lines=True)
+
+    def time_read_json_lines(self):
+        pd.read_json("__test__.json", lines=True)
+
+    def time_read_json_lines_chunk(self):
+        pd.read_json("__test__.json", lines=True, chunksize=self.N/4)
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
index f3b298041147a..24f80cc836dd4 100644
--- a/asv_bench/benchmarks/packers.py
+++ b/asv_bench/benchmarks/packers.py
@@ -85,24 +85,6 @@ def time_packers_read_json(self):
         pd.read_json(self.f, orient='split')
 
 
-class packers_read_json_lines(_Packers):
-
-    def setup(self):
-        self._setup()
-        self.df.to_json(self.f, orient="records", lines=True)
-
-    def time_packers_read_json_lines(self):
-        pd.read_json(self.f, lines=True)
-
-class packers_read_json_lines_chunks(_Packers):
-    def setup(self):
-        self._setup()
-        self.df.to_json(self.f, orient="records", lines=True)
-
-    def time_packers_read_json_lines_chunks(self):
-        chunksize = int(self.C / 5.0)
-        next([c for c in pd.read_json(self.f, lines=True, chunksize=chunksize)])
-
 class packers_read_json_date_index(_Packers):
 
     def setup(self):

From a913d8e011be316f9b87134c45fa3e87d3bd44a2 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 13:27:56 -0700
Subject: [PATCH 43/62] add peakmem for jsonlines

---
 asv_bench/benchmarks/io_bench.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
index 12d1182cafc4c..b5b9de3d85f6c 100644
--- a/asv_bench/benchmarks/io_bench.py
+++ b/asv_bench/benchmarks/io_bench.py
@@ -193,6 +193,7 @@ def time_read_nrows(self, compression, engine):
         pd.read_csv(self.big_fname + ext, nrows=10,
                     compression=compression, engine=engine)
 
+
 class read_json_lines(object):
     goal_time = 0.2
 
@@ -206,4 +207,10 @@ def time_read_json_lines(self):
         pd.read_json("__test__.json", lines=True)
 
     def time_read_json_lines_chunk(self):
-        pd.read_json("__test__.json", lines=True, chunksize=self.N/4)
+        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4))
+
+    def peakmem_read_json_lines(self):
+        pd.read_json("__test__.json", lines=True)
+
+    def peakmem_read_json_lines_chunk(self):
+        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4))

From ce7aef6fe6d957f01f6f82105cf64a72134ad345 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Fri, 15 Sep 2017 13:35:16 -0700
Subject: [PATCH 44/62] smaller benchmark

---
 asv_bench/benchmarks/io_bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
index b5b9de3d85f6c..aefced4a42a6f 100644
--- a/asv_bench/benchmarks/io_bench.py
+++ b/asv_bench/benchmarks/io_bench.py
@@ -198,7 +198,7 @@ class read_json_lines(object):
     goal_time = 0.2
 
     def setup(self):
-        self.N = 1000000
+        self.N = 100000
         self.C = 5
         self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
         self.df.to_json("__test__.json",orient="records",lines=True)

From 1dc15266ff7e68a667588967ea7c9aa3ff887d44 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 09:47:37 -0700
Subject: [PATCH 45/62] refactor JsonReader

---
 pandas/io/json/json.py | 88 ++++++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 762729d34e51a..58037d993c1c6 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -12,7 +12,7 @@
                               _stringify_path, BaseIterator)
 from pandas.io.parsers import _validate_integer
 from pandas.core.common import AbstractMethodError
-from pandas.core.reshape import concat
+from pandas.core.reshape.concat import concat
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
 from .table_schema import build_table_schema
@@ -378,13 +378,48 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
         self.lines = lines
         self.chunksize = chunksize
         self.nrows_seen = 0
-        self.raw_json = False
 
         if self.chunksize is not None:
             self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
                 raise ValueError("chunksize can only be passed if lines=True")
 
+        self.fp_or_buf = filepath_or_buffer
+        data = self._get_data_from_filepath(filepath_or_buffer)
+        self.data = self._preprocess_data(data)
+
+    def _preprocess_data(self, data):
+        """
+        At this point, the data either has a `read` attribute (e.g. a file
+        object or a StringIO) or is a string that is a JSON document.
+        """
+        if hasattr(data, 'read'):
+            if self.chunksize:
+                data = data
+            else:
+                data = data.read()
+
+        else:
+            if self.chunksize:
+                data = StringIO(data)
+            else:
+                data = data
+
+        return data
+
+    def _get_data_from_filepath(self, filepath_or_buffer):
+        """
+        read_json accepts three input types:
+            1. filepath (string-like)
+            2. file-like object (e.g. open file object, StringIO)
+            3. JSON string
+
+        This function turns (1) into (2) to simplify the rest of the processing.
+        It returns input types (2) and (3) unchanged.
+        """
+
+        data = None
+
         if isinstance(filepath_or_buffer, compat.string_types):
             try:
                 exists = os.path.exists(filepath_or_buffer)
@@ -394,16 +429,13 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
                 exists = False
 
             if exists:
-                self.data, _ = _get_handle(filepath_or_buffer, 'r',
-                                           encoding=encoding)
-            else:
-                self.raw_json = True
-                self.data = filepath_or_buffer
-        elif hasattr(filepath_or_buffer, 'read'):
-            self.data = filepath_or_buffer
-        else:
-            self.raw_json = True
-            self.data = filepath_or_buffer
+                data, _ = _get_handle(filepath_or_buffer, 'r',
+                                      encoding=self.encoding)
+
+        if not data:
+            data = filepath_or_buffer
+
+        return data
 
     def combine_lines(self, data):
         """Combines a multi-line JSON document into a single document"""
@@ -414,21 +446,12 @@ def combine_lines(self, data):
 
     def read(self):
         """Read the whole JSON input into a pandas object"""
-        if self.raw_json:
-            if self.lines:
-                obj = self._get_object_parser(self.combine_lines(self.data))
-            else:
-                obj = self._get_object_parser(self.data)
-        elif self.lines and self.chunksize:
+        if self.lines and self.chunksize:
             obj = concat(self)
+        elif self.lines:
+            obj = self._get_object_parser(self.combine_lines(self.data))
         else:
-
-            if self.lines:
-                obj = self._get_object_parser(
-                    self.combine_lines(self.data.read())
-                )
-            else:
-                obj = self._get_object_parser(self.data.read())
+            obj = self._get_object_parser(self.data)
         self.close()
         return obj
 
@@ -455,14 +478,29 @@ def _get_object_parser(self, json):
         return obj
 
     def close(self):
+        """
+        If self.chunksize, self.data may need closing.
+        If not, self.fp_or_buff may need closing.
+        """
         try:
             self.data.close()
         except (IOError, AttributeError):
             pass
 
+        try:
+            self.fp_or_buf.close()
+        except(IOError, AttributeError):
+            pass
+
     def __next__(self):
         lines = list(islice(self.data, self.chunksize))
         if lines:
+
+            # _get_object_parser can't handle multiple empty lines, so we just
+            # pass it one and it will correctly return an empty object
+            if all(line=="\n" for line in lines):
+                lines = lines[0]
+
             lines_json = '[' + ','.join(lines) + ']'
             obj = self._get_object_parser(lines_json)
 

From 03b6069c8222a3c40655eb4624e9e440aa98eb6b Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 10:07:14 -0700
Subject: [PATCH 46/62] add test for reading with multiple empty lines

---
 pandas/tests/io/json/test_pandas.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index fb71a293fef8e..fe9950485c7fa 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1110,6 +1110,28 @@ def test_readjson_invalid_chunksize(self, lines_json_df):
             with tm.assert_raises_regex(ValueError, msg):
                 pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
 
+    def test_readjson_chunks_multiple_empty_lines(self):
+        j = """
+
+        {"A":1,"B":4}
+
+
+
+        {"A":2,"B":5}
+
+
+
+
+
+
+
+        {"A":3,"B":6}
+        """
+        for chunksize in [None, 1, 2]:
+            orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+            test = pd.concat(pd.read_json(j, lines=True, chunksize=chunksize))
+            tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

From aef6bbc500605f4fad97ec2fe997cf0c2c21432c Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 10:50:11 -0700
Subject: [PATCH 47/62] add support for JSON docs with multiple consecutive
 newlines

---
 pandas/io/json/json.py              | 9 +++------
 pandas/tests/io/json/test_pandas.py | 4 +++-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 58037d993c1c6..f46f4d87a30f4 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -441,7 +441,8 @@ def combine_lines(self, data):
         """Combines a multi-line JSON document into a single document"""
         # If given a json lines file, we break the string into lines, add
         # commas and put it in a json list to make a valid json object.
-        lines = StringIO(data.strip())
+
+        lines = filter(None, data.strip().split('\n'))
         return '[' + ','.join(lines) + ']'
 
     def read(self):
@@ -496,11 +497,7 @@ def __next__(self):
         lines = list(islice(self.data, self.chunksize))
         if lines:
 
-            # _get_object_parser can't handle multiple empty lines, so we just
-            # pass it one and it will correctly return an empty object
-            if all(line=="\n" for line in lines):
-                lines = lines[0]
-
+            lines = filter(None, map(lambda x: x.strip(), lines))
             lines_json = '[' + ','.join(lines) + ']'
             obj = self._get_object_parser(lines_json)
 
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index fe9950485c7fa..0733eb8e9526f 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1129,7 +1129,9 @@ def test_readjson_chunks_multiple_empty_lines(self):
         """
         for chunksize in [None, 1, 2]:
             orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-            test = pd.concat(pd.read_json(j, lines=True, chunksize=chunksize))
+            test = pd.read_json(j, lines=True, chunksize=chunksize)
+            if chunksize is not None:
+                test = pd.concat(test)
             tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)
 
     def test_latin_encoding(self):

From 30e40436a353376c81c208734a6798ac39e7b356 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 10:51:04 -0700
Subject: [PATCH 48/62] remove raw_json init param

---
 pandas/io/json/json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index f46f4d87a30f4..efe4fc9010a64 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -362,7 +362,7 @@ class JsonReader(BaseIterator):
     """
     def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
                  convert_dates, keep_default_dates, numpy, precise_float,
-                 date_unit, encoding, lines, chunksize, raw_json=False):
+                 date_unit, encoding, lines, chunksize):
 
         self.path_or_buf = filepath_or_buffer
         self.orient = orient

From 7dae78a9bf938479fa9e7bf57adcedeaaca8bf33 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 10:55:31 -0700
Subject: [PATCH 49/62] DRY for combining lines

---
 pandas/io/json/json.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index efe4fc9010a64..7083d82dd9d63 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -437,20 +437,20 @@ def _get_data_from_filepath(self, filepath_or_buffer):
 
         return data
 
-    def combine_lines(self, data):
-        """Combines a multi-line JSON document into a single document"""
-        # If given a json lines file, we break the string into lines, add
-        # commas and put it in a json list to make a valid json object.
-
-        lines = filter(None, data.strip().split('\n'))
+    def combine_lines(self, lines):
+        """Combines a list of JSON objects into one JSON object"""
+        lines = filter(None, map(lambda x: x.strip(), lines))
         return '[' + ','.join(lines) + ']'
 
+
     def read(self):
         """Read the whole JSON input into a pandas object"""
         if self.lines and self.chunksize:
             obj = concat(self)
         elif self.lines:
-            obj = self._get_object_parser(self.combine_lines(self.data))
+            obj = self._get_object_parser(
+                self.combine_lines(self.data.split('\n'))
+            )
         else:
             obj = self._get_object_parser(self.data)
         self.close()
@@ -496,9 +496,7 @@ def close(self):
     def __next__(self):
         lines = list(islice(self.data, self.chunksize))
         if lines:
-
-            lines = filter(None, map(lambda x: x.strip(), lines))
-            lines_json = '[' + ','.join(lines) + ']'
+            lines_json = self.combine_lines(lines)
             obj = self._get_object_parser(lines_json)
 
             # Make sure that the returned objects have the right index.

From fe95445eb337c82da8c95ef6a50f607fccd05f0b Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 10:59:27 -0700
Subject: [PATCH 50/62] use floor division in asv bench

---
 asv_bench/benchmarks/io_bench.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
index aefced4a42a6f..a735c7bf88a81 100644
--- a/asv_bench/benchmarks/io_bench.py
+++ b/asv_bench/benchmarks/io_bench.py
@@ -207,10 +207,10 @@ def time_read_json_lines(self):
         pd.read_json("__test__.json", lines=True)
 
     def time_read_json_lines_chunk(self):
-        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4))
+        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4))
 
     def peakmem_read_json_lines(self):
         pd.read_json("__test__.json", lines=True)
 
     def peakmem_read_json_lines_chunk(self):
-        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N/4))
+        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4))

From e41124af9f442ebd84c89d8f4b00676d70ba39da Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 11:03:04 -0700
Subject: [PATCH 51/62] add teardown to asv bench

---
 asv_bench/benchmarks/io_bench.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
index a735c7bf88a81..93273955a29b9 100644
--- a/asv_bench/benchmarks/io_bench.py
+++ b/asv_bench/benchmarks/io_bench.py
@@ -1,3 +1,4 @@
+import os
 from .pandas_vb_common import *
 from pandas import concat, Timestamp, compat
 try:
@@ -196,21 +197,28 @@ def time_read_nrows(self, compression, engine):
 
 class read_json_lines(object):
     goal_time = 0.2
+    fname = "__test__.json"
 
     def setup(self):
         self.N = 100000
         self.C = 5
         self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
-        self.df.to_json("__test__.json",orient="records",lines=True)
+        self.df.to_json(self.fname,orient="records",lines=True)
+
+    def teardown(self):
+        try:
+            os.remove(self.fname)
+        except:
+            pass
 
     def time_read_json_lines(self):
-        pd.read_json("__test__.json", lines=True)
+        pd.read_json(self.fname, lines=True)
 
     def time_read_json_lines_chunk(self):
-        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4))
+        pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))
 
     def peakmem_read_json_lines(self):
-        pd.read_json("__test__.json", lines=True)
+        pd.read_json(self.fname, lines=True)
 
     def peakmem_read_json_lines_chunk(self):
-        pd.concat(pd.read_json("__test__.json", lines=True, chunksize=self.N//4))
+        pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))

From 9cfd012ee393d08634c0c7ed159da2b149e442e5 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 11:09:20 -0700
Subject: [PATCH 52/62] add docs

---
 pandas/io/json/json.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 7083d82dd9d63..58f9d1070d6c4 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -392,6 +392,9 @@ def _preprocess_data(self, data):
         """
         At this point, the data either has a `read` attribute (e.g. a file
         object or a StringIO) or is a string that is a JSON document.
+
+        If self.chunksize, we want to prepare the data for the `__next__`
+        method. Otherwise, we want to read it into memory for the `read` method.
         """
         if hasattr(data, 'read'):
             if self.chunksize:
@@ -442,7 +445,6 @@ def combine_lines(self, lines):
         lines = filter(None, map(lambda x: x.strip(), lines))
         return '[' + ','.join(lines) + ']'
 
-
     def read(self):
         """Read the whole JSON input into a pandas object"""
         if self.lines and self.chunksize:

From 035ca84b1296eb181268a366560209351ff007cf Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 13:59:48 -0700
Subject: [PATCH 53/62] pep fixup

---
 pandas/io/json/json.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 58f9d1070d6c4..84fba8d751abe 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -393,8 +393,8 @@ def _preprocess_data(self, data):
         At this point, the data either has a `read` attribute (e.g. a file
         object or a StringIO) or is a string that is a JSON document.
 
-        If self.chunksize, we want to prepare the data for the `__next__`
-        method. Otherwise, we want to read it into memory for the `read` method.
+        If self.chunksize, we prepare the data for the `__next__` method.
+        Otherwise, we read it into memory for the `read` method.
         """
         if hasattr(data, 'read'):
             if self.chunksize:
@@ -417,7 +417,7 @@ def _get_data_from_filepath(self, filepath_or_buffer):
             2. file-like object (e.g. open file object, StringIO)
             3. JSON string
 
-        This function turns (1) into (2) to simplify the rest of the processing.
+        This method turns (1) into (2) to simplify the rest of the processing.
         It returns input types (2) and (3) unchanged.
         """
 

From 4c92287e04b2d183dfc52b024ed37aadfb4251b8 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 17:50:08 -0700
Subject: [PATCH 54/62] update documentation

---
 doc/source/io.rst      | 1 -
 pandas/io/json/json.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 2bbe3f0738b92..55ca43b0d03c3 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2064,7 +2064,6 @@ For line-delimited json files, pandas can also return an iterator which reads in
   df
   df.to_json(orient='records', lines=True)
 
-  # chunksize has no effect when reading a string.
   reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
   reader
   for chunk in reader:
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 84fba8d751abe..382e40e081437 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -274,7 +274,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         for more information on ``chunksize``.
         This can only be passed if `lines=True`.
         If this is None, the file will be read into memory all at once.
-        If the JSON input is a string, this argument has no effect.
 
         .. versionadded:: 0.21.0
 
@@ -354,7 +353,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
 class JsonReader(BaseIterator):
     """
-    Reads a JSON document to a pandas object.
+    JsonReader provides an interface for reading in a JSON file.
 
     If initialized with ``lines=True`` and ``chunksize``, can be iterated over
     ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the

From 61178be8aee8475537669fbc95307b6bf2ba72d8 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 17:54:44 -0700
Subject: [PATCH 55/62] simplify JsonReader._preprocess_data

---
 pandas/io/json/json.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 382e40e081437..fe79f596cec7c 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -395,17 +395,10 @@ def _preprocess_data(self, data):
         If self.chunksize, we prepare the data for the `__next__` method.
         Otherwise, we read it into memory for the `read` method.
         """
-        if hasattr(data, 'read'):
-            if self.chunksize:
-                data = data
-            else:
-                data = data.read()
-
-        else:
-            if self.chunksize:
-                data = StringIO(data)
-            else:
-                data = data
+        if hasattr(data, 'read') and not self.chunksize:
+            data = data.read()
+        if not hasattr(data, 'read') and self.chunksize:
+            data = StringIO(data)
 
         return data
 

From a2841875de1e7210cf4a3d65062447221837692e Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 17:58:27 -0700
Subject: [PATCH 56/62] simplify _get_data_from_filepath

---
 pandas/io/json/json.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index fe79f596cec7c..4feea78d037e6 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -413,22 +413,20 @@ def _get_data_from_filepath(self, filepath_or_buffer):
         It returns input types (2) and (3) unchanged.
         """
 
-        data = None
+        data = filepath_or_buffer
 
-        if isinstance(filepath_or_buffer, compat.string_types):
+        if isinstance(data, compat.string_types):
             try:
                 exists = os.path.exists(filepath_or_buffer)
 
             # gh-5874: if the filepath is too long will raise here
             except (TypeError, ValueError):
-                exists = False
-
-            if exists:
-                data, _ = _get_handle(filepath_or_buffer, 'r',
-                                      encoding=self.encoding)
+                pass
 
-        if not data:
-            data = filepath_or_buffer
+            else:
+                if exists:
+                    data, _ = _get_handle(filepath_or_buffer, 'r',
+                                          encoding=self.encoding)
 
         return data
 

From 55170ddb08b90d4a22c787f76e9e2e7f76317d59 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 18:13:41 -0700
Subject: [PATCH 57/62] Update read_json tests

Split out tests with lines=True into separate test class
Parametrize tests
Replace """ comments with #.
---
 pandas/tests/io/json/test_pandas.py | 183 ++++++++++++++--------------
 1 file changed, 92 insertions(+), 91 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 0733eb8e9526f..6d2a1c0a01706 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -991,6 +991,62 @@ def test_tz_range_is_utc(self):
         df = DataFrame({'DT': dti})
         assert dumps(df, iso_dates=True) == dfexp
 
+    def test_latin_encoding(self):
+        if compat.PY2:
+            tm.assert_raises_regex(
+                TypeError, r'\[unicode\] is not implemented as a table column')
+            return
+
+        # GH 13774
+        pytest.skip("encoding not implemented in .to_json(), "
+                    "xref #13774")
+
+        values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'a', b'b', b'c'],
+                  [b'EE, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'', b'a', b'b', b'c'],
+                  [b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
+                  [np.nan, b'', b'b', b'c'],
+                  [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
+
+        def _try_decode(x, encoding='latin-1'):
+            try:
+                return x.decode(encoding)
+            except AttributeError:
+                return x
+
+        # not sure how to remove latin-1 from code in python 2 and 3
+        values = [[_try_decode(x) for x in y] for y in values]
+
+        examples = []
+        for dtype in ['category', object]:
+            for val in values:
+                examples.append(Series(val, dtype=dtype))
+
+        def roundtrip(s, encoding='latin-1'):
+            with ensure_clean('test.json') as path:
+                s.to_json(path, encoding=encoding)
+                retr = read_json(path, encoding=encoding)
+                assert_series_equal(s, retr, check_categorical=False)
+
+        for s in examples:
+            roundtrip(s)
+
+    def test_data_frame_size_after_to_json(self):
+        # GH15344
+        df = DataFrame({'a': [str(1)]})
+
+        size_before = df.memory_usage(index=True, deep=True).sum()
+        df.to_json()
+        size_after = df.memory_usage(index=True, deep=True).sum()
+
+        assert size_before == size_after
+
+
+class TestPandasJsonLines(object):
+
     def test_read_jsonl(self):
         # GH9180
         result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
@@ -1038,19 +1094,18 @@ def test_to_jsonl(self):
         assert result == expected
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
-    def test_readjson_chunks(self, lines_json_df):
-        """Basic test that read_json(chunks=True) gives the same result as
-        read_json(chunks=False)"""
+    @pytest.mark.parametrize("chunksize", [1, 1.0])
+    def test_readjson_chunks(self, lines_json_df, chunksize):
+        # Basic test that read_json(chunks=True) gives the same result as
+        # read_json(chunks=False)
         # GH17048: memory usage when lines=True
 
-        for cs in [1, 1.0]:
+        unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
+        reader = pd.read_json(StringIO(lines_json_df), lines=True,
+                              chunksize=chunksize)
+        chunked = pd.concat(reader)
 
-            unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
-            chunked = pd.concat(
-                pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
-            )
-
-            assert_frame_equal(chunked, unchunked)
+        assert_frame_equal(chunked, unchunked)
 
     def test_readjson_chunksize_requires_lines(self, lines_json_df):
         msg = "chunksize can only be passed if lines=True"
@@ -1058,7 +1113,7 @@ def test_readjson_chunksize_requires_lines(self, lines_json_df):
             pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
 
     def test_readjson_chunks_series(self):
-        """Test reading line-format JSON to Series with chunksize param"""
+        # Test reading line-format JSON to Series with chunksize param
         s = pd.Series({'A': 1, 'B': 2})
 
         strio = StringIO(s.to_json(lines=True, orient="records"))
@@ -1072,10 +1127,8 @@ def test_readjson_chunks_series(self):
         assert_series_equal(chunked, unchunked)
 
     def test_readjson_each_chunk(self, lines_json_df):
-        """
-        Other tests check that the final result of read_json(chunksize=True) is
-        correct. This checks that the intermediate chunks read in are correct.
-        """
+        # Other tests check that the final result of read_json(chunksize=True)
+        # is correct. This checks the intermediate chunks.
         chunks = list(
             pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
         )
@@ -1090,27 +1143,29 @@ def test_readjson_chunks_from_file(self):
             unchunked = pd.read_json(path, lines=True)
             assert_frame_equal(unchunked, chunked)
 
-    def test_readjson_chunks_closes(self):
-        for chunksize in [None, 1]:
-            with ensure_clean('test.json') as path:
-                df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-                df.to_json(path, lines=True, orient="records")
-                f = open(path, 'r')
-                if chunksize is not None:
-                    pd.concat(pd.read_json(f, lines=True, chunksize=chunksize))
-                else:
-                    pd.read_json(f, lines=True)
-                assert f.closed, \
-                    "didn't close file with chunksize = %s" % chunksize
+    @pytest.mark.parametrize("chunksize", [None, 1])
+    def test_readjson_chunks_closes(self, chunksize):
+        with ensure_clean('test.json') as path:
+            df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+            df.to_json(path, lines=True, orient="records")
+            f = open(path, 'r')
+            if chunksize is not None:
+                pd.concat(pd.read_json(f, lines=True, chunksize=chunksize))
+            else:
+                pd.read_json(f, lines=True)
+            assert f.closed, \
+                "didn't close file with chunksize = %s" % chunksize
 
-    def test_readjson_invalid_chunksize(self, lines_json_df):
+    @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
+    def test_readjson_invalid_chunksize(self, lines_json_df, chunksize):
         msg = r"'chunksize' must be an integer >=1"
 
-        for cs in [0, -1, 2.2, 'foo']:
-            with tm.assert_raises_regex(ValueError, msg):
-                pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(StringIO(lines_json_df), lines=True,
+                         chunksize=chunksize)
 
-    def test_readjson_chunks_multiple_empty_lines(self):
+    @pytest.mark.parametrize("chunksize", [None, 1, 2])
+    def test_readjson_chunks_multiple_empty_lines(self, chunksize):
         j = """
 
         {"A":1,"B":4}
@@ -1127,62 +1182,8 @@ def test_readjson_chunks_multiple_empty_lines(self):
 
         {"A":3,"B":6}
         """
-        for chunksize in [None, 1, 2]:
-            orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-            test = pd.read_json(j, lines=True, chunksize=chunksize)
-            if chunksize is not None:
-                test = pd.concat(test)
-            tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)
-
-    def test_latin_encoding(self):
-        if compat.PY2:
-            tm.assert_raises_regex(
-                TypeError, r'\[unicode\] is not implemented as a table column')
-            return
-
-        # GH 13774
-        pytest.skip("encoding not implemented in .to_json(), "
-                    "xref #13774")
-
-        values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
-                  [b'E\xc9, 17', b'a', b'b', b'c'],
-                  [b'EE, 17', b'', b'a', b'b', b'c'],
-                  [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
-                  [b'', b'a', b'b', b'c'],
-                  [b'\xf8\xfc', b'a', b'b', b'c'],
-                  [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
-                  [np.nan, b'', b'b', b'c'],
-                  [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
-
-        def _try_decode(x, encoding='latin-1'):
-            try:
-                return x.decode(encoding)
-            except AttributeError:
-                return x
-
-        # not sure how to remove latin-1 from code in python 2 and 3
-        values = [[_try_decode(x) for x in y] for y in values]
-
-        examples = []
-        for dtype in ['category', object]:
-            for val in values:
-                examples.append(Series(val, dtype=dtype))
-
-        def roundtrip(s, encoding='latin-1'):
-            with ensure_clean('test.json') as path:
-                s.to_json(path, encoding=encoding)
-                retr = read_json(path, encoding=encoding)
-                assert_series_equal(s, retr, check_categorical=False)
-
-        for s in examples:
-            roundtrip(s)
-
-    def test_data_frame_size_after_to_json(self):
-        # GH15344
-        df = DataFrame({'a': [str(1)]})
-
-        size_before = df.memory_usage(index=True, deep=True).sum()
-        df.to_json()
-        size_after = df.memory_usage(index=True, deep=True).sum()
-
-        assert size_before == size_after
+        orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        test = pd.read_json(j, lines=True, chunksize=chunksize)
+        if chunksize is not None:
+            test = pd.concat(test)
+        tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)

From 1d7087dc53872b2c9852f75ca4d88c44a2511dee Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Thu, 21 Sep 2017 18:50:02 -0700
Subject: [PATCH 58/62] JsonReader should only close if it opened

---
 pandas/io/json/json.py              | 22 ++++++++++------------
 pandas/tests/io/json/test_pandas.py | 16 +++++++++-------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 4feea78d037e6..9f0ad24639b7c 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -377,13 +377,13 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
         self.lines = lines
         self.chunksize = chunksize
         self.nrows_seen = 0
+        self.should_close = False
 
         if self.chunksize is not None:
             self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
                 raise ValueError("chunksize can only be passed if lines=True")
 
-        self.fp_or_buf = filepath_or_buffer
         data = self._get_data_from_filepath(filepath_or_buffer)
         self.data = self._preprocess_data(data)
 
@@ -427,6 +427,8 @@ def _get_data_from_filepath(self, filepath_or_buffer):
                 if exists:
                     data, _ = _get_handle(filepath_or_buffer, 'r',
                                           encoding=self.encoding)
+                    self.should_close = True
+                    self.open_stream = data
 
         return data
 
@@ -472,18 +474,14 @@ def _get_object_parser(self, json):
 
     def close(self):
         """
-        If self.chunksize, self.data may need closing.
-        If not, self.fp_or_buff may need closing.
+        If we opened a  stream earlier, in _get_data_from_filepath, we should
+        close it. If an open stream or file was passed, we leave it open.
         """
-        try:
-            self.data.close()
-        except (IOError, AttributeError):
-            pass
-
-        try:
-            self.fp_or_buf.close()
-        except(IOError, AttributeError):
-            pass
+        if self.should_close:
+            try:
+                self.open_stream.close()
+            except (IOError, AttributeError):
+                pass
 
     def __next__(self):
         lines = list(islice(self.data, self.chunksize))
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 6d2a1c0a01706..49dd71ac0e961 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -10,6 +10,7 @@
                     read_json, compat)
 from datetime import timedelta
 import pandas as pd
+from pandas.io.json.json import JsonReader
 
 from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
                                  assert_series_equal, network,
@@ -1148,13 +1149,14 @@ def test_readjson_chunks_closes(self, chunksize):
         with ensure_clean('test.json') as path:
             df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
             df.to_json(path, lines=True, orient="records")
-            f = open(path, 'r')
-            if chunksize is not None:
-                pd.concat(pd.read_json(f, lines=True, chunksize=chunksize))
-            else:
-                pd.read_json(f, lines=True)
-            assert f.closed, \
-                "didn't close file with chunksize = %s" % chunksize
+            reader = JsonReader(
+                path, orient=None, typ="frame", dtype=True, convert_axes=True,
+                convert_dates=True, keep_default_dates=True, numpy=False,
+                precise_float=False, date_unit=None, encoding=None,
+                lines=True, chunksize=chunksize)
+            reader.read()
+            assert reader.open_stream.closed, "didn't close stream with \
+                chunksize = %s" % chunksize
 
     @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
     def test_readjson_invalid_chunksize(self, lines_json_df, chunksize):

From 6a76c557939d0ad155ff1db5cc77b8cbe3d897ec Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Tue, 26 Sep 2017 13:18:37 -0700
Subject: [PATCH 59/62] split out json readlines to sep test class

---
 pandas/tests/io/json/test_pandas.py    | 150 ----------------------
 pandas/tests/io/json/test_readlines.py | 167 +++++++++++++++++++++++++
 2 files changed, 167 insertions(+), 150 deletions(-)
 create mode 100644 pandas/tests/io/json/test_readlines.py

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 49dd71ac0e961..b46e3b3033a53 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -10,7 +10,6 @@
                     read_json, compat)
 from datetime import timedelta
 import pandas as pd
-from pandas.io.json.json import JsonReader
 
 from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
                                  assert_series_equal, network,
@@ -36,12 +35,6 @@
 _mixed_frame = _frame.copy()
 
 
-@pytest.fixture
-def lines_json_df():
-    df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-    return df.to_json(lines=True, orient="records")
-
-
 class TestPandasContainer(object):
 
     def setup_method(self, method):
@@ -1046,146 +1039,3 @@ def test_data_frame_size_after_to_json(self):
         assert size_before == size_after
 
 
-class TestPandasJsonLines(object):
-
-    def test_read_jsonl(self):
-        # GH9180
-        result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
-        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
-        assert_frame_equal(result, expected)
-
-    def test_read_jsonl_unicode_chars(self):
-        # GH15132: non-ascii unicode characters
-        # \u201d == RIGHT DOUBLE QUOTATION MARK
-
-        # simulate file handle
-        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
-        json = StringIO(json)
-        result = read_json(json, lines=True)
-        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
-                             columns=['a', 'b'])
-        assert_frame_equal(result, expected)
-
-        # simulate string
-        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
-        result = read_json(json, lines=True)
-        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
-                             columns=['a', 'b'])
-        assert_frame_equal(result, expected)
-
-    def test_to_jsonl(self):
-        # GH9180
-        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
-        result = df.to_json(orient="records", lines=True)
-        expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
-        assert result == expected
-
-        df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
-        result = df.to_json(orient="records", lines=True)
-        expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
-        assert result == expected
-        assert_frame_equal(pd.read_json(result, lines=True), df)
-
-        # GH15096: escaped characters in columns and data
-        df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
-                       columns=["a\\", 'b'])
-        result = df.to_json(orient="records", lines=True)
-        expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
-                    '{"a\\\\":"foo\\"","b":"bar"}')
-        assert result == expected
-        assert_frame_equal(pd.read_json(result, lines=True), df)
-
-    @pytest.mark.parametrize("chunksize", [1, 1.0])
-    def test_readjson_chunks(self, lines_json_df, chunksize):
-        # Basic test that read_json(chunks=True) gives the same result as
-        # read_json(chunks=False)
-        # GH17048: memory usage when lines=True
-
-        unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
-        reader = pd.read_json(StringIO(lines_json_df), lines=True,
-                              chunksize=chunksize)
-        chunked = pd.concat(reader)
-
-        assert_frame_equal(chunked, unchunked)
-
-    def test_readjson_chunksize_requires_lines(self, lines_json_df):
-        msg = "chunksize can only be passed if lines=True"
-        with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
-
-    def test_readjson_chunks_series(self):
-        # Test reading line-format JSON to Series with chunksize param
-        s = pd.Series({'A': 1, 'B': 2})
-
-        strio = StringIO(s.to_json(lines=True, orient="records"))
-        unchunked = pd.read_json(strio, lines=True, typ='Series')
-
-        strio = StringIO(s.to_json(lines=True, orient="records"))
-        chunked = pd.concat(pd.read_json(
-            strio, lines=True, typ='Series', chunksize=1
-        ))
-
-        assert_series_equal(chunked, unchunked)
-
-    def test_readjson_each_chunk(self, lines_json_df):
-        # Other tests check that the final result of read_json(chunksize=True)
-        # is correct. This checks the intermediate chunks.
-        chunks = list(
-            pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
-        )
-        assert chunks[0].shape == (2, 2)
-        assert chunks[1].shape == (1, 2)
-
-    def test_readjson_chunks_from_file(self):
-        with ensure_clean('test.json') as path:
-            df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-            df.to_json(path, lines=True, orient="records")
-            chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
-            unchunked = pd.read_json(path, lines=True)
-            assert_frame_equal(unchunked, chunked)
-
-    @pytest.mark.parametrize("chunksize", [None, 1])
-    def test_readjson_chunks_closes(self, chunksize):
-        with ensure_clean('test.json') as path:
-            df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-            df.to_json(path, lines=True, orient="records")
-            reader = JsonReader(
-                path, orient=None, typ="frame", dtype=True, convert_axes=True,
-                convert_dates=True, keep_default_dates=True, numpy=False,
-                precise_float=False, date_unit=None, encoding=None,
-                lines=True, chunksize=chunksize)
-            reader.read()
-            assert reader.open_stream.closed, "didn't close stream with \
-                chunksize = %s" % chunksize
-
-    @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
-    def test_readjson_invalid_chunksize(self, lines_json_df, chunksize):
-        msg = r"'chunksize' must be an integer >=1"
-
-        with tm.assert_raises_regex(ValueError, msg):
-            pd.read_json(StringIO(lines_json_df), lines=True,
-                         chunksize=chunksize)
-
-    @pytest.mark.parametrize("chunksize", [None, 1, 2])
-    def test_readjson_chunks_multiple_empty_lines(self, chunksize):
-        j = """
-
-        {"A":1,"B":4}
-
-
-
-        {"A":2,"B":5}
-
-
-
-
-
-
-
-        {"A":3,"B":6}
-        """
-        orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-        test = pd.read_json(j, lines=True, chunksize=chunksize)
-        if chunksize is not None:
-            test = pd.concat(test)
-        tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
new file mode 100644
index 0000000000000..711cf89de29e3
--- /dev/null
+++ b/pandas/tests/io/json/test_readlines.py
@@ -0,0 +1,167 @@
+import pytest
+import pandas as pd
+from pandas import DataFrame, read_json
+from pandas.compat import StringIO
+from pandas.io.json.json import JsonReader
+import pandas.util.testing as tm
+from pandas.util.testing import (assert_frame_equal, assert_series_equal,
+                                 ensure_clean)
+
+
+@pytest.fixture
+def lines_json_df():
+    df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+    return df.to_json(lines=True, orient="records")
+
+
+def test_read_jsonl():
+    # GH9180
+    result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+    expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+    assert_frame_equal(result, expected)
+
+
+def test_read_jsonl_unicode_chars():
+    # GH15132: non-ascii unicode characters
+    # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+    # simulate file handle
+    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+    json = StringIO(json)
+    result = read_json(json, lines=True)
+    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                         columns=['a', 'b'])
+    assert_frame_equal(result, expected)
+
+    # simulate string
+    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+    result = read_json(json, lines=True)
+    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                         columns=['a', 'b'])
+    assert_frame_equal(result, expected)
+
+
+def test_to_jsonl():
+    # GH9180
+    df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+    result = df.to_json(orient="records", lines=True)
+    expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+    assert result == expected
+
+    df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
+    result = df.to_json(orient="records", lines=True)
+    expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
+    assert result == expected
+    assert_frame_equal(read_json(result, lines=True), df)
+
+    # GH15096: escaped characters in columns and data
+    df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+                   columns=["a\\", 'b'])
+    result = df.to_json(orient="records", lines=True)
+    expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+                '{"a\\\\":"foo\\"","b":"bar"}')
+    assert result == expected
+    assert_frame_equal(read_json(result, lines=True), df)
+
+
+@pytest.mark.parametrize("chunksize", [1, 1.0])
+def test_readjson_chunks(lines_json_df, chunksize):
+    # Basic test that read_json(chunks=True) gives the same result as
+    # read_json(chunks=False)
+    # GH17048: memory usage when lines=True
+
+    unchunked = read_json(StringIO(lines_json_df), lines=True)
+    reader = read_json(StringIO(lines_json_df), lines=True,
+                       chunksize=chunksize)
+    chunked = pd.concat(reader)
+
+    assert_frame_equal(chunked, unchunked)
+
+
+def test_readjson_chunksize_requires_lines(lines_json_df):
+    msg = "chunksize can only be passed if lines=True"
+    with tm.assert_raises_regex(ValueError, msg):
+        pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
+
+
+def test_readjson_chunks_series():
+    # Test reading line-format JSON to Series with chunksize param
+    s = pd.Series({'A': 1, 'B': 2})
+
+    strio = StringIO(s.to_json(lines=True, orient="records"))
+    unchunked = pd.read_json(strio, lines=True, typ='Series')
+
+    strio = StringIO(s.to_json(lines=True, orient="records"))
+    chunked = pd.concat(pd.read_json(
+        strio, lines=True, typ='Series', chunksize=1
+    ))
+
+    assert_series_equal(chunked, unchunked)
+
+
+def test_readjson_each_chunk(lines_json_df):
+    # Other tests check that the final result of read_json(chunksize=True)
+    # is correct. This checks the intermediate chunks.
+    chunks = list(
+        pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
+    )
+    assert chunks[0].shape == (2, 2)
+    assert chunks[1].shape == (1, 2)
+
+
+def test_readjson_chunks_from_file():
+    with ensure_clean('test.json') as path:
+        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        df.to_json(path, lines=True, orient="records")
+        chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
+        unchunked = pd.read_json(path, lines=True)
+        assert_frame_equal(unchunked, chunked)
+
+
+@pytest.mark.parametrize("chunksize", [None, 1])
+def test_readjson_chunks_closes(chunksize):
+    with ensure_clean('test.json') as path:
+        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        df.to_json(path, lines=True, orient="records")
+        reader = JsonReader(
+            path, orient=None, typ="frame", dtype=True, convert_axes=True,
+            convert_dates=True, keep_default_dates=True, numpy=False,
+            precise_float=False, date_unit=None, encoding=None,
+            lines=True, chunksize=chunksize)
+        reader.read()
+        assert reader.open_stream.closed, "didn't close stream with \
+            chunksize = %s" % chunksize
+
+
+@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
+def test_readjson_invalid_chunksize(lines_json_df, chunksize):
+    msg = r"'chunksize' must be an integer >=1"
+
+    with tm.assert_raises_regex(ValueError, msg):
+        pd.read_json(StringIO(lines_json_df), lines=True,
+                     chunksize=chunksize)
+
+
+@pytest.mark.parametrize("chunksize", [None, 1, 2])
+def test_readjson_chunks_multiple_empty_lines(chunksize):
+    j = """
+
+    {"A":1,"B":4}
+
+
+
+    {"A":2,"B":5}
+
+
+
+
+
+
+
+    {"A":3,"B":6}
+    """
+    orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+    test = pd.read_json(j, lines=True, chunksize=chunksize)
+    if chunksize is not None:
+        test = pd.concat(test)
+    tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)

From a72411fded15efa29acf842c59c8dd1ba97cbf8e Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Tue, 26 Sep 2017 16:25:09 -0700
Subject: [PATCH 60/62] add encoding to test_readlines

---
 pandas/tests/io/json/test_readlines.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
index 711cf89de29e3..d14355b07cf20 100644
--- a/pandas/tests/io/json/test_readlines.py
+++ b/pandas/tests/io/json/test_readlines.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import pytest
 import pandas as pd
 from pandas import DataFrame, read_json

From 56129344586c84d32359dcb07cd033d448b7c90b Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Tue, 26 Sep 2017 18:53:41 -0700
Subject: [PATCH 61/62] pep8 cleanup

---
 pandas/tests/io/json/test_pandas.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index b46e3b3033a53..de4afec883efd 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1037,5 +1037,3 @@ def test_data_frame_size_after_to_json(self):
         size_after = df.memory_usage(index=True, deep=True).sum()
 
         assert size_before == size_after
-
-

From 28d1cbe6cefa4561d79c8cf6245f3448b4f5b422 Mon Sep 17 00:00:00 2001
From: louispotok <louispotok@gmail.com>
Date: Wed, 27 Sep 2017 09:39:02 -0700
Subject: [PATCH 62/62] minor fixups

---
 doc/source/io.rst      | 1 +
 pandas/io/json/json.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index 55ca43b0d03c3..4eba9687efc58 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2064,6 +2064,7 @@ For line-delimited json files, pandas can also return an iterator which reads in
   df
   df.to_json(orient='records', lines=True)
 
+  # reader is an iterator that returns `chunksize` lines each iteration
   reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
   reader
   for chunk in reader:
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index 9f0ad24639b7c..ab74b265b6a06 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -432,7 +432,7 @@ def _get_data_from_filepath(self, filepath_or_buffer):
 
         return data
 
-    def combine_lines(self, lines):
+    def _combine_lines(self, lines):
         """Combines a list of JSON objects into one JSON object"""
         lines = filter(None, map(lambda x: x.strip(), lines))
         return '[' + ','.join(lines) + ']'
@@ -443,7 +443,7 @@ def read(self):
             obj = concat(self)
         elif self.lines:
             obj = self._get_object_parser(
-                self.combine_lines(self.data.split('\n'))
+                self._combine_lines(self.data.split('\n'))
             )
         else:
             obj = self._get_object_parser(self.data)
@@ -486,7 +486,7 @@ def close(self):
     def __next__(self):
         lines = list(islice(self.data, self.chunksize))
         if lines:
-            lines_json = self.combine_lines(lines)
+            lines_json = self._combine_lines(lines)
             obj = self._get_object_parser(lines_json)
 
             # Make sure that the returned objects have the right index.