Update read_json tests

louispotok · louispotok · commit 55170ddb08b9 · 2017-09-28T14:47:32.000-07:00
Split out tests with lines=True into separate test class
Parametrize tests
Replace """ comments with #.
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -991,6 +991,62 @@ def test_tz_range_is_utc(self):
         df = DataFrame({'DT': dti})
         assert dumps(df, iso_dates=True) == dfexp
 
+    def test_latin_encoding(self):
+        if compat.PY2:
+            tm.assert_raises_regex(
+                TypeError, r'\[unicode\] is not implemented as a table column')
+            return
+
+        # GH 13774
+        pytest.skip("encoding not implemented in .to_json(), "
+                    "xref #13774")
+
+        values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'a', b'b', b'c'],
+                  [b'EE, 17', b'', b'a', b'b', b'c'],
+                  [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'', b'a', b'b', b'c'],
+                  [b'\xf8\xfc', b'a', b'b', b'c'],
+                  [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
+                  [np.nan, b'', b'b', b'c'],
+                  [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
+
+        def _try_decode(x, encoding='latin-1'):
+            try:
+                return x.decode(encoding)
+            except AttributeError:
+                return x
+
+        # not sure how to remove latin-1 from code in python 2 and 3
+        values = [[_try_decode(x) for x in y] for y in values]
+
+        examples = []
+        for dtype in ['category', object]:
+            for val in values:
+                examples.append(Series(val, dtype=dtype))
+
+        def roundtrip(s, encoding='latin-1'):
+            with ensure_clean('test.json') as path:
+                s.to_json(path, encoding=encoding)
+                retr = read_json(path, encoding=encoding)
+                assert_series_equal(s, retr, check_categorical=False)
+
+        for s in examples:
+            roundtrip(s)
+
+    def test_data_frame_size_after_to_json(self):
+        # GH15344
+        df = DataFrame({'a': [str(1)]})
+
+        size_before = df.memory_usage(index=True, deep=True).sum()
+        df.to_json()
+        size_after = df.memory_usage(index=True, deep=True).sum()
+
+        assert size_before == size_after
+
+
+class TestPandasJsonLines(object):
+
     def test_read_jsonl(self):
         # GH9180
         result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
@@ -1038,27 +1094,26 @@ def test_to_jsonl(self):
         assert result == expected
         assert_frame_equal(pd.read_json(result, lines=True), df)
 
-    def test_readjson_chunks(self, lines_json_df):
-        """Basic test that read_json(chunks=True) gives the same result as
-        read_json(chunks=False)"""
+    @pytest.mark.parametrize("chunksize", [1, 1.0])
+    def test_readjson_chunks(self, lines_json_df, chunksize):
+        # Basic test that read_json(chunks=True) gives the same result as
+        # read_json(chunks=False)
         # GH17048: memory usage when lines=True
 
-        for cs in [1, 1.0]:
+        unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
+        reader = pd.read_json(StringIO(lines_json_df), lines=True,
+                              chunksize=chunksize)
+        chunked = pd.concat(reader)
 
-            unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
-            chunked = pd.concat(
-                pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
-            )
-
-            assert_frame_equal(chunked, unchunked)
+        assert_frame_equal(chunked, unchunked)
 
     def test_readjson_chunksize_requires_lines(self, lines_json_df):
         msg = "chunksize can only be passed if lines=True"
         with tm.assert_raises_regex(ValueError, msg):
             pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
 
     def test_readjson_chunks_series(self):
-        """Test reading line-format JSON to Series with chunksize param"""
+        # Test reading line-format JSON to Series with chunksize param
         s = pd.Series({'A': 1, 'B': 2})
 
         strio = StringIO(s.to_json(lines=True, orient="records"))
@@ -1072,10 +1127,8 @@ def test_readjson_chunks_series(self):
         assert_series_equal(chunked, unchunked)
 
     def test_readjson_each_chunk(self, lines_json_df):
-        """
-        Other tests check that the final result of read_json(chunksize=True) is
-        correct. This checks that the intermediate chunks read in are correct.
-        """
+        # Other tests check that the final result of read_json(chunksize=True)
+        # is correct. This checks the intermediate chunks.
         chunks = list(
             pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
         )
@@ -1090,27 +1143,29 @@ def test_readjson_chunks_from_file(self):
             unchunked = pd.read_json(path, lines=True)
             assert_frame_equal(unchunked, chunked)
 
-    def test_readjson_chunks_closes(self):
-        for chunksize in [None, 1]:
-            with ensure_clean('test.json') as path:
-                df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-                df.to_json(path, lines=True, orient="records")
-                f = open(path, 'r')
-                if chunksize is not None:
-                    pd.concat(pd.read_json(f, lines=True, chunksize=chunksize))
-                else:
-                    pd.read_json(f, lines=True)
-                assert f.closed, \
-                    "didn't close file with chunksize = %s" % chunksize
+    @pytest.mark.parametrize("chunksize", [None, 1])
+    def test_readjson_chunks_closes(self, chunksize):
+        with ensure_clean('test.json') as path:
+            df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+            df.to_json(path, lines=True, orient="records")
+            f = open(path, 'r')
+            if chunksize is not None:
+                pd.concat(pd.read_json(f, lines=True, chunksize=chunksize))
+            else:
+                pd.read_json(f, lines=True)
+            assert f.closed, \
+                "didn't close file with chunksize = %s" % chunksize
 
-    def test_readjson_invalid_chunksize(self, lines_json_df):
+    @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
+    def test_readjson_invalid_chunksize(self, lines_json_df, chunksize):
         msg = r"'chunksize' must be an integer >=1"
 
-        for cs in [0, -1, 2.2, 'foo']:
-            with tm.assert_raises_regex(ValueError, msg):
-                pd.read_json(StringIO(lines_json_df), lines=True, chunksize=cs)
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_json(StringIO(lines_json_df), lines=True,
+                         chunksize=chunksize)
 
-    def test_readjson_chunks_multiple_empty_lines(self):
+    @pytest.mark.parametrize("chunksize", [None, 1, 2])
+    def test_readjson_chunks_multiple_empty_lines(self, chunksize):
         j = """
 
         {"A":1,"B":4}
@@ -1127,62 +1182,8 @@ def test_readjson_chunks_multiple_empty_lines(self):
 
         {"A":3,"B":6}
         """
-        for chunksize in [None, 1, 2]:
-            orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-            test = pd.read_json(j, lines=True, chunksize=chunksize)
-            if chunksize is not None:
-                test = pd.concat(test)
-            tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)
-
-    def test_latin_encoding(self):
-        if compat.PY2:
-            tm.assert_raises_regex(
-                TypeError, r'\[unicode\] is not implemented as a table column')
-            return
-
-        # GH 13774
-        pytest.skip("encoding not implemented in .to_json(), "
-                    "xref #13774")
-
-        values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
-                  [b'E\xc9, 17', b'a', b'b', b'c'],
-                  [b'EE, 17', b'', b'a', b'b', b'c'],
-                  [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
-                  [b'', b'a', b'b', b'c'],
-                  [b'\xf8\xfc', b'a', b'b', b'c'],
-                  [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
-                  [np.nan, b'', b'b', b'c'],
-                  [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
-
-        def _try_decode(x, encoding='latin-1'):
-            try:
-                return x.decode(encoding)
-            except AttributeError:
-                return x
-
-        # not sure how to remove latin-1 from code in python 2 and 3
-        values = [[_try_decode(x) for x in y] for y in values]
-
-        examples = []
-        for dtype in ['category', object]:
-            for val in values:
-                examples.append(Series(val, dtype=dtype))
-
-        def roundtrip(s, encoding='latin-1'):
-            with ensure_clean('test.json') as path:
-                s.to_json(path, encoding=encoding)
-                retr = read_json(path, encoding=encoding)
-                assert_series_equal(s, retr, check_categorical=False)
-
-        for s in examples:
-            roundtrip(s)
-
-    def test_data_frame_size_after_to_json(self):
-        # GH15344
-        df = DataFrame({'a': [str(1)]})
-
-        size_before = df.memory_usage(index=True, deep=True).sum()
-        df.to_json()
-        size_after = df.memory_usage(index=True, deep=True).sum()
-
-        assert size_before == size_after
+        orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        test = pd.read_json(j, lines=True, chunksize=chunksize)
+        if chunksize is not None:
+            test = pd.concat(test)
+        tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)