@@ -991,6 +991,62 @@ def test_tz_range_is_utc(self):
991
991
df = DataFrame ({'DT' : dti })
992
992
assert dumps (df , iso_dates = True ) == dfexp
993
993
994
+ def test_latin_encoding (self ):
995
+ if compat .PY2 :
996
+ tm .assert_raises_regex (
997
+ TypeError , r'\[unicode\] is not implemented as a table column' )
998
+ return
999
+
1000
+ # GH 13774
1001
+ pytest .skip ("encoding not implemented in .to_json(), "
1002
+ "xref #13774" )
1003
+
1004
+ values = [[b'E\xc9 , 17' , b'' , b'a' , b'b' , b'c' ],
1005
+ [b'E\xc9 , 17' , b'a' , b'b' , b'c' ],
1006
+ [b'EE, 17' , b'' , b'a' , b'b' , b'c' ],
1007
+ [b'E\xc9 , 17' , b'\xf8 \xfc ' , b'a' , b'b' , b'c' ],
1008
+ [b'' , b'a' , b'b' , b'c' ],
1009
+ [b'\xf8 \xfc ' , b'a' , b'b' , b'c' ],
1010
+ [b'A\xf8 \xfc ' , b'' , b'a' , b'b' , b'c' ],
1011
+ [np .nan , b'' , b'b' , b'c' ],
1012
+ [b'A\xf8 \xfc ' , np .nan , b'' , b'b' , b'c' ]]
1013
+
1014
+ def _try_decode (x , encoding = 'latin-1' ):
1015
+ try :
1016
+ return x .decode (encoding )
1017
+ except AttributeError :
1018
+ return x
1019
+
1020
+ # not sure how to remove latin-1 from code in python 2 and 3
1021
+ values = [[_try_decode (x ) for x in y ] for y in values ]
1022
+
1023
+ examples = []
1024
+ for dtype in ['category' , object ]:
1025
+ for val in values :
1026
+ examples .append (Series (val , dtype = dtype ))
1027
+
1028
+ def roundtrip (s , encoding = 'latin-1' ):
1029
+ with ensure_clean ('test.json' ) as path :
1030
+ s .to_json (path , encoding = encoding )
1031
+ retr = read_json (path , encoding = encoding )
1032
+ assert_series_equal (s , retr , check_categorical = False )
1033
+
1034
+ for s in examples :
1035
+ roundtrip (s )
1036
+
1037
+ def test_data_frame_size_after_to_json (self ):
1038
+ # GH15344
1039
+ df = DataFrame ({'a' : [str (1 )]})
1040
+
1041
+ size_before = df .memory_usage (index = True , deep = True ).sum ()
1042
+ df .to_json ()
1043
+ size_after = df .memory_usage (index = True , deep = True ).sum ()
1044
+
1045
+ assert size_before == size_after
1046
+
1047
+
1048
+ class TestPandasJsonLines (object ):
1049
+
994
1050
def test_read_jsonl (self ):
995
1051
# GH9180
996
1052
result = read_json ('{"a": 1, "b": 2}\n {"b":2, "a" :1}\n ' , lines = True )
@@ -1038,27 +1094,26 @@ def test_to_jsonl(self):
1038
1094
assert result == expected
1039
1095
assert_frame_equal (pd .read_json (result , lines = True ), df )
1040
1096
1041
- def test_readjson_chunks (self , lines_json_df ):
1042
- """Basic test that read_json(chunks=True) gives the same result as
1043
- read_json(chunks=False)"""
1097
+ @pytest .mark .parametrize ("chunksize" , [1 , 1.0 ])
1098
+ def test_readjson_chunks (self , lines_json_df , chunksize ):
1099
+ # Basic test that read_json(chunks=True) gives the same result as
1100
+ # read_json(chunks=False)
1044
1101
# GH17048: memory usage when lines=True
1045
1102
1046
- for cs in [1 , 1.0 ]:
1103
+ unchunked = pd .read_json (StringIO (lines_json_df ), lines = True )
1104
+ reader = pd .read_json (StringIO (lines_json_df ), lines = True ,
1105
+ chunksize = chunksize )
1106
+ chunked = pd .concat (reader )
1047
1107
1048
- unchunked = pd .read_json (StringIO (lines_json_df ), lines = True )
1049
- chunked = pd .concat (
1050
- pd .read_json (StringIO (lines_json_df ), lines = True , chunksize = cs )
1051
- )
1052
-
1053
- assert_frame_equal (chunked , unchunked )
1108
+ assert_frame_equal (chunked , unchunked )
1054
1109
1055
1110
def test_readjson_chunksize_requires_lines (self , lines_json_df ):
1056
1111
msg = "chunksize can only be passed if lines=True"
1057
1112
with tm .assert_raises_regex (ValueError , msg ):
1058
1113
pd .read_json (StringIO (lines_json_df ), lines = False , chunksize = 2 )
1059
1114
1060
1115
def test_readjson_chunks_series (self ):
1061
- """ Test reading line-format JSON to Series with chunksize param"""
1116
+ # Test reading line-format JSON to Series with chunksize param
1062
1117
s = pd .Series ({'A' : 1 , 'B' : 2 })
1063
1118
1064
1119
strio = StringIO (s .to_json (lines = True , orient = "records" ))
@@ -1072,10 +1127,8 @@ def test_readjson_chunks_series(self):
1072
1127
assert_series_equal (chunked , unchunked )
1073
1128
1074
1129
def test_readjson_each_chunk (self , lines_json_df ):
1075
- """
1076
- Other tests check that the final result of read_json(chunksize=True) is
1077
- correct. This checks that the intermediate chunks read in are correct.
1078
- """
1130
+ # Other tests check that the final result of read_json(chunksize=True)
1131
+ # is correct. This checks the intermediate chunks.
1079
1132
chunks = list (
1080
1133
pd .read_json (StringIO (lines_json_df ), lines = True , chunksize = 2 )
1081
1134
)
@@ -1090,27 +1143,29 @@ def test_readjson_chunks_from_file(self):
1090
1143
unchunked = pd .read_json (path , lines = True )
1091
1144
assert_frame_equal (unchunked , chunked )
1092
1145
1093
- def test_readjson_chunks_closes ( self ):
1094
- for chunksize in [ None , 1 ] :
1095
- with ensure_clean ('test.json' ) as path :
1096
- df = pd .DataFrame ({'A' : [1 , 2 , 3 ], 'B' : [4 , 5 , 6 ]})
1097
- df .to_json (path , lines = True , orient = "records" )
1098
- f = open (path , 'r' )
1099
- if chunksize is not None :
1100
- pd .concat (pd .read_json (f , lines = True , chunksize = chunksize ))
1101
- else :
1102
- pd .read_json (f , lines = True )
1103
- assert f .closed , \
1104
- "didn't close file with chunksize = %s" % chunksize
1146
+ @ pytest . mark . parametrize ( "chunksize" , [ None , 1 ])
1147
+ def test_readjson_chunks_closes ( self , chunksize ) :
1148
+ with ensure_clean ('test.json' ) as path :
1149
+ df = pd .DataFrame ({'A' : [1 , 2 , 3 ], 'B' : [4 , 5 , 6 ]})
1150
+ df .to_json (path , lines = True , orient = "records" )
1151
+ f = open (path , 'r' )
1152
+ if chunksize is not None :
1153
+ pd .concat (pd .read_json (f , lines = True , chunksize = chunksize ))
1154
+ else :
1155
+ pd .read_json (f , lines = True )
1156
+ assert f .closed , \
1157
+ "didn't close file with chunksize = %s" % chunksize
1105
1158
1106
- def test_readjson_invalid_chunksize (self , lines_json_df ):
1159
+ @pytest .mark .parametrize ("chunksize" , [0 , - 1 , 2.2 , "foo" ])
1160
+ def test_readjson_invalid_chunksize (self , lines_json_df , chunksize ):
1107
1161
msg = r"'chunksize' must be an integer >=1"
1108
1162
1109
- for cs in [ 0 , - 1 , 2.2 , 'foo' ] :
1110
- with tm . assert_raises_regex ( ValueError , msg ):
1111
- pd . read_json ( StringIO ( lines_json_df ), lines = True , chunksize = cs )
1163
+ with tm . assert_raises_regex ( ValueError , msg ) :
1164
+ pd . read_json ( StringIO ( lines_json_df ), lines = True ,
1165
+ chunksize = chunksize )
1112
1166
1113
- def test_readjson_chunks_multiple_empty_lines (self ):
1167
+ @pytest .mark .parametrize ("chunksize" , [None , 1 , 2 ])
1168
+ def test_readjson_chunks_multiple_empty_lines (self , chunksize ):
1114
1169
j = """
1115
1170
1116
1171
{"A":1,"B":4}
@@ -1127,62 +1182,8 @@ def test_readjson_chunks_multiple_empty_lines(self):
1127
1182
1128
1183
{"A":3,"B":6}
1129
1184
"""
1130
- for chunksize in [None , 1 , 2 ]:
1131
- orig = pd .DataFrame ({'A' : [1 , 2 , 3 ], 'B' : [4 , 5 , 6 ]})
1132
- test = pd .read_json (j , lines = True , chunksize = chunksize )
1133
- if chunksize is not None :
1134
- test = pd .concat (test )
1135
- tm .assert_frame_equal (orig , test , obj = "chunksize: %s" % chunksize )
1136
-
1137
- def test_latin_encoding (self ):
1138
- if compat .PY2 :
1139
- tm .assert_raises_regex (
1140
- TypeError , r'\[unicode\] is not implemented as a table column' )
1141
- return
1142
-
1143
- # GH 13774
1144
- pytest .skip ("encoding not implemented in .to_json(), "
1145
- "xref #13774" )
1146
-
1147
- values = [[b'E\xc9 , 17' , b'' , b'a' , b'b' , b'c' ],
1148
- [b'E\xc9 , 17' , b'a' , b'b' , b'c' ],
1149
- [b'EE, 17' , b'' , b'a' , b'b' , b'c' ],
1150
- [b'E\xc9 , 17' , b'\xf8 \xfc ' , b'a' , b'b' , b'c' ],
1151
- [b'' , b'a' , b'b' , b'c' ],
1152
- [b'\xf8 \xfc ' , b'a' , b'b' , b'c' ],
1153
- [b'A\xf8 \xfc ' , b'' , b'a' , b'b' , b'c' ],
1154
- [np .nan , b'' , b'b' , b'c' ],
1155
- [b'A\xf8 \xfc ' , np .nan , b'' , b'b' , b'c' ]]
1156
-
1157
- def _try_decode (x , encoding = 'latin-1' ):
1158
- try :
1159
- return x .decode (encoding )
1160
- except AttributeError :
1161
- return x
1162
-
1163
- # not sure how to remove latin-1 from code in python 2 and 3
1164
- values = [[_try_decode (x ) for x in y ] for y in values ]
1165
-
1166
- examples = []
1167
- for dtype in ['category' , object ]:
1168
- for val in values :
1169
- examples .append (Series (val , dtype = dtype ))
1170
-
1171
- def roundtrip (s , encoding = 'latin-1' ):
1172
- with ensure_clean ('test.json' ) as path :
1173
- s .to_json (path , encoding = encoding )
1174
- retr = read_json (path , encoding = encoding )
1175
- assert_series_equal (s , retr , check_categorical = False )
1176
-
1177
- for s in examples :
1178
- roundtrip (s )
1179
-
1180
- def test_data_frame_size_after_to_json (self ):
1181
- # GH15344
1182
- df = DataFrame ({'a' : [str (1 )]})
1183
-
1184
- size_before = df .memory_usage (index = True , deep = True ).sum ()
1185
- df .to_json ()
1186
- size_after = df .memory_usage (index = True , deep = True ).sum ()
1187
-
1188
- assert size_before == size_after
1185
+ orig = pd .DataFrame ({'A' : [1 , 2 , 3 ], 'B' : [4 , 5 , 6 ]})
1186
+ test = pd .read_json (j , lines = True , chunksize = chunksize )
1187
+ if chunksize is not None :
1188
+ test = pd .concat (test )
1189
+ tm .assert_frame_equal (orig , test , obj = "chunksize: %s" % chunksize )
0 commit comments