Skip to content

Commit fe8509b

Browse files
committed
Merge pull request #8977 from bashtage/stata-writer-string-length
BUG: StataWriter uses incorrect string length
2 parents 2ed182e + 3872a6e commit fe8509b

File tree

3 files changed

+18
-8
lines changed

3 files changed

+18
-8
lines changed

doc/source/whatsnew/v0.15.2.txt

+7
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,10 @@ Bug Fixes
167167
not lexically sorted or unique (:issue:`7724`)
168168
- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`)
169169
- Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`)
170+
171+
172+
173+
174+
175+
- Bug in `StataWriter` the produces writes strings with 244 characters irrespective of actual size (:issue:`8969`)
176+

pandas/io/stata.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1409,7 +1409,7 @@ def _maybe_convert_to_int_keys(convert_dates, varlist):
14091409
return new_dict
14101410

14111411

1412-
def _dtype_to_stata_type(dtype):
1412+
def _dtype_to_stata_type(dtype, column):
14131413
"""
14141414
Converts dtype types to stata types. Returns the byte of the given ordinal.
14151415
See TYPE_MAP and comments for an explanation. This is also explained in
@@ -1425,13 +1425,14 @@ def _dtype_to_stata_type(dtype):
14251425
If there are dates to convert, then dtype will already have the correct
14261426
type inserted.
14271427
"""
1428-
#TODO: expand to handle datetime to integer conversion
1428+
# TODO: expand to handle datetime to integer conversion
14291429
if dtype.type == np.string_:
14301430
return chr(dtype.itemsize)
14311431
elif dtype.type == np.object_: # try to coerce it to the biggest string
14321432
# not memory efficient, what else could we
14331433
# do?
1434-
return chr(244)
1434+
itemsize = max_len_string_array(column.values)
1435+
return chr(max(itemsize, 1))
14351436
elif dtype == np.float64:
14361437
return chr(255)
14371438
elif dtype == np.float32:
@@ -1461,6 +1462,7 @@ def _dtype_to_default_stata_fmt(dtype, column):
14611462
int16 -> "%8.0g"
14621463
int8 -> "%8.0g"
14631464
"""
1465+
# TODO: Refactor to combine type with format
14641466
# TODO: expand this to handle a default datetime format?
14651467
if dtype.type == np.object_:
14661468
inferred_dtype = infer_dtype(column.dropna())
@@ -1470,8 +1472,7 @@ def _dtype_to_default_stata_fmt(dtype, column):
14701472
itemsize = max_len_string_array(column.values)
14711473
if itemsize > 244:
14721474
raise ValueError(excessive_string_length_error % column.name)
1473-
1474-
return "%" + str(itemsize) + "s"
1475+
return "%" + str(max(itemsize, 1)) + "s"
14751476
elif dtype == np.float64:
14761477
return "%10.0g"
14771478
elif dtype == np.float32:
@@ -1718,10 +1719,11 @@ def _prepare_pandas(self, data):
17181719
self._convert_dates[key]
17191720
)
17201721
dtypes[key] = np.dtype(new_type)
1721-
self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
1722+
self.typlist = []
17221723
self.fmtlist = []
17231724
for col, dtype in dtypes.iteritems():
17241725
self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
1726+
self.typlist.append(_dtype_to_stata_type(dtype, data[col]))
17251727

17261728
# set the given format for the datetime cols
17271729
if self._convert_dates is not None:

pandas/io/tests/test_stata.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -593,10 +593,12 @@ def test_minimal_size_col(self):
593593
with tm.ensure_clean() as path:
594594
original.to_stata(path, write_index=False)
595595
sr = StataReader(path)
596+
typlist = sr.typlist
596597
variables = sr.varlist
597598
formats = sr.fmtlist
598-
for variable, fmt in zip(variables, formats):
599+
for variable, fmt, typ in zip(variables, formats, typlist):
599600
self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
601+
self.assertTrue(int(variable[1:]) == typ)
600602

601603
def test_excessively_long_string(self):
602604
str_lens = (1, 244, 500)
@@ -850,7 +852,6 @@ def test_categorical_order(self):
850852
# Check identity of codes
851853
for col in expected:
852854
if is_categorical_dtype(expected[col]):
853-
print(col)
854855
tm.assert_series_equal(expected[col].cat.codes,
855856
parsed_115[col].cat.codes)
856857
tm.assert_index_equal(expected[col].cat.categories,

0 commit comments

Comments
 (0)