Skip to content

BUG: Fixed tm.set_locale context manager, it could error and leak when category LC_ALL was used #47570

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 8, 2022
3 changes: 2 additions & 1 deletion pandas/_config/localization.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def set_locale(
particular locale, without globally setting the locale. This probably isn't
thread-safe.
"""
current_locale = locale.getlocale()
# getlocale is not always compliant with setlocale, use setlocale. GH#46595
current_locale = locale.setlocale(lc_var)

try:
locale.setlocale(lc_var, new_locale)
Expand Down
67 changes: 50 additions & 17 deletions pandas/tests/config/test_localization.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,67 @@
set_locale,
)

from pandas.compat import is_platform_windows

import pandas as pd

_all_locales = get_locales() or []
_current_locale = locale.getlocale()
_current_locale = locale.setlocale(locale.LC_ALL) # getlocale() is wrong, see GH#46595

# Don't run any of these tests if we are on Windows or have no locales.
pytestmark = pytest.mark.skipif(
is_platform_windows() or not _all_locales, reason="Need non-Windows and locales"
)
# Don't run any of these tests if we have no locales.
pytestmark = pytest.mark.skipif(not _all_locales, reason="Need locales")

_skip_if_only_one_locale = pytest.mark.skipif(
len(_all_locales) <= 1, reason="Need multiple locales for meaningful test"
)


def test_can_set_locale_valid_set():
def _get_current_locale(lc_var: int = locale.LC_ALL) -> str:
# getlocale is not always compliant with setlocale, use setlocale. GH#46595
return locale.setlocale(lc_var)


@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
def test_can_set_current_locale(lc_var):
# Can set the current locale
before_locale = _get_current_locale(lc_var)
assert can_set_locale(before_locale, lc_var=lc_var)
after_locale = _get_current_locale(lc_var)
assert before_locale == after_locale


@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
def test_can_set_locale_valid_set(lc_var):
# Can set the default locale.
assert can_set_locale("")
before_locale = _get_current_locale(lc_var)
assert can_set_locale("", lc_var=lc_var)
after_locale = _get_current_locale(lc_var)
assert before_locale == after_locale


def test_can_set_locale_invalid_set():
@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
def test_can_set_locale_invalid_set(lc_var):
# Cannot set an invalid locale.
assert not can_set_locale("non-existent_locale")
before_locale = _get_current_locale(lc_var)
assert not can_set_locale("non-existent_locale", lc_var=lc_var)
after_locale = _get_current_locale(lc_var)
assert before_locale == after_locale


@pytest.mark.parametrize(
"lang,enc",
[
("it_CH", "UTF-8"),
("en_US", "ascii"),
("zh_CN", "GB2312"),
("it_IT", "ISO-8859-1"),
],
)
@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
def test_can_set_locale_no_leak(lang, enc, lc_var):
# Test that can_set_locale does not leak even when returning False. See GH#46595
before_locale = _get_current_locale(lc_var)
can_set_locale((lang, enc), locale.LC_ALL)
after_locale = _get_current_locale(lc_var)
assert before_locale == after_locale


def test_can_set_locale_invalid_get(monkeypatch):
Expand Down Expand Up @@ -72,10 +108,7 @@ def test_get_locales_prefix():
],
)
def test_set_locale(lang, enc):
if all(x is None for x in _current_locale):
# Not sure why, but on some Travis runs with pytest,
# getlocale() returned (None, None).
pytest.skip("Current locale is not set.")
before_locale = _get_current_locale()

enc = codecs.lookup(enc).name
new_locale = lang, enc
Expand All @@ -95,8 +128,8 @@ def test_set_locale(lang, enc):
assert normalized_locale == new_locale

# Once we exit the "with" statement, locale should be back to what it was.
current_locale = locale.getlocale()
assert current_locale == _current_locale
after_locale = _get_current_locale()
assert before_locale == after_locale


def test_encoding_detected():
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,13 +284,16 @@ def test_to_datetime_format_microsecond(self, cache):
"%m/%d/%Y %H:%M:%S",
Timestamp("2010-01-10 13:56:01"),
],
# The 3 tests below are locale-dependent.
# They pass, except when the machine locale is zh_CN or it_IT .
pytest.param(
"01/10/2010 08:14 PM",
"%m/%d/%Y %I:%M %p",
Timestamp("2010-01-10 20:14"),
marks=pytest.mark.xfail(
locale.getlocale()[0] in ("zh_CN", "it_IT"),
reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
strict=False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just remove these xfails entirely?

Copy link
Contributor Author

@smarie smarie Jul 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, these tests are executed on other workers, and they pass. So they are important :) We should actually try to solve the issue (datetime parsing, separate topic) if we wish to remove the xfail mark

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we be make the xfail condition more specific? strict=False is really undesirable

Copy link
Contributor Author

@smarie smarie Jul 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already quite specific: the test will be marked as xfail only if locale.getlocale()[0] in ("zh_CN", "it_IT"). So for all CI workers except the 2 workers zh_CN and it_IT, the tests are actually correctly executed (and they pass)

Note that I did not write this code, it was already there. But since there was a locale leakage bug, the zh_CN and it_IT locale was actually never active (even on the 2 corresponding CI workers) when hitting the code. This is why noone every needed to add the strict=False flag.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear to me why strict=False is needed i.e. why does it sometimes pass? Is it because the locale setting isn't threadsafe? If so, these tests can be decorated with @pytest.mark.single_cpu

Copy link
Contributor Author

@smarie smarie Jul 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is less specific IMO. See below : on test_to_time you use a fails_on_non_english mark (for readability) but with still a test about zh_CN and it_IT (because for other locales we actually do not know if it would pass or fail, therefore we do not want to use the xfail mark too prematurely). Let me know

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with strict=False for now then especially if there's a path to know how we can remove these in the future.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with strict=False for now then especially if there's a path to know how we can remove these in the future.

im not clear on what this path is. can you elaborate on this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

solve the issue (datetime parsing, separate topic)

From your investigation @smarie fixing the datetime parsing would allow us to remove the xfail?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, fixing the parsing for these two locales would remove the xfail.

),
),
pytest.param(
Expand All @@ -300,6 +303,7 @@ def test_to_datetime_format_microsecond(self, cache):
marks=pytest.mark.xfail(
locale.getlocale()[0] in ("zh_CN", "it_IT"),
reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
strict=False,
),
),
pytest.param(
Expand All @@ -309,6 +313,7 @@ def test_to_datetime_format_microsecond(self, cache):
marks=pytest.mark.xfail(
locale.getlocale()[0] in ("zh_CN", "it_IT"),
reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
strict=False,
),
),
],
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/tools/test_to_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@
from pandas.core.tools.datetimes import to_time as to_time_alias
from pandas.core.tools.times import to_time

# The tests marked with this are locale-dependent.
# They pass, except when the machine locale is zh_CN or it_IT.
fails_on_non_english = pytest.mark.xfail(
locale.getlocale()[0] in ("zh_CN", "it_IT"),
reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
strict=False,
)


Expand Down