python · ambv · Nov 4, 2023 · Sep 20, 2022 · Sep 20, 2022 · Sep 20, 2022
diff --git a/Lib/test/test_tools/test_makeunicodedata.py b/Lib/test/test_tools/test_makeunicodedata.py
@@ -0,0 +1,121 @@
+import unittest
+from test.test_tools import toolsdir, imports_under_tool
+from test import support
+from test.support.hypothesis_helper import hypothesis
+
+st = hypothesis.strategies
+given = hypothesis.given
+example = hypothesis.example
+
+
+with imports_under_tool("unicode"):
+    from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
+
+
+@st.composite
+def char_name_db(draw, min_length=1, max_length=30):
+    m = draw(st.integers(min_value=min_length, max_value=max_length))
+    names = draw(
+        st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
+    )
+    characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
+    return list(zip(names, characters))
+
+
+class TestDawg(unittest.TestCase):
+    """Tests for the directed acyclic word graph data structure that is used
+    to store the unicode character names in unicodedata. Tests ported from PyPy
+    """
+
+    def test_dawg_direct_simple(self):
+        dawg = Dawg()
+        dawg.insert("a", -4)
+        dawg.insert("c", -2)
+        dawg.insert("cat", -1)
+        dawg.insert("catarr", 0)
+        dawg.insert("catnip", 1)
+        dawg.insert("zcatnip", 5)
+        packed, data, inverse = dawg.finish()
+
+        self.assertEqual(lookup(packed, data, b"a"), -4)
+        self.assertEqual(lookup(packed, data, b"c"), -2)
+        self.assertEqual(lookup(packed, data, b"cat"), -1)
+        self.assertEqual(lookup(packed, data, b"catarr"), 0)
+        self.assertEqual(lookup(packed, data, b"catnip"), 1)
+        self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
+        self.assertRaises(KeyError, lookup, packed, data, b"b")
+        self.assertRaises(KeyError, lookup, packed, data, b"catni")
+        self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
+
+        self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
+        self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
+        self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
+        self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
+        self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
+        self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
+        self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
+
+    def test_forbid_empty_dawg(self):
+        dawg = Dawg()
+        self.assertRaises(ValueError, dawg.finish)
+
+    @given(char_name_db())
+    @example([("abc", "a"), ("abd", "b")])
+    @example(
+        [
+            ("bab", "1"),
+            ("a", ":"),
+            ("ad", "@"),
+            ("b", "<"),
+            ("aacc", "?"),
+            ("dab", "D"),
+            ("aa", "0"),
+            ("ab", "F"),
+            ("aaa", "7"),
+            ("cbd", "="),
+            ("abad", ";"),
+            ("ac", "B"),
+            ("abb", "4"),
+            ("bb", "2"),
+            ("aab", "9"),
+            ("caaaaba", "E"),
+            ("ca", ">"),
+            ("bbaaa", "5"),
+            ("d", "3"),
+            ("baac", "8"),
+            ("c", "6"),
+            ("ba", "A"),
+        ]
+    )
+    @example(
+        [
+            ("bcdac", "9"),
+            ("acc", "g"),
+            ("d", "d"),
+            ("daabdda", "0"),
+            ("aba", ";"),
+            ("c", "6"),
+            ("aa", "7"),
+            ("abbd", "c"),
+            ("badbd", "?"),
+            ("bbd", "f"),
+            ("cc", "@"),
+            ("bb", "8"),
+            ("daca", ">"),
+            ("ba", ":"),
+            ("baac", "3"),
+            ("dbdddac", "a"),
+            ("a", "2"),
+            ("cabd", "b"),
+            ("b", "="),
+            ("abd", "4"),
+            ("adcbd", "5"),
+            ("abc", "e"),
+            ("ab", "1"),
+        ]
+    )
+    def test_dawg(self, data):
+        # suppress debug prints
+        with support.captured_stdout() as output:
+            # it's enough to build it, building will also check the result
+            build_compression_dawg(data)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -104,6 +104,26 @@ def test_name_inverse_lookup(self):
             if looked_name := self.db.name(char, None):
                 self.assertEqual(self.db.lookup(looked_name), char)
 
+    def test_no_names_in_pua(self):
+        puas = [*range(0xe000, 0xf8ff),
+                *range(0xf0000, 0xfffff),
+                *range(0x100000, 0x10ffff)]
+        for i in puas:
+            char = chr(i)
+            self.assertRaises(ValueError, self.db.name, char)
+
+    def test_lookup_nonexistant(self):
+        # just make sure that lookup can fail
+        for nonexistant in [
+            "LATIN SMLL LETR A",
+            "OPEN HANDS SIGHS",
+            "DREGS",
+            "HANDBUG",
+            "MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
+            "???",
+        ]:
+            self.assertRaises(KeyError, self.db.lookup, nonexistant)
+
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)
         self.assertEqual(self.db.digit('9'), 9)

@@ -1328,6 +1328,14 @@ check-abidump: all
 regen-limited-abi: all
 	$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml
 
+############################################################################
+# Regenerate Unicode Data
+
+.PHONY: regen-unicodedata
+regen-unicodedata:
+	$(PYTHON_FOR_REGEN) Tools/unicode/makeunicodedata.py
+
+
 ############################################################################
 # Regenerate all generated files
 
@@ -1336,7 +1344,7 @@ regen-limited-abi: all
 regen-all: regen-cases regen-typeslots \
 	regen-token regen-ast regen-keyword regen-sre regen-frozen \
 	regen-pegen-metaparser regen-pegen regen-test-frozenmain \
-	regen-test-levenshtein regen-global-objects
+	regen-test-levenshtein regen-global-objects regen-unicodedata
 	@echo
 	@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
 	@echo "and make regen-configure should be run manually"

diff --git a/Misc/NEWS.d/next/Library/2022-10-05-15-01-36.gh-issue-96954.ezwkrU.rst b/Misc/NEWS.d/next/Library/2022-10-05-15-01-36.gh-issue-96954.ezwkrU.rst
@@ -0,0 +1,5 @@
+Switch the storage of the unicode codepoint names to use a different
+data-structure, a `directed acyclic word graph
+<https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton>`_.
+This makes the unicodedata shared library about 440 KiB smaller. Contributed by
+Carl Friedrich Bolz-Tereick using code from the PyPy project.