Skip to content

gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names #97906

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 42 commits into from
Nov 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0fb902f
intermediate commit: start working on porting pypy dawg to store unic…
cfbolz Sep 20, 2022
80a8470
start porting lookup to the compact DAWG
cfbolz Sep 20, 2022
8e22650
implement inverse lookup
cfbolz Sep 20, 2022
ab8ee48
cleanup script
cfbolz Sep 20, 2022
d72faa3
refactor a bit
cfbolz Sep 20, 2022
4478542
add comments, some cleanup
cfbolz Sep 21, 2022
c307224
fix inverse lookup
cfbolz Sep 21, 2022
2f663b2
check that names are actually shorter than NAME_MAXLEN
cfbolz Sep 21, 2022
1102b6a
explain why we can't use topological sorting
cfbolz Oct 4, 2022
fba25e3
use topological sorting, found a way to do it
cfbolz Oct 5, 2022
330ef08
Merge remote-tracking branch 'origin/main' into unicodenames-dawg
cfbolz Oct 5, 2022
7bccc32
fix: make sure that looking up lower case unicode names works
cfbolz Oct 5, 2022
4e4eb7a
update comment
cfbolz Oct 5, 2022
b7ad39e
3.2.0 doesn't have aliases or named sequences
cfbolz Oct 5, 2022
a577528
move the handling of from _getcode into the two callers
cfbolz Oct 5, 2022
489b1d5
blurb
cfbolz Oct 5, 2022
999c106
maybe fix ReST
cfbolz Oct 5, 2022
a7742e3
fix whitespace, don't know what happened here
cfbolz Oct 5, 2022
28102ad
consistently use unsigned ints everywhere
cfbolz Oct 5, 2022
2ec1438
small simplification
cfbolz Oct 5, 2022
c5039c5
do a cleanup pass after Łukasz feedback
cfbolz Oct 6, 2022
bd8d80b
use Py_SAFE_DOWNCAST, nicer str calls
cfbolz Oct 10, 2022
a87d561
use a cached_property for num_reachable_linear
cfbolz Oct 10, 2022
d9b45ae
add tests for unicodedata.lookup and .name which actually fail
cfbolz Oct 10, 2022
827100a
Merge branch 'main' into unicodenames-dawg
cfbolz Oct 10, 2022
4443289
Merge branch 'main' into unicodenames-dawg
cfbolz May 29, 2023
75c8550
Merge branch 'main' into unicodenames-dawg
ambv Oct 18, 2023
3c3a9db
Update unicodename_db.h after upgrade to Unicode 15 in GH-96809
ambv Oct 18, 2023
a627d7b
update space saving numbers after unicode 15.1
cfbolz Oct 18, 2023
6735494
actually update Objects/unicodetype_db.h as well
cfbolz Oct 18, 2023
a3a8b58
no, I was wrong about the size, the effect isn't that huge
cfbolz Oct 18, 2023
8fb106a
add makefile target for regenerating the unicodedata
cfbolz Oct 18, 2023
8b52621
always check the result of dawg generation at construction time
cfbolz Oct 22, 2023
57af105
more sensible __hash__ and __eq__
cfbolz Oct 22, 2023
3148c85
improve comments and docstrings
cfbolz Oct 22, 2023
a6deecf
rename confusingly named argument
cfbolz Oct 22, 2023
cc3600f
add unittests for the dawg code
cfbolz Oct 23, 2023
cccf356
fix fixpoint bug
cfbolz Oct 23, 2023
776bd1a
fix comment
cfbolz Oct 23, 2023
1cb4a69
fix typo
cfbolz Nov 3, 2023
d49e871
clearer names
cfbolz Nov 3, 2023
dfb0580
check for empty dawg
cfbolz Nov 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions Lib/test/test_tools/test_makeunicodedata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import unittest
from test.test_tools import toolsdir, imports_under_tool
from test import support
from test.support.hypothesis_helper import hypothesis

st = hypothesis.strategies
given = hypothesis.given
example = hypothesis.example


with imports_under_tool("unicode"):
from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup


@st.composite
def char_name_db(draw, min_length=1, max_length=30):
m = draw(st.integers(min_value=min_length, max_value=max_length))
names = draw(
st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
)
characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
return list(zip(names, characters))


class TestDawg(unittest.TestCase):
"""Tests for the directed acyclic word graph data structure that is used
to store the unicode character names in unicodedata. Tests ported from PyPy
"""

def test_dawg_direct_simple(self):
dawg = Dawg()
dawg.insert("a", -4)
dawg.insert("c", -2)
dawg.insert("cat", -1)
dawg.insert("catarr", 0)
dawg.insert("catnip", 1)
dawg.insert("zcatnip", 5)
packed, data, inverse = dawg.finish()

self.assertEqual(lookup(packed, data, b"a"), -4)
self.assertEqual(lookup(packed, data, b"c"), -2)
self.assertEqual(lookup(packed, data, b"cat"), -1)
self.assertEqual(lookup(packed, data, b"catarr"), 0)
self.assertEqual(lookup(packed, data, b"catnip"), 1)
self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
self.assertRaises(KeyError, lookup, packed, data, b"b")
self.assertRaises(KeyError, lookup, packed, data, b"catni")
self.assertRaises(KeyError, lookup, packed, data, b"catnipp")

self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)

def test_forbid_empty_dawg(self):
dawg = Dawg()
self.assertRaises(ValueError, dawg.finish)

@given(char_name_db())
@example([("abc", "a"), ("abd", "b")])
@example(
[
("bab", "1"),
("a", ":"),
("ad", "@"),
("b", "<"),
("aacc", "?"),
("dab", "D"),
("aa", "0"),
("ab", "F"),
("aaa", "7"),
("cbd", "="),
("abad", ";"),
("ac", "B"),
("abb", "4"),
("bb", "2"),
("aab", "9"),
("caaaaba", "E"),
("ca", ">"),
("bbaaa", "5"),
("d", "3"),
("baac", "8"),
("c", "6"),
("ba", "A"),
]
)
@example(
[
("bcdac", "9"),
("acc", "g"),
("d", "d"),
("daabdda", "0"),
("aba", ";"),
("c", "6"),
("aa", "7"),
("abbd", "c"),
("badbd", "?"),
("bbd", "f"),
("cc", "@"),
("bb", "8"),
("daca", ">"),
("ba", ":"),
("baac", "3"),
("dbdddac", "a"),
("a", "2"),
("cabd", "b"),
("b", "="),
("abd", "4"),
("adcbd", "5"),
("abc", "e"),
("ab", "1"),
]
)
def test_dawg(self, data):
# suppress debug prints
with support.captured_stdout() as output:
# it's enough to build it, building will also check the result
build_compression_dawg(data)
20 changes: 20 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,26 @@ def test_name_inverse_lookup(self):
if looked_name := self.db.name(char, None):
self.assertEqual(self.db.lookup(looked_name), char)

def test_no_names_in_pua(self):
puas = [*range(0xe000, 0xf8ff),
*range(0xf0000, 0xfffff),
*range(0x100000, 0x10ffff)]
for i in puas:
char = chr(i)
self.assertRaises(ValueError, self.db.name, char)

def test_lookup_nonexistant(self):
# just make sure that lookup can fail
for nonexistant in [
"LATIN SMLL LETR A",
"OPEN HANDS SIGHS",
"DREGS",
"HANDBUG",
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
"???",
]:
self.assertRaises(KeyError, self.db.lookup, nonexistant)

def test_digit(self):
self.assertEqual(self.db.digit('A', None), None)
self.assertEqual(self.db.digit('9'), 9)
Expand Down
10 changes: 9 additions & 1 deletion Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,14 @@ check-abidump: all
regen-limited-abi: all
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml

############################################################################
# Regenerate Unicode Data

.PHONY: regen-unicodedata
regen-unicodedata:
$(PYTHON_FOR_REGEN) Tools/unicode/makeunicodedata.py


############################################################################
# Regenerate all generated files

Expand All @@ -1336,7 +1344,7 @@ regen-limited-abi: all
regen-all: regen-cases regen-typeslots \
regen-token regen-ast regen-keyword regen-sre regen-frozen \
regen-pegen-metaparser regen-pegen regen-test-frozenmain \
regen-test-levenshtein regen-global-objects
regen-test-levenshtein regen-global-objects regen-unicodedata
@echo
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
@echo "and make regen-configure should be run manually"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Switch the storage of the unicode codepoint names to use a different
data-structure, a `directed acyclic word graph
<https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton>`_.
This makes the unicodedata shared library about 440 KiB smaller. Contributed by
Carl Friedrich Bolz-Tereick using code from the PyPy project.
Loading