toc: Do not remove diacritical marks when slugify_unicode is used

mitya57 · waylan · commit a11431539d08 · 2021-03-24T14:35:59.000-04:00
Update the existing test and add a new one to make sure that the behavior of default slugify function has not changed. Fixes #1118.
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
@@ -3,6 +3,10 @@ title: Change Log
 Python-Markdown Change Log
 =========================
 
+Under development: version 3.3.5 (a bug-fix release).
+
+* Make the `slugify_unicode` function not remove diacritical marks (#1118).
+
 Feb 24, 2021: version 3.3.4 (a bug-fix release).
 
 * Properly parse unclosed tags in code spans (#1066).
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
@@ -23,16 +23,19 @@
 import xml.etree.ElementTree as etree
 
 
-def slugify(value, separator, encoding='ascii'):
+def slugify(value, separator, unicode=False):
     """ Slugify a string, to make it URL friendly. """
-    value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore')
-    value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower()
+    if not unicode:
+        # Replace Extended Latin characters with ASCII, i.e. žlutý → zluty
+        value = unicodedata.normalize('NFKD', value)
+        value = value.encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value).strip().lower()
     return re.sub(r'[{}\s]+'.format(separator), separator, value)
 
 
 def slugify_unicode(value, separator):
     """ Slugify a string, to make it URL friendly while preserving Unicode characters. """
-    return slugify(value, separator, 'utf-8')
+    return slugify(value, separator, unicode=True)
 
 
 IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py
@@ -534,9 +534,9 @@ def testPermalinkWithUnicodeInID(self):
         from markdown.extensions.toc import slugify_unicode
         self.assertMarkdownRenders(
             '# Unicode ヘッダー',
-            '<h1 id="unicode-ヘッター">'                                                            # noqa
+            '<h1 id="unicode-ヘッダー">'                                                            # noqa
                 'Unicode ヘッダー'                                                                  # noqa
-                '<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">&para;</a>'  # noqa
+                '<a class="headerlink" href="#unicode-ヘッダー" title="Permanent link">&para;</a>'  # noqa
             '</h1>',                                                                               # noqa
             extensions=[TocExtension(permalink=True, slugify=slugify_unicode)]
         )
@@ -545,9 +545,19 @@ def testPermalinkWithUnicodeTitle(self):
         from markdown.extensions.toc import slugify_unicode
         self.assertMarkdownRenders(
             '# Unicode ヘッダー',
-            '<h1 id="unicode-ヘッター">'                                                        # noqa
+            '<h1 id="unicode-ヘッダー">'                                                        # noqa
                 'Unicode ヘッダー'                                                              # noqa
-                '<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">&para;</a>'  # noqa
+                '<a class="headerlink" href="#unicode-ヘッダー" title="パーマリンク">&para;</a>'  # noqa
             '</h1>',                                                                           # noqa
             extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)]
         )
+
+    def testPermalinkWithExtendedLatinInID(self):
+        self.assertMarkdownRenders(
+            '# Théâtre',
+            '<h1 id="theatre">'                                                            # noqa
+                'Théâtre'                                                                  # noqa
+                '<a class="headerlink" href="#theatre" title="Permanent link">&para;</a>'  # noqa
+            '</h1>',                                                                       # noqa
+            extensions=[TocExtension(permalink=True)]
+        )