Skip to content

Commit a114315

Browse files
mitya57waylan
authored andcommitted
toc: Do not remove diacritical marks when slugify_unicode is used
Update the existing test and add a new one to make sure that the behavior of default slugify function has not changed. Fixes #1118.
1 parent 14c2fa9 commit a114315

File tree

3 files changed

+25
-8
lines changed

3 files changed

+25
-8
lines changed

docs/change_log/index.md

+4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ title: Change Log
33
Python-Markdown Change Log
44
=========================
55

6+
Under development: version 3.3.5 (a bug-fix release).
7+
8+
* Make the `slugify_unicode` function not remove diacritical marks (#1118).
9+
610
Feb 24, 2021: version 3.3.4 (a bug-fix release).
711

812
* Properly parse unclosed tags in code spans (#1066).

markdown/extensions/toc.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,19 @@
2323
import xml.etree.ElementTree as etree
2424

2525

26-
def slugify(value, separator, encoding='ascii'):
26+
def slugify(value, separator, unicode=False):
2727
""" Slugify a string, to make it URL friendly. """
28-
value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore')
29-
value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower()
28+
if not unicode:
29+
# Replace Extended Latin characters with ASCII, i.e. žlutý → zluty
30+
value = unicodedata.normalize('NFKD', value)
31+
value = value.encode('ascii', 'ignore').decode('ascii')
32+
value = re.sub(r'[^\w\s-]', '', value).strip().lower()
3033
return re.sub(r'[{}\s]+'.format(separator), separator, value)
3134

3235

3336
def slugify_unicode(value, separator):
3437
""" Slugify a string, to make it URL friendly while preserving Unicode characters. """
35-
return slugify(value, separator, 'utf-8')
38+
return slugify(value, separator, unicode=True)
3639

3740

3841
IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')

tests/test_syntax/extensions/test_toc.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -534,9 +534,9 @@ def testPermalinkWithUnicodeInID(self):
534534
from markdown.extensions.toc import slugify_unicode
535535
self.assertMarkdownRenders(
536536
'# Unicode ヘッダー',
537-
'<h1 id="unicode-ヘッター">' # noqa
537+
'<h1 id="unicode-ヘッダー">' # noqa
538538
'Unicode ヘッダー' # noqa
539-
'<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">&para;</a>' # noqa
539+
'<a class="headerlink" href="#unicode-ヘッダー" title="Permanent link">&para;</a>' # noqa
540540
'</h1>', # noqa
541541
extensions=[TocExtension(permalink=True, slugify=slugify_unicode)]
542542
)
@@ -545,9 +545,19 @@ def testPermalinkWithUnicodeTitle(self):
545545
from markdown.extensions.toc import slugify_unicode
546546
self.assertMarkdownRenders(
547547
'# Unicode ヘッダー',
548-
'<h1 id="unicode-ヘッター">' # noqa
548+
'<h1 id="unicode-ヘッダー">' # noqa
549549
'Unicode ヘッダー' # noqa
550-
'<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">&para;</a>' # noqa
550+
'<a class="headerlink" href="#unicode-ヘッダー" title="パーマリンク">&para;</a>' # noqa
551551
'</h1>', # noqa
552552
extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)]
553553
)
554+
555+
def testPermalinkWithExtendedLatinInID(self):
556+
self.assertMarkdownRenders(
557+
'# Théâtre',
558+
'<h1 id="theatre">' # noqa
559+
'Théâtre' # noqa
560+
'<a class="headerlink" href="#theatre" title="Permanent link">&para;</a>' # noqa
561+
'</h1>', # noqa
562+
extensions=[TocExtension(permalink=True)]
563+
)

0 commit comments

Comments
 (0)