Skip to content

Commit 8ddc204

Browse files
committed
rustdoc: check parsing diffs between pulldown-cmark 0.9.6 and 0.10
This commit is not meant to be merged as-is. It's meant to run in Crater, so that we can estimate the impact of bumping to the new version of the markdown parser.
1 parent 829308e commit 8ddc204

File tree

7 files changed

+431
-0
lines changed

7 files changed

+431
-0
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4704,6 +4704,7 @@ dependencies = [
47044704
"itertools",
47054705
"minifier",
47064706
"once_cell",
4707+
"pulldown-cmark 0.10.0",
47074708
"regex",
47084709
"rustdoc-json-types",
47094710
"serde",

src/librustdoc/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ itertools = "0.11"
1313
indexmap = "2"
1414
minifier = "0.3.0"
1515
once_cell = "1.10.0"
16+
pulldown-cmark-new = { version = "0.10", package = "pulldown-cmark", default-features = false }
1617
regex = "1"
1718
rustdoc-json-types = { path = "../rustdoc-json-types" }
1819
serde_json = "1.0"

src/librustdoc/lint.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ declare_rustdoc_lint! {
196196
"detects redundant explicit links in doc comments"
197197
}
198198

199+
declare_rustdoc_lint! {
200+
/// This compatibility lint checks for Markdown syntax that works in the old engine but not
201+
/// the new one.
202+
UNPORTABLE_MARKDOWN,
203+
Deny,
204+
"detects markdown that is interpreted differently in different parser"
205+
}
206+
199207
pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
200208
vec![
201209
BROKEN_INTRA_DOC_LINKS,
@@ -209,6 +217,7 @@ pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
209217
MISSING_CRATE_LEVEL_DOCS,
210218
UNESCAPED_BACKTICKS,
211219
REDUNDANT_EXPLICIT_LINKS,
220+
UNPORTABLE_MARKDOWN,
212221
]
213222
});
214223

src/librustdoc/passes/lint.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod check_code_block_syntax;
66
mod html_tags;
77
mod redundant_explicit_links;
88
mod unescaped_backticks;
9+
mod unportable_markdown;
910

1011
use super::Pass;
1112
use crate::clean::*;
@@ -31,6 +32,7 @@ impl<'a, 'tcx> DocVisitor for Linter<'a, 'tcx> {
3132
html_tags::visit_item(self.cx, item);
3233
unescaped_backticks::visit_item(self.cx, item);
3334
redundant_explicit_links::visit_item(self.cx, item);
35+
unportable_markdown::visit_item(self.cx, item);
3436

3537
self.visit_item_recur(item)
3638
}
Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
//! Detects markdown syntax that's different between pulldown-cmark
2+
//! 0.9 and 0.10.
3+
4+
use crate::clean::Item;
5+
use crate::core::DocContext;
6+
use crate::html::markdown::main_body_opts;
7+
use pulldown_cmark as cmarko;
8+
use pulldown_cmark_new as cmarkn;
9+
use rustc_resolve::rustdoc::source_span_for_markdown_range;
10+
11+
pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item) {
12+
let tcx = cx.tcx;
13+
let Some(hir_id) = DocContext::as_local_hir_id(tcx, item.item_id) else {
14+
// If non-local, no need to check anything.
15+
return;
16+
};
17+
18+
let dox = item.doc_value();
19+
if dox.is_empty() {
20+
return;
21+
}
22+
23+
let link_names = item.link_names(&cx.cache);
24+
let mut replacer_old = |broken_link: cmarko::BrokenLink<'_>| {
25+
link_names
26+
.iter()
27+
.find(|link| *link.original_text == *broken_link.reference)
28+
.map(|link| ((*link.href).into(), (*link.new_text).into()))
29+
};
30+
let parser_old = cmarko::Parser::new_with_broken_link_callback(
31+
&dox,
32+
main_body_opts(),
33+
Some(&mut replacer_old),
34+
)
35+
.into_offset_iter()
36+
// Not worth cleaning up minor "distinctions without difference" in the AST.
37+
// Text events get chopped up differently between versions.
38+
// <html> and `code` mistakes are usually covered by unescaped_backticks and html_tags lints.
39+
.filter(|(event, _event_range)| {
40+
!matches!(
41+
event,
42+
cmarko::Event::Code(_)
43+
| cmarko::Event::Text(_)
44+
| cmarko::Event::Html(_)
45+
| cmarko::Event::SoftBreak
46+
)
47+
});
48+
49+
pub fn main_body_opts_new() -> cmarkn::Options {
50+
cmarkn::Options::ENABLE_TABLES
51+
| cmarkn::Options::ENABLE_FOOTNOTES
52+
| cmarkn::Options::ENABLE_STRIKETHROUGH
53+
| cmarkn::Options::ENABLE_TASKLISTS
54+
| cmarkn::Options::ENABLE_SMART_PUNCTUATION
55+
}
56+
let mut replacer_new = |broken_link: cmarkn::BrokenLink<'_>| {
57+
link_names
58+
.iter()
59+
.find(|link| *link.original_text.trim() == *broken_link.reference.trim())
60+
.map(|link| ((*link.href).into(), (*link.new_text).into()))
61+
};
62+
let parser_new = cmarkn::Parser::new_with_broken_link_callback(
63+
&dox,
64+
main_body_opts_new(),
65+
Some(&mut replacer_new),
66+
)
67+
.into_offset_iter()
68+
.filter(|(event, _event_range)| {
69+
!matches!(
70+
event,
71+
cmarkn::Event::Code(_)
72+
| cmarkn::Event::Text(_)
73+
| cmarkn::Event::Html(_)
74+
| cmarkn::Event::InlineHtml(_)
75+
| cmarkn::Event::SoftBreak
76+
)
77+
});
78+
79+
let mut reported_an_error = false;
80+
for ((event_old, event_range_old), (event_new, event_range_new)) in parser_old.zip(parser_new) {
81+
match (event_old, event_new) {
82+
(
83+
cmarko::Event::Start(cmarko::Tag::Emphasis),
84+
cmarkn::Event::Start(cmarkn::Tag::Emphasis),
85+
)
86+
| (
87+
cmarko::Event::Start(cmarko::Tag::Strong),
88+
cmarkn::Event::Start(cmarkn::Tag::Strong),
89+
)
90+
| (
91+
cmarko::Event::Start(cmarko::Tag::Strikethrough),
92+
cmarkn::Event::Start(cmarkn::Tag::Strikethrough),
93+
)
94+
| (
95+
cmarko::Event::End(cmarko::Tag::Emphasis),
96+
cmarkn::Event::End(cmarkn::TagEnd::Emphasis),
97+
)
98+
| (
99+
cmarko::Event::End(cmarko::Tag::Strong),
100+
cmarkn::Event::End(cmarkn::TagEnd::Strong),
101+
)
102+
| (
103+
cmarko::Event::End(cmarko::Tag::Strikethrough),
104+
cmarkn::Event::End(cmarkn::TagEnd::Strikethrough),
105+
)
106+
| (
107+
cmarko::Event::End(cmarko::Tag::Link(..)),
108+
cmarkn::Event::End(cmarkn::TagEnd::Link),
109+
)
110+
| (
111+
cmarko::Event::End(cmarko::Tag::Image(..)),
112+
cmarkn::Event::End(cmarkn::TagEnd::Image),
113+
)
114+
| (cmarko::Event::FootnoteReference(..), cmarkn::Event::FootnoteReference(..))
115+
| (cmarko::Event::TaskListMarker(false), cmarkn::Event::TaskListMarker(false))
116+
| (cmarko::Event::TaskListMarker(true), cmarkn::Event::TaskListMarker(true))
117+
if event_range_old == event_range_new =>
118+
{
119+
// Matching tags. Do nothing.
120+
}
121+
(
122+
cmarko::Event::Start(cmarko::Tag::Link(_, old_dest_url, old_title)),
123+
cmarkn::Event::Start(cmarkn::Tag::Link { dest_url, title, .. }),
124+
)
125+
| (
126+
cmarko::Event::Start(cmarko::Tag::Image(_, old_dest_url, old_title)),
127+
cmarkn::Event::Start(cmarkn::Tag::Image { dest_url, title, .. }),
128+
) if event_range_old == event_range_new
129+
&& &old_dest_url[..] == &dest_url[..]
130+
&& &old_title[..] == &title[..] =>
131+
{
132+
// Matching tags. Do nothing.
133+
}
134+
(cmarko::Event::SoftBreak, cmarkn::Event::SoftBreak)
135+
| (cmarko::Event::HardBreak, cmarkn::Event::HardBreak)
136+
| (cmarko::Event::Rule, cmarkn::Event::Rule)
137+
| (
138+
cmarko::Event::Start(cmarko::Tag::Paragraph),
139+
cmarkn::Event::Start(cmarkn::Tag::Paragraph),
140+
)
141+
| (
142+
cmarko::Event::Start(cmarko::Tag::Heading(..)),
143+
cmarkn::Event::Start(cmarkn::Tag::Heading { .. }),
144+
)
145+
| (
146+
cmarko::Event::Start(cmarko::Tag::BlockQuote),
147+
cmarkn::Event::Start(cmarkn::Tag::BlockQuote),
148+
)
149+
| (
150+
cmarko::Event::Start(cmarko::Tag::CodeBlock(..)),
151+
cmarkn::Event::Start(cmarkn::Tag::CodeBlock(..)),
152+
)
153+
| (
154+
cmarko::Event::Start(cmarko::Tag::List(..)),
155+
cmarkn::Event::Start(cmarkn::Tag::List(..)),
156+
)
157+
| (cmarko::Event::Start(cmarko::Tag::Item), cmarkn::Event::Start(cmarkn::Tag::Item))
158+
| (
159+
cmarko::Event::Start(cmarko::Tag::FootnoteDefinition(..)),
160+
cmarkn::Event::Start(cmarkn::Tag::FootnoteDefinition(..)),
161+
)
162+
| (
163+
cmarko::Event::Start(cmarko::Tag::Table(..)),
164+
cmarkn::Event::Start(cmarkn::Tag::Table(..)),
165+
)
166+
| (
167+
cmarko::Event::Start(cmarko::Tag::TableHead),
168+
cmarkn::Event::Start(cmarkn::Tag::TableHead),
169+
)
170+
| (
171+
cmarko::Event::Start(cmarko::Tag::TableRow),
172+
cmarkn::Event::Start(cmarkn::Tag::TableRow),
173+
)
174+
| (
175+
cmarko::Event::Start(cmarko::Tag::TableCell),
176+
cmarkn::Event::Start(cmarkn::Tag::TableCell),
177+
)
178+
| (
179+
cmarko::Event::End(cmarko::Tag::Paragraph),
180+
cmarkn::Event::End(cmarkn::TagEnd::Paragraph),
181+
)
182+
| (
183+
cmarko::Event::End(cmarko::Tag::Heading(..)),
184+
cmarkn::Event::End(cmarkn::TagEnd::Heading(_)),
185+
)
186+
| (
187+
cmarko::Event::End(cmarko::Tag::BlockQuote),
188+
cmarkn::Event::End(cmarkn::TagEnd::BlockQuote),
189+
)
190+
| (
191+
cmarko::Event::End(cmarko::Tag::CodeBlock(..)),
192+
cmarkn::Event::End(cmarkn::TagEnd::CodeBlock),
193+
)
194+
| (
195+
cmarko::Event::End(cmarko::Tag::List(..)),
196+
cmarkn::Event::End(cmarkn::TagEnd::List(_)),
197+
)
198+
| (cmarko::Event::End(cmarko::Tag::Item), cmarkn::Event::End(cmarkn::TagEnd::Item))
199+
| (
200+
cmarko::Event::End(cmarko::Tag::FootnoteDefinition(..)),
201+
cmarkn::Event::End(cmarkn::TagEnd::FootnoteDefinition),
202+
)
203+
| (
204+
cmarko::Event::End(cmarko::Tag::Table(..)),
205+
cmarkn::Event::End(cmarkn::TagEnd::Table),
206+
)
207+
| (
208+
cmarko::Event::End(cmarko::Tag::TableHead),
209+
cmarkn::Event::End(cmarkn::TagEnd::TableHead),
210+
)
211+
| (
212+
cmarko::Event::End(cmarko::Tag::TableRow),
213+
cmarkn::Event::End(cmarkn::TagEnd::TableRow),
214+
)
215+
| (
216+
cmarko::Event::End(cmarko::Tag::TableCell),
217+
cmarkn::Event::End(cmarkn::TagEnd::TableCell),
218+
) => {
219+
// Matching tags. Do nothing.
220+
//
221+
// Parsers sometimes differ in what they consider the "range of an event,"
222+
// even though the event is really the same. Inlines are pretty consistent,
223+
// but stuff like list items? Not really.
224+
//
225+
// Mismatched block elements will usually nest differently, so ignoring it
226+
// works good enough.
227+
}
228+
// If we've already reported an error on the start tag, don't bother on the end tag.
229+
(cmarko::Event::End(_), _) | (_, cmarkn::Event::End(_)) if reported_an_error => {}
230+
// Non-matching inline.
231+
(cmarko::Event::Start(cmarko::Tag::Link(..)), cmarkn::Event::FootnoteReference(..))
232+
| (
233+
cmarko::Event::Start(cmarko::Tag::Image(..)),
234+
cmarkn::Event::FootnoteReference(..),
235+
)
236+
| (
237+
cmarko::Event::FootnoteReference(..),
238+
cmarkn::Event::Start(cmarkn::Tag::Link { .. }),
239+
)
240+
| (
241+
cmarko::Event::FootnoteReference(..),
242+
cmarkn::Event::Start(cmarkn::Tag::Image { .. }),
243+
) if event_range_old == event_range_new => {
244+
reported_an_error = true;
245+
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
246+
// use the span of the entire attribute as a fallback.
247+
let span = source_span_for_markdown_range(
248+
tcx,
249+
&dox,
250+
&event_range_old,
251+
&item.attrs.doc_strings,
252+
)
253+
.unwrap_or_else(|| item.attr_span(tcx));
254+
tcx.node_span_lint(
255+
crate::lint::UNPORTABLE_MARKDOWN,
256+
hir_id,
257+
span,
258+
"unportable markdown",
259+
|lint| {
260+
lint.help(format!("syntax ambiguous between footnote and link"));
261+
},
262+
);
263+
}
264+
// Non-matching results.
265+
(event_old, event_new) => {
266+
reported_an_error = true;
267+
let (range, range_other, desc, desc_other, tag, tag_other) = if event_range_old.end
268+
- event_range_old.start
269+
< event_range_new.end - event_range_new.start
270+
{
271+
(
272+
event_range_old,
273+
event_range_new,
274+
"old",
275+
"new",
276+
format!("{event_old:?}"),
277+
format!("{event_new:?}"),
278+
)
279+
} else {
280+
(
281+
event_range_new,
282+
event_range_old,
283+
"new",
284+
"old",
285+
format!("{event_new:?}"),
286+
format!("{event_old:?}"),
287+
)
288+
};
289+
let (range, tag_other) =
290+
if range_other.start <= range.start && range_other.end <= range.end {
291+
(range_other.start..range.end, tag_other)
292+
} else {
293+
(range, format!("nothing"))
294+
};
295+
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
296+
// use the span of the entire attribute as a fallback.
297+
let span =
298+
source_span_for_markdown_range(tcx, &dox, &range, &item.attrs.doc_strings)
299+
.unwrap_or_else(|| item.attr_span(tcx));
300+
tcx.node_span_lint(
301+
crate::lint::UNPORTABLE_MARKDOWN,
302+
hir_id,
303+
span,
304+
"unportable markdown",
305+
|lint| {
306+
lint.help(format!(
307+
"{desc} parser sees {tag}, {desc_other} sees {tag_other}"
308+
));
309+
},
310+
);
311+
}
312+
}
313+
}
314+
}

0 commit comments

Comments
 (0)