Skip to content

Commit c861e61

Browse files
committed
rustdoc: check parsing diffs between pulldown-cmark 0.9.6 and 0.10
This commit is not meant to be merged as-is. It's meant to run in Crater, so that we can estimate the impact of bumping to the new version of the markdown parser.
1 parent 829308e commit c861e61

File tree

7 files changed

+430
-0
lines changed

7 files changed

+430
-0
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4704,6 +4704,7 @@ dependencies = [
47044704
"itertools",
47054705
"minifier",
47064706
"once_cell",
4707+
"pulldown-cmark 0.10.0",
47074708
"regex",
47084709
"rustdoc-json-types",
47094710
"serde",

src/librustdoc/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ itertools = "0.11"
1313
indexmap = "2"
1414
minifier = "0.3.0"
1515
once_cell = "1.10.0"
16+
pulldown-cmark-new = { version = "0.10", package = "pulldown-cmark", default-features = false }
1617
regex = "1"
1718
rustdoc-json-types = { path = "../rustdoc-json-types" }
1819
serde_json = "1.0"

src/librustdoc/lint.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ declare_rustdoc_lint! {
196196
"detects redundant explicit links in doc comments"
197197
}
198198

199+
declare_rustdoc_lint! {
200+
/// This compatibility lint checks for Markdown syntax that works in the old engine but not
201+
/// the new one.
202+
UNPORTABLE_MARKDOWN,
203+
Deny,
204+
"detects markdown that is interpreted differently in different parser"
205+
}
206+
199207
pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
200208
vec![
201209
BROKEN_INTRA_DOC_LINKS,
@@ -209,6 +217,7 @@ pub(crate) static RUSTDOC_LINTS: Lazy<Vec<&'static Lint>> = Lazy::new(|| {
209217
MISSING_CRATE_LEVEL_DOCS,
210218
UNESCAPED_BACKTICKS,
211219
REDUNDANT_EXPLICIT_LINKS,
220+
UNPORTABLE_MARKDOWN,
212221
]
213222
});
214223

src/librustdoc/passes/lint.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod check_code_block_syntax;
66
mod html_tags;
77
mod redundant_explicit_links;
88
mod unescaped_backticks;
9+
mod unportable_markdown;
910

1011
use super::Pass;
1112
use crate::clean::*;
@@ -31,6 +32,7 @@ impl<'a, 'tcx> DocVisitor for Linter<'a, 'tcx> {
3132
html_tags::visit_item(self.cx, item);
3233
unescaped_backticks::visit_item(self.cx, item);
3334
redundant_explicit_links::visit_item(self.cx, item);
35+
unportable_markdown::visit_item(self.cx, item);
3436

3537
self.visit_item_recur(item)
3638
}
Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
//! Detects markdown syntax that's different between pulldown-cmark
2+
//! 0.9 and 0.10.
3+
4+
use crate::clean::Item;
5+
use crate::core::DocContext;
6+
use crate::html::markdown::main_body_opts;
7+
use pulldown_cmark as cmarko;
8+
use pulldown_cmark_new as cmarkn;
9+
use rustc_resolve::rustdoc::source_span_for_markdown_range;
10+
11+
pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item) {
12+
let tcx = cx.tcx;
13+
let Some(hir_id) = DocContext::as_local_hir_id(tcx, item.item_id) else {
14+
// If non-local, no need to check anything.
15+
return;
16+
};
17+
18+
let dox = item.doc_value();
19+
if dox.is_empty() {
20+
return;
21+
}
22+
23+
let link_names = item.link_names(&cx.cache);
24+
let mut replacer_old = |broken_link: cmarko::BrokenLink<'_>| {
25+
link_names
26+
.iter()
27+
.find(|link| *link.original_text == *broken_link.reference)
28+
.map(|link| ((*link.href).into(), (*link.new_text).into()))
29+
};
30+
let parser_old = cmarko::Parser::new_with_broken_link_callback(
31+
&dox,
32+
main_body_opts(),
33+
Some(&mut replacer_old),
34+
)
35+
.into_offset_iter()
36+
// Not worth cleaning up minor "distinctions without difference" in the AST.
37+
// Text events get chopped up differently between versions.
38+
// <html> and `code` mistakes are usually covered by unescaped_backticks and html_tags lints.
39+
.filter(|(event, _event_range)| {
40+
!matches!(
41+
event,
42+
cmarko::Event::Code(_)
43+
| cmarko::Event::Text(_)
44+
| cmarko::Event::Html(_)
45+
| cmarko::Event::SoftBreak
46+
)
47+
});
48+
49+
pub fn main_body_opts_new() -> cmarkn::Options {
50+
cmarkn::Options::ENABLE_TABLES
51+
| cmarkn::Options::ENABLE_FOOTNOTES
52+
| cmarkn::Options::ENABLE_STRIKETHROUGH
53+
| cmarkn::Options::ENABLE_TASKLISTS
54+
| cmarkn::Options::ENABLE_SMART_PUNCTUATION
55+
}
56+
let mut replacer_new = |broken_link: cmarkn::BrokenLink<'_>| {
57+
link_names
58+
.iter()
59+
.find(|link| *link.original_text.trim() == *broken_link.reference.trim())
60+
.map(|link| ((*link.href).into(), (*link.new_text).into()))
61+
};
62+
let parser_new = cmarkn::Parser::new_with_broken_link_callback(
63+
&dox,
64+
main_body_opts_new(),
65+
Some(&mut replacer_new),
66+
)
67+
.into_offset_iter()
68+
.filter(|(event, _event_range)| {
69+
!matches!(
70+
event,
71+
cmarkn::Event::Code(_)
72+
| cmarkn::Event::Text(_)
73+
| cmarkn::Event::Html(_)
74+
| cmarkn::Event::InlineHtml(_)
75+
| cmarkn::Event::SoftBreak
76+
)
77+
});
78+
79+
let mut reported_an_error = false;
80+
for ((event_old, event_range_old), (event_new, event_range_new)) in parser_old.zip(parser_new) {
81+
match (event_old, event_new) {
82+
(
83+
cmarko::Event::Start(cmarko::Tag::Emphasis),
84+
cmarkn::Event::Start(cmarkn::Tag::Emphasis),
85+
)
86+
| (
87+
cmarko::Event::Start(cmarko::Tag::Strong),
88+
cmarkn::Event::Start(cmarkn::Tag::Strong),
89+
)
90+
| (
91+
cmarko::Event::Start(cmarko::Tag::Strikethrough),
92+
cmarkn::Event::Start(cmarkn::Tag::Strikethrough),
93+
)
94+
| (
95+
cmarko::Event::End(cmarko::Tag::Emphasis),
96+
cmarkn::Event::End(cmarkn::TagEnd::Emphasis),
97+
)
98+
| (
99+
cmarko::Event::End(cmarko::Tag::Strong),
100+
cmarkn::Event::End(cmarkn::TagEnd::Strong),
101+
)
102+
| (
103+
cmarko::Event::End(cmarko::Tag::Strikethrough),
104+
cmarkn::Event::End(cmarkn::TagEnd::Strikethrough),
105+
)
106+
| (
107+
cmarko::Event::End(cmarko::Tag::Link(..)),
108+
cmarkn::Event::End(cmarkn::TagEnd::Link),
109+
)
110+
| (
111+
cmarko::Event::End(cmarko::Tag::Image(..)),
112+
cmarkn::Event::End(cmarkn::TagEnd::Image),
113+
)
114+
| (cmarko::Event::FootnoteReference(..), cmarkn::Event::FootnoteReference(..))
115+
| (cmarko::Event::SoftBreak, cmarkn::Event::SoftBreak)
116+
| (cmarko::Event::HardBreak, cmarkn::Event::HardBreak)
117+
| (cmarko::Event::TaskListMarker(false), cmarkn::Event::TaskListMarker(false))
118+
| (cmarko::Event::TaskListMarker(true), cmarkn::Event::TaskListMarker(true))
119+
if event_range_old == event_range_new =>
120+
{
121+
// Matching tags. Do nothing.
122+
}
123+
(
124+
cmarko::Event::Start(cmarko::Tag::Link(_, old_dest_url, old_title)),
125+
cmarkn::Event::Start(cmarkn::Tag::Link { dest_url, title, .. }),
126+
)
127+
| (
128+
cmarko::Event::Start(cmarko::Tag::Image(_, old_dest_url, old_title)),
129+
cmarkn::Event::Start(cmarkn::Tag::Image { dest_url, title, .. }),
130+
) if event_range_old == event_range_new
131+
&& &old_dest_url[..] == &dest_url[..]
132+
&& &old_title[..] == &title[..] =>
133+
{
134+
// Matching tags. Do nothing.
135+
}
136+
(
137+
cmarko::Event::Start(cmarko::Tag::Paragraph),
138+
cmarkn::Event::Start(cmarkn::Tag::Paragraph),
139+
)
140+
| (
141+
cmarko::Event::Start(cmarko::Tag::Heading(..)),
142+
cmarkn::Event::Start(cmarkn::Tag::Heading { .. }),
143+
)
144+
| (
145+
cmarko::Event::Start(cmarko::Tag::BlockQuote),
146+
cmarkn::Event::Start(cmarkn::Tag::BlockQuote),
147+
)
148+
| (
149+
cmarko::Event::Start(cmarko::Tag::CodeBlock(..)),
150+
cmarkn::Event::Start(cmarkn::Tag::CodeBlock(..)),
151+
)
152+
| (
153+
cmarko::Event::Start(cmarko::Tag::List(..)),
154+
cmarkn::Event::Start(cmarkn::Tag::List(..)),
155+
)
156+
| (cmarko::Event::Start(cmarko::Tag::Item), cmarkn::Event::Start(cmarkn::Tag::Item))
157+
| (
158+
cmarko::Event::Start(cmarko::Tag::FootnoteDefinition(..)),
159+
cmarkn::Event::Start(cmarkn::Tag::FootnoteDefinition(..)),
160+
)
161+
| (
162+
cmarko::Event::Start(cmarko::Tag::Table(..)),
163+
cmarkn::Event::Start(cmarkn::Tag::Table(..)),
164+
)
165+
| (
166+
cmarko::Event::Start(cmarko::Tag::TableHead),
167+
cmarkn::Event::Start(cmarkn::Tag::TableHead),
168+
)
169+
| (
170+
cmarko::Event::Start(cmarko::Tag::TableRow),
171+
cmarkn::Event::Start(cmarkn::Tag::TableRow),
172+
)
173+
| (
174+
cmarko::Event::Start(cmarko::Tag::TableCell),
175+
cmarkn::Event::Start(cmarkn::Tag::TableCell),
176+
)
177+
| (
178+
cmarko::Event::End(cmarko::Tag::Paragraph),
179+
cmarkn::Event::End(cmarkn::TagEnd::Paragraph),
180+
)
181+
| (
182+
cmarko::Event::End(cmarko::Tag::Heading(..)),
183+
cmarkn::Event::End(cmarkn::TagEnd::Heading(_)),
184+
)
185+
| (
186+
cmarko::Event::End(cmarko::Tag::BlockQuote),
187+
cmarkn::Event::End(cmarkn::TagEnd::BlockQuote),
188+
)
189+
| (
190+
cmarko::Event::End(cmarko::Tag::CodeBlock(..)),
191+
cmarkn::Event::End(cmarkn::TagEnd::CodeBlock),
192+
)
193+
| (
194+
cmarko::Event::End(cmarko::Tag::List(..)),
195+
cmarkn::Event::End(cmarkn::TagEnd::List(_)),
196+
)
197+
| (cmarko::Event::End(cmarko::Tag::Item), cmarkn::Event::End(cmarkn::TagEnd::Item))
198+
| (
199+
cmarko::Event::End(cmarko::Tag::FootnoteDefinition(..)),
200+
cmarkn::Event::End(cmarkn::TagEnd::FootnoteDefinition),
201+
)
202+
| (
203+
cmarko::Event::End(cmarko::Tag::Table(..)),
204+
cmarkn::Event::End(cmarkn::TagEnd::Table),
205+
)
206+
| (
207+
cmarko::Event::End(cmarko::Tag::TableHead),
208+
cmarkn::Event::End(cmarkn::TagEnd::TableHead),
209+
)
210+
| (
211+
cmarko::Event::End(cmarko::Tag::TableRow),
212+
cmarkn::Event::End(cmarkn::TagEnd::TableRow),
213+
)
214+
| (
215+
cmarko::Event::End(cmarko::Tag::TableCell),
216+
cmarkn::Event::End(cmarkn::TagEnd::TableCell),
217+
) => {
218+
// Matching tags. Do nothing.
219+
//
220+
// Parsers sometimes differ in what they consider the "range of an event,"
221+
// even though the event is really the same. Inlines are pretty consistent,
222+
// but stuff like list items? Not really.
223+
//
224+
// Mismatched block elements will usually nest differently, so ignoring it
225+
// works good enough.
226+
}
227+
// If we've already reported an error on the start tag, don't bother on the end tag.
228+
(cmarko::Event::End(_), _) | (_, cmarkn::Event::End(_)) if reported_an_error => {}
229+
// Non-matching inline.
230+
(cmarko::Event::Start(cmarko::Tag::Link(..)), cmarkn::Event::FootnoteReference(..))
231+
| (
232+
cmarko::Event::Start(cmarko::Tag::Image(..)),
233+
cmarkn::Event::FootnoteReference(..),
234+
)
235+
| (
236+
cmarko::Event::FootnoteReference(..),
237+
cmarkn::Event::Start(cmarkn::Tag::Link { .. }),
238+
)
239+
| (
240+
cmarko::Event::FootnoteReference(..),
241+
cmarkn::Event::Start(cmarkn::Tag::Image { .. }),
242+
) if event_range_old == event_range_new => {
243+
reported_an_error = true;
244+
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
245+
// use the span of the entire attribute as a fallback.
246+
let span = source_span_for_markdown_range(
247+
tcx,
248+
&dox,
249+
&event_range_old,
250+
&item.attrs.doc_strings,
251+
)
252+
.unwrap_or_else(|| item.attr_span(tcx));
253+
tcx.node_span_lint(
254+
crate::lint::UNPORTABLE_MARKDOWN,
255+
hir_id,
256+
span,
257+
"unportable markdown",
258+
|lint| {
259+
lint.help(format!("syntax ambiguous between footnote and link"));
260+
},
261+
);
262+
}
263+
// Non-matching results.
264+
(event_old, event_new) => {
265+
reported_an_error = true;
266+
let (range, range_other, desc, desc_other, tag, tag_other) = if event_range_old.end
267+
- event_range_old.start
268+
< event_range_new.end - event_range_new.start
269+
{
270+
(
271+
event_range_old,
272+
event_range_new,
273+
"old",
274+
"new",
275+
format!("{event_old:?}"),
276+
format!("{event_new:?}"),
277+
)
278+
} else {
279+
(
280+
event_range_new,
281+
event_range_old,
282+
"new",
283+
"old",
284+
format!("{event_new:?}"),
285+
format!("{event_old:?}"),
286+
)
287+
};
288+
let (range, tag_other) =
289+
if range_other.start <= range.start && range_other.end <= range.end {
290+
(range_other.start..range.end, tag_other)
291+
} else {
292+
(range, format!("nothing"))
293+
};
294+
// If we can't get a span of the backtick, because it is in a `#[doc = ""]` attribute,
295+
// use the span of the entire attribute as a fallback.
296+
let span =
297+
source_span_for_markdown_range(tcx, &dox, &range, &item.attrs.doc_strings)
298+
.unwrap_or_else(|| item.attr_span(tcx));
299+
tcx.node_span_lint(
300+
crate::lint::UNPORTABLE_MARKDOWN,
301+
hir_id,
302+
span,
303+
"unportable markdown",
304+
|lint| {
305+
lint.help(format!(
306+
"{desc} parser sees {tag}, {desc_other} sees {tag_other}"
307+
));
308+
},
309+
);
310+
}
311+
}
312+
}
313+
}

0 commit comments

Comments
 (0)