Skip to content

Commit 922a7c9

Browse files
committed
Auto merge of #1751 - smarnach:clean-up-render, r=sgrif
Clean up README rendering code. When browsing the code, I noticed that the HTML sanitzation code looked a bit messy, so I attempted to clean it up. This patch should not change the functionality in any way, but the new code may be easier to read and understand – not sure whether that's enough for you to spend time reviewing this. I also wonder whether there is an explicit list of allowed tags in the code. The [defaults in the `ammonia` crate](https://docs.rs/ammonia/2.1.1/src/ammonia/lib.rs.html#260-269) look quite reasonable to me (we'd only need to add `input` for check boxes).
2 parents b20b7b4 + 41af3e1 commit 922a7c9

File tree

1 file changed

+99
-108
lines changed

1 file changed

+99
-108
lines changed

src/render.rs

Lines changed: 99 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
use ammonia::{Builder, UrlRelative};
1+
//! Render README files to HTML.
2+
3+
use ammonia::{Builder, UrlRelative, UrlRelativeEvaluate};
24
use htmlescape::encode_minimal;
35
use std::borrow::Cow;
46
use std::path::Path;
@@ -20,7 +22,7 @@ impl<'a> MarkdownRenderer<'a> {
2022
/// Per `readme_to_html`, `base_url` is the base URL prepended to any
2123
/// relative links in the input document. See that function for more detail.
2224
fn new(base_url: Option<&'a str>) -> MarkdownRenderer<'a> {
23-
let tags = [
25+
let tags = hashset(&[
2426
"a",
2527
"b",
2628
"blockquote",
@@ -60,30 +62,15 @@ impl<'a> MarkdownRenderer<'a> {
6062
"ul",
6163
"hr",
6264
"span",
63-
]
64-
.iter()
65-
.cloned()
66-
.collect();
67-
let tag_attributes = [
68-
("a", ["href", "id", "target"].iter().cloned().collect()),
69-
(
70-
"img",
71-
["width", "height", "src", "alt", "align"]
72-
.iter()
73-
.cloned()
74-
.collect(),
75-
),
76-
(
77-
"input",
78-
["checked", "disabled", "type"].iter().cloned().collect(),
79-
),
80-
]
81-
.iter()
82-
.cloned()
83-
.collect();
84-
let allowed_classes = [(
65+
]);
66+
let tag_attributes = hashmap(&[
67+
("a", hashset(&["href", "id", "target"])),
68+
("img", hashset(&["width", "height", "src", "alt", "align"])),
69+
("input", hashset(&["checked", "disabled", "type"])),
70+
]);
71+
let allowed_classes = hashmap(&[(
8572
"code",
86-
[
73+
hashset(&[
8774
"language-bash",
8875
"language-clike",
8976
"language-glsl",
@@ -98,97 +85,18 @@ impl<'a> MarkdownRenderer<'a> {
9885
"language-scss",
9986
"language-sql",
10087
"yaml",
101-
]
102-
.iter()
103-
.cloned()
104-
.collect(),
105-
)]
106-
.iter()
107-
.cloned()
108-
.collect();
109-
110-
let sanitizer_base_url = base_url.map(ToString::to_string);
111-
112-
// Constrain the type of the closures given to the HTML sanitizer.
113-
fn constrain_closure<F>(f: F) -> F
114-
where
115-
F: for<'a> Fn(&'a str) -> Option<Cow<'a, str>> + Send + Sync,
116-
{
117-
f
118-
}
119-
120-
let unrelative_url_sanitizer = constrain_closure(|url| {
121-
// We have no base URL; allow fragment links only.
122-
if url.starts_with('#') {
123-
return Some(Cow::Borrowed(url));
124-
}
125-
126-
None
127-
});
128-
129-
fn is_media_url(url: &str) -> bool {
130-
Path::new(url)
131-
.extension()
132-
.and_then(std::ffi::OsStr::to_str)
133-
.map_or(false, |e| match e {
134-
"png" | "svg" | "jpg" | "jpeg" | "gif" | "mp4" | "webm" | "ogg" => true,
135-
_ => false,
136-
})
137-
}
138-
139-
let relative_url_sanitizer = constrain_closure(move |url| {
140-
// sanitizer_base_url is Some(String); use it to fix the relative URL.
141-
if url.starts_with('#') {
142-
return Some(Cow::Borrowed(url));
143-
}
144-
145-
let mut new_url = sanitizer_base_url.clone().unwrap();
146-
if !new_url.ends_with('/') {
147-
new_url.push('/');
148-
}
149-
if new_url.ends_with(".git/") {
150-
let offset = new_url.len() - 5;
151-
new_url.drain(offset..offset + 4);
152-
}
153-
// Assumes GitHub’s URL scheme. GitHub renders text and markdown
154-
// better in the "blob" view, but images need to be served raw.
155-
new_url += if is_media_url(url) {
156-
"raw/master"
157-
} else {
158-
"blob/master"
159-
};
160-
if !url.starts_with('/') {
161-
new_url.push('/');
162-
}
163-
new_url += url;
164-
Some(Cow::Owned(new_url))
165-
});
166-
167-
let use_relative = if let Some(base_url) = base_url {
168-
if let Ok(url) = Url::parse(base_url) {
169-
url.host_str() == Some("github.com")
170-
|| url.host_str() == Some("gitlab.com")
171-
|| url.host_str() == Some("bitbucket.org")
172-
} else {
173-
false
174-
}
175-
} else {
176-
false
177-
};
88+
]),
89+
)]);
90+
let sanitize_url = UrlRelative::Custom(Box::new(SanitizeUrl::new(base_url)));
17891

17992
let mut html_sanitizer = Builder::new();
18093
html_sanitizer
18194
.link_rel(Some("nofollow noopener noreferrer"))
18295
.tags(tags)
18396
.tag_attributes(tag_attributes)
18497
.allowed_classes(allowed_classes)
185-
.url_relative(if use_relative {
186-
UrlRelative::Custom(Box::new(relative_url_sanitizer))
187-
} else {
188-
UrlRelative::Custom(Box::new(unrelative_url_sanitizer))
189-
})
98+
.url_relative(sanitize_url)
19099
.id_prefix(Some("user-content-"));
191-
192100
MarkdownRenderer { html_sanitizer }
193101
}
194102

@@ -209,6 +117,72 @@ impl<'a> MarkdownRenderer<'a> {
209117
}
210118
}
211119

120+
/// Add trailing slash and remove `.git` suffix of base URL.
121+
fn canon_base_url(mut base_url: String) -> String {
122+
if !base_url.ends_with('/') {
123+
base_url.push('/');
124+
}
125+
if base_url.ends_with(".git/") {
126+
let offset = base_url.len() - 5;
127+
base_url.drain(offset..offset + 4);
128+
}
129+
base_url
130+
}
131+
132+
/// Sanitize relative URLs in README files.
133+
struct SanitizeUrl {
134+
base_url: Option<String>,
135+
}
136+
137+
impl SanitizeUrl {
138+
fn new(base_url: Option<&str>) -> Self {
139+
let base_url = base_url
140+
.and_then(|base_url| Url::parse(base_url).ok())
141+
.and_then(|url| match url.host_str() {
142+
Some("github.com") | Some("gitlab.com") | Some("bitbucket.org") => {
143+
Some(canon_base_url(url.into_string()))
144+
}
145+
_ => None,
146+
});
147+
Self { base_url }
148+
}
149+
}
150+
151+
/// Determine whether the given URL has a media file externsion.
152+
fn is_media_url(url: &str) -> bool {
153+
Path::new(url)
154+
.extension()
155+
.and_then(std::ffi::OsStr::to_str)
156+
.map_or(false, |e| match e {
157+
"png" | "svg" | "jpg" | "jpeg" | "gif" | "mp4" | "webm" | "ogg" => true,
158+
_ => false,
159+
})
160+
}
161+
162+
impl UrlRelativeEvaluate for SanitizeUrl {
163+
fn evaluate<'a>(&self, url: &'a str) -> Option<Cow<'a, str>> {
164+
if url.starts_with('#') {
165+
// Always allow fragment URLs.
166+
return Some(Cow::Borrowed(url));
167+
}
168+
self.base_url.as_ref().map(|base_url| {
169+
let mut new_url = base_url.clone();
170+
// Assumes GitHub’s URL scheme. GitHub renders text and markdown
171+
// better in the "blob" view, but images need to be served raw.
172+
new_url += if is_media_url(url) {
173+
"raw/master"
174+
} else {
175+
"blob/master"
176+
};
177+
if !url.starts_with('/') {
178+
new_url.push('/');
179+
}
180+
new_url += url;
181+
Cow::Owned(new_url)
182+
})
183+
}
184+
}
185+
212186
/// Renders Markdown text to sanitized HTML with a given `base_url`.
213187
/// See `readme_to_html` for the interpretation of `base_url`.
214188
fn markdown_to_html(text: &str, base_url: Option<&str>) -> String {
@@ -286,6 +260,23 @@ pub fn render_and_upload_readme(
286260
})
287261
}
288262

263+
/// Helper function to build a new `HashSet` from the items slice.
264+
fn hashset<T>(items: &[T]) -> std::collections::HashSet<T>
265+
where
266+
T: Clone + Eq + std::hash::Hash,
267+
{
268+
items.iter().cloned().collect()
269+
}
270+
271+
/// Helper function to build a new `HashMap` from a slice of key-value pairs.
272+
fn hashmap<K, V>(items: &[(K, V)]) -> std::collections::HashMap<K, V>
273+
where
274+
K: Clone + Eq + std::hash::Hash,
275+
V: Clone,
276+
{
277+
items.iter().cloned().collect()
278+
}
279+
289280
#[cfg(test)]
290281
mod tests {
291282
use super::*;

0 commit comments

Comments
 (0)