From 62759156a10203afb12b8432d4688750e0812ff3 Mon Sep 17 00:00:00 2001 From: Hollow Man Date: Fri, 15 Apr 2022 14:47:13 +0800 Subject: [PATCH 1/2] Add Songlin as a contributor Signed-off-by: Hollow Man --- guide/src/misc/contributors.md | 1 + 1 file changed, 1 insertion(+) diff --git a/guide/src/misc/contributors.md b/guide/src/misc/contributors.md index 362a21fe4f..ff3549091f 100644 --- a/guide/src/misc/contributors.md +++ b/guide/src/misc/contributors.md @@ -20,5 +20,6 @@ shout-out to them! - Vivek Akupatni ([apatniv](https://github.com/apatniv)) - Eric Huss ([ehuss](https://github.com/ehuss)) - Josh Rotenberg ([joshrotenberg](https://github.com/joshrotenberg)) +- Songlin Jiang ([HollowMan6](https://github.com/HollowMan6)) If you feel you're missing from this list, feel free to add yourself in a PR. From 095de4a0565774800dc7cab3534ef5972a06360f Mon Sep 17 00:00:00 2001 From: Hollow Man Date: Fri, 15 Apr 2022 14:43:20 +0800 Subject: [PATCH 2/2] Make print page (print.html) links link to anchors on the print page Let all the anchors id on the print page to have a path id prefix to help locate. e.g. bar/foo.md#abc -> #bar-foo-abc Also append a dummy div to the start of the original page to make sure that original page links without an anchor can also be located. Fix to remove all the `./` in the normalized path id so that for "./foo/bar.html#abc" we still get "#foo-bar-abc" Add support for redirect link anchors in print page so that anchors can also be redirected, also handle URL redirect links on print page Handle all the elements id to add a path prefix, also make path id to all be the lower case Fix for print page footnote links by adding the path id prefix Signed-off-by: Hollow Man --- src/renderer/html_handlebars/hbs_renderer.rs | 88 ++++- src/utils/mod.rs | 352 ++++++++++++++++--- tests/testsuite/print.rs | 14 +- 3 files changed, 385 insertions(+), 69 deletions(-) diff --git a/src/renderer/html_handlebars/hbs_renderer.rs b/src/renderer/html_handlebars/hbs_renderer.rs index a144b32b57..12677860ee 100644 --- a/src/renderer/html_handlebars/hbs_renderer.rs +++ b/src/renderer/html_handlebars/hbs_renderer.rs @@ -59,10 +59,11 @@ impl HtmlHandlebars { let content = utils::render_markdown(&ch.content, ctx.html_config.smart_punctuation()); - let fixed_content = utils::render_markdown_with_path( + let printed_item = utils::render_markdown_with_path_and_redirects( &ch.content, ctx.html_config.smart_punctuation(), Some(path), + &ctx.html_config.redirect, ); if !ctx.is_index && ctx.html_config.print.page_break { // Add page break between chapters @@ -71,7 +72,25 @@ impl HtmlHandlebars { print_content .push_str(r#"
"#); } - print_content.push_str(&fixed_content); + let print_page_id = { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + &base + .replace("/", "-") + .replace("\\", "-") + .to_ascii_lowercase() + }; + + // We have to build header links in advance so that we can know the ranges + // for the headers in one page. + // Insert a dummy div to make sure that we can locate the specific page. + print_content.push_str(&(format!(r#"
"#))); + print_content.push_str(&build_header_links( + &build_print_element_id(&printed_item, &print_page_id), + Some(print_page_id), + )); // Update the context with data for this file let ctx_path = path @@ -216,7 +235,23 @@ impl HtmlHandlebars { code_config: &Code, edition: Option, ) -> String { - let rendered = build_header_links(&rendered); + let rendered = build_header_links(&rendered, None); + let rendered = self.post_process_common(rendered, &playground_config, code_config, edition); + + rendered + } + + /// Applies some post-processing to the HTML to apply some adjustments. + /// + /// This common function is used for both normal chapters (via + /// `post_process`) and the combined print page. + fn post_process_common( + &self, + rendered: String, + playground_config: &Playground, + code_config: &Code, + edition: Option, + ) -> String { let rendered = fix_code_blocks(&rendered); let rendered = add_playground_pre(&rendered, playground_config, edition); let rendered = hide_lines(&rendered, code_config); @@ -465,7 +500,7 @@ impl Renderer for HtmlHandlebars { debug!("Render template"); let rendered = handlebars.render("index", &data)?; - let rendered = self.post_process( + let rendered = self.post_process_common( rendered, &html_config.playground, &html_config.code, @@ -661,9 +696,35 @@ fn make_data( Ok(data) } +/// Go through the rendered print page HTML, +/// add path id prefix to all the elements id as well as footnote links. +fn build_print_element_id(html: &str, print_page_id: &str) -> String { + static ALL_ID: LazyLock = + LazyLock::new(|| Regex::new(r#"(<[^>]*?id=")([^"]+?)""#).unwrap()); + static FOOTNOTE_ID: LazyLock = LazyLock::new(|| { + Regex::new( + r##"(]*?class="footnote-reference"[^>]*?>[^<]*?]*?href="#)([^"]+?)""##, + ) + .unwrap() + }); + + let temp_html = ALL_ID.replace_all(html, |caps: &Captures<'_>| { + format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2]) + }); + + FOOTNOTE_ID + .replace_all(&temp_html, |caps: &Captures<'_>| { + format!("{}{}-{}\"", &caps[1], print_page_id, &caps[2]) + }) + .into_owned() +} + /// Goes through the rendered HTML, making sure all header tags have /// an anchor respectively so people can link to sections directly. -fn build_header_links(html: &str) -> String { +/// +/// `print_page_id` should be set to the print page ID prefix when adjusting the +/// print page. +fn build_header_links(html: &str, print_page_id: Option<&str>) -> String { static BUILD_HEADER_LINKS: LazyLock = LazyLock::new(|| { Regex::new(r#"(.*?)"#).unwrap() }); @@ -692,6 +753,7 @@ fn build_header_links(html: &str) -> String { caps.get(2).map(|x| x.as_str().to_string()), caps.get(3).map(|x| x.as_str().to_string()), &mut id_counter, + print_page_id, ) }) .into_owned() @@ -699,14 +761,26 @@ fn build_header_links(html: &str) -> String { /// Insert a single link into a header, making sure each link gets its own /// unique ID by appending an auto-incremented number (if necessary). +/// +/// For `print.html`, we will add a path id prefix. fn insert_link_into_header( level: usize, content: &str, id: Option, classes: Option, id_counter: &mut HashMap, + print_page_id: Option<&str>, ) -> String { - let id = id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)); + let id = if let Some(print_page_id) = print_page_id { + let content_id = { + #[allow(deprecated)] + utils::id_from_content(content) + }; + let with_prefix = format!("{} {}", print_page_id, content_id); + id.unwrap_or_else(|| utils::unique_id_from_content(&with_prefix, id_counter)) + } else { + id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)) + }; let classes = classes .map(|s| format!(" class=\"{s}\"")) .unwrap_or_default(); @@ -986,7 +1060,7 @@ mod tests { ]; for (src, should_be) in inputs { - let got = build_header_links(src); + let got = build_header_links(src, None); assert_eq!(got, should_be); } } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 597f0ea400..e6dcd77c7e 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -5,13 +5,13 @@ mod string; pub(crate) mod toml_ext; use crate::errors::Error; use log::error; -use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd}; +use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, LinkType, Options, Parser, Tag, TagEnd}; use regex::Regex; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Write; -use std::path::Path; +use std::path::{Component, Path, PathBuf}; use std::sync::LazyLock; pub use self::string::{ @@ -83,64 +83,243 @@ pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap +fn normalize_path>(path: P) -> String { + let ends_with_slash = path.as_ref().to_str().map_or(false, |s| s.ends_with('/')); + let mut normalized = PathBuf::new(); + for component in path.as_ref().components() { + match &component { + Component::ParentDir => { + if !normalized.pop() { + normalized.push(component); + } + } + Component::CurDir => {} + _ => { + normalized.push(component); + } + } + } + if ends_with_slash { + normalized.push(""); + } + normalized + .to_str() + .unwrap() + .replace("\\", "/") + .trim_start_matches('/') + .to_string() +} + +/// Converts a relative URL path to a reference ID for the print page. +fn normalize_print_page_id(mut path: String) -> String { + path = path + .replace("/", "-") + .replace(".html#", "-") + .replace("#", "-") + .to_ascii_lowercase(); + if path.ends_with(".html") { + path.truncate(path.len() - 5); + } + path +} + /// Fix links to the correct location. /// /// This adjusts links, such as turning `.md` extensions to `.html`. /// -/// `path` is the path to the page being rendered relative to the root of the -/// book. This is used for the `print.html` page so that links on the print -/// page go to the original location. Normal page rendering sets `path` to -/// None. Ideally, print page links would link to anchors on the print page, -/// but that is very difficult. -fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { +/// See [`render_markdown_with_path_and_redirects`] for a description of +/// `path` and `redirects`. +fn adjust_links<'a>( + event: Event<'a>, + path: Option<&Path>, + redirects: &HashMap, +) -> Event<'a> { static SCHEME_LINK: LazyLock = LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap()); - static MD_LINK: LazyLock = - LazyLock::new(|| Regex::new(r"(?P.*)\.md(?P#.*)?").unwrap()); - - fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { - if dest.starts_with('#') { - // Fragment-only link. - if let Some(path) = path { - let mut base = path.display().to_string(); - if base.ends_with(".md") { - base.replace_range(base.len() - 3.., ".html"); - } - return format!("{base}{dest}").into(); - } else { - return dest; + static HTML_MD_LINK: LazyLock = + LazyLock::new(|| Regex::new(r"(?P.*)\.(html|md)(?P#.*)?").unwrap()); + + fn add_base(path: Option<&Path>) -> String { + let mut fixed_link = String::new(); + if let Some(path) = path { + let base = path + .parent() + .expect("path can't be empty") + .to_str() + .expect("utf-8 paths only"); + if !base.is_empty() { + write!(fixed_link, "{base}/").unwrap(); } } - // Don't modify links with schemes like `https`. - if !SCHEME_LINK.is_match(&dest) { - // This is a relative link, adjust it as necessary. - let mut fixed_link = String::new(); - if let Some(path) = path { - let base = path + fixed_link.to_string() + } + + fn fix_print_page_link<'a>( + mut normalized_path: String, + redirects: &HashMap, + ) -> CowStr<'a> { + // Fix redirect links + let (path_no_fragment, fragment) = match normalized_path.split_once('#') { + Some((a, b)) => (a, Some(b)), + None => (normalized_path.as_str(), None), + }; + for (original, redirect) in redirects { + if !normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&normalized_path) + && !normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&path_no_fragment) + { + continue; + } + + let mut unnormalized_path = String::new(); + if SCHEME_LINK.is_match(&redirect) { + unnormalized_path = redirect.to_string(); + } else { + let base = PathBuf::from(path_no_fragment) .parent() .expect("path can't be empty") .to_str() - .expect("utf-8 paths only"); - if !base.is_empty() { - write!(fixed_link, "{base}/").unwrap(); + .expect("utf-8 paths only") + .to_owned(); + + let normalized_base = normalize_path(base).trim_matches('/').to_owned(); + if !normalized_base.is_empty() { + write!(unnormalized_path, "{normalized_base}/{redirect}").unwrap(); + } else { + unnormalized_path = redirect.to_string().trim_start_matches('/').to_string(); } } - if let Some(caps) = MD_LINK.captures(&dest) { - fixed_link.push_str(&caps["link"]); - fixed_link.push_str(".html"); - if let Some(anchor) = caps.name("anchor") { - fixed_link.push_str(anchor.as_str()); + // original without anchors, need to append link anchors + if !original.contains("#") { + if let Some(fragment) = fragment { + if !unnormalized_path.contains("#") { + unnormalized_path.push('#'); + } else { + unnormalized_path.push('-'); + } + unnormalized_path.push_str(fragment); } + } + + if SCHEME_LINK.is_match(&redirect) { + return CowStr::from(unnormalized_path); } else { - fixed_link.push_str(&dest); + normalized_path = normalize_path(unnormalized_path); + } + break; + } + + // Check again to make sure anchors are the html links inside the book. + if normalized_path.starts_with("../") || normalized_path.contains("/../") { + return CowStr::from(normalized_path); + } + + let mut fixed_anchor_for_print = String::new(); + fixed_anchor_for_print.push_str("#"); + fixed_anchor_for_print.push_str(&normalize_print_page_id(normalized_path)); + CowStr::from(fixed_anchor_for_print) + } + + /// Fix resource links like img to the correct location. + fn fix_resource_links<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + // Don't modify links with schemes like `https`. + // Only fix relative links + if SCHEME_LINK.is_match(&dest) || dest.starts_with('/') { + return dest; + } + + // This is a relative link, adjust it as necessary. + let mut fixed_link = add_base(path); + fixed_link.push_str(&dest); + CowStr::from(fixed_link) + } + + fn fix_a_links_with_type<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + link_type: LinkType, + ) -> CowStr<'a> { + if link_type == LinkType::Email { + return dest; + } + fix_a_links(dest, path, redirects) + } + + /// Adjust markdown file to correct point in the html file. + fn fix_a_links<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + ) -> CowStr<'a> { + if dest.starts_with('#') { + // Fragment-only link. + return match path { + Some(path) => { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + format!( + "#{}{}", + normalize_print_page_id(normalize_path(base)), + dest.replace("#", "-") + ) + .into() + } + None => dest, + }; + } + + // Don't modify links with schemes like `https`. + if SCHEME_LINK.is_match(&dest) { + return dest; + } + + let mut fixed_link = if dest.starts_with('/') { + String::new() + } else { + // This is a relative link, adjust it as necessary. + add_base(path) + }; + + if let Some(caps) = HTML_MD_LINK.captures(&dest) { + fixed_link.push_str(&caps["link"]); + fixed_link.push_str(".html"); + if let Some(anchor) = caps.name("anchor") { + fixed_link.push_str(anchor.as_str()); + } + } else { + fixed_link.push_str(&dest); + }; + + let normalized_path = normalize_path(&fixed_link); + + // Judge if the html link is inside the book. + if !normalized_path.starts_with("../") && !normalized_path.contains("/../") { + // In `print.html`, print page links would all link to anchors on the print page. + return match path { + Some(_) => fix_print_page_link(normalized_path, redirects), + None => CowStr::from(fixed_link), }; - return CowStr::from(fixed_link); } - dest + // In normal page rendering, links to anchors on another page. + CowStr::from(fixed_link) } - fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + fn fix_html<'a>( + html: CowStr<'a>, + path: Option<&Path>, + redirects: &HashMap, + ) -> CowStr<'a> { // This is a terrible hack, but should be reasonably reliable. Nobody // should ever parse a tag with a regex. However, there isn't anything // in Rust that I know of that is suitable for handling partial html @@ -149,12 +328,45 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { // There are dozens of HTML tags/attributes that contain paths, so // feel free to add more tags if desired; these are the only ones I // care about right now. - static HTML_LINK: LazyLock = - LazyLock::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap()); + static A_LINK: LazyLock = + LazyLock::new(|| Regex::new(r#"(]*?href=")([^"]+?)""#).unwrap()); + static A_NAME: LazyLock = + LazyLock::new(|| Regex::new(r#"(]*?name=")([^"]+?)""#).unwrap()); + static IMG_LINK: LazyLock = + LazyLock::new(|| Regex::new(r#"(]*?src=")([^"]+?)""#).unwrap()); + + let img_link_fixed_html = IMG_LINK.replace_all(&html, |caps: ®ex::Captures<'_>| { + let fixed = fix_resource_links(caps[2].into(), path); + format!("{}{}\"", &caps[1], fixed) + }); - HTML_LINK - .replace_all(&html, |caps: ®ex::Captures<'_>| { - let fixed = fix(caps[2].into(), path); + let a_name_fixed_html = + A_NAME.replace_all(&img_link_fixed_html, |caps: ®ex::Captures<'_>| { + // This is a relative link, adjust it as necessary. + let origin_name = &caps[2].to_string(); + format!( + "{}{}\"", + &caps[1], + CowStr::from(match path { + Some(path) => { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + format!( + "{}-{}", + normalize_print_page_id(normalize_path(base)), + origin_name.to_string() + ) + } + None => origin_name.to_string(), + }) + ) + }); + + A_LINK + .replace_all(&a_name_fixed_html, |caps: ®ex::Captures<'_>| { + let fixed = fix_a_links(caps[2].into(), path, &redirects); format!("{}{}\"", &caps[1], fixed) }) .into_owned() @@ -169,7 +381,7 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { id, }) => Event::Start(Tag::Link { link_type, - dest_url: fix(dest_url, path), + dest_url: fix_a_links_with_type(dest_url, path, redirects, link_type), title, id, }), @@ -180,12 +392,12 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { id, }) => Event::Start(Tag::Image { link_type, - dest_url: fix(dest_url, path), + dest_url: fix_resource_links(dest_url, path), title, id, }), - Event::Html(html) => Event::Html(fix_html(html, path)), - Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path)), + Event::Html(html) => Event::Html(fix_html(html, path, redirects)), + Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path, redirects)), _ => event, } } @@ -195,6 +407,15 @@ pub fn render_markdown(text: &str, smart_punctuation: bool) -> String { render_markdown_with_path(text, smart_punctuation, None) } +/// Wrapper around for API compatibility. +pub fn render_markdown_with_path( + text: &str, + smart_punctuation: bool, + path: Option<&Path>, +) -> String { + render_markdown_with_path_and_redirects(text, smart_punctuation, path, &HashMap::new()) +} + /// Creates a new pulldown-cmark parser of the given text. pub fn new_cmark_parser(text: &str, smart_punctuation: bool) -> Parser<'_> { let mut opts = Options::empty(); @@ -211,13 +432,18 @@ pub fn new_cmark_parser(text: &str, smart_punctuation: bool) -> Parser<'_> { /// Renders markdown to HTML. /// -/// `path` should only be set if this is being generated for the consolidated -/// print page. It should point to the page being rendered relative to the -/// root of the book. -pub fn render_markdown_with_path( +/// `path` is the path to the page being rendered relative to the root of the +/// book. This is used for the `print.html` page so that links on the print +/// page go to the anchors that has a path id prefix. Normal page rendering +/// sets `path` to None. +/// +/// `redirects` is also only for the print page. It's for adjusting links to +/// a redirected location to go to the correct spot on the `print.html` page. +pub(crate) fn render_markdown_with_path_and_redirects( text: &str, smart_punctuation: bool, path: Option<&Path>, + redirects: &HashMap, ) -> String { let mut body = String::with_capacity(text.len() * 3 / 2); @@ -250,7 +476,7 @@ pub fn render_markdown_with_path( let events = new_cmark_parser(text, smart_punctuation) .map(clean_codeblock_headers) - .map(|event| adjust_links(event, path)) + .map(|event| adjust_links(event, path, &redirects)) .flat_map(|event| { let (a, b) = wrap_tables(event); a.into_iter().chain(b) @@ -345,6 +571,22 @@ fn add_footnote_defs( } }); + let prefix = if let Some(path) = path { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.truncate(base.len() - 3); + } + base = normalize_print_page_id(normalize_path(base)); + + if base.is_empty() { + String::new() + } else { + format!("{}-", base) + } + } else { + String::new() + }; + defs.sort_by_cached_key(|(name, _)| numbers[name].0); body.push_str( @@ -367,7 +609,7 @@ fn add_footnote_defs( usage.to_string() }; let backlink = - Event::Html(format!(" ↩{nth}").into()); + Event::Html(format!(" ↩{nth}").into()); if matches!(fn_events.last(), Some(Event::End(TagEnd::Paragraph))) { // Put the linkback at the end of the last paragraph instead // of on a line by itself. diff --git a/tests/testsuite/print.rs b/tests/testsuite/print.rs index 0e0cdfe05c..57f02501db 100644 --- a/tests/testsuite/print.rs +++ b/tests/testsuite/print.rs @@ -8,17 +8,17 @@ fn relative_links() { BookTest::from_dir("print/relative_links") .check_main_file("book/print.html", str![[r##" -

First Chapter

-

First Nested

-

Testing relative links for the print page

-

When we link to the first section, it should work on +

First Chapter

+

First Nested

+

Testing relative links for the print page

+

When we link to the first section, it should work on both the print page and the non-print page.

-

A fragment link should work.

+

A fragment link should work.

Link outside.

Some image

-

HTML Link

+

HTML Link

raw html -

Some section

+

Some section

"##]]); }