From f1cd4af2bb7fd238e26f1e6c550ace3b095bdcce Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Wed, 13 Aug 2025 21:29:54 +0200 Subject: [PATCH 01/11] create fixture for whitespace wikilinks Feat: strip Potholes and Headings from wikilinks adjust fixture to contain Headers and Potholes add integration test for fixture split fixture into obsidian and mediawiki --- fixtures/wiki/Dash-Usage.md | 1 + fixtures/wiki/Space Usage.md | 1 + fixtures/wiki/Underscore_Usage.md | 1 + fixtures/wiki/Usage.md | 1 + fixtures/wiki/obsidian-style.md | 13 ++++ .../subdirectory/Different-Directory-Dash.md | 1 + .../wiki/subdirectory/DifferentDirectory.md | 1 + .../Different_Directory_Underscore.md | 1 + .../Space Usage DifferentDirectory.md | 1 + fixtures/wiki/wikilink-style.md | 19 ++++++ lychee-bin/tests/cli.rs | 62 +++++++++++++++++++ lychee-lib/src/extract/markdown.rs | 50 ++++++++++++++- 12 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 fixtures/wiki/Dash-Usage.md create mode 100644 fixtures/wiki/Space Usage.md create mode 100644 fixtures/wiki/Underscore_Usage.md create mode 100644 fixtures/wiki/Usage.md create mode 100644 fixtures/wiki/obsidian-style.md create mode 100644 fixtures/wiki/subdirectory/Different-Directory-Dash.md create mode 100644 fixtures/wiki/subdirectory/DifferentDirectory.md create mode 100644 fixtures/wiki/subdirectory/Different_Directory_Underscore.md create mode 100644 fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md create mode 100644 fixtures/wiki/wikilink-style.md diff --git a/fixtures/wiki/Dash-Usage.md b/fixtures/wiki/Dash-Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Dash-Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/Space Usage.md b/fixtures/wiki/Space Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Space Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/Underscore_Usage.md b/fixtures/wiki/Underscore_Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Underscore_Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/Usage.md b/fixtures/wiki/Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/obsidian-style.md b/fixtures/wiki/obsidian-style.md new file mode 100644 index 0000000000..4fb6f4a4c9 --- /dev/null +++ b/fixtures/wiki/obsidian-style.md @@ -0,0 +1,13 @@ +[[#LocalHeader]] + +[[Usage]] +[[Space Usage]] +[[Space Usage DifferentDirectory]] +[[DifferentDirectory]] + +[[Usage#Header|HeaderRenaming]] +[[Space Usage#Header|HeaderRenaming]] +[[Space Usage DifferentDirectory#Header|HeaderRenaming]] +[[DifferentDirectory#Header|HeaderRenaming]] + +# LocalHeader diff --git a/fixtures/wiki/subdirectory/Different-Directory-Dash.md b/fixtures/wiki/subdirectory/Different-Directory-Dash.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/Different-Directory-Dash.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/subdirectory/DifferentDirectory.md b/fixtures/wiki/subdirectory/DifferentDirectory.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/DifferentDirectory.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/subdirectory/Different_Directory_Underscore.md b/fixtures/wiki/subdirectory/Different_Directory_Underscore.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/Different_Directory_Underscore.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md b/fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/wikilink-style.md b/fixtures/wiki/wikilink-style.md new file mode 100644 index 0000000000..61479263a2 --- /dev/null +++ b/fixtures/wiki/wikilink-style.md @@ -0,0 +1,19 @@ +[[#LocalHeader]] + +[[Usage]] +[[Space Usage]] +[[Dash Usage]] +[[Underscore Usage]] +[[DifferentDirectory]] +[[Different Directory Dash]] +[[Different Directory Underscore]] + +[[Usage#Header|HeaderRenaming]] +[[Space Usage#Header|HeaderRenaming]] +[[Dash Usage#Header|HeaderRenaming]] +[[Underscore Usage#Header|HeaderRenaming]] +[[DifferentDirectory#Header|HeaderRenaming]] +[[Different Directory Dash#Header|HeaderRenaming]] +[[Different Directory Underscore#Header|HeaderRenaming]] + +# LocalHeader diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 74a8d4aa12..a1cffdeb19 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2962,6 +2962,68 @@ mod cli { .assert() .success() .stdout(contains("https://example.org")); // Should extract the link as plaintext + + fn test_wikilink_fixture_obsidian_style() { + let input = fixtures_path().join("wiki/obsidian-style.md"); + + // testing without fragments should not yield failures + main_command() + .arg(&input) + .arg("--include-wikilinks") + .arg("--fallback-extensions") + .arg("md") + .assert() + .success(); + } + + #[test] + fn test_wikilink_fixture_with_fragments_obsidian_style() { + let input = fixtures_path().join("wiki/obsidian-style.md"); + + //fragments should resolve all headers + let dir_links_with_fragment = 2; + main_command() + .arg(&input) + .arg("--include-wikilinks") + .arg("--include-fragments") + .arg("--fallback-extensions") + .arg("md") + .assert() + .failure() + .stdout(contains("Cannot find fragment").count(dir_links_with_fragment)) + .stdout(contains("#").count(dir_links_with_fragment)); + } + + #[test] + fn test_wikilink_fixture_wikilink_style() { + let input = fixtures_path().join("wiki/wikilink-style.md"); + + // testing without fragments should not yield failures + main_command() + .arg(&input) + .arg("--include-wikilinks") + .arg("--fallback-extensions") + .arg("md") + .assert() + .success(); + } + + #[test] + fn test_wikilink_fixture_with_fragments_wikilink_style() { + let input = fixtures_path().join("wiki/wikilink-style.md"); + + //fragments should resolve all headers + let dir_links_with_fragment = 2; + main_command() + .arg(&input) + .arg("--include-wikilinks") + .arg("--include-fragments") + .arg("--fallback-extensions") + .arg("md") + .assert() + .failure() + .stdout(contains("Cannot find fragment").count(dir_links_with_fragment)) + .stdout(contains("#").count(dir_links_with_fragment)); } /// An input which matches nothing should print a warning and continue. diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 52bed658a9..29429dc3e5 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -79,7 +79,7 @@ pub(crate) fn extract_markdown( Some(extract_raw_uri_from_plaintext(&dest_url, &span_provider)) } // Wiki URL (`[[http://example.com]]`) - LinkType::WikiLink { has_pothole: _ } => { + LinkType::WikiLink { has_pothole } => { // Exclude WikiLinks if not explicitly enabled if !include_wikilinks { return None; @@ -90,8 +90,20 @@ pub(crate) fn extract_markdown( return None; } + //Strip potholes (|) from wikilinks + let stripped_dest_url = if has_pothole { + pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('|').unwrap_or(dest_url.len())]) + }else { + dest_url.clone() + }; + + Some(vec![RawUri { + text: stripped_dest_url.to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), // wiki links start with `[[`, so offset the span by `2` - Some(raw_uri(&dest_url, span_provider.span(span.start + 2))) + span: span.start + 2 + }]) } } } @@ -631,5 +643,39 @@ Shortcut link: [link4] "Missing expected URI: {expected_uri:?}. Found: {uris:?}" ); } + + fn test_remove_wikilink_pothole() { + let markdown = r"[[foo|bar]]"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + assert_eq!(uris, expected); + } + + #[test] + fn test_remove_wikilink_title() { + let markdown = r"[[foo#bar]]"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + assert_eq!(uris, expected); + } + + #[test] + fn test_remove_wikilink_pothole_and_title() { + let markdown = r"[[foo#bar|baz]]"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + assert_eq!(uris, expected); } } From 43d83ea14d4ffbd9cc5a5fb119dc4a4ab6c0f45c Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Thu, 28 Aug 2025 09:15:36 +0200 Subject: [PATCH 02/11] implement directory walking for base-url --- Cargo.lock | 1 + lychee-lib/Cargo.toml | 1 + lychee-lib/src/checker/file.rs | 24 +++++++++--- lychee-lib/src/client.rs | 4 ++ lychee-lib/src/extract/markdown.rs | 34 +++------------- lychee-lib/src/utils/mod.rs | 1 + lychee-lib/src/utils/wikilink_checker.rs | 50 ++++++++++++++++++++++++ 7 files changed, 80 insertions(+), 35 deletions(-) create mode 100644 lychee-lib/src/utils/wikilink_checker.rs diff --git a/Cargo.lock b/Cargo.lock index 78b6fb7dc6..62002abec5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2602,6 +2602,7 @@ dependencies = [ "toml", "typed-builder", "url", + "walkdir", "wiremock", ] diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 04f1c2ce8e..bd4059dd89 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -61,6 +61,7 @@ tokio = { version = "1.47.1", features = ["full"] } toml = "0.9.5" typed-builder = "0.22.0" url = { version = "2.5.7", features = ["serde"] } +walkdir = "2.5.0" [dependencies.par-stream] version = "0.10.2" diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 122eadf7f8..8205c117dd 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -3,6 +3,7 @@ use log::warn; use std::borrow::Cow; use std::path::{Path, PathBuf}; +use crate::utils::wikilink_checker::WikilinkChecker; use crate::{ Base, ErrorKind, Status, Uri, utils::fragment_checker::{FragmentChecker, FragmentInput}, @@ -32,8 +33,12 @@ pub(crate) struct FileChecker { index_files: Option>, /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files. include_fragments: bool, + /// Whether to check for the existence of files linked to by Wikilinks + include_wikilinks: bool, /// Utility for performing fragment checks in HTML files. fragment_checker: FragmentChecker, + /// Utility for checking wikilinks, indexes files in a given directory + wikilink_checker: WikilinkChecker, } impl FileChecker { @@ -50,13 +55,16 @@ impl FileChecker { fallback_extensions: Vec, index_files: Option>, include_fragments: bool, + include_wikilinks: bool, ) -> Self { Self { - base, + base: base.clone(), fallback_extensions, index_files, include_fragments, + include_wikilinks, fragment_checker: FragmentChecker::new(), + wikilink_checker: WikilinkChecker::new(base), } } @@ -372,7 +380,7 @@ mod tests { #[tokio::test] async fn test_default() { // default behaviour accepts dir links as long as the directory exists. - let checker = FileChecker::new(None, vec![], None, true); + let checker = FileChecker::new(None, vec![], None, true, false); assert_filecheck!(&checker, "filechecker/index_dir", Status::Ok(_)); @@ -430,6 +438,7 @@ mod tests { vec![], Some(vec!["index.html".to_owned(), "index.md".to_owned()]), true, + false, ); assert_resolves!( @@ -468,6 +477,7 @@ mod tests { vec!["html".to_owned()], Some(vec!["index".to_owned()]), false, + false, ); // this test case has a subdir 'same_name' and a file 'same_name.html'. @@ -492,7 +502,7 @@ mod tests { #[tokio::test] async fn test_empty_index_list_corner() { // empty index_files list will reject all directory links - let checker_no_indexes = FileChecker::new(None, vec![], Some(vec![]), false); + let checker_no_indexes = FileChecker::new(None, vec![], Some(vec![]), false, false); assert_resolves!( &checker_no_indexes, "filechecker/index_dir", @@ -516,7 +526,7 @@ mod tests { "..".to_owned(), "/".to_owned(), ]; - let checker_dir_indexes = FileChecker::new(None, vec![], Some(dir_names), false); + let checker_dir_indexes = FileChecker::new(None, vec![], Some(dir_names), false, false); assert_resolves!( &checker_dir_indexes, "filechecker/index_dir", @@ -537,6 +547,7 @@ mod tests { vec![], Some(vec!["../index_dir/index.html".to_owned()]), true, + false, ); assert_resolves!( &checker_dotdot, @@ -550,7 +561,8 @@ mod tests { .to_str() .expect("expected utf-8 fixtures path") .to_owned(); - let checker_absolute = FileChecker::new(None, vec![], Some(vec![absolute_html]), true); + let checker_absolute = + FileChecker::new(None, vec![], Some(vec![absolute_html]), true, false); assert_resolves!( &checker_absolute, "filechecker/empty_dir#fragment", @@ -560,7 +572,7 @@ mod tests { #[tokio::test] async fn test_fallback_extensions_on_directories() { - let checker = FileChecker::new(None, vec!["html".to_owned()], None, true); + let checker = FileChecker::new(None, vec!["html".to_owned()], None, true, false); // fallback extensions should be applied when directory links are resolved // to directories (i.e., the default index_files behavior or if `.` diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 63306bb91c..c067b7fbae 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -299,6 +299,9 @@ pub struct ClientBuilder { /// Enable the checking of fragments in links. include_fragments: bool, + /// Enable the checking of wikilinks in markdown files + include_wikilinks: bool, + /// Requests run through this chain where each item in the chain /// can modify the request. A chained item can also decide to exit /// early and return a status, so that subsequent chain items are @@ -424,6 +427,7 @@ impl ClientBuilder { self.fallback_extensions, self.index_files, self.include_fragments, + self.include_wikilinks, ), }) } diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 29429dc3e5..64b4247645 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -100,7 +100,7 @@ pub(crate) fn extract_markdown( Some(vec![RawUri { text: stripped_dest_url.to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), // wiki links start with `[[`, so offset the span by `2` span: span.start + 2 }]) @@ -496,7 +496,7 @@ $$ let expected = vec![RawUri { text: "https://example.com/destination".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), span: span(1, 3), }]; let uris = extract_markdown(markdown, true, true); @@ -510,13 +510,13 @@ $$ RawUri { text: "https://example.com/destination".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), span: span(1, 3), }, RawUri { text: "https://example.com/source".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), span: span(1, 38), }, ]; @@ -650,31 +650,7 @@ Shortcut link: [link4] let expected = vec![RawUri { text: "foo".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), - }]; - assert_eq!(uris, expected); - } - - #[test] - fn test_remove_wikilink_title() { - let markdown = r"[[foo#bar]]"; - let uris = extract_markdown(markdown, true, true); - let expected = vec![RawUri { - text: "foo".to_string(), - element: Some("a".to_string()), - attribute: Some("href".to_string()), - }]; - assert_eq!(uris, expected); - } - - #[test] - fn test_remove_wikilink_pothole_and_title() { - let markdown = r"[[foo#bar|baz]]"; - let uris = extract_markdown(markdown, true, true); - let expected = vec![RawUri { - text: "foo".to_string(), - element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), }]; assert_eq!(uris, expected); } diff --git a/lychee-lib/src/utils/mod.rs b/lychee-lib/src/utils/mod.rs index d75d20c064..0236b9de32 100644 --- a/lychee-lib/src/utils/mod.rs +++ b/lychee-lib/src/utils/mod.rs @@ -3,3 +3,4 @@ pub(crate) mod path; pub(crate) mod request; pub(crate) mod reqwest; pub(crate) mod url; +pub(crate) mod wikilink_checker; diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs new file mode 100644 index 0000000000..1339284f64 --- /dev/null +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -0,0 +1,50 @@ +use std::{collections::HashSet, path::PathBuf, sync::Arc}; + +use std::sync::Mutex; +use walkdir::WalkDir; + +use crate::Base; + +#[derive(Clone, Debug, Default)] +/// Indexes a given directory for filenames +pub(crate) struct WikilinkChecker { + filesnames: Arc>>, + basedir: Option, +} + +impl WikilinkChecker { + pub(crate) fn new(base: Option) -> Self { + Self { + filesnames: Arc::new(Mutex::new(HashSet::with_capacity(100000000))), + basedir: base, + } + } + + pub(crate) fn index_files(&self) { + match self.basedir { + None => {} + Some(ref basetype) => match basetype { + Base::Local(localbasename) => { + //Start file indexing only if the Base is valid and local + + let mut filenameslock = self.filesnames.lock().unwrap(); + for entry in WalkDir::new::(localbasename.into()) + //actively ignore symlinks + .follow_links(false) + .into_iter() + .filter_map(|e| e.ok()) + { + match entry.path().file_name() { + Some(filename) => { + filenameslock.insert(filename.to_string_lossy().to_string()); + } + None => {} + } + } + } + // A remote base is of no use for the wikilink checker + Base::Remote(_remotebasename) => {} + }, + } + } +} From 607e4b54951538a47a5505f069bca6f742423726 Mon Sep 17 00:00:00 2001 From: Jakob <144204108+JayJayArr@users.noreply.github.com> Date: Sun, 31 Aug 2025 10:58:20 +0200 Subject: [PATCH 03/11] implement indexing and lookup switch to tokio mutex --- lychee-bin/src/client.rs | 1 + lychee-lib/src/checker/file.rs | 13 ++++++ lychee-lib/src/utils/wikilink_checker.rs | 50 +++++++++++++++++++----- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 4c99f6fe7c..b9a16b8946 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .include_fragments(cfg.include_fragments) .fallback_extensions(cfg.fallback_extensions.clone()) .index_files(cfg.index_files.clone()) + .include_wikilinks(cfg.include_wikilinks) .build() .client() .context("Failed to create request client") diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 8205c117dd..2b6b22b140 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -81,6 +81,9 @@ impl FileChecker { /// /// Returns a `Status` indicating the result of the check. pub(crate) async fn check(&self, uri: &Uri) -> Status { + if self.include_wikilinks { + self.setup_wikilinks().await; + } let Ok(path) = uri.url.to_file_path() else { return ErrorKind::InvalidFilePath(uri.clone()).into(); }; @@ -321,6 +324,16 @@ impl FileChecker { } } } + + // Initializes the Index of the wikilink checker + async fn setup_wikilinks(&self) { + self.wikilink_checker.index_files().await; + } + // Tries to resolve a link by looking up the filename in the wikilink index + // The + async fn check_wikilink(&self, path: &Path, uri: &Uri) -> Status { + self.wikilink_checker.check(path, uri).await + } } #[cfg(test)] diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index 1339284f64..1439507245 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -1,33 +1,44 @@ +use crate::{Base, Status, Uri}; +use http::StatusCode; +use log::info; +use std::path::Path; use std::{collections::HashSet, path::PathBuf, sync::Arc}; - -use std::sync::Mutex; +use tokio::sync::Mutex; use walkdir::WalkDir; -use crate::Base; - #[derive(Clone, Debug, Default)] /// Indexes a given directory for filenames pub(crate) struct WikilinkChecker { - filesnames: Arc>>, + filenames: Arc>>, basedir: Option, } impl WikilinkChecker { pub(crate) fn new(base: Option) -> Self { Self { - filesnames: Arc::new(Mutex::new(HashSet::with_capacity(100000000))), + filenames: Arc::new(Mutex::new(HashSet::new())), basedir: base, } } - pub(crate) fn index_files(&self) { + pub(crate) async fn index_files(&self) { + //Skip the indexing step in case the filenames are already populated + if !self.filenames.lock().await.is_empty() { + return; + } match self.basedir { - None => {} + None => { + info!("File indexing for Wikilinks aborted as no base directory is specified"); + } Some(ref basetype) => match basetype { Base::Local(localbasename) => { //Start file indexing only if the Base is valid and local + info!( + "Starting file indexing for wikilinks in {}", + localbasename.display() + ); - let mut filenameslock = self.filesnames.lock().unwrap(); + let mut filenameslock = self.filenames.lock().await; for entry in WalkDir::new::(localbasename.into()) //actively ignore symlinks .follow_links(false) @@ -42,9 +53,28 @@ impl WikilinkChecker { } } } - // A remote base is of no use for the wikilink checker + // A remote base is of no use for the wikilink checker, silently skip over it Base::Remote(_remotebasename) => {} }, } } + + pub(crate) async fn check(&self, path: &Path, uri: &Uri) -> Status { + match path.file_name() { + None => Status::Error(crate::ErrorKind::InvalidFilePath(uri.clone())), + Some(filename) => { + if self + .filenames + .lock() + .await + .get(filename.to_str().unwrap()) + .is_some() + { + Status::Ok(StatusCode::OK) + } else { + Status::Error(crate::ErrorKind::InvalidFilePath(uri.clone())) + } + } + } + } } From b85b3da758884add6a3e07d1059c8086d5955eae Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Tue, 2 Sep 2025 07:28:36 +0200 Subject: [PATCH 04/11] switch to Hashmap to resolve file names to pathes --- lychee-lib/src/checker/file.rs | 2 +- lychee-lib/src/utils/wikilink_checker.rs | 35 ++++++++++++------------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 2b6b22b140..9c101cc951 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -331,7 +331,7 @@ impl FileChecker { } // Tries to resolve a link by looking up the filename in the wikilink index // The - async fn check_wikilink(&self, path: &Path, uri: &Uri) -> Status { + async fn apply_wikilink_check(&self, path: &Path, uri: &Uri) -> Result { self.wikilink_checker.check(path, uri).await } } diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index 1439507245..0314de8be0 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -1,22 +1,23 @@ -use crate::{Base, Status, Uri}; -use http::StatusCode; +use crate::{Base, ErrorKind, Uri}; use log::info; +use std::collections::HashMap; +use std::ffi::OsString; use std::path::Path; -use std::{collections::HashSet, path::PathBuf, sync::Arc}; +use std::{path::PathBuf, sync::Arc}; use tokio::sync::Mutex; use walkdir::WalkDir; #[derive(Clone, Debug, Default)] -/// Indexes a given directory for filenames +// Indexes a given directory for filenames and the corresponding path pub(crate) struct WikilinkChecker { - filenames: Arc>>, + filenames: Arc>>, basedir: Option, } impl WikilinkChecker { pub(crate) fn new(base: Option) -> Self { Self { - filenames: Arc::new(Mutex::new(HashSet::new())), + filenames: Arc::new(Mutex::new(HashMap::new())), basedir: base, } } @@ -47,7 +48,7 @@ impl WikilinkChecker { { match entry.path().file_name() { Some(filename) => { - filenameslock.insert(filename.to_string_lossy().to_string()); + filenameslock.insert(filename.into(), entry.path().to_path_buf()); } None => {} } @@ -59,20 +60,18 @@ impl WikilinkChecker { } } - pub(crate) async fn check(&self, path: &Path, uri: &Uri) -> Status { + pub(crate) async fn check(&self, path: &Path, uri: &Uri) -> Result { match path.file_name() { - None => Status::Error(crate::ErrorKind::InvalidFilePath(uri.clone())), + None => Err(ErrorKind::InvalidFilePath(uri.clone())), Some(filename) => { - if self - .filenames - .lock() - .await - .get(filename.to_str().unwrap()) - .is_some() - { - Status::Ok(StatusCode::OK) + let filenamelock = self.filenames.lock().await; + if filenamelock.contains_key(filename.into()) { + Ok(filenamelock + .get(filename.into()) + .expect("Could not retrieve inserted Path for discovered Wikilink-Path")) + .cloned() } else { - Status::Error(crate::ErrorKind::InvalidFilePath(uri.clone())) + Err(ErrorKind::InvalidFilePath(uri.clone())) } } } From 1113bd4fb9885a2d642c8d51ad3e9dd70e209b61 Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Wed, 17 Sep 2025 14:14:45 +0200 Subject: [PATCH 05/11] feat: resolve Filenames through wikilink checker --- fixtures/wiki/obsidian-style-plus-headers.md | 8 ++++++++ fixtures/wiki/obsidian-style.md | 9 --------- lychee-bin/tests/cli.rs | 6 +++++- lychee-lib/src/checker/file.rs | 20 +++++++++++++++++--- lychee-lib/src/utils/wikilink_checker.rs | 14 ++++++-------- 5 files changed, 36 insertions(+), 21 deletions(-) create mode 100644 fixtures/wiki/obsidian-style-plus-headers.md diff --git a/fixtures/wiki/obsidian-style-plus-headers.md b/fixtures/wiki/obsidian-style-plus-headers.md new file mode 100644 index 0000000000..0b892aafc7 --- /dev/null +++ b/fixtures/wiki/obsidian-style-plus-headers.md @@ -0,0 +1,8 @@ +[[#LocalHeader]] + +# LocalHeader + +[[Usage#Header|HeaderRenaming]] +[[Space Usage#Header|HeaderRenaming]] +[[Space Usage DifferentDirectory#Header|HeaderRenaming]] +[[DifferentDirectory#Header|HeaderRenaming]] diff --git a/fixtures/wiki/obsidian-style.md b/fixtures/wiki/obsidian-style.md index 4fb6f4a4c9..4911206397 100644 --- a/fixtures/wiki/obsidian-style.md +++ b/fixtures/wiki/obsidian-style.md @@ -1,13 +1,4 @@ -[[#LocalHeader]] - [[Usage]] [[Space Usage]] [[Space Usage DifferentDirectory]] [[DifferentDirectory]] - -[[Usage#Header|HeaderRenaming]] -[[Space Usage#Header|HeaderRenaming]] -[[Space Usage DifferentDirectory#Header|HeaderRenaming]] -[[DifferentDirectory#Header|HeaderRenaming]] - -# LocalHeader diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index a1cffdeb19..884edaa06a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2972,13 +2972,17 @@ mod cli { .arg("--include-wikilinks") .arg("--fallback-extensions") .arg("md") + .arg("--base-url") + .arg(fixtures_path()) + .arg("--root-dir") + .arg(fixtures_path()) .assert() .success(); } #[test] fn test_wikilink_fixture_with_fragments_obsidian_style() { - let input = fixtures_path().join("wiki/obsidian-style.md"); + let input = fixtures_path().join("wiki/obsidian-style-plus-headers.md"); //fragments should resolve all headers let dir_links_with_fragment = 2; diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 9c101cc951..44f92a5b17 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -81,6 +81,7 @@ impl FileChecker { /// /// Returns a `Status` indicating the result of the check. pub(crate) async fn check(&self, uri: &Uri) -> Status { + //only populate the wikilink filenames if it is enabled if self.include_wikilinks { self.setup_wikilinks().await; } @@ -145,8 +146,12 @@ impl FileChecker { ) -> Result, ErrorKind> { let path = match path.metadata() { // for non-existing paths, attempt fallback extensions + // if fallback extensions don't help, try wikilinks Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - self.apply_fallback_extensions(path, uri).map(Cow::Owned) + match self.apply_fallback_extensions(path, uri).map(Cow::Owned) { + Ok(val) => Ok(val), + Err(_) => self.apply_wikilink_check(path, uri).await.map(Cow::Owned), + } } // other IO errors are unexpected and should fail the check @@ -266,7 +271,7 @@ impl FileChecker { } /// Checks a resolved file, optionally verifying fragments for HTML files. - /// + ///u /// # Arguments /// /// * `path` - The resolved path to check. @@ -332,7 +337,16 @@ impl FileChecker { // Tries to resolve a link by looking up the filename in the wikilink index // The async fn apply_wikilink_check(&self, path: &Path, uri: &Uri) -> Result { - self.wikilink_checker.check(path, uri).await + let mut path_buf = path.to_path_buf(); + for ext in &self.fallback_extensions { + path_buf.set_extension(ext); + match self.wikilink_checker.check(&path_buf, uri).await { + Err(_) => {} + Ok(resolved_path) => return Ok(resolved_path), + } + } + + Err(ErrorKind::InvalidFilePath(uri.clone())) } } diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index 0314de8be0..b03dbd92e6 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -44,13 +44,11 @@ impl WikilinkChecker { //actively ignore symlinks .follow_links(false) .into_iter() - .filter_map(|e| e.ok()) + .filter_map(std::result::Result::ok) { - match entry.path().file_name() { - Some(filename) => { - filenameslock.insert(filename.into(), entry.path().to_path_buf()); - } - None => {} + if let Some(filename) = entry.path().file_name() { + filenameslock + .insert(filename.to_ascii_lowercase(), entry.path().to_path_buf()); } } } @@ -65,9 +63,9 @@ impl WikilinkChecker { None => Err(ErrorKind::InvalidFilePath(uri.clone())), Some(filename) => { let filenamelock = self.filenames.lock().await; - if filenamelock.contains_key(filename.into()) { + if filenamelock.contains_key(&filename.to_ascii_lowercase()) { Ok(filenamelock - .get(filename.into()) + .get(&filename.to_ascii_lowercase()) .expect("Could not retrieve inserted Path for discovered Wikilink-Path")) .cloned() } else { From e5b72491eccb1eafe18ff99fef5cabd02989313e Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Tue, 30 Sep 2025 11:31:50 +0200 Subject: [PATCH 06/11] fix: exclude fragments cleanup fix: merge conflicts --- lychee-bin/tests/cli.rs | 40 +++++++------------ lychee-lib/src/checker/file.rs | 12 +++--- lychee-lib/src/extract/markdown.rs | 49 ++++++++++++++++++++---- lychee-lib/src/utils/wikilink_checker.rs | 12 +++--- 4 files changed, 68 insertions(+), 45 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 884edaa06a..6fe4b5ad24 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2962,7 +2962,8 @@ mod cli { .assert() .success() .stdout(contains("https://example.org")); // Should extract the link as plaintext - + } + #[test] fn test_wikilink_fixture_obsidian_style() { let input = fixtures_path().join("wiki/obsidian-style.md"); @@ -2981,53 +2982,40 @@ mod cli { } #[test] - fn test_wikilink_fixture_with_fragments_obsidian_style() { + fn test_wikilink_fixture_with_fragments_obsidian_style_fixtures_excluded() { let input = fixtures_path().join("wiki/obsidian-style-plus-headers.md"); //fragments should resolve all headers - let dir_links_with_fragment = 2; - main_command() - .arg(&input) - .arg("--include-wikilinks") - .arg("--include-fragments") - .arg("--fallback-extensions") - .arg("md") - .assert() - .failure() - .stdout(contains("Cannot find fragment").count(dir_links_with_fragment)) - .stdout(contains("#").count(dir_links_with_fragment)); - } - - #[test] - fn test_wikilink_fixture_wikilink_style() { - let input = fixtures_path().join("wiki/wikilink-style.md"); - - // testing without fragments should not yield failures main_command() .arg(&input) .arg("--include-wikilinks") .arg("--fallback-extensions") .arg("md") + .arg("--base-url") + .arg(fixtures_path()) + .arg("--root-dir") + .arg(fixtures_path()) .assert() .success(); } #[test] - fn test_wikilink_fixture_with_fragments_wikilink_style() { - let input = fixtures_path().join("wiki/wikilink-style.md"); + fn test_wikilink_fixture_with_fragments_obsidian_style() { + let input = fixtures_path().join("wiki/obsidian-style-plus-headers.md"); //fragments should resolve all headers - let dir_links_with_fragment = 2; main_command() .arg(&input) .arg("--include-wikilinks") .arg("--include-fragments") .arg("--fallback-extensions") .arg("md") + .arg("--base-url") + .arg(fixtures_path()) + .arg("--root-dir") + .arg(fixtures_path()) .assert() - .failure() - .stdout(contains("Cannot find fragment").count(dir_links_with_fragment)) - .stdout(contains("#").count(dir_links_with_fragment)); + .success(); } /// An input which matches nothing should print a warning and continue. diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 44f92a5b17..9232388984 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -83,7 +83,7 @@ impl FileChecker { pub(crate) async fn check(&self, uri: &Uri) -> Status { //only populate the wikilink filenames if it is enabled if self.include_wikilinks { - self.setup_wikilinks().await; + self.setup_wikilinks(); } let Ok(path) = uri.url.to_file_path() else { return ErrorKind::InvalidFilePath(uri.clone()).into(); @@ -150,7 +150,7 @@ impl FileChecker { Err(e) if e.kind() == std::io::ErrorKind::NotFound => { match self.apply_fallback_extensions(path, uri).map(Cow::Owned) { Ok(val) => Ok(val), - Err(_) => self.apply_wikilink_check(path, uri).await.map(Cow::Owned), + Err(_) => self.apply_wikilink_check(path, uri).map(Cow::Owned), } } @@ -331,16 +331,16 @@ impl FileChecker { } // Initializes the Index of the wikilink checker - async fn setup_wikilinks(&self) { - self.wikilink_checker.index_files().await; + fn setup_wikilinks(&self) { + self.wikilink_checker.index_files(); } // Tries to resolve a link by looking up the filename in the wikilink index // The - async fn apply_wikilink_check(&self, path: &Path, uri: &Uri) -> Result { + fn apply_wikilink_check(&self, path: &Path, uri: &Uri) -> Result { let mut path_buf = path.to_path_buf(); for ext in &self.fallback_extensions { path_buf.set_extension(ext); - match self.wikilink_checker.check(&path_buf, uri).await { + match self.wikilink_checker.check(&path_buf, uri) { Err(_) => {} Ok(resolved_path) => return Ok(resolved_path), } diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 64b4247645..daebb00102 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -91,19 +91,29 @@ pub(crate) fn extract_markdown( } //Strip potholes (|) from wikilinks - let stripped_dest_url = if has_pothole { + let mut stripped_dest_url = if has_pothole { pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('|').unwrap_or(dest_url.len())]) }else { dest_url.clone() }; - Some(vec![RawUri { - text: stripped_dest_url.to_string(), - element: Some("a".to_string()), - attribute: Some("wikilink".to_string()), + //Strip fragments (#) from wikilinks, according to the obsidian spec + //fragments come before potholes + if stripped_dest_url.contains('#') { + stripped_dest_url = pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('#').unwrap_or(dest_url.len())]); + } + + if stripped_dest_url.is_empty() { + None + } else { + Some(vec![RawUri { + text: stripped_dest_url.to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), // wiki links start with `[[`, so offset the span by `2` span: span.start + 2 - }]) + }]) + } } } } @@ -643,7 +653,8 @@ Shortcut link: [link4] "Missing expected URI: {expected_uri:?}. Found: {uris:?}" ); } - + } + #[test] fn test_remove_wikilink_pothole() { let markdown = r"[[foo|bar]]"; let uris = extract_markdown(markdown, true, true); @@ -654,4 +665,28 @@ Shortcut link: [link4] }]; assert_eq!(uris, expected); } + + #[test] + fn test_remove_wikilink_fragment() { + let markdown = r"[[foo#bar]]"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + }]; + assert_eq!(uris, expected); + } + + #[test] + fn test_remove_wikilink_potholes_and_fragments() { + let markdown = r"[[foo#bar|baz]]"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + }]; + assert_eq!(uris, expected); + } } diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index b03dbd92e6..4349ef505e 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -3,8 +3,8 @@ use log::info; use std::collections::HashMap; use std::ffi::OsString; use std::path::Path; +use std::sync::Mutex; use std::{path::PathBuf, sync::Arc}; -use tokio::sync::Mutex; use walkdir::WalkDir; #[derive(Clone, Debug, Default)] @@ -22,9 +22,9 @@ impl WikilinkChecker { } } - pub(crate) async fn index_files(&self) { + pub(crate) fn index_files(&self) { //Skip the indexing step in case the filenames are already populated - if !self.filenames.lock().await.is_empty() { + if !self.filenames.lock().unwrap().is_empty() { return; } match self.basedir { @@ -39,7 +39,7 @@ impl WikilinkChecker { localbasename.display() ); - let mut filenameslock = self.filenames.lock().await; + let mut filenameslock = self.filenames.lock().unwrap(); for entry in WalkDir::new::(localbasename.into()) //actively ignore symlinks .follow_links(false) @@ -58,11 +58,11 @@ impl WikilinkChecker { } } - pub(crate) async fn check(&self, path: &Path, uri: &Uri) -> Result { + pub(crate) fn check(&self, path: &Path, uri: &Uri) -> Result { match path.file_name() { None => Err(ErrorKind::InvalidFilePath(uri.clone())), Some(filename) => { - let filenamelock = self.filenames.lock().await; + let filenamelock = self.filenames.lock().unwrap(); if filenamelock.contains_key(&filename.to_ascii_lowercase()) { Ok(filenamelock .get(&filename.to_ascii_lowercase()) From a1b0bd86aeb086ae4ee7130c912f4972d6ad025c Mon Sep 17 00:00:00 2001 From: Jakob <144204108+JayJayArr@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:00:28 +0200 Subject: [PATCH 07/11] Apply suggestions from code review Co-authored-by: Matthias Endler --- lychee-lib/src/checker/file.rs | 9 ++++----- lychee-lib/src/extract/markdown.rs | 2 +- lychee-lib/src/utils/wikilink_checker.rs | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 9232388984..afcdd28a9e 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -81,7 +81,7 @@ impl FileChecker { /// /// Returns a `Status` indicating the result of the check. pub(crate) async fn check(&self, uri: &Uri) -> Status { - //only populate the wikilink filenames if it is enabled + //only populate the wikilink filenames if the feature is enabled if self.include_wikilinks { self.setup_wikilinks(); } @@ -271,7 +271,7 @@ impl FileChecker { } /// Checks a resolved file, optionally verifying fragments for HTML files. - ///u + /// /// # Arguments /// /// * `path` - The resolved path to check. @@ -330,18 +330,17 @@ impl FileChecker { } } - // Initializes the Index of the wikilink checker + // Initializes the index of the wikilink checker fn setup_wikilinks(&self) { self.wikilink_checker.index_files(); } // Tries to resolve a link by looking up the filename in the wikilink index - // The fn apply_wikilink_check(&self, path: &Path, uri: &Uri) -> Result { let mut path_buf = path.to_path_buf(); for ext in &self.fallback_extensions { path_buf.set_extension(ext); match self.wikilink_checker.check(&path_buf, uri) { - Err(_) => {} + Err(_) => { trace!("Tried to find wikilink at {path_buf}") } Ok(resolved_path) => return Ok(resolved_path), } } diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index daebb00102..73dd92488d 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -93,7 +93,7 @@ pub(crate) fn extract_markdown( //Strip potholes (|) from wikilinks let mut stripped_dest_url = if has_pothole { pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('|').unwrap_or(dest_url.len())]) - }else { + } else { dest_url.clone() }; diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index 4349ef505e..4f48937972 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -17,8 +17,8 @@ pub(crate) struct WikilinkChecker { impl WikilinkChecker { pub(crate) fn new(base: Option) -> Self { Self { - filenames: Arc::new(Mutex::new(HashMap::new())), basedir: base, + ..default::Default() } } From 523adabeb9b66dac329a933c4ea410478d22ec98 Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Fri, 3 Oct 2025 18:59:47 +0200 Subject: [PATCH 08/11] tie --include-wikilinks to --base-url --- README.md | 3 ++- lychee-bin/src/options.rs | 3 ++- lychee-bin/tests/cli.rs | 22 +++++++++++----------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 0fa4666a66..dc2e2a8cc8 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Available as a command-line utility, a library and a [GitHub Action](https://git + ## Table of Contents - [Development](#development) @@ -660,7 +661,7 @@ Options: and existing cookies will be updated. --include-wikilinks - Check WikiLinks in Markdown files + Check WikiLinks in Markdown files, this requires specifying --base-url -h, --help Print help (see a summary with '-h') diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 50e1732db8..d230187649 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -845,7 +845,8 @@ and existing cookies will be updated." pub(crate) cookie_jar: Option, #[allow(clippy::doc_markdown)] - /// Check WikiLinks in Markdown files + /// Check WikiLinks in Markdown files, this requires specifying --base-url + #[clap(requires = "base_url")] #[arg(long)] #[serde(default)] pub(crate) include_wikilinks: bool, diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 6fe4b5ad24..d56f769a8a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2501,6 +2501,8 @@ mod cli { let mut cmd = main_command(); cmd.arg("--dump") .arg("--include-wikilinks") + .arg("--base-url") + .arg(fixtures_path()) .arg(test_path) .assert() .success() @@ -2963,6 +2965,7 @@ mod cli { .success() .stdout(contains("https://example.org")); // Should extract the link as plaintext } + #[test] fn test_wikilink_fixture_obsidian_style() { let input = fixtures_path().join("wiki/obsidian-style.md"); @@ -2975,17 +2978,16 @@ mod cli { .arg("md") .arg("--base-url") .arg(fixtures_path()) - .arg("--root-dir") - .arg(fixtures_path()) .assert() - .success(); + .success() + .stdout(contains("4 OK")); } #[test] fn test_wikilink_fixture_with_fragments_obsidian_style_fixtures_excluded() { let input = fixtures_path().join("wiki/obsidian-style-plus-headers.md"); - //fragments should resolve all headers + // fragments should resolve all headers main_command() .arg(&input) .arg("--include-wikilinks") @@ -2993,17 +2995,16 @@ mod cli { .arg("md") .arg("--base-url") .arg(fixtures_path()) - .arg("--root-dir") - .arg(fixtures_path()) .assert() - .success(); + .success() + .stdout(contains("4 OK")); } #[test] fn test_wikilink_fixture_with_fragments_obsidian_style() { let input = fixtures_path().join("wiki/obsidian-style-plus-headers.md"); - //fragments should resolve all headers + // fragments should resolve all headers main_command() .arg(&input) .arg("--include-wikilinks") @@ -3012,10 +3013,9 @@ mod cli { .arg("md") .arg("--base-url") .arg(fixtures_path()) - .arg("--root-dir") - .arg(fixtures_path()) .assert() - .success(); + .success() + .stdout(contains("4 OK")); } /// An input which matches nothing should print a warning and continue. From e4af5ed840d259210de3df008a278ce0de07e76d Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Fri, 3 Oct 2025 19:05:48 +0200 Subject: [PATCH 09/11] update return values for Wikilink checker --- lychee-lib/src/checker/file.rs | 22 ++++--- lychee-lib/src/extract/markdown.rs | 6 +- lychee-lib/src/types/error.rs | 10 +++- lychee-lib/src/utils/wikilink_checker.rs | 73 +++++++++++++++--------- 4 files changed, 72 insertions(+), 39 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index afcdd28a9e..ddbff75bd6 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -1,5 +1,5 @@ use http::StatusCode; -use log::warn; +use log::{trace, warn}; use std::borrow::Cow; use std::path::{Path, PathBuf}; @@ -81,9 +81,12 @@ impl FileChecker { /// /// Returns a `Status` indicating the result of the check. pub(crate) async fn check(&self, uri: &Uri) -> Status { - //only populate the wikilink filenames if the feature is enabled + // only populate the wikilink filenames if the feature is enabled if self.include_wikilinks { - self.setup_wikilinks(); + match self.setup_wikilinks() { + Ok(()) => (), + Err(e) => return Status::Error(e), + } } let Ok(path) = uri.url.to_file_path() else { return ErrorKind::InvalidFilePath(uri.clone()).into(); @@ -331,17 +334,20 @@ impl FileChecker { } // Initializes the index of the wikilink checker - fn setup_wikilinks(&self) { - self.wikilink_checker.index_files(); + fn setup_wikilinks(&self) -> Result<(), ErrorKind> { + self.wikilink_checker.setup_wikilinks_index() } + // Tries to resolve a link by looking up the filename in the wikilink index fn apply_wikilink_check(&self, path: &Path, uri: &Uri) -> Result { let mut path_buf = path.to_path_buf(); for ext in &self.fallback_extensions { path_buf.set_extension(ext); - match self.wikilink_checker.check(&path_buf, uri) { - Err(_) => { trace!("Tried to find wikilink at {path_buf}") } - Ok(resolved_path) => return Ok(resolved_path), + match self.wikilink_checker.contains_path(&path_buf) { + None => { + trace!("Tried to find wikilink {} at {}", uri, path_buf.display()); + } + Some(resolved_path) => return Ok(resolved_path), } } diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 73dd92488d..4b8dd7538e 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -90,15 +90,15 @@ pub(crate) fn extract_markdown( return None; } - //Strip potholes (|) from wikilinks + // Strip potholes (|) from wikilinks let mut stripped_dest_url = if has_pothole { pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('|').unwrap_or(dest_url.len())]) } else { dest_url.clone() }; - //Strip fragments (#) from wikilinks, according to the obsidian spec - //fragments come before potholes + // Strip fragments (#) from wikilinks, according to the obsidian spec + // fragments come before potholes if stripped_dest_url.contains('#') { stripped_dest_url = pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('#').unwrap_or(dest_url.len())]); } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 66ad178d3d..5268f80f3e 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -170,6 +170,10 @@ pub enum ErrorKind { #[error("Status code range error")] StatusCodeSelectorError(#[from] StatusCodeSelectorError), + /// Error locking a Mutex + #[error("Failed to lock a Mutex")] + MutexPoisoned, + /// Test-only error variant for formatter tests /// Available in both test and debug builds to support cross-crate testing #[cfg(any(test, debug_assertions))] @@ -334,7 +338,10 @@ impl ErrorKind { [] => "No directory links are allowed because index_files is defined and empty".to_string(), [name] => format!("An index file ({name}) is required"), [init @ .., tail] => format!("An index file ({}, or {}) is required", init.join(", "), tail), - }.into() + }.into(), + ErrorKind::MutexPoisoned => Some ( + "One or more threads failed and poisoned a Mutex".to_string() + ) } } @@ -470,6 +477,7 @@ impl Hash for ErrorKind { Self::BasicAuthExtractorError(e) => e.to_string().hash(state), Self::Cookies(e) => e.to_string().hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), + Self::MutexPoisoned => "Mutex Poisoned".to_string().hash(state), } } } diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index 4f48937972..348d0561c4 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -1,5 +1,5 @@ -use crate::{Base, ErrorKind, Uri}; -use log::info; +use crate::{Base, ErrorKind, Result}; +use log::{info, warn}; use std::collections::HashMap; use std::ffi::OsString; use std::path::Path; @@ -7,8 +7,12 @@ use std::sync::Mutex; use std::{path::PathBuf, sync::Arc}; use walkdir::WalkDir; +/// Indexes a given directory mapping filenames to their corresponding path. +/// +/// The `WikilinkChecker` Recursively checks all subdirectories of the given +/// base directory mapping any found files to the path where they can be found. +/// Symlinks are ignored to prevent it from infinite loops. #[derive(Clone, Debug, Default)] -// Indexes a given directory for filenames and the corresponding path pub(crate) struct WikilinkChecker { filenames: Arc>>, basedir: Option, @@ -18,58 +22,73 @@ impl WikilinkChecker { pub(crate) fn new(base: Option) -> Self { Self { basedir: base, - ..default::Default() + ..Default::default() } } - pub(crate) fn index_files(&self) { - //Skip the indexing step in case the filenames are already populated + /// Populates the index of the `WikilinkChecker` unless it is already populated. + /// + /// Recursively walks the base directory mapping each filename to an absolute filepath. + /// Errors if no base directory is given or if it is recognized as remote + pub(crate) fn setup_wikilinks_index(&self) -> Result<()> { + // Skip the indexing step in case the filenames are already populated if !self.filenames.lock().unwrap().is_empty() { - return; + return Ok(()); } match self.basedir { None => { - info!("File indexing for Wikilinks aborted as no base directory is specified"); + warn!("File indexing for Wikilinks aborted as no base directory is specified"); + Ok(()) } - Some(ref basetype) => match basetype { - Base::Local(localbasename) => { - //Start file indexing only if the Base is valid and local + Some(ref base_type) => match base_type { + Base::Local(local_base_name) => { + // Start file indexing only if the Base is valid and local info!( "Starting file indexing for wikilinks in {}", - localbasename.display() + local_base_name.display() ); - let mut filenameslock = self.filenames.lock().unwrap(); - for entry in WalkDir::new::(localbasename.into()) - //actively ignore symlinks + let mut lock = self + .filenames + .lock() + .map_err(|_| ErrorKind::MutexPoisoned)?; + for entry in WalkDir::new::(local_base_name.into()) + // actively ignore symlinks .follow_links(false) .into_iter() .filter_map(std::result::Result::ok) { if let Some(filename) = entry.path().file_name() { - filenameslock - .insert(filename.to_ascii_lowercase(), entry.path().to_path_buf()); + lock.insert(filename.to_ascii_lowercase(), entry.path().to_path_buf()); } } + Ok(()) } + // A remote base is of no use for the wikilink checker, silently skip over it - Base::Remote(_remotebasename) => {} + Base::Remote(remote_base_name) => { + warn!("Error using remote base url for checking wililinks: {remote_base_name}"); + Ok(()) + } }, } } - - pub(crate) fn check(&self, path: &Path, uri: &Uri) -> Result { + /// Checks the index for a filename. Returning the absolute path if the name is found, + /// otherwise returning None + pub(crate) fn contains_path(&self, path: &Path) -> Option { match path.file_name() { - None => Err(ErrorKind::InvalidFilePath(uri.clone())), + None => None, Some(filename) => { - let filenamelock = self.filenames.lock().unwrap(); - if filenamelock.contains_key(&filename.to_ascii_lowercase()) { - Ok(filenamelock - .get(&filename.to_ascii_lowercase()) - .expect("Could not retrieve inserted Path for discovered Wikilink-Path")) + let filename_lock = self.filenames.lock().unwrap(); + if filename_lock.contains_key(&filename.to_ascii_lowercase()) { + Some( + filename_lock.get(&filename.to_ascii_lowercase()).expect( + "Could not retrieve inserted Path for discovered Wikilink-Path", + ), + ) .cloned() } else { - Err(ErrorKind::InvalidFilePath(uri.clone())) + None } } } From 840499d4fc7d2881d2efe3712ce021bf75df8c63 Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Mon, 13 Oct 2025 14:10:16 +0200 Subject: [PATCH 10/11] refactor: wikilink cleanup --- lychee-lib/src/extract/markdown.rs | 109 ++++++++++++++++++----------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 4b8dd7538e..a844e5c597 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,9 +1,11 @@ //! Extract links and fragments from markdown documents use std::collections::{HashMap, HashSet}; +use log::warn; use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeWithOffset}; use crate::{ + ErrorKind, extract::{html::html5gum::extract_html_with_span, plaintext::extract_raw_uri_from_plaintext}, types::uri::raw::{ OffsetSpanProvider, RawUri, RawUriSpan, SourceSpanProvider, SpanProvider as _, @@ -90,29 +92,17 @@ pub(crate) fn extract_markdown( return None; } - // Strip potholes (|) from wikilinks - let mut stripped_dest_url = if has_pothole { - pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('|').unwrap_or(dest_url.len())]) - } else { - dest_url.clone() - }; - - // Strip fragments (#) from wikilinks, according to the obsidian spec - // fragments come before potholes - if stripped_dest_url.contains('#') { - stripped_dest_url = pulldown_cmark::CowStr::Borrowed(&dest_url[0..dest_url.find('#').unwrap_or(dest_url.len())]); - } - - if stripped_dest_url.is_empty() { - None - } else { + if let Ok(wikilink) = clean_wikilink(&dest_url, has_pothole) { Some(vec![RawUri { - text: stripped_dest_url.to_string(), + text: wikilink.to_string(), element: Some("a".to_string()), attribute: Some("wikilink".to_string()), // wiki links start with `[[`, so offset the span by `2` span: span.start + 2 }]) + } else { + warn!("WARNING: The wikilink destination url {dest_url} could not be cleaned by removing potholes and fragments"); + None } } } @@ -287,6 +277,26 @@ pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet { out } +fn clean_wikilink(input: &str, has_pothole: bool) -> Result, ErrorKind> { + // Strip potholes (|) from wikilinks + let mut stripped_input = if has_pothole { + pulldown_cmark::CowStr::Borrowed(&input[0..input.find('|').unwrap_or(input.len())]) + } else { + pulldown_cmark::CowStr::Borrowed(input) + }; + + // Strip fragments (#) from wikilinks, according to the obsidian spec + // fragments always come before potholes + if stripped_input.contains('#') { + stripped_input = + pulldown_cmark::CowStr::Borrowed(&input[0..input.find('#').unwrap_or(input.len())]); + } + if stripped_input.is_empty() { + return Err(ErrorKind::EmptyUrl); + } + Ok(stripped_input) +} + #[derive(Default)] struct HeadingIdGenerator { counter: HashMap, @@ -327,6 +337,7 @@ mod tests { use crate::types::uri::raw::span; use super::*; + use rstest::rstest; const MD_INPUT: &str = r#" # A Test @@ -654,39 +665,53 @@ Shortcut link: [link4] ); } } + #[test] - fn test_remove_wikilink_pothole() { - let markdown = r"[[foo|bar]]"; + fn test_clean_wikilink() { + let markdown = r" +[[foo|bar]] +[[foo#bar]] +[[foo#bar|baz]] +"; let uris = extract_markdown(markdown, true, true); - let expected = vec![RawUri { - text: "foo".to_string(), - element: Some("a".to_string()), - attribute: Some("wikilink".to_string()), - }]; + let expected = vec![ + RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + }, + RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + }, + RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + }, + ]; assert_eq!(uris, expected); } #[test] - fn test_remove_wikilink_fragment() { - let markdown = r"[[foo#bar]]"; + fn test_wikilink_extraction_returns_none_on_empty_links() { + let markdown = r" +[[|bar]] +[[#bar]] +[[#bar|baz]] +"; + let uris = extract_markdown(markdown, true, true); - let expected = vec![RawUri { - text: "foo".to_string(), - element: Some("a".to_string()), - attribute: Some("wikilink".to_string()), - }]; - assert_eq!(uris, expected); + assert!(uris.is_empty()); } - #[test] - fn test_remove_wikilink_potholes_and_fragments() { - let markdown = r"[[foo#bar|baz]]"; - let uris = extract_markdown(markdown, true, true); - let expected = vec![RawUri { - text: "foo".to_string(), - element: Some("a".to_string()), - attribute: Some("wikilink".to_string()), - }]; - assert_eq!(uris, expected); + #[rstest] + #[case("|foo", true)] + #[case("|foo#bar", true)] + #[case("#baz", false)] + fn test_from_str(#[case] input: &str, #[case] has_pothole: bool) { + let result = clean_wikilink(input, has_pothole); + assert!(result.is_err()); } } From 589e1f301d0d3061314d6b0f1b07a4d7c75c5a2c Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Mon, 13 Oct 2025 14:10:58 +0200 Subject: [PATCH 11/11] feat: WikilinkChecker as optional include span in tests fix: allow too many lines --- lychee-lib/src/checker/file.rs | 19 +++++++++++++------ lychee-lib/src/extract/markdown.rs | 8 ++++++-- lychee-lib/src/types/error.rs | 10 +++++++++- lychee-lib/src/utils/wikilink_checker.rs | 20 +++++++++++++++----- 4 files changed, 43 insertions(+), 14 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index ddbff75bd6..0b80f79bdf 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -38,7 +38,7 @@ pub(crate) struct FileChecker { /// Utility for performing fragment checks in HTML files. fragment_checker: FragmentChecker, /// Utility for checking wikilinks, indexes files in a given directory - wikilink_checker: WikilinkChecker, + wikilink_checker: Option, } impl FileChecker { @@ -335,7 +335,12 @@ impl FileChecker { // Initializes the index of the wikilink checker fn setup_wikilinks(&self) -> Result<(), ErrorKind> { - self.wikilink_checker.setup_wikilinks_index() + match &self.wikilink_checker { + Some(checker) => checker.setup_wikilinks_index(), + None => Err(ErrorKind::WikilinkCheckerInit( + "Initialization failed, no checker instantiated".to_string(), + )), + } } // Tries to resolve a link by looking up the filename in the wikilink index @@ -343,11 +348,13 @@ impl FileChecker { let mut path_buf = path.to_path_buf(); for ext in &self.fallback_extensions { path_buf.set_extension(ext); - match self.wikilink_checker.contains_path(&path_buf) { - None => { - trace!("Tried to find wikilink {} at {}", uri, path_buf.display()); + if let Some(checker) = &self.wikilink_checker { + match checker.contains_path(&path_buf) { + None => { + trace!("Tried to find wikilink {} at {}", uri, path_buf.display()); + } + Some(resolved_path) => return Ok(resolved_path), } - Some(resolved_path) => return Ok(resolved_path), } } diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index a844e5c597..5edf4ebb37 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -24,6 +24,7 @@ fn md_extensions() -> Options { } /// Extract unparsed URL strings from a Markdown string. +#[allow(clippy::too_many_lines)] pub(crate) fn extract_markdown( input: &str, include_verbatim: bool, @@ -97,8 +98,8 @@ pub(crate) fn extract_markdown( text: wikilink.to_string(), element: Some("a".to_string()), attribute: Some("wikilink".to_string()), - // wiki links start with `[[`, so offset the span by `2` - span: span.start + 2 + // wiki links start with `[[`, so offset the span by `2` + span: span_provider.span(span.start + 2) }]) } else { warn!("WARNING: The wikilink destination url {dest_url} could not be cleaned by removing potholes and fragments"); @@ -679,16 +680,19 @@ Shortcut link: [link4] text: "foo".to_string(), element: Some("a".to_string()), attribute: Some("wikilink".to_string()), + span: span(2, 3), }, RawUri { text: "foo".to_string(), element: Some("a".to_string()), attribute: Some("wikilink".to_string()), + span: span(3, 3), }, RawUri { text: "foo".to_string(), element: Some("a".to_string()), attribute: Some("wikilink".to_string()), + span: span(4, 3), }, ]; assert_eq!(uris, expected); diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 5268f80f3e..cef5290b52 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -174,6 +174,10 @@ pub enum ErrorKind { #[error("Failed to lock a Mutex")] MutexPoisoned, + /// Error when initializing the Wikilink Checker + #[error("Failed to initialize Wikilink Checker")] + WikilinkCheckerInit(String), + /// Test-only error variant for formatter tests /// Available in both test and debug builds to support cross-crate testing #[cfg(any(test, debug_assertions))] @@ -341,7 +345,10 @@ impl ErrorKind { }.into(), ErrorKind::MutexPoisoned => Some ( "One or more threads failed and poisoned a Mutex".to_string() - ) + ), + ErrorKind::WikilinkCheckerInit(reason) => Some(format!( + "Error initialzing the Wikilink Checker: {reason} ", + )), } } @@ -478,6 +485,7 @@ impl Hash for ErrorKind { Self::Cookies(e) => e.to_string().hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), Self::MutexPoisoned => "Mutex Poisoned".to_string().hash(state), + Self::WikilinkCheckerInit(e) => e.to_string().hash(state), } } } diff --git a/lychee-lib/src/utils/wikilink_checker.rs b/lychee-lib/src/utils/wikilink_checker.rs index 348d0561c4..fcd4d5f084 100644 --- a/lychee-lib/src/utils/wikilink_checker.rs +++ b/lychee-lib/src/utils/wikilink_checker.rs @@ -19,10 +19,17 @@ pub(crate) struct WikilinkChecker { } impl WikilinkChecker { - pub(crate) fn new(base: Option) -> Self { - Self { - basedir: base, - ..Default::default() + pub(crate) fn new(base: Option) -> Option { + if base.is_none() { + None + } else { + warn!( + "The Wikilink Checker could not be initialized because the base directory is missing" + ); + Some(Self { + basedir: base, + ..Default::default() + }) } } @@ -68,7 +75,10 @@ impl WikilinkChecker { // A remote base is of no use for the wikilink checker, silently skip over it Base::Remote(remote_base_name) => { warn!("Error using remote base url for checking wililinks: {remote_base_name}"); - Ok(()) + Err(ErrorKind::WikilinkCheckerInit( + "Remote Base Directory found, only local directories are allowed" + .to_string(), + )) } }, }