Skip to main content

ssg/seo/
canonical.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Canonical URL injection plugin.
5
6use super::helpers::escape_attr;
7use crate::plugin::{Plugin, PluginContext};
8use anyhow::Result;
9use std::path::Path;
10
11/// Injects `<link rel="canonical">` tags into HTML files.
12///
13/// For each HTML file missing a canonical link, this plugin computes
14/// the canonical URL from the base URL and the file's relative path,
15/// then injects the tag before `</head>`.
16///
17/// The plugin is idempotent — it will not add a duplicate canonical
18/// link if one already exists.
19///
20/// # Example
21///
22/// ```rust
23/// use ssg::plugin::PluginManager;
24/// use ssg::seo::CanonicalPlugin;
25///
26/// let mut pm = PluginManager::new();
27/// pm.register(CanonicalPlugin::new("https://example.com"));
28/// ```
29#[derive(Debug, Clone)]
30pub struct CanonicalPlugin {
31    base_url: String,
32}
33
34impl CanonicalPlugin {
35    /// Creates a new `CanonicalPlugin` with the given base URL.
36    pub fn new(base_url: impl Into<String>) -> Self {
37        Self {
38            base_url: base_url.into(),
39        }
40    }
41}
42
43impl Plugin for CanonicalPlugin {
44    fn name(&self) -> &'static str {
45        "canonical"
46    }
47
48    fn has_transform(&self) -> bool {
49        true
50    }
51
52    fn transform_html(
53        &self,
54        html: &str,
55        path: &Path,
56        ctx: &PluginContext,
57    ) -> Result<String> {
58        let base = self.base_url.trim_end_matches('/');
59
60        let rel_path = path
61            .strip_prefix(&ctx.site_dir)
62            .unwrap_or(path)
63            .to_string_lossy()
64            .replace('\\', "/");
65
66        let tag = build_canonical_tag(base, &rel_path);
67
68        let mut result = remove_existing_canonicals(html);
69
70        // Inject the correct canonical before </head>
71        result = if let Some(pos) = result.find("</head>") {
72            format!("{}{}\n{}", &result[..pos], tag, &result[pos..])
73        } else {
74            result
75        };
76
77        Ok(result)
78    }
79
80    fn after_compile(&self, _ctx: &PluginContext) -> Result<()> {
81        Ok(())
82    }
83}
84
85/// Builds a `<link rel="canonical">` tag for the given base URL and path.
86fn build_canonical_tag(base: &str, rel_path: &str) -> String {
87    let canonical_url = format!("{base}/{rel_path}");
88    format!(
89        "<link rel=\"canonical\" href=\"{}\">",
90        escape_attr(&canonical_url)
91    )
92}
93
94/// Removes all existing canonical link tags from HTML.
95fn remove_existing_canonicals(html: &str) -> String {
96    let has_canonical = html.contains("rel=\"canonical\"")
97        || html.contains("rel='canonical'")
98        || html.contains("rel=canonical");
99    if !has_canonical {
100        return html.to_string();
101    }
102
103    let mut result = html.to_string();
104    for pat in &["rel=\"canonical\"", "rel='canonical'", "rel=canonical"] {
105        while let Some(pos) = result.find(pat) {
106            let start = result[..pos].rfind('<').unwrap_or(pos);
107            let end = result[pos..]
108                .find('>')
109                .map_or(result.len(), |i| pos + i + 1);
110            let end = if result.as_bytes().get(end) == Some(&b'\n') {
111                end + 1
112            } else {
113                end
114            };
115            result.replace_range(start..end, "");
116        }
117    }
118    result
119}
120
121#[cfg(test)]
122#[allow(clippy::unwrap_used, clippy::expect_used)]
123mod tests {
124    use super::*;
125    use crate::plugin::PluginContext;
126    use std::path::Path;
127    use tempfile::tempdir;
128
129    fn ctx(site: &Path) -> PluginContext {
130        PluginContext::new(
131            Path::new("content"),
132            Path::new("build"),
133            site,
134            Path::new("templates"),
135        )
136    }
137
138    #[test]
139    fn name_is_stable() {
140        assert_eq!(CanonicalPlugin::new("https://x").name(), "canonical");
141    }
142
143    #[test]
144    fn new_accepts_string_or_str() {
145        let _ = CanonicalPlugin::new("https://a");
146        let _ = CanonicalPlugin::new(String::from("https://b"));
147    }
148
149    #[test]
150    fn no_op_when_site_dir_missing() {
151        let dir = tempdir().unwrap();
152        CanonicalPlugin::new("https://x")
153            .after_compile(&ctx(&dir.path().join("nope")))
154            .unwrap();
155    }
156
157    #[test]
158    fn build_canonical_tag_joins_base_and_rel_path() {
159        let tag = build_canonical_tag("https://example.com", "blog/post.html");
160        assert_eq!(
161            tag,
162            r#"<link rel="canonical" href="https://example.com/blog/post.html">"#
163        );
164    }
165
166    #[test]
167    fn build_canonical_tag_escapes_href_attribute_value() {
168        let tag = build_canonical_tag("https://example.com", "x?a=1&b=2");
169        // & in href must be escaped to &amp; (what escape_attr does)
170        assert!(
171            tag.contains("&amp;"),
172            "ampersand in URL must be HTML-escaped: {tag}"
173        );
174    }
175
176    #[test]
177    fn remove_existing_canonicals_no_op_when_none_present() {
178        let html = "<head><title>x</title></head>";
179        assert_eq!(remove_existing_canonicals(html), html);
180    }
181
182    #[test]
183    fn remove_existing_canonicals_strips_double_quoted() {
184        let html = r#"<head><link rel="canonical" href="/old"><title>x</title></head>"#;
185        let out = remove_existing_canonicals(html);
186        assert!(!out.contains("rel=\"canonical\""));
187        assert!(out.contains("<title>x</title>"));
188    }
189
190    #[test]
191    fn remove_existing_canonicals_strips_single_quoted() {
192        let html = "<head><link rel='canonical' href='/old'></head>";
193        let out = remove_existing_canonicals(html);
194        assert!(!out.contains("rel='canonical'"));
195    }
196
197    #[test]
198    fn remove_existing_canonicals_strips_unquoted() {
199        let html = "<head><link rel=canonical href=/old></head>";
200        let out = remove_existing_canonicals(html);
201        assert!(!out.contains("rel=canonical"));
202    }
203
204    #[test]
205    fn remove_existing_canonicals_strips_multiple() {
206        let html = r#"<head>
207            <link rel="canonical" href="/a">
208            <link rel="canonical" href="/b">
209        </head>"#;
210        let out = remove_existing_canonicals(html);
211        assert!(!out.contains("rel=\"canonical\""));
212    }
213
214    #[test]
215    fn transform_html_injects_canonical() {
216        let dir = tempdir().unwrap();
217        let c = ctx(dir.path());
218        let html = "<html><head></head><body></body></html>";
219        let page_path = dir.path().join("page.html");
220        let after = CanonicalPlugin::new("https://example.com")
221            .transform_html(html, &page_path, &c)
222            .unwrap();
223        assert!(
224            after.contains(r#"<link rel="canonical""#),
225            "canonical link should be injected: {after}"
226        );
227    }
228
229    #[test]
230    fn transform_html_replaces_existing_canonical_with_correct_one() {
231        let dir = tempdir().unwrap();
232        let c = ctx(dir.path());
233        let html =
234            r#"<html><head><link rel="canonical" href="/wrong"></head></html>"#;
235        let page_path = dir.path().join("page.html");
236        let after = CanonicalPlugin::new("https://example.com")
237            .transform_html(html, &page_path, &c)
238            .unwrap();
239        assert!(
240            after.contains("https://example.com"),
241            "wrong canonical replaced with correct: {after}"
242        );
243        assert!(
244            !after.contains("/wrong"),
245            "old canonical should be gone: {after}"
246        );
247    }
248
249    #[test]
250    fn transform_html_trims_trailing_slash_on_base_url() {
251        let dir = tempdir().unwrap();
252        let c = ctx(dir.path());
253        let html = "<html><head></head></html>";
254        let page_path = dir.path().join("page.html");
255        let after = CanonicalPlugin::new("https://example.com/")
256            .transform_html(html, &page_path, &c)
257            .unwrap();
258        assert!(
259            !after.contains("com//page.html"),
260            "no double-slash after trim: {after}"
261        );
262    }
263
264    #[test]
265    fn transform_html_handles_html_without_head_tag() {
266        let dir = tempdir().unwrap();
267        let c = ctx(dir.path());
268        let raw = "<!doctype html><html><body>only</body></html>";
269        let page_path = dir.path().join("frag.html");
270        let after = CanonicalPlugin::new("https://example.com")
271            .transform_html(raw, &page_path, &c)
272            .unwrap();
273        assert_eq!(after, raw);
274    }
275}