Skip to main content

ssg/postprocess/
sitemap.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Sitemap fix plugin.
5
6use super::helpers::{
7    normalise_url_in_xml_line, read_meta_sidecars, rfc2822_to_iso_date,
8};
9use crate::plugin::{Plugin, PluginContext};
10use anyhow::{Context, Result};
11use std::collections::HashMap;
12use std::fs;
13
14/// Repairs sitemap.xml by removing duplicate XML declarations,
15/// normalising double-slash URLs, and updating per-page lastmod dates.
16#[derive(Debug, Clone, Copy)]
17pub struct SitemapFixPlugin;
18
19impl Plugin for SitemapFixPlugin {
20    fn name(&self) -> &'static str {
21        "sitemap-fix"
22    }
23
24    fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
25        let sitemap_path = ctx.site_dir.join("sitemap.xml");
26        if !sitemap_path.exists() {
27            return Ok(());
28        }
29
30        let content = fs::read_to_string(&sitemap_path).with_context(|| {
31            format!("cannot read {}", sitemap_path.display())
32        })?;
33
34        let meta_entries =
35            read_meta_sidecars(&ctx.site_dir).unwrap_or_default();
36        let date_map = collect_date_map(&meta_entries);
37
38        let result = strip_duplicate_xml_decls_and_fix_urls(&content);
39
40        // Second pass: update lastmod based on the <loc> in each <url> block
41        let updated = update_lastmod_from_loc(&result, &date_map);
42
43        fs::write(&sitemap_path, updated).with_context(|| {
44            format!("cannot write {}", sitemap_path.display())
45        })?;
46
47        log::info!("[sitemap-fix] Repaired sitemap.xml");
48        Ok(())
49    }
50}
51
52/// Collects per-page date strings from meta sidecar entries.
53fn collect_date_map(
54    meta_entries: &[(String, HashMap<String, String>)],
55) -> HashMap<String, String> {
56    let mut date_map = HashMap::new();
57    for (rel_path, meta) in meta_entries {
58        if let Some(date) = extract_best_date(meta) {
59            let _ = date_map.insert(rel_path.clone(), date);
60        }
61    }
62    date_map
63}
64
65/// Extracts the best available date from a metadata map.
66fn extract_best_date(meta: &HashMap<String, String>) -> Option<String> {
67    meta.get("item_pub_date")
68        .and_then(|d| rfc2822_to_iso_date(d))
69        .or_else(|| {
70            meta.get("last_build_date")
71                .and_then(|d| rfc2822_to_iso_date(d))
72        })
73        .or_else(|| meta.get("date").cloned())
74}
75
76/// Strips duplicate XML declarations and normalises URLs in the sitemap.
77fn strip_duplicate_xml_decls_and_fix_urls(content: &str) -> String {
78    let mut result = String::with_capacity(content.len());
79    let mut first_decl = true;
80
81    for line in content.lines() {
82        let trimmed = line.trim();
83
84        if trimmed.starts_with("<?xml") {
85            if first_decl {
86                first_decl = false;
87                result.push_str(line);
88                result.push('\n');
89            }
90            continue;
91        }
92
93        let processed = if line.contains("<loc>")
94            || line.contains("<link>")
95            || line.contains("<atom:link")
96        {
97            normalise_url_in_xml_line(line)
98        } else {
99            line.to_string()
100        };
101
102        result.push_str(&processed);
103        result.push('\n');
104    }
105
106    result
107}
108
109/// Update `<lastmod>` values based on the preceding `<loc>` URL in each
110/// `<url>` block.
111pub(super) fn update_lastmod_from_loc(
112    xml: &str,
113    date_map: &HashMap<String, String>,
114) -> String {
115    if date_map.is_empty() {
116        return xml.to_string();
117    }
118
119    let mut result = String::with_capacity(xml.len());
120    let mut current_loc = String::new();
121
122    for line in xml.lines() {
123        let trimmed = line.trim();
124
125        // Track current <loc> value
126        if trimmed.starts_with("<loc>") {
127            if let Some(url) = trimmed
128                .strip_prefix("<loc>")
129                .and_then(|s| s.strip_suffix("</loc>"))
130            {
131                current_loc = url.to_string();
132            }
133        }
134
135        // Replace <lastmod> using per-page date if available
136        if trimmed.starts_with("<lastmod>") && trimmed.ends_with("</lastmod>") {
137            let mut matched = false;
138            for (rel_path, date) in date_map {
139                if !rel_path.is_empty() && current_loc.contains(rel_path) {
140                    let indent = &line[..line.len() - line.trim_start().len()];
141                    result.push_str(&format!(
142                        "{indent}<lastmod>{date}</lastmod>\n"
143                    ));
144                    matched = true;
145                    break;
146                }
147            }
148            if !matched {
149                result.push_str(line);
150                result.push('\n');
151            }
152        } else {
153            result.push_str(line);
154            result.push('\n');
155        }
156    }
157    result
158}
159
160#[cfg(test)]
161#[allow(clippy::unwrap_used, clippy::expect_used)]
162mod tests {
163    use super::*;
164    use crate::plugin::PluginContext;
165    use std::path::Path;
166    use tempfile::tempdir;
167
168    fn test_ctx(site_dir: &Path) -> PluginContext {
169        crate::test_support::init_logger();
170        PluginContext::new(
171            Path::new("content"),
172            Path::new("build"),
173            site_dir,
174            Path::new("templates"),
175        )
176    }
177
178    #[test]
179    fn test_sitemap_fix_removes_duplicate_xml_decls() -> Result<()> {
180        let tmp = tempdir()?;
181        let sitemap = tmp.path().join("sitemap.xml");
182        fs::write(
183            &sitemap,
184            r#"<?xml version="1.0" encoding="UTF-8"?>
185<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
186    <?xml version="1.0" encoding="UTF-8"?>
187<url>
188  <loc>https://example.com/page1</loc>
189  <lastmod>2025-09-01</lastmod>
190</url>
191    <?xml version="1.0" encoding="UTF-8"?>
192<url>
193  <loc>https://example.com/page2</loc>
194  <lastmod>2025-09-01</lastmod>
195</url>
196</urlset>"#,
197        )?;
198
199        let ctx = test_ctx(tmp.path());
200        SitemapFixPlugin.after_compile(&ctx)?;
201
202        let result = fs::read_to_string(&sitemap)?;
203        assert_eq!(result.matches("<?xml").count(), 1);
204        Ok(())
205    }
206
207    #[test]
208    fn test_sitemap_fix_normalises_double_slashes() -> Result<()> {
209        let tmp = tempdir()?;
210        let sitemap = tmp.path().join("sitemap.xml");
211        fs::write(
212            &sitemap,
213            r#"<?xml version="1.0" encoding="UTF-8"?>
214<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
215<url>
216  <loc>https://example.com//index.html</loc>
217  <lastmod>2025-09-01</lastmod>
218</url>
219</urlset>"#,
220        )?;
221
222        let ctx = test_ctx(tmp.path());
223        SitemapFixPlugin.after_compile(&ctx)?;
224
225        let result = fs::read_to_string(&sitemap)?;
226        assert!(result.contains("https://example.com/index.html"));
227        assert!(!result.contains("com//index"));
228        Ok(())
229    }
230
231    #[test]
232    fn test_update_lastmod_from_loc_empty_map() {
233        let xml = "<url><loc>https://example.com</loc><lastmod>2025-01-01</lastmod></url>";
234        let result = update_lastmod_from_loc(xml, &HashMap::new());
235        assert_eq!(result, xml);
236    }
237
238    #[test]
239    fn test_update_lastmod_from_loc_with_match() {
240        let xml = "<url>\n<loc>https://example.com/blog/</loc>\n<lastmod>2025-01-01</lastmod>\n</url>";
241        let mut map = HashMap::new();
242        let _ = map.insert("blog".to_string(), "2026-04-11".to_string());
243        let result = update_lastmod_from_loc(xml, &map);
244        assert!(
245            result.contains("<lastmod>2026-04-11</lastmod>"),
246            "Should update lastmod: {result}"
247        );
248    }
249
250    #[test]
251    fn name_is_stable() {
252        assert_eq!(SitemapFixPlugin.name(), "sitemap-fix");
253    }
254
255    #[test]
256    fn after_compile_no_op_when_sitemap_missing() -> Result<()> {
257        let tmp = tempdir()?;
258        let ctx = test_ctx(tmp.path());
259        SitemapFixPlugin.after_compile(&ctx)?;
260        assert!(!tmp.path().join("sitemap.xml").exists());
261        Ok(())
262    }
263
264    #[test]
265    fn extract_best_date_prefers_item_pub_date() {
266        let mut meta = HashMap::new();
267        let _ = meta.insert(
268            "item_pub_date".to_string(),
269            "Thu, 11 Apr 2026 06:06:06 +0000".to_string(),
270        );
271        let _ = meta.insert(
272            "last_build_date".to_string(),
273            "Mon, 01 Sep 2025 06:06:06 +0000".to_string(),
274        );
275        let _ = meta.insert("date".to_string(), "2024-01-01".to_string());
276        let date = extract_best_date(&meta);
277        assert!(
278            date.as_deref().is_some_and(|d| d.contains("2026-04-11")),
279            "should prefer item_pub_date, got: {date:?}"
280        );
281    }
282
283    #[test]
284    fn extract_best_date_falls_back_to_last_build_date() {
285        let mut meta = HashMap::new();
286        let _ = meta.insert(
287            "last_build_date".to_string(),
288            "Mon, 01 Sep 2025 06:06:06 +0000".to_string(),
289        );
290        let date = extract_best_date(&meta);
291        assert!(
292            date.as_deref().is_some_and(|d| d.contains("2025-09-01")),
293            "should use last_build_date when item_pub_date absent: {date:?}"
294        );
295    }
296
297    #[test]
298    fn extract_best_date_falls_back_to_date_field() {
299        let mut meta = HashMap::new();
300        let _ = meta.insert("date".to_string(), "2024-01-01".to_string());
301        let date = extract_best_date(&meta);
302        assert_eq!(date.as_deref(), Some("2024-01-01"));
303    }
304
305    #[test]
306    fn extract_best_date_returns_none_when_no_dates() {
307        let meta = HashMap::new();
308        assert!(extract_best_date(&meta).is_none());
309    }
310
311    #[test]
312    fn collect_date_map_includes_only_pages_with_dates() {
313        let mut m1 = HashMap::new();
314        let _ = m1.insert("date".to_string(), "2025-01-01".to_string());
315        let mut m2 = HashMap::new();
316        let _ = m2.insert("title".to_string(), "no date here".to_string());
317        let entries =
318            vec![("page-a".to_string(), m1), ("page-b".to_string(), m2)];
319        let map = collect_date_map(&entries);
320        assert_eq!(map.len(), 1);
321        assert_eq!(map.get("page-a").unwrap(), "2025-01-01");
322    }
323
324    #[test]
325    fn strip_duplicate_xml_decls_preserves_first_only() {
326        let input = "<?xml version=\"1.0\"?>\n<root>\n<?xml version=\"1.0\"?>\n<x/>\n</root>";
327        let out = strip_duplicate_xml_decls_and_fix_urls(input);
328        assert_eq!(out.matches("<?xml").count(), 1);
329        assert!(out.contains("<x/>"));
330    }
331
332    #[test]
333    fn update_lastmod_no_match_leaves_line_unchanged() {
334        let xml = "<url>\n<loc>https://example.com/other/</loc>\n<lastmod>2025-01-01</lastmod>\n</url>";
335        let mut map = HashMap::new();
336        let _ = map.insert("blog".to_string(), "2026-04-11".to_string());
337        let result = update_lastmod_from_loc(xml, &map);
338        assert!(
339            result.contains("<lastmod>2025-01-01</lastmod>"),
340            "non-matching loc should leave lastmod unchanged: {result}"
341        );
342    }
343
344    #[test]
345    fn update_lastmod_skips_empty_rel_path_match() {
346        // Edge case: empty rel_path entries shouldn't match anything.
347        let xml = "<url>\n<loc>https://example.com/x/</loc>\n<lastmod>2025-01-01</lastmod>\n</url>";
348        let mut map = HashMap::new();
349        let _ = map.insert(String::new(), "should-not-match".to_string());
350        let result = update_lastmod_from_loc(xml, &map);
351        assert!(result.contains("<lastmod>2025-01-01</lastmod>"));
352        assert!(!result.contains("should-not-match"));
353    }
354}