ssg/postprocess/
sitemap.rs1use super::helpers::{
7 normalise_url_in_xml_line, read_meta_sidecars, rfc2822_to_iso_date,
8};
9use crate::plugin::{Plugin, PluginContext};
10use anyhow::{Context, Result};
11use std::collections::HashMap;
12use std::fs;
13
14#[derive(Debug, Clone, Copy)]
17pub struct SitemapFixPlugin;
18
19impl Plugin for SitemapFixPlugin {
20 fn name(&self) -> &'static str {
21 "sitemap-fix"
22 }
23
24 fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
25 let sitemap_path = ctx.site_dir.join("sitemap.xml");
26 if !sitemap_path.exists() {
27 return Ok(());
28 }
29
30 let content = fs::read_to_string(&sitemap_path).with_context(|| {
31 format!("cannot read {}", sitemap_path.display())
32 })?;
33
34 let meta_entries =
35 read_meta_sidecars(&ctx.site_dir).unwrap_or_default();
36 let date_map = collect_date_map(&meta_entries);
37
38 let result = strip_duplicate_xml_decls_and_fix_urls(&content);
39
40 let updated = update_lastmod_from_loc(&result, &date_map);
42
43 fs::write(&sitemap_path, updated).with_context(|| {
44 format!("cannot write {}", sitemap_path.display())
45 })?;
46
47 log::info!("[sitemap-fix] Repaired sitemap.xml");
48 Ok(())
49 }
50}
51
52fn collect_date_map(
54 meta_entries: &[(String, HashMap<String, String>)],
55) -> HashMap<String, String> {
56 let mut date_map = HashMap::new();
57 for (rel_path, meta) in meta_entries {
58 if let Some(date) = extract_best_date(meta) {
59 let _ = date_map.insert(rel_path.clone(), date);
60 }
61 }
62 date_map
63}
64
65fn extract_best_date(meta: &HashMap<String, String>) -> Option<String> {
67 meta.get("item_pub_date")
68 .and_then(|d| rfc2822_to_iso_date(d))
69 .or_else(|| {
70 meta.get("last_build_date")
71 .and_then(|d| rfc2822_to_iso_date(d))
72 })
73 .or_else(|| meta.get("date").cloned())
74}
75
76fn strip_duplicate_xml_decls_and_fix_urls(content: &str) -> String {
78 let mut result = String::with_capacity(content.len());
79 let mut first_decl = true;
80
81 for line in content.lines() {
82 let trimmed = line.trim();
83
84 if trimmed.starts_with("<?xml") {
85 if first_decl {
86 first_decl = false;
87 result.push_str(line);
88 result.push('\n');
89 }
90 continue;
91 }
92
93 let processed = if line.contains("<loc>")
94 || line.contains("<link>")
95 || line.contains("<atom:link")
96 {
97 normalise_url_in_xml_line(line)
98 } else {
99 line.to_string()
100 };
101
102 result.push_str(&processed);
103 result.push('\n');
104 }
105
106 result
107}
108
109pub(super) fn update_lastmod_from_loc(
112 xml: &str,
113 date_map: &HashMap<String, String>,
114) -> String {
115 if date_map.is_empty() {
116 return xml.to_string();
117 }
118
119 let mut result = String::with_capacity(xml.len());
120 let mut current_loc = String::new();
121
122 for line in xml.lines() {
123 let trimmed = line.trim();
124
125 if trimmed.starts_with("<loc>") {
127 if let Some(url) = trimmed
128 .strip_prefix("<loc>")
129 .and_then(|s| s.strip_suffix("</loc>"))
130 {
131 current_loc = url.to_string();
132 }
133 }
134
135 if trimmed.starts_with("<lastmod>") && trimmed.ends_with("</lastmod>") {
137 let mut matched = false;
138 for (rel_path, date) in date_map {
139 if !rel_path.is_empty() && current_loc.contains(rel_path) {
140 let indent = &line[..line.len() - line.trim_start().len()];
141 result.push_str(&format!(
142 "{indent}<lastmod>{date}</lastmod>\n"
143 ));
144 matched = true;
145 break;
146 }
147 }
148 if !matched {
149 result.push_str(line);
150 result.push('\n');
151 }
152 } else {
153 result.push_str(line);
154 result.push('\n');
155 }
156 }
157 result
158}
159
160#[cfg(test)]
161#[allow(clippy::unwrap_used, clippy::expect_used)]
162mod tests {
163 use super::*;
164 use crate::plugin::PluginContext;
165 use std::path::Path;
166 use tempfile::tempdir;
167
168 fn test_ctx(site_dir: &Path) -> PluginContext {
169 crate::test_support::init_logger();
170 PluginContext::new(
171 Path::new("content"),
172 Path::new("build"),
173 site_dir,
174 Path::new("templates"),
175 )
176 }
177
178 #[test]
179 fn test_sitemap_fix_removes_duplicate_xml_decls() -> Result<()> {
180 let tmp = tempdir()?;
181 let sitemap = tmp.path().join("sitemap.xml");
182 fs::write(
183 &sitemap,
184 r#"<?xml version="1.0" encoding="UTF-8"?>
185<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
186 <?xml version="1.0" encoding="UTF-8"?>
187<url>
188 <loc>https://example.com/page1</loc>
189 <lastmod>2025-09-01</lastmod>
190</url>
191 <?xml version="1.0" encoding="UTF-8"?>
192<url>
193 <loc>https://example.com/page2</loc>
194 <lastmod>2025-09-01</lastmod>
195</url>
196</urlset>"#,
197 )?;
198
199 let ctx = test_ctx(tmp.path());
200 SitemapFixPlugin.after_compile(&ctx)?;
201
202 let result = fs::read_to_string(&sitemap)?;
203 assert_eq!(result.matches("<?xml").count(), 1);
204 Ok(())
205 }
206
207 #[test]
208 fn test_sitemap_fix_normalises_double_slashes() -> Result<()> {
209 let tmp = tempdir()?;
210 let sitemap = tmp.path().join("sitemap.xml");
211 fs::write(
212 &sitemap,
213 r#"<?xml version="1.0" encoding="UTF-8"?>
214<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
215<url>
216 <loc>https://example.com//index.html</loc>
217 <lastmod>2025-09-01</lastmod>
218</url>
219</urlset>"#,
220 )?;
221
222 let ctx = test_ctx(tmp.path());
223 SitemapFixPlugin.after_compile(&ctx)?;
224
225 let result = fs::read_to_string(&sitemap)?;
226 assert!(result.contains("https://example.com/index.html"));
227 assert!(!result.contains("com//index"));
228 Ok(())
229 }
230
231 #[test]
232 fn test_update_lastmod_from_loc_empty_map() {
233 let xml = "<url><loc>https://example.com</loc><lastmod>2025-01-01</lastmod></url>";
234 let result = update_lastmod_from_loc(xml, &HashMap::new());
235 assert_eq!(result, xml);
236 }
237
238 #[test]
239 fn test_update_lastmod_from_loc_with_match() {
240 let xml = "<url>\n<loc>https://example.com/blog/</loc>\n<lastmod>2025-01-01</lastmod>\n</url>";
241 let mut map = HashMap::new();
242 let _ = map.insert("blog".to_string(), "2026-04-11".to_string());
243 let result = update_lastmod_from_loc(xml, &map);
244 assert!(
245 result.contains("<lastmod>2026-04-11</lastmod>"),
246 "Should update lastmod: {result}"
247 );
248 }
249
250 #[test]
251 fn name_is_stable() {
252 assert_eq!(SitemapFixPlugin.name(), "sitemap-fix");
253 }
254
255 #[test]
256 fn after_compile_no_op_when_sitemap_missing() -> Result<()> {
257 let tmp = tempdir()?;
258 let ctx = test_ctx(tmp.path());
259 SitemapFixPlugin.after_compile(&ctx)?;
260 assert!(!tmp.path().join("sitemap.xml").exists());
261 Ok(())
262 }
263
264 #[test]
265 fn extract_best_date_prefers_item_pub_date() {
266 let mut meta = HashMap::new();
267 let _ = meta.insert(
268 "item_pub_date".to_string(),
269 "Thu, 11 Apr 2026 06:06:06 +0000".to_string(),
270 );
271 let _ = meta.insert(
272 "last_build_date".to_string(),
273 "Mon, 01 Sep 2025 06:06:06 +0000".to_string(),
274 );
275 let _ = meta.insert("date".to_string(), "2024-01-01".to_string());
276 let date = extract_best_date(&meta);
277 assert!(
278 date.as_deref().is_some_and(|d| d.contains("2026-04-11")),
279 "should prefer item_pub_date, got: {date:?}"
280 );
281 }
282
283 #[test]
284 fn extract_best_date_falls_back_to_last_build_date() {
285 let mut meta = HashMap::new();
286 let _ = meta.insert(
287 "last_build_date".to_string(),
288 "Mon, 01 Sep 2025 06:06:06 +0000".to_string(),
289 );
290 let date = extract_best_date(&meta);
291 assert!(
292 date.as_deref().is_some_and(|d| d.contains("2025-09-01")),
293 "should use last_build_date when item_pub_date absent: {date:?}"
294 );
295 }
296
297 #[test]
298 fn extract_best_date_falls_back_to_date_field() {
299 let mut meta = HashMap::new();
300 let _ = meta.insert("date".to_string(), "2024-01-01".to_string());
301 let date = extract_best_date(&meta);
302 assert_eq!(date.as_deref(), Some("2024-01-01"));
303 }
304
305 #[test]
306 fn extract_best_date_returns_none_when_no_dates() {
307 let meta = HashMap::new();
308 assert!(extract_best_date(&meta).is_none());
309 }
310
311 #[test]
312 fn collect_date_map_includes_only_pages_with_dates() {
313 let mut m1 = HashMap::new();
314 let _ = m1.insert("date".to_string(), "2025-01-01".to_string());
315 let mut m2 = HashMap::new();
316 let _ = m2.insert("title".to_string(), "no date here".to_string());
317 let entries =
318 vec![("page-a".to_string(), m1), ("page-b".to_string(), m2)];
319 let map = collect_date_map(&entries);
320 assert_eq!(map.len(), 1);
321 assert_eq!(map.get("page-a").unwrap(), "2025-01-01");
322 }
323
324 #[test]
325 fn strip_duplicate_xml_decls_preserves_first_only() {
326 let input = "<?xml version=\"1.0\"?>\n<root>\n<?xml version=\"1.0\"?>\n<x/>\n</root>";
327 let out = strip_duplicate_xml_decls_and_fix_urls(input);
328 assert_eq!(out.matches("<?xml").count(), 1);
329 assert!(out.contains("<x/>"));
330 }
331
332 #[test]
333 fn update_lastmod_no_match_leaves_line_unchanged() {
334 let xml = "<url>\n<loc>https://example.com/other/</loc>\n<lastmod>2025-01-01</lastmod>\n</url>";
335 let mut map = HashMap::new();
336 let _ = map.insert("blog".to_string(), "2026-04-11".to_string());
337 let result = update_lastmod_from_loc(xml, &map);
338 assert!(
339 result.contains("<lastmod>2025-01-01</lastmod>"),
340 "non-matching loc should leave lastmod unchanged: {result}"
341 );
342 }
343
344 #[test]
345 fn update_lastmod_skips_empty_rel_path_match() {
346 let xml = "<url>\n<loc>https://example.com/x/</loc>\n<lastmod>2025-01-01</lastmod>\n</url>";
348 let mut map = HashMap::new();
349 let _ = map.insert(String::new(), "should-not-match".to_string());
350 let result = update_lastmod_from_loc(xml, &map);
351 assert!(result.contains("<lastmod>2025-01-01</lastmod>"));
352 assert!(!result.contains("should-not-match"));
353 }
354}