ssg/seo/
mod.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! SEO plugins for the static site generator.
5//!
6//! Provides three plugins that improve search engine optimization:
7//!
8//! - `SeoPlugin` — Injects missing meta tags (description, Open Graph,
9//!   Twitter Card) into HTML files.
10//! - `RobotsPlugin` — Generates a `robots.txt` file.
11//! - `CanonicalPlugin` — Injects `<link rel="canonical">` tags.
12
13mod canonical;
14pub mod helpers;
15mod jsonld;
16mod robots;
17mod seo_plugin;
18
19pub use canonical::CanonicalPlugin;
20pub use jsonld::{
21    validate_jsonld, JsonLdConfig, JsonLdPlugin, JsonLdValidationError,
22};
23pub use robots::RobotsPlugin;
24pub use seo_plugin::SeoPlugin;
25
26#[cfg(test)]
27#[allow(clippy::unwrap_used, clippy::expect_used)]
28mod tests {
29    use super::helpers::*;
30    use super::*;
31    use crate::plugin::{Plugin, PluginContext};
32    use anyhow::Result;
33    use std::fs;
34    use std::path::Path;
35    use tempfile::tempdir;
36
37    fn make_html(title: &str, body: &str) -> String {
38        format!(
39            "<html><head><title>{title}</title></head>\
40             <body>{body}</body></html>"
41        )
42    }
43
44    fn test_ctx(site_dir: &Path) -> PluginContext {
45        crate::test_support::init_logger();
46        PluginContext::new(
47            Path::new("content"),
48            Path::new("build"),
49            site_dir,
50            Path::new("templates"),
51        )
52    }
53
54    // -----------------------------------------------------------------
55    // Helper function tests
56    // -----------------------------------------------------------------
57
58    #[test]
59    fn test_extract_title_present() {
60        let html = "<html><head><title>My Page</title></head></html>";
61        assert_eq!(extract_title(html), "My Page");
62    }
63
64    #[test]
65    fn test_extract_title_missing() {
66        let html = "<html><head></head><body></body></html>";
67        assert_eq!(extract_title(html), "");
68    }
69
70    #[test]
71    fn test_extract_description_truncates() {
72        let long = "word ".repeat(100);
73        let html =
74            format!("<html><head></head><body><p>{long}</p></body></html>");
75        let desc = extract_description(&html, 160);
76        assert!(desc.len() <= 160);
77        assert!(!desc.is_empty());
78    }
79
80    // -----------------------------------------------------------------
81    // SeoPlugin tests
82    // -----------------------------------------------------------------
83
84    #[test]
85    fn test_seo_plugin_name() {
86        assert_eq!(SeoPlugin.name(), "seo");
87    }
88
89    #[test]
90    fn test_seo_plugin_injects_meta_tags() -> Result<()> {
91        let tmp = tempdir()?;
92        let ctx = test_ctx(tmp.path());
93        let html = make_html("Hello World", "<p>Some content here</p>");
94
95        let result =
96            SeoPlugin.transform_html(&html, Path::new("index.html"), &ctx)?;
97        assert!(result.contains("<meta name=\"description\""));
98        assert!(result.contains("<meta property=\"og:title\""));
99        assert!(result.contains("Hello World"));
100        assert!(result.contains("<meta property=\"og:description\""));
101        assert!(
102            result.contains("<meta property=\"og:type\" content=\"website\"")
103        );
104        assert!(
105            result.contains("<meta name=\"twitter:card\" content=\"summary\"")
106        );
107        Ok(())
108    }
109
110    #[test]
111    fn test_seo_plugin_idempotent() -> Result<()> {
112        let tmp = tempdir()?;
113        let ctx = test_ctx(tmp.path());
114        let html = make_html("Test", "<p>Content</p>");
115
116        let first =
117            SeoPlugin.transform_html(&html, Path::new("page.html"), &ctx)?;
118        let second =
119            SeoPlugin.transform_html(&first, Path::new("page.html"), &ctx)?;
120
121        assert_eq!(first, second);
122        Ok(())
123    }
124
125    #[test]
126    fn test_extract_description_excludes_nav_header_footer() {
127        let html = r##"<html><head></head><body>
128            <a href="#main">Skip to content</a>
129            <nav><ul><li>Home</li><li>About</li><li>Search</li></ul></nav>
130            <header><h1>Site Header</h1></header>
131            <main><p>This is the actual page content that should be extracted.</p></main>
132            <footer><p>Copyright 2026</p></footer>
133            </body></html>"##;
134        let desc = extract_description(html, 160);
135        assert!(
136            desc.contains("actual page content"),
137            "description should contain main content, got: {desc}"
138        );
139        assert!(
140            !desc.contains("Skip to content"),
141            "description should not contain skip link text"
142        );
143        assert!(
144            !desc.contains("Site Header"),
145            "description should not contain header text"
146        );
147        assert!(
148            !desc.contains("Copyright"),
149            "description should not contain footer text"
150        );
151    }
152
153    #[test]
154    fn test_seo_plugin_handles_missing_title() -> Result<()> {
155        let tmp = tempdir()?;
156        let ctx = test_ctx(tmp.path());
157        let html =
158            "<html><head></head><body><p>No title here</p></body></html>";
159
160        let result =
161            SeoPlugin.transform_html(html, Path::new("no-title.html"), &ctx)?;
162        // Should still inject og:type and twitter:card
163        assert!(result.contains("<meta property=\"og:type\""));
164        assert!(result.contains("<meta name=\"twitter:card\""));
165        // Should not inject og:title (no title available)
166        assert!(!result.contains("<meta property=\"og:title\""));
167        Ok(())
168    }
169
170    #[test]
171    fn test_seo_plugin_empty_dir() -> Result<()> {
172        let tmp = tempdir()?;
173        let ctx = test_ctx(tmp.path());
174        assert!(SeoPlugin.after_compile(&ctx).is_ok());
175        Ok(())
176    }
177
178    #[test]
179    fn test_seo_plugin_nonexistent_dir() -> Result<()> {
180        let ctx = test_ctx(Path::new("/nonexistent/path"));
181        assert!(SeoPlugin.after_compile(&ctx).is_ok());
182        Ok(())
183    }
184
185    // -----------------------------------------------------------------
186    // RobotsPlugin tests
187    // -----------------------------------------------------------------
188
189    #[test]
190    fn test_robots_plugin_name() {
191        let plugin = RobotsPlugin::new("https://example.com");
192        assert_eq!(plugin.name(), "robots");
193    }
194
195    #[test]
196    fn test_robots_plugin_creates_file() -> Result<()> {
197        let tmp = tempdir()?;
198        let ctx = test_ctx(tmp.path());
199        let plugin = RobotsPlugin::new("https://example.com");
200        plugin.after_compile(&ctx)?;
201
202        let path = tmp.path().join("robots.txt");
203        assert!(path.exists());
204        Ok(())
205    }
206
207    #[test]
208    fn test_robots_plugin_correct_content() -> Result<()> {
209        let tmp = tempdir()?;
210        let ctx = test_ctx(tmp.path());
211        let plugin = RobotsPlugin::new("https://example.com");
212        plugin.after_compile(&ctx)?;
213
214        let content = fs::read_to_string(tmp.path().join("robots.txt"))?;
215        assert!(content.contains("User-agent: *"));
216        assert!(content.contains("Allow: /"));
217        assert!(content.contains("Sitemap: https://example.com/sitemap.xml"));
218        Ok(())
219    }
220
221    #[test]
222    fn test_robots_plugin_does_not_overwrite() -> Result<()> {
223        let tmp = tempdir()?;
224        let robots_path = tmp.path().join("robots.txt");
225        fs::write(&robots_path, "User-agent: *\nDisallow: /secret\n")?;
226
227        let ctx = test_ctx(tmp.path());
228        let plugin = RobotsPlugin::new("https://example.com");
229        plugin.after_compile(&ctx)?;
230
231        let content = fs::read_to_string(&robots_path)?;
232        assert!(content.contains("Disallow: /secret"));
233        assert!(!content.contains("Sitemap:"));
234        Ok(())
235    }
236
237    #[test]
238    fn test_robots_plugin_custom_base_url() -> Result<()> {
239        let tmp = tempdir()?;
240        let ctx = test_ctx(tmp.path());
241        let plugin = RobotsPlugin::new("https://my-site.org");
242        plugin.after_compile(&ctx)?;
243
244        let content = fs::read_to_string(tmp.path().join("robots.txt"))?;
245        assert!(content.contains("Sitemap: https://my-site.org/sitemap.xml"));
246        Ok(())
247    }
248
249    // -----------------------------------------------------------------
250    // CanonicalPlugin tests
251    // -----------------------------------------------------------------
252
253    #[test]
254    fn test_canonical_plugin_name() {
255        let plugin = CanonicalPlugin::new("https://example.com");
256        assert_eq!(plugin.name(), "canonical");
257    }
258
259    #[test]
260    fn test_canonical_plugin_injects_tag() -> Result<()> {
261        let tmp = tempdir()?;
262        let ctx = test_ctx(tmp.path());
263        let plugin = CanonicalPlugin::new("https://example.com");
264        let html = make_html("Home", "<p>Welcome</p>");
265        let page_path = tmp.path().join("index.html");
266
267        let result = plugin.transform_html(&html, &page_path, &ctx)?;
268        assert!(result.contains("<link rel=\"canonical\""));
269        assert!(result.contains("https://example.com/index.html"));
270        Ok(())
271    }
272
273    #[test]
274    fn test_canonical_plugin_idempotent() -> Result<()> {
275        let tmp = tempdir()?;
276        let ctx = test_ctx(tmp.path());
277        let plugin = CanonicalPlugin::new("https://example.com");
278        let html = make_html("Page", "<p>Content</p>");
279        let page_path = tmp.path().join("page.html");
280
281        let first = plugin.transform_html(&html, &page_path, &ctx)?;
282        let second = plugin.transform_html(&first, &page_path, &ctx)?;
283
284        assert_eq!(first, second);
285        Ok(())
286    }
287
288    #[test]
289    fn test_canonical_plugin_nested_files() -> Result<()> {
290        let tmp = tempdir()?;
291        fs::create_dir_all(tmp.path().join("blog"))?;
292        let ctx = test_ctx(tmp.path());
293        let plugin = CanonicalPlugin::new("https://example.com");
294        let html = make_html("Post", "<p>Blog post</p>");
295        let page_path = tmp.path().join("blog/post.html");
296
297        let result = plugin.transform_html(&html, &page_path, &ctx)?;
298        assert!(result.contains("https://example.com/blog/post.html"));
299        Ok(())
300    }
301
302    // -----------------------------------------------------------------
303    // Registration tests
304    // -----------------------------------------------------------------
305
306    #[test]
307    fn test_all_plugins_register() {
308        use crate::plugin::PluginManager;
309        let mut pm = PluginManager::new();
310        pm.register(SeoPlugin);
311        pm.register(RobotsPlugin::new("https://example.com"));
312        pm.register(CanonicalPlugin::new("https://example.com"));
313        assert_eq!(pm.len(), 3);
314        assert_eq!(pm.names(), vec!["seo", "robots", "canonical"]);
315    }
316
317    // -----------------------------------------------------------------
318    // Additional edge-case tests
319    // -----------------------------------------------------------------
320
321    #[test]
322    fn extract_description_unicode_truncation_respects_char_boundary() {
323        // Arrange: multi-byte chars (é = 2 bytes, 日 = 3 bytes)
324        let text = "café 日本語 ".repeat(30);
325        let html =
326            format!("<html><head></head><body><p>{text}</p></body></html>");
327
328        // Act
329        let desc = extract_description(&html, 50);
330
331        // Assert: result is valid UTF-8 and within limit
332        assert!(desc.len() <= 50);
333        assert!(!desc.is_empty());
334        // Verify it doesn't panic and is a valid string
335        let _ = desc.chars().count();
336    }
337
338    #[test]
339    fn extract_description_empty_main_falls_back_to_body() {
340        // Arrange: <main> is present but empty
341        let html = "<html><head></head><body>\
342                     <main></main>\
343                     <p>Body fallback text</p>\
344                     </body></html>";
345
346        // Act
347        let desc = extract_description(html, 160);
348
349        // Assert: empty main yields empty string (main takes priority)
350        assert!(
351            desc.is_empty(),
352            "expected empty description from empty <main>, got: {desc}"
353        );
354    }
355
356    #[test]
357    fn extract_description_no_body_uses_raw_html() {
358        // Arrange: no <body> tag at all
359        let html = "<div><p>Raw content without body</p></div>";
360
361        // Act
362        let desc = extract_description(html, 160);
363
364        // Assert: falls back to raw HTML content
365        assert!(
366            desc.contains("Raw content without body"),
367            "expected raw content fallback, got: {desc}"
368        );
369    }
370
371    #[test]
372    fn extract_title_with_nested_tags() {
373        // Arrange: title contains nested HTML tags
374        let html = "<html><head><title><span>Foo</span></title></head></html>";
375
376        // Act
377        let title = extract_title(html);
378
379        // Assert: nested tags are stripped, text is preserved
380        assert_eq!(title, "Foo");
381    }
382
383    #[test]
384    fn escape_attr_all_special_chars() {
385        // Arrange
386        let input = r#"Tom & "Jerry" <script>alert('xss')</script>"#;
387
388        // Act
389        let escaped = escape_attr(input);
390
391        // Assert: all special chars are escaped
392        assert!(escaped.contains("&amp;"), "& should be escaped");
393        assert!(escaped.contains("&quot;"), "\" should be escaped");
394        assert!(escaped.contains("&lt;"), "< should be escaped");
395        assert!(escaped.contains("&gt;"), "> should be escaped");
396        assert_eq!(
397            escaped,
398            "Tom &amp; &quot;Jerry&quot; &lt;script&gt;alert('xss')&lt;/script&gt;"
399        );
400    }
401
402    #[test]
403    fn seo_plugin_skips_existing_single_quote_meta() -> Result<()> {
404        // Arrange: meta tags use single quotes
405        let html = "<html><head>\
406                     <meta name='description' content='Already set'>\
407                     <meta property='og:title' content='Title'>\
408                     <meta property='og:description' content='Desc'>\
409                     <meta property='og:type' content='website'>\
410                     <meta name='twitter:card' content='summary'>\
411                     <title>Test</title></head>\
412                     <body><p>Content</p></body></html>";
413        let tmp = tempdir()?;
414        let ctx = test_ctx(tmp.path());
415
416        // Act
417        let result = SeoPlugin.transform_html(
418            html,
419            Path::new("single-quote.html"),
420            &ctx,
421        )?;
422        assert_eq!(
423            result.matches("meta name=\"description\"").count()
424                + result.matches("meta name='description'").count(),
425            1,
426            "description meta should not be duplicated"
427        );
428        assert_eq!(
429            result.matches("og:title").count(),
430            1,
431            "og:title should not be duplicated"
432        );
433        Ok(())
434    }
435
436    #[test]
437    fn canonical_plugin_trailing_slash_base_url() -> Result<()> {
438        let tmp = tempdir()?;
439        let ctx = test_ctx(tmp.path());
440        let plugin = CanonicalPlugin::new("https://example.com/");
441        let html = make_html("Home", "<p>Welcome</p>");
442        let page_path = tmp.path().join("index.html");
443
444        let result = plugin.transform_html(&html, &page_path, &ctx)?;
445        assert!(
446            result.contains("https://example.com/index.html"),
447            "should produce clean URL without double slash"
448        );
449        assert!(
450            !result.contains("https://example.com//"),
451            "should not contain double slash in canonical URL"
452        );
453        Ok(())
454    }
455
456    #[test]
457    fn robots_plugin_trailing_slash_base_url() -> Result<()> {
458        // Arrange: base_url has a trailing slash
459        let tmp = tempdir()?;
460        let ctx = test_ctx(tmp.path());
461        let plugin = RobotsPlugin::new("https://example.com/");
462
463        // Act
464        plugin.after_compile(&ctx)?;
465
466        // Assert: sitemap URL has no double slash
467        let content = fs::read_to_string(tmp.path().join("robots.txt"))?;
468        assert!(
469            content.contains("Sitemap: https://example.com/sitemap.xml"),
470            "sitemap URL should not have double slash, got: {content}"
471        );
472        assert!(
473            !content.contains("https://example.com//"),
474            "should not contain double slash"
475        );
476        Ok(())
477    }
478
479    #[test]
480    fn extract_description_nested_script_in_main() {
481        // Arrange: <main> contains a <script> block alongside real content
482        let html = "<html><head></head><body>\
483                     <main>\
484                     <script>var x = 'ignore me';</script>\
485                     <p>Visible text after script</p>\
486                     </main></body></html>";
487
488        // Act
489        let desc = extract_description(html, 160);
490
491        // Assert: script content is stripped, visible text remains
492        assert!(
493            desc.contains("Visible text after script"),
494            "should contain the paragraph text, got: {desc}"
495        );
496        assert!(
497            !desc.contains("ignore me"),
498            "should not contain script content, got: {desc}"
499        );
500    }
501
502    // -----------------------------------------------------------------
503    // JSON-LD Plugin tests
504    // -----------------------------------------------------------------
505
506    #[test]
507    fn test_jsonld_injects_webpage() {
508        let dir = tempdir().unwrap();
509        let site = dir.path().join("site");
510        fs::create_dir_all(&site).unwrap();
511
512        let html = make_html("About", "<p>About us</p>");
513        let ctx = test_ctx(&site);
514        let plugin = JsonLdPlugin::from_site("https://example.com", "Test Org");
515        let page_path = site.join("about.html");
516
517        let output = plugin.transform_html(&html, &page_path, &ctx).unwrap();
518        assert!(output.contains("application/ld+json"));
519        assert!(output.contains("\"@type\":\"WebPage\""));
520        assert!(output.contains("\"name\":\"About\""));
521    }
522
523    #[test]
524    fn test_jsonld_injects_article() {
525        let dir = tempdir().unwrap();
526        let site = dir.path().join("site");
527        fs::create_dir_all(&site).unwrap();
528
529        let html = "<html><head><title>Post</title></head>\
530                     <body><article><h1>Post</h1></article></body></html>";
531        let ctx = test_ctx(&site);
532        let plugin = JsonLdPlugin::from_site("https://example.com", "My Org");
533        let page_path = site.join("post.html");
534
535        let output = plugin.transform_html(html, &page_path, &ctx).unwrap();
536        assert!(output.contains("\"@type\":\"Article\""));
537        assert!(output.contains("\"headline\":\"Post\""));
538        assert!(output.contains("My Org"));
539    }
540
541    #[test]
542    fn test_jsonld_breadcrumbs() {
543        let dir = tempdir().unwrap();
544        let site = dir.path().join("site");
545        let blog = site.join("blog");
546        fs::create_dir_all(&blog).unwrap();
547
548        let html = make_html("My Post", "<p>Content</p>");
549        let ctx = test_ctx(&site);
550        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
551        let page_path = blog.join("my-post.html");
552
553        let output = plugin.transform_html(&html, &page_path, &ctx).unwrap();
554        assert!(output.contains("BreadcrumbList"));
555        assert!(output.contains("\"name\":\"Home\""));
556        assert!(output.contains("\"name\":\"blog\""));
557    }
558
559    #[test]
560    fn test_jsonld_idempotent() {
561        let dir = tempdir().unwrap();
562        let site = dir.path().join("site");
563        fs::create_dir_all(&site).unwrap();
564
565        let html = "<html><head><title>X</title>\
566                     <script type=\"application/ld+json\">{}</script>\
567                     </head><body></body></html>";
568        let ctx = test_ctx(&site);
569        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
570        let page_path = site.join("x.html");
571
572        let output = plugin.transform_html(html, &page_path, &ctx).unwrap();
573        // Should have exactly one ld+json (the original), not two
574        let count = output.matches("application/ld+json").count();
575        assert_eq!(count, 1);
576    }
577
578    // -----------------------------------------------------------------
579    // extract_title — edge cases
580    // -----------------------------------------------------------------
581
582    #[test]
583    fn extract_title_empty_tag_returns_empty_string() {
584        assert_eq!(extract_title("<title></title>"), "");
585        assert_eq!(extract_title("<title>   </title>"), "");
586        assert_eq!(extract_title("<title>\n\t </title>"), "");
587    }
588
589    #[test]
590    fn extract_title_without_closing_tag_returns_empty() {
591        assert_eq!(extract_title("<title>Unterminated"), "");
592    }
593
594    #[test]
595    fn extract_title_strips_inner_html_tags() {
596        let out = extract_title("<title>Hello <em>World</em></title>");
597        assert!(out.contains("Hello"));
598        assert!(out.contains("World"));
599    }
600
601    // -----------------------------------------------------------------
602    // extract_description — every branch
603    // -----------------------------------------------------------------
604
605    #[test]
606    fn extract_description_prefers_main_over_body() {
607        let html = r"<html><head></head><body>
608            <nav>menu</nav>
609            <main>The primary content.</main>
610            <footer>Bottom</footer>
611        </body></html>";
612        let desc = extract_description(html, 200);
613        assert!(desc.contains("primary content"));
614        assert!(!desc.contains("menu"));
615    }
616
617    #[test]
618    fn extract_description_main_without_closing_tag_takes_rest() {
619        let html = r"<html><body><main>content without close";
620        let desc = extract_description(html, 200);
621        assert!(desc.contains("content without close"));
622    }
623
624    #[test]
625    fn extract_description_main_without_angle_bracket_returns_empty_fallback() {
626        let html = "<html><body><main";
627        let desc = extract_description(html, 200);
628        assert_eq!(desc, "");
629    }
630
631    #[test]
632    fn extract_description_fallback_to_body_strips_script_and_style() {
633        let html = r"<html><head></head><body>
634            <script>alert('skip');</script>
635            <style>body { color: red; }</style>
636            <nav>menu items here</nav>
637            <header>site title</header>
638            <p>The body text.</p>
639            <footer>copyright</footer>
640        </body></html>";
641        let desc = extract_description(html, 200);
642        assert!(desc.contains("body text"));
643        assert!(!desc.contains("alert"));
644        assert!(!desc.contains("color: red"));
645        assert!(!desc.contains("menu items"));
646        assert!(!desc.contains("site title"));
647        assert!(!desc.contains("copyright"));
648    }
649
650    #[test]
651    fn extract_description_body_without_closing_tag_uses_rest() {
652        let html = "<html><body><p>open-ended body paragraph";
653        let desc = extract_description(html, 200);
654        assert!(desc.contains("open-ended body paragraph"));
655    }
656
657    #[test]
658    fn extract_description_body_without_angle_bracket_returns_empty() {
659        let html = "<html><body";
660        let desc = extract_description(html, 200);
661        assert_eq!(desc, "");
662    }
663
664    #[test]
665    fn extract_description_no_body_no_main_uses_entire_html() {
666        let html = "just plain text no tags here";
667        let desc = extract_description(html, 200);
668        assert!(desc.contains("just plain text"));
669    }
670
671    #[test]
672    fn extract_description_unterminated_script_breaks_out() {
673        let html = "<html><body><main><script>unterminated<p>x</p>";
674        let desc = extract_description(html, 200);
675        let _ = desc;
676    }
677
678    #[test]
679    fn extract_description_truncates_at_word_boundary() {
680        let html = "<html><body><main>one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty twenty-one twenty-two twenty-three twenty-four twenty-five</main></body></html>";
681        let desc = extract_description(html, 80);
682        assert!(desc.len() <= 80);
683        assert!(!desc.ends_with('-'));
684    }
685
686    #[test]
687    fn extract_description_truncates_without_space_falls_to_byte_cut() {
688        let html =
689            "<html><body><main>oneverylongwordwithnospacesanywherehere</main></body></html>";
690        let desc = extract_description(html, 10);
691        assert!(desc.len() <= 10);
692    }
693
694    #[test]
695    fn extract_description_respects_char_boundary_on_truncation() {
696        let html = "<html><body><main>Rust programming — é ñ ü characters everywhere in this text that we want to truncate mid-char</main></body></html>";
697        let desc = extract_description(html, 30);
698        assert!(desc.is_ascii() || !desc.is_empty());
699    }
700
701    #[test]
702    fn extract_description_truncation_walks_back_multiple_bytes() {
703        let mut input = String::from("<html><body><main>");
704        input.push_str(&"a".repeat(20));
705        input.push('🎉'); // 4 bytes
706        input.push_str(&"b".repeat(20));
707        input.push_str("</main></body></html>");
708        let desc = extract_description(&input, 22);
709        assert!(!desc.is_empty(), "expected non-empty desc");
710        let _ = desc.len();
711    }
712
713    #[test]
714    fn extract_description_body_fallback_unterminated_nav_breaks() {
715        let html = "<html><body><nav>unterminated nav block<p>visible</p>";
716        let desc = extract_description(html, 200);
717        let _ = desc;
718    }
719
720    // -----------------------------------------------------------------
721    // SeoPlugin.after_compile — no </head> tag
722    // -----------------------------------------------------------------
723
724    #[test]
725    fn seo_plugin_file_without_head_tag_is_unchanged() {
726        let dir = tempdir().unwrap();
727        fs::write(
728            dir.path().join("fragment.html"),
729            "<p>no html/head/body structure</p>",
730        )
731        .unwrap();
732        let ctx = test_ctx(dir.path());
733        SeoPlugin.after_compile(&ctx).unwrap();
734        let out = fs::read_to_string(dir.path().join("fragment.html")).unwrap();
735        assert_eq!(out, "<p>no html/head/body structure</p>");
736    }
737
738    #[test]
739    fn seo_plugin_missing_site_dir_returns_ok() {
740        let dir = tempdir().unwrap();
741        let missing = dir.path().join("missing");
742        let ctx = test_ctx(&missing);
743        SeoPlugin.after_compile(&ctx).unwrap();
744    }
745
746    // -----------------------------------------------------------------
747    // RobotsPlugin — idempotency + missing dir
748    // -----------------------------------------------------------------
749
750    #[test]
751    fn robots_plugin_skips_existing_robots_txt() {
752        let dir = tempdir().unwrap();
753        let existing = dir.path().join("robots.txt");
754        fs::write(&existing, "USER: existing").unwrap();
755
756        let plugin = RobotsPlugin::new("https://example.com");
757        let ctx = test_ctx(dir.path());
758        plugin.after_compile(&ctx).unwrap();
759
760        assert_eq!(fs::read_to_string(&existing).unwrap(), "USER: existing");
761    }
762
763    #[test]
764    fn robots_plugin_writes_user_agent_and_sitemap() {
765        let dir = tempdir().unwrap();
766        let plugin = RobotsPlugin::new("https://example.com/");
767        let ctx = test_ctx(dir.path());
768        plugin.after_compile(&ctx).unwrap();
769
770        let body = fs::read_to_string(dir.path().join("robots.txt")).unwrap();
771        assert!(body.contains("User-agent: *"));
772        assert!(body.contains("Sitemap: https://example.com/sitemap.xml"));
773    }
774
775    #[test]
776    fn robots_plugin_missing_site_dir_returns_ok() {
777        let dir = tempdir().unwrap();
778        let missing = dir.path().join("missing");
779        let plugin = RobotsPlugin::new("https://example.com");
780        let ctx = test_ctx(&missing);
781        plugin.after_compile(&ctx).unwrap();
782    }
783
784    #[test]
785    fn robots_plugin_name_returns_static_identifier() {
786        assert_eq!(RobotsPlugin::new("").name(), "robots");
787    }
788
789    // -----------------------------------------------------------------
790    // CanonicalPlugin — skip path, missing head, already-canonical
791    // -----------------------------------------------------------------
792
793    #[test]
794    fn canonical_plugin_missing_site_dir_returns_ok() {
795        let dir = tempdir().unwrap();
796        let missing = dir.path().join("missing");
797        let plugin = CanonicalPlugin::new("https://example.com");
798        let ctx = test_ctx(&missing);
799        plugin.after_compile(&ctx).unwrap();
800    }
801
802    #[test]
803    fn canonical_plugin_replaces_existing_canonical_with_correct_url() {
804        let dir = tempdir().unwrap();
805        let html = r#"<html><head><link rel="canonical" href="/original"></head><body></body></html>"#;
806        let plugin = CanonicalPlugin::new("https://example.com");
807        let ctx = test_ctx(dir.path());
808        let page_path = dir.path().join("p.html");
809
810        let out = plugin.transform_html(html, &page_path, &ctx).unwrap();
811        assert_eq!(out.matches(r#"rel="canonical""#).count(), 1);
812        assert!(out.contains("https://example.com/p.html"));
813    }
814
815    #[test]
816    fn canonical_plugin_skips_pages_with_single_quoted_canonical() {
817        let dir = tempdir().unwrap();
818        let html =
819            r"<html><head><link rel='canonical' href='/x'></head></html>";
820        let plugin = CanonicalPlugin::new("https://example.com");
821        let ctx = test_ctx(dir.path());
822        let page_path = dir.path().join("p.html");
823
824        let out = plugin.transform_html(html, &page_path, &ctx).unwrap();
825        assert_eq!(out.matches("canonical").count(), 1);
826    }
827
828    #[test]
829    fn canonical_plugin_page_without_head_is_left_unchanged() {
830        let dir = tempdir().unwrap();
831        let html = "<p>no structure</p>";
832        let plugin = CanonicalPlugin::new("https://example.com");
833        let ctx = test_ctx(dir.path());
834        let page_path = dir.path().join("frag.html");
835
836        let out = plugin.transform_html(html, &page_path, &ctx).unwrap();
837        assert_eq!(out, html);
838    }
839
840    #[test]
841    fn canonical_plugin_injects_canonical_link_before_head_close() {
842        let dir = tempdir().unwrap();
843        let html = "<html><head><title>T</title></head><body></body></html>";
844        let plugin = CanonicalPlugin::new("https://example.com/");
845        let ctx = test_ctx(dir.path());
846        let page_path = dir.path().join("a.html");
847
848        let out = plugin.transform_html(html, &page_path, &ctx).unwrap();
849        assert!(out.contains(r#"rel="canonical""#));
850        assert!(out.contains("https://example.com/a.html"));
851    }
852
853    #[test]
854    fn canonical_plugin_name_returns_static_identifier() {
855        assert_eq!(CanonicalPlugin::new("").name(), "canonical");
856    }
857
858    // -----------------------------------------------------------------
859    // JsonLdPlugin — WebPage branch + no-head skip
860    // -----------------------------------------------------------------
861
862    #[test]
863    fn jsonld_plugin_missing_site_dir_returns_ok() {
864        let dir = tempdir().unwrap();
865        let missing = dir.path().join("missing");
866        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
867        let ctx = test_ctx(&missing);
868        plugin.after_compile(&ctx).unwrap();
869    }
870
871    #[test]
872    fn jsonld_plugin_skips_pages_without_head_tag() {
873        let dir = tempdir().unwrap();
874        let site = dir.path().join("site");
875        fs::create_dir_all(&site).unwrap();
876        let ctx = test_ctx(&site);
877        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
878        let page_path = site.join("frag.html");
879
880        let out = plugin
881            .transform_html("<p>no head</p>", &page_path, &ctx)
882            .unwrap();
883        assert_eq!(out, "<p>no head</p>");
884    }
885
886    #[test]
887    fn jsonld_plugin_generates_webpage_when_no_article_element() {
888        let dir = tempdir().unwrap();
889        let site = dir.path().join("site");
890        fs::create_dir_all(&site).unwrap();
891        let html = "<html><head><title>Hello</title></head><body><p>content</p></body></html>";
892        let ctx = test_ctx(&site);
893        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
894        let page_path = site.join("index.html");
895
896        let out = plugin.transform_html(html, &page_path, &ctx).unwrap();
897        assert!(out.contains("application/ld+json"));
898        assert!(out.contains("WebPage"));
899    }
900
901    #[test]
902    fn jsonld_plugin_generates_article_when_article_element_present() {
903        let dir = tempdir().unwrap();
904        let site = dir.path().join("site");
905        fs::create_dir_all(&site).unwrap();
906        let html = "<html><head><title>Post</title></head><body><article><h1>Post</h1></article></body></html>";
907        let ctx = test_ctx(&site);
908        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
909        let page_path = site.join("post.html");
910
911        let out = plugin.transform_html(html, &page_path, &ctx).unwrap();
912        assert!(out.contains("application/ld+json"));
913        assert!(out.contains(r#""Article""#));
914    }
915
916    #[test]
917    fn jsonld_plugin_new_stores_supplied_config() {
918        let cfg = JsonLdConfig {
919            base_url: "https://a".to_string(),
920            org_name: "Org".to_string(),
921            breadcrumbs: false,
922        };
923        let plugin = JsonLdPlugin::new(cfg);
924        assert_eq!(plugin.config.base_url, "https://a");
925        assert_eq!(plugin.config.org_name, "Org");
926        assert!(!plugin.config.breadcrumbs);
927    }
928
929    #[test]
930    fn jsonld_plugin_name_returns_static_identifier() {
931        let plugin = JsonLdPlugin::from_site("https://example.com", "Org");
932        assert_eq!(plugin.name(), "json-ld");
933    }
934
935    // -----------------------------------------------------------------
936    // collect_html_files_recursive
937    // -----------------------------------------------------------------
938
939    #[test]
940    fn collect_html_files_recursive_filters_and_sorts() {
941        let dir = tempdir().unwrap();
942        let sub = dir.path().join("sub");
943        fs::create_dir(&sub).unwrap();
944        fs::write(dir.path().join("z.html"), "").unwrap();
945        fs::write(dir.path().join("a.html"), "").unwrap();
946        fs::write(sub.join("m.html"), "").unwrap();
947        fs::write(dir.path().join("ignore.css"), "").unwrap();
948
949        let files = collect_html_files_recursive(dir.path()).unwrap();
950        assert_eq!(files.len(), 3);
951    }
952
953    #[test]
954    fn collect_html_files_recursive_missing_dir_returns_empty() {
955        let dir = tempdir().unwrap();
956        let result =
957            collect_html_files_recursive(&dir.path().join("missing")).unwrap();
958        assert!(result.is_empty());
959    }
960
961    // -----------------------------------------------------------------
962    // has_meta_tag — name= and property= variants
963    // -----------------------------------------------------------------
964
965    #[test]
966    fn has_meta_tag_detects_name_double_quote() {
967        let html = r#"<meta name="description" content="hello">"#;
968        assert!(has_meta_tag(html, "description"));
969    }
970
971    #[test]
972    fn has_meta_tag_detects_name_single_quote() {
973        let html = "<meta name='description' content='hello'>";
974        assert!(has_meta_tag(html, "description"));
975    }
976
977    #[test]
978    fn has_meta_tag_detects_property_double_quote() {
979        let html = r#"<meta property="og:title" content="T">"#;
980        assert!(has_meta_tag(html, "og:title"));
981    }
982
983    #[test]
984    fn has_meta_tag_detects_property_single_quote() {
985        let html = "<meta property='og:title' content='T'>";
986        assert!(has_meta_tag(html, "og:title"));
987    }
988
989    #[test]
990    fn has_meta_tag_returns_false_when_absent() {
991        let html = "<html><head></head></html>";
992        assert!(!has_meta_tag(html, "description"));
993    }
994
995    #[test]
996    fn has_meta_tag_ignores_comment_markers() {
997        let html = "<!-- # Start Open Graph / Facebook Meta Tags -->\n\
998                     <!-- # End Open Graph / Facebook Meta Tags -->";
999        assert!(!has_meta_tag(html, "og:title"));
1000    }
1001
1002    // -----------------------------------------------------------------
1003    // extract_canonical
1004    // -----------------------------------------------------------------
1005
1006    #[test]
1007    fn extract_canonical_finds_url() {
1008        let html = r#"<link rel="canonical" href="https://example.com/page">"#;
1009        assert_eq!(extract_canonical(html), "https://example.com/page");
1010    }
1011
1012    #[test]
1013    fn extract_canonical_returns_empty_when_missing() {
1014        let html = "<html><head><title>No canonical</title></head></html>";
1015        assert_eq!(extract_canonical(html), "");
1016    }
1017
1018    // -----------------------------------------------------------------
1019    // extract_existing_meta — name and property attributes
1020    // -----------------------------------------------------------------
1021
1022    #[test]
1023    fn extract_existing_meta_name_variant() {
1024        let html = r#"<meta name="author" content="Alice">"#;
1025        assert_eq!(extract_existing_meta(html, "author"), "Alice");
1026    }
1027
1028    #[test]
1029    fn extract_existing_meta_property_variant() {
1030        let html =
1031            r#"<meta property="article:published_time" content="2026-01-01">"#;
1032        assert_eq!(
1033            extract_existing_meta(html, "article:published_time"),
1034            "2026-01-01"
1035        );
1036    }
1037
1038    #[test]
1039    fn extract_existing_meta_single_quote_variant() {
1040        let html = "<meta name='author' content='Bob'>";
1041        assert_eq!(extract_existing_meta(html, "author"), "Bob");
1042    }
1043
1044    #[test]
1045    fn extract_existing_meta_returns_empty_when_absent() {
1046        let html = "<html><head></head></html>";
1047        assert_eq!(extract_existing_meta(html, "author"), "");
1048    }
1049
1050    // -----------------------------------------------------------------
1051    // extract_meta_author
1052    // -----------------------------------------------------------------
1053
1054    #[test]
1055    fn extract_meta_author_from_meta_tag() {
1056        let html = r#"<meta name="author" content="Jane Doe">"#;
1057        assert_eq!(extract_meta_author(html), "Jane Doe");
1058    }
1059
1060    #[test]
1061    fn extract_meta_author_from_class_author_span() {
1062        let html = r#"<span class="author">John Smith</span>"#;
1063        assert_eq!(extract_meta_author(html), "John Smith");
1064    }
1065
1066    #[test]
1067    fn extract_meta_author_strips_by_prefix() {
1068        let html = r#"<span class="author">by Alice Wonder</span>"#;
1069        assert_eq!(extract_meta_author(html), "Alice Wonder");
1070    }
1071
1072    #[test]
1073    fn extract_meta_author_returns_empty_when_absent() {
1074        let html = "<html><body><p>No author</p></body></html>";
1075        assert_eq!(extract_meta_author(html), "");
1076    }
1077
1078    // -----------------------------------------------------------------
1079    // extract_date_from_html (JSON-LD)
1080    // -----------------------------------------------------------------
1081
1082    #[test]
1083    fn extract_date_from_html_finds_date_published() {
1084        let html = r#"<script type="application/ld+json">{"datePublished":"2026-03-15"}</script>"#;
1085        assert_eq!(
1086            extract_date_from_html(html, "datePublished"),
1087            Some("2026-03-15".to_string())
1088        );
1089    }
1090
1091    #[test]
1092    fn extract_date_from_html_returns_none_when_absent() {
1093        let html = "<html><body></body></html>";
1094        assert_eq!(extract_date_from_html(html, "datePublished"), None);
1095    }
1096
1097    // -----------------------------------------------------------------
1098    // extract_meta_date
1099    // -----------------------------------------------------------------
1100
1101    #[test]
1102    fn extract_meta_date_from_published_time() {
1103        let html =
1104            r#"<meta property="article:published_time" content="2026-06-01">"#;
1105        assert_eq!(extract_meta_date(html), Some("2026-06-01".to_string()));
1106    }
1107
1108    #[test]
1109    fn extract_meta_date_from_time_datetime() {
1110        let html = r#"<time datetime="2026-07-04">July 4</time>"#;
1111        assert_eq!(extract_meta_date(html), Some("2026-07-04".to_string()));
1112    }
1113
1114    #[test]
1115    fn extract_meta_date_returns_none_when_absent() {
1116        let html = "<html><body><p>No date</p></body></html>";
1117        assert_eq!(extract_meta_date(html), None);
1118    }
1119
1120    // -----------------------------------------------------------------
1121    // extract_html_lang
1122    // -----------------------------------------------------------------
1123
1124    #[test]
1125    fn extract_html_lang_double_quotes() {
1126        let html = r#"<html lang="fr-FR"><head></head></html>"#;
1127        assert_eq!(extract_html_lang(html), "fr-FR");
1128    }
1129
1130    #[test]
1131    fn extract_html_lang_single_quotes() {
1132        let html = "<html lang='de-DE'><head></head></html>";
1133        assert_eq!(extract_html_lang(html), "de-DE");
1134    }
1135
1136    #[test]
1137    fn extract_html_lang_missing_returns_empty() {
1138        let html = "<html><head></head></html>";
1139        assert_eq!(extract_html_lang(html), "");
1140    }
1141
1142    // -----------------------------------------------------------------
1143    // extract_first_content_image
1144    // -----------------------------------------------------------------
1145
1146    #[test]
1147    fn extract_first_content_image_from_main() {
1148        let html = r#"<html><body><main><img src="/img/hero.jpg"></main></body></html>"#;
1149        assert_eq!(extract_first_content_image(html), "/img/hero.jpg");
1150    }
1151
1152    #[test]
1153    fn extract_first_content_image_from_article() {
1154        let html = r#"<html><body><article><img src="/img/post.png"></article></body></html>"#;
1155        assert_eq!(extract_first_content_image(html), "/img/post.png");
1156    }
1157
1158    #[test]
1159    fn extract_first_content_image_no_image_returns_empty() {
1160        let html = "<html><body><main><p>No images</p></main></body></html>";
1161        assert_eq!(extract_first_content_image(html), "");
1162    }
1163
1164    #[test]
1165    fn extract_first_content_image_no_main_or_article_returns_empty() {
1166        let html = r#"<html><body><div><img src="/img/sidebar.jpg"></div></body></html>"#;
1167        assert_eq!(extract_first_content_image(html), "");
1168    }
1169
1170    // -----------------------------------------------------------------
1171    // inject_seo_tags — article page triggers summary_large_image
1172    // -----------------------------------------------------------------
1173
1174    #[test]
1175    fn inject_seo_tags_article_page_uses_large_image_card() -> Result<()> {
1176        let tmp = tempdir()?;
1177        let html = "<html><head><title>Blog Post</title></head>\
1178                     <body><article><p>Article content</p></article></body></html>";
1179        let ctx = test_ctx(tmp.path());
1180
1181        let result =
1182            SeoPlugin.transform_html(html, Path::new("post.html"), &ctx)?;
1183        assert!(
1184            result.contains("content=\"summary_large_image\""),
1185            "article pages should use summary_large_image twitter card"
1186        );
1187        assert!(
1188            result.contains("content=\"article\""),
1189            "article pages should use og:type=article"
1190        );
1191        Ok(())
1192    }
1193
1194    // -----------------------------------------------------------------
1195    // CanonicalPlugin — replaces existing canonicals
1196    // -----------------------------------------------------------------
1197
1198    #[test]
1199    fn canonical_plugin_replaces_not_skips_existing() -> Result<()> {
1200        let tmp = tempdir()?;
1201        let html = r#"<html><head><link rel="canonical" href="https://old.com/wrong"></head><body></body></html>"#;
1202        let plugin = CanonicalPlugin::new("https://correct.com");
1203        let ctx = test_ctx(tmp.path());
1204        let page_path = tmp.path().join("page.html");
1205
1206        let result = plugin.transform_html(html, &page_path, &ctx)?;
1207        assert!(
1208            result.contains("https://correct.com/page.html"),
1209            "canonical should be replaced with correct URL"
1210        );
1211        assert!(
1212            !result.contains("https://old.com/wrong"),
1213            "old canonical should be removed"
1214        );
1215        assert_eq!(
1216            result.matches("canonical").count(),
1217            1,
1218            "should have exactly one canonical link"
1219        );
1220        Ok(())
1221    }
1222}
ssg/seo/mod.rs

ssg/seo/
mod.rs