ssg/
ai.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! AI-readiness content hooks.
5//!
6//! Provides algorithmic content enhancements for Generative Engine
7//! Optimization (GEO) and Answer Engine Optimization (AEO):
8//!
9//! - Auto-generate meta descriptions from page content when missing
10//! - Validate all `<img>` elements have alt text (log warnings)
11//! - Generate `llms.txt` and `llms-full.txt` for AI crawler guidance
12
13use crate::plugin::{Plugin, PluginContext};
14use anyhow::Result;
15use std::{
16    collections::BTreeMap,
17    fs,
18    path::{Path, PathBuf},
19};
20
21/// Plugin for AI-readiness content validation and enhancement.
22///
23/// Runs in `after_compile`:
24/// - Checks all images have alt text (logs warnings for missing)
25/// - Generates `llms.txt` and `llms-full.txt` in the site root
26/// - Adds max-snippet meta for AI citation eligibility
27#[derive(Debug, Clone, Copy)]
28pub struct AiPlugin;
29
30impl Plugin for AiPlugin {
31    fn name(&self) -> &'static str {
32        "ai"
33    }
34
35    fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
36        if !ctx.site_dir.exists() {
37            return Ok(());
38        }
39
40        generate_llms_txt(&ctx.site_dir, ctx.config.as_ref())?;
41        generate_llms_full_txt(&ctx.site_dir, ctx.config.as_ref())?;
42
43        let html_files = collect_html_files(&ctx.site_dir)?;
44        let pages_with_missing_alt =
45            process_html_for_ai(&html_files, &ctx.site_dir)?;
46
47        if pages_with_missing_alt > 0 {
48            log::warn!(
49                "[ai] {pages_with_missing_alt} page(s) have images without alt text"
50            );
51        }
52
53        Ok(())
54    }
55}
56
57/// Processes HTML files: injects max-snippet meta tags and checks for missing alt text.
58fn process_html_for_ai(
59    html_files: &[PathBuf],
60    site_dir: &Path,
61) -> Result<usize> {
62    let mut pages_with_missing_alt = 0usize;
63
64    for path in html_files {
65        let html = fs::read_to_string(path)?;
66        let modified = inject_max_snippet(&html);
67
68        check_alt_text(path, &modified, site_dir, &mut pages_with_missing_alt);
69
70        if modified != html {
71            fs::write(path, modified)?;
72        }
73    }
74
75    Ok(pages_with_missing_alt)
76}
77
78/// Injects the max-snippet meta tag before `</head>` if not already present.
79fn inject_max_snippet(html: &str) -> String {
80    if html.contains("max-snippet") || !html.contains("</head>") {
81        return html.to_string();
82    }
83    let tag = "<meta name=\"robots\" content=\"max-snippet:-1, max-image-preview:large, max-video-preview:-1\">\n";
84    if let Some(pos) = html.find("</head>") {
85        let mut modified = html.to_string();
86        modified.insert_str(pos, tag);
87        modified
88    } else {
89        html.to_string()
90    }
91}
92
93/// Checks for missing alt text and logs a warning if found.
94fn check_alt_text(
95    path: &Path,
96    html: &str,
97    site_dir: &Path,
98    counter: &mut usize,
99) {
100    let missing = count_missing_alt(html);
101    if missing > 0 {
102        let rel = path.strip_prefix(site_dir).unwrap_or(path).display();
103        log::warn!("[ai] {missing} image(s) missing alt text in {rel}");
104        *counter += 1;
105    }
106}
107
108// -------------------------------------------------------------------
109// llms.txt generation — llmstxt.org v1 spec
110// -------------------------------------------------------------------
111
112/// Collects page metadata from `.meta.json` sidecars in the site dir.
113///
114/// Returns a list of `(title, relative_url, description)` tuples for
115/// pages that should appear in `llms.txt`.
116fn collect_page_entries(
117    site_dir: &Path,
118) -> Result<Vec<(String, String, String)>> {
119    let html_files = collect_html_files(site_dir)?;
120    let mut entries = Vec::new();
121
122    for html_path in &html_files {
123        let rel = html_path.strip_prefix(site_dir).unwrap_or(html_path);
124
125        // Read the companion sidecar
126        let sidecar_path = html_path.with_extension("meta.json");
127        let meta: serde_json::Map<String, serde_json::Value> =
128            if sidecar_path.exists() {
129                if let Ok(content) = fs::read_to_string(&sidecar_path) {
130                    serde_json::from_str(&content).unwrap_or_default()
131                } else {
132                    serde_json::Map::new()
133                }
134            } else {
135                serde_json::Map::new()
136            };
137
138        if is_excluded_page(rel, &meta) {
139            continue;
140        }
141
142        let title = meta
143            .get("title")
144            .and_then(serde_json::Value::as_str)
145            .unwrap_or_default()
146            .to_string();
147        let description = meta
148            .get("description")
149            .and_then(serde_json::Value::as_str)
150            .unwrap_or_default()
151            .to_string();
152
153        // Build a URL path from the relative file path
154        let url = format!("/{}", rel.to_string_lossy().replace('\\', "/"));
155
156        if !title.is_empty() {
157            entries.push((title, url, description));
158        }
159    }
160
161    Ok(entries)
162}
163
164/// Returns true if a page should be excluded from `llms.txt`.
165///
166/// Excludes pages that are drafts, private, or error pages (404).
167fn is_excluded_page(
168    path: &Path,
169    frontmatter: &serde_json::Map<String, serde_json::Value>,
170) -> bool {
171    // Exclude 404 and error pages
172    let file_name = path
173        .file_name()
174        .map(|n| n.to_string_lossy().to_lowercase())
175        .unwrap_or_default();
176    if file_name == "404.html" || file_name.starts_with("error") {
177        return true;
178    }
179
180    // Exclude drafts
181    if let Some(draft) = frontmatter.get("draft") {
182        if draft.as_bool().unwrap_or(false)
183            || draft.as_str().is_some_and(|s| s == "true")
184        {
185            return true;
186        }
187    }
188
189    // Exclude private pages
190    if let Some(private) = frontmatter.get("private") {
191        if private.as_bool().unwrap_or(false)
192            || private.as_str().is_some_and(|s| s == "true")
193        {
194            return true;
195        }
196    }
197
198    false
199}
200
201/// Groups page entries by their top-level directory.
202///
203/// Files at the root level are grouped under `"Pages"`.
204/// Subdirectory names are title-cased (e.g., `blog/` becomes `"Blog"`).
205fn group_pages_by_section(
206    entries: &[(String, String, String)],
207) -> BTreeMap<String, Vec<(String, String, String)>> {
208    let mut sections: BTreeMap<String, Vec<(String, String, String)>> =
209        BTreeMap::new();
210
211    for (title, url, description) in entries {
212        // url looks like "/blog/post.html" or "/index.html"
213        let trimmed = url.trim_start_matches('/');
214        let section = if let Some(slash) = trimmed.find('/') {
215            let dir = &trimmed[..slash];
216            titlecase_word(dir)
217        } else {
218            "Pages".to_string()
219        };
220
221        sections.entry(section).or_default().push((
222            title.clone(),
223            url.clone(),
224            description.clone(),
225        ));
226    }
227
228    sections
229}
230
231/// Title-cases a single word (first char uppercase, rest lowercase).
232fn titlecase_word(s: &str) -> String {
233    let mut chars = s.chars();
234    match chars.next() {
235        None => String::new(),
236        Some(first) => {
237            let upper: String = first.to_uppercase().collect();
238            format!("{upper}{}", chars.as_str().to_lowercase())
239        }
240    }
241}
242
243/// Parses `Disallow:` patterns from an existing `robots.txt` file.
244fn parse_robots_disallow(site_dir: &Path) -> Vec<String> {
245    let robots_path = site_dir.join("robots.txt");
246    let Ok(content) = fs::read_to_string(&robots_path) else {
247        return Vec::new();
248    };
249
250    content
251        .lines()
252        .filter_map(|line| {
253            let trimmed = line.trim();
254            if let Some(rest) = trimmed.strip_prefix("Disallow:") {
255                let pattern = rest.trim();
256                if !pattern.is_empty() {
257                    return Some(pattern.to_string());
258                }
259            }
260            None
261        })
262        .collect()
263}
264
265/// Generates `llms.txt` following the llmstxt.org v1 specification.
266///
267/// Format:
268/// ```text
269/// # {site_name}
270///
271/// > {site_description}
272///
273/// Language: {language}
274///
275/// ## {Section Name}
276/// - [{Page Title}]({URL}): {Description}
277///
278/// ## Disallow
279/// - {pattern from robots.txt}
280/// ```
281fn generate_llms_txt(
282    site_dir: &Path,
283    config: Option<&crate::cmd::SsgConfig>,
284) -> Result<()> {
285    let site_name = config.map_or("Site", |c| c.site_name.as_str());
286    let base_url = config.map_or("", |c| c.base_url.as_str());
287    let description = config.map_or("", |c| c.site_description.as_str());
288    let language = config
289        .map(|c| c.language.as_str())
290        .filter(|l| !l.is_empty())
291        .unwrap_or("en");
292    let canonical_root = base_url.trim_end_matches('/');
293
294    let mut content =
295        format!("# {site_name}\n\n> {description}\n\nLanguage: {language}\n");
296
297    // Collect and group pages
298    let entries = collect_page_entries(site_dir).unwrap_or_default();
299    let sections = group_pages_by_section(&entries);
300
301    for (section, pages) in &sections {
302        content.push_str(&format!("\n## {section}\n"));
303        for (title, url, desc) in pages {
304            let full_url = if canonical_root.is_empty() {
305                url.clone()
306            } else {
307                format!("{canonical_root}{url}")
308            };
309            if desc.is_empty() {
310                content.push_str(&format!("- [{title}]({full_url})\n"));
311            } else {
312                content.push_str(&format!("- [{title}]({full_url}): {desc}\n"));
313            }
314        }
315    }
316
317    // Disallow section from robots.txt
318    let disallow = parse_robots_disallow(site_dir);
319    if !disallow.is_empty() {
320        content.push_str("\n## Disallow\n");
321        for pattern in &disallow {
322            content.push_str(&format!("- {pattern}\n"));
323        }
324    }
325
326    fs::write(site_dir.join("llms.txt"), content)?;
327    log::info!("[ai] Generated llms.txt");
328    Ok(())
329}
330
331/// Generates `llms-full.txt` with full text content for each page.
332///
333/// Follows the same structure as `llms.txt` but includes the stripped
334/// HTML body content for each page rather than just a link index.
335fn generate_llms_full_txt(
336    site_dir: &Path,
337    config: Option<&crate::cmd::SsgConfig>,
338) -> Result<()> {
339    let site_name = config.map_or("Site", |c| c.site_name.as_str());
340    let base_url = config.map_or("", |c| c.base_url.as_str());
341    let description = config.map_or("", |c| c.site_description.as_str());
342    let language = config
343        .map(|c| c.language.as_str())
344        .filter(|l| !l.is_empty())
345        .unwrap_or("en");
346    let canonical_root = base_url.trim_end_matches('/');
347
348    let mut content =
349        format!("# {site_name}\n\n> {description}\n\nLanguage: {language}\n");
350
351    let html_files = collect_html_files(site_dir)?;
352
353    for html_path in &html_files {
354        let rel = html_path.strip_prefix(site_dir).unwrap_or(html_path);
355
356        // Read sidecar
357        let sidecar_path = html_path.with_extension("meta.json");
358        let meta: serde_json::Map<String, serde_json::Value> =
359            if sidecar_path.exists() {
360                if let Ok(c) = fs::read_to_string(&sidecar_path) {
361                    serde_json::from_str(&c).unwrap_or_default()
362                } else {
363                    serde_json::Map::new()
364                }
365            } else {
366                serde_json::Map::new()
367            };
368
369        if is_excluded_page(rel, &meta) {
370            continue;
371        }
372
373        let title = meta
374            .get("title")
375            .and_then(serde_json::Value::as_str)
376            .unwrap_or_default();
377
378        if title.is_empty() {
379            continue;
380        }
381
382        let url = format!("/{}", rel.to_string_lossy().replace('\\', "/"));
383        let full_url = if canonical_root.is_empty() {
384            url.clone()
385        } else {
386            format!("{canonical_root}{url}")
387        };
388
389        // Read and strip HTML content
390        let html = fs::read_to_string(html_path).unwrap_or_default();
391        let body_text = strip_html_tags(&extract_body(&html));
392        let trimmed = collapse_whitespace(&body_text);
393
394        content.push_str(&format!("\n---\n\n## [{title}]({full_url})\n\n"));
395        if !trimmed.is_empty() {
396            content.push_str(&trimmed);
397            content.push('\n');
398        }
399    }
400
401    fs::write(site_dir.join("llms-full.txt"), content)?;
402    log::info!("[ai] Generated llms-full.txt");
403    Ok(())
404}
405
406/// Extracts the content between `<body>` and `</body>` tags.
407fn extract_body(html: &str) -> String {
408    let lower = html.to_lowercase();
409    let start = lower
410        .find("<body")
411        .and_then(|i| lower[i..].find('>').map(|j| i + j + 1))
412        .unwrap_or(0);
413    let end = lower.find("</body>").unwrap_or(html.len());
414    html[start..end].to_string()
415}
416
417/// Strips HTML tags from a string, preserving text content.
418fn strip_html_tags(html: &str) -> String {
419    let mut result = String::with_capacity(html.len());
420    let mut in_tag = false;
421    for ch in html.chars() {
422        match ch {
423            '<' => in_tag = true,
424            '>' => in_tag = false,
425            _ if !in_tag => result.push(ch),
426            _ => {}
427        }
428    }
429    result
430}
431
432/// Collapses runs of whitespace into single spaces and trims.
433fn collapse_whitespace(s: &str) -> String {
434    let mut result = String::with_capacity(s.len());
435    let mut prev_ws = true; // start true to trim leading
436    for ch in s.chars() {
437        if ch.is_whitespace() {
438            if !prev_ws {
439                result.push(' ');
440                prev_ws = true;
441            }
442        } else {
443            result.push(ch);
444            prev_ws = false;
445        }
446    }
447    // Trim trailing space
448    if result.ends_with(' ') {
449        let _ = result.pop();
450    }
451    result
452}
453
454/// Counts `<img>` tags missing alt attributes in an HTML string.
455fn count_missing_alt(html: &str) -> usize {
456    let lower = html.to_lowercase();
457    let mut count = 0;
458    let mut pos = 0;
459    while let Some(start) = lower[pos..].find("<img") {
460        let abs = pos + start;
461        let tag_end =
462            lower[abs..].find('>').map_or(lower.len(), |e| abs + e + 1);
463        let tag = &lower[abs..tag_end];
464
465        let has_alt = tag.contains("alt=");
466        let empty_alt = tag.contains("alt=\"\"") || tag.contains("alt=''");
467        if !has_alt || empty_alt {
468            count += 1;
469        }
470        pos = tag_end;
471    }
472    count
473}
474
475/// Recursively collects HTML files (delegates to `crate::walk`).
476fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
477    crate::walk::walk_files(dir, "html")
478}
479
480#[cfg(test)]
481mod tests {
482    #![allow(clippy::unwrap_used, clippy::expect_used)]
483
484    use super::*;
485    use crate::cmd::SsgConfig;
486    use crate::test_support::init_logger;
487    use std::path::PathBuf;
488    use tempfile::{tempdir, TempDir};
489
490    // -------------------------------------------------------------------
491    // Test fixtures
492    // -------------------------------------------------------------------
493
494    fn make_site() -> (TempDir, PathBuf, PluginContext) {
495        init_logger();
496        let dir = tempdir().expect("create tempdir");
497        let site = dir.path().join("site");
498        fs::create_dir_all(&site).expect("mkdir site");
499        let ctx = PluginContext::new(dir.path(), dir.path(), &site, dir.path());
500        (dir, site, ctx)
501    }
502
503    /// Writes an HTML file and a companion `.meta.json` sidecar.
504    fn write_page(
505        site: &Path,
506        rel_path: &str,
507        title: &str,
508        description: &str,
509        extra_fields: &str,
510    ) {
511        let html_path = site.join(rel_path);
512        if let Some(parent) = html_path.parent() {
513            fs::create_dir_all(parent).unwrap();
514        }
515        let html = format!(
516            "<html><head><title>{title}</title></head>\
517             <body><h1>{title}</h1><p>{description}</p></body></html>"
518        );
519        fs::write(&html_path, html).unwrap();
520
521        let mut sidecar_json =
522            format!(r#"{{"title": "{title}", "description": "{description}""#);
523        if !extra_fields.is_empty() {
524            sidecar_json.push_str(", ");
525            sidecar_json.push_str(extra_fields);
526        }
527        sidecar_json.push('}');
528        fs::write(html_path.with_extension("meta.json"), sidecar_json).unwrap();
529    }
530
531    // -------------------------------------------------------------------
532    // AiPlugin — derive surface
533    // -------------------------------------------------------------------
534
535    #[test]
536    fn ai_plugin_is_copy_after_move() {
537        // Guards the `Copy` derive added in v0.0.34.
538        let plugin = AiPlugin;
539        let _copy = plugin;
540        assert_eq!(plugin.name(), "ai");
541    }
542
543    #[test]
544    fn name_returns_static_ai_identifier() {
545        assert_eq!(AiPlugin.name(), "ai");
546    }
547
548    // -------------------------------------------------------------------
549    // count_missing_alt — table-driven over the logical paths
550    // -------------------------------------------------------------------
551
552    #[test]
553    fn count_missing_alt_table_driven() {
554        let cases: &[(&str, usize, &str)] = &[
555            // (input, expected_count, comment)
556            (
557                r#"<img src="a.jpg" alt="ok">"#,
558                0,
559                "alt present and non-empty",
560            ),
561            (r#"<img src="a.jpg">"#, 1, "no alt attribute at all"),
562            (r#"<img src="a.jpg" alt="">"#, 1, "empty double-quoted alt"),
563            (r#"<img src="a.jpg" alt=''>"#, 1, "empty single-quoted alt"),
564            (
565                r#"<img src="a.jpg"><img src="b.jpg" alt="ok">"#,
566                1,
567                "first missing, second ok",
568            ),
569            (
570                r#"<img src="a.jpg"><img src="b.jpg">"#,
571                2,
572                "both missing — sequential scan progresses",
573            ),
574            ("", 0, "empty input → zero"),
575            ("<p>no images here</p>", 0, "no <img> tags at all"),
576            (r#"<IMG SRC="a.jpg" ALT="ok">"#, 0, "case-insensitive ALT"),
577            (r#"<IMG SRC="a.jpg">"#, 1, "uppercase tag, no alt"),
578        ];
579        for (input, expected, comment) in cases {
580            assert_eq!(
581                count_missing_alt(input),
582                *expected,
583                "{comment}: count_missing_alt({input:?})"
584            );
585        }
586    }
587
588    #[test]
589    fn count_missing_alt_unterminated_tag_does_not_panic() {
590        let result = count_missing_alt("<img src=foo");
591        assert!(result <= 1);
592    }
593
594    // -------------------------------------------------------------------
595    // parse_robots_disallow
596    // -------------------------------------------------------------------
597
598    #[test]
599    fn test_parse_robots_disallow() {
600        let dir = tempdir().expect("tempdir");
601
602        // Standard robots.txt with multiple directives
603        fs::write(
604            dir.path().join("robots.txt"),
605            "User-agent: *\nDisallow: /admin/\nDisallow: /private/\nAllow: /\n",
606        )
607        .unwrap();
608        let result = parse_robots_disallow(dir.path());
609        assert_eq!(result, vec!["/admin/", "/private/"]);
610    }
611
612    #[test]
613    fn test_parse_robots_disallow_empty_file() {
614        let dir = tempdir().expect("tempdir");
615        fs::write(dir.path().join("robots.txt"), "").unwrap();
616        let result = parse_robots_disallow(dir.path());
617        assert!(result.is_empty());
618    }
619
620    #[test]
621    fn test_parse_robots_disallow_no_disallow_lines() {
622        let dir = tempdir().expect("tempdir");
623        fs::write(
624            dir.path().join("robots.txt"),
625            "User-agent: *\nAllow: /\nSitemap: https://example.com/sitemap.xml\n",
626        )
627        .unwrap();
628        let result = parse_robots_disallow(dir.path());
629        assert!(result.is_empty());
630    }
631
632    #[test]
633    fn test_parse_robots_disallow_multiple_user_agents() {
634        let dir = tempdir().expect("tempdir");
635        fs::write(
636            dir.path().join("robots.txt"),
637            "User-agent: Googlebot\nDisallow: /nogoogle/\n\n\
638             User-agent: *\nDisallow: /secret/\n",
639        )
640        .unwrap();
641        let result = parse_robots_disallow(dir.path());
642        assert_eq!(result, vec!["/nogoogle/", "/secret/"]);
643    }
644
645    #[test]
646    fn test_parse_robots_disallow_missing_file() {
647        let dir = tempdir().expect("tempdir");
648        let result = parse_robots_disallow(dir.path());
649        assert!(result.is_empty());
650    }
651
652    #[test]
653    fn test_parse_robots_disallow_empty_pattern_skipped() {
654        // `Disallow:` with no path means allow all — should be skipped
655        let dir = tempdir().expect("tempdir");
656        fs::write(
657            dir.path().join("robots.txt"),
658            "User-agent: *\nDisallow:\nDisallow: /blocked/\n",
659        )
660        .unwrap();
661        let result = parse_robots_disallow(dir.path());
662        assert_eq!(result, vec!["/blocked/"]);
663    }
664
665    // -------------------------------------------------------------------
666    // is_excluded_page
667    // -------------------------------------------------------------------
668
669    #[test]
670    fn test_is_excluded_page_draft() {
671        let mut meta = serde_json::Map::new();
672        let _ = meta.insert("draft".to_string(), serde_json::Value::Bool(true));
673        assert!(is_excluded_page(Path::new("post.html"), &meta));
674    }
675
676    #[test]
677    fn test_is_excluded_page_draft_string() {
678        let mut meta = serde_json::Map::new();
679        let _ = meta.insert(
680            "draft".to_string(),
681            serde_json::Value::String("true".to_string()),
682        );
683        assert!(is_excluded_page(Path::new("post.html"), &meta));
684    }
685
686    #[test]
687    fn test_is_excluded_page_private() {
688        let mut meta = serde_json::Map::new();
689        let _ =
690            meta.insert("private".to_string(), serde_json::Value::Bool(true));
691        assert!(is_excluded_page(Path::new("post.html"), &meta));
692    }
693
694    #[test]
695    fn test_is_excluded_page_404() {
696        let meta = serde_json::Map::new();
697        assert!(is_excluded_page(Path::new("404.html"), &meta));
698    }
699
700    #[test]
701    fn test_is_excluded_page_normal() {
702        let mut meta = serde_json::Map::new();
703        let _ = meta.insert(
704            "title".to_string(),
705            serde_json::Value::String("Hello".to_string()),
706        );
707        assert!(!is_excluded_page(Path::new("index.html"), &meta));
708    }
709
710    #[test]
711    fn test_is_excluded_page_error_page() {
712        let meta = serde_json::Map::new();
713        assert!(is_excluded_page(Path::new("error500.html"), &meta));
714    }
715
716    // -------------------------------------------------------------------
717    // group_pages_by_section
718    // -------------------------------------------------------------------
719
720    #[test]
721    fn test_group_pages_by_section() {
722        let entries = vec![
723            (
724                "Home".to_string(),
725                "/index.html".to_string(),
726                "Welcome".to_string(),
727            ),
728            (
729                "Post 1".to_string(),
730                "/blog/post1.html".to_string(),
731                "First".to_string(),
732            ),
733            (
734                "Post 2".to_string(),
735                "/blog/post2.html".to_string(),
736                "Second".to_string(),
737            ),
738            (
739                "API Ref".to_string(),
740                "/docs/api.html".to_string(),
741                "API docs".to_string(),
742            ),
743        ];
744        let grouped = group_pages_by_section(&entries);
745
746        assert_eq!(grouped.len(), 3);
747        assert!(grouped.contains_key("Pages"));
748        assert!(grouped.contains_key("Blog"));
749        assert!(grouped.contains_key("Docs"));
750        assert_eq!(grouped["Pages"].len(), 1);
751        assert_eq!(grouped["Blog"].len(), 2);
752        assert_eq!(grouped["Docs"].len(), 1);
753    }
754
755    #[test]
756    fn test_group_pages_by_section_root_only() {
757        let entries = vec![
758            (
759                "About".to_string(),
760                "/about.html".to_string(),
761                String::new(),
762            ),
763            (
764                "Contact".to_string(),
765                "/contact.html".to_string(),
766                String::new(),
767            ),
768        ];
769        let grouped = group_pages_by_section(&entries);
770        assert_eq!(grouped.len(), 1);
771        assert_eq!(grouped["Pages"].len(), 2);
772    }
773
774    #[test]
775    fn test_group_pages_by_section_deterministic_order() {
776        let entries = vec![
777            ("Z".to_string(), "/zebra/z.html".to_string(), String::new()),
778            ("A".to_string(), "/alpha/a.html".to_string(), String::new()),
779            ("M".to_string(), "/middle/m.html".to_string(), String::new()),
780        ];
781        let grouped = group_pages_by_section(&entries);
782        let keys: Vec<&String> = grouped.keys().collect();
783        assert_eq!(keys, vec!["Alpha", "Middle", "Zebra"]);
784    }
785
786    // -------------------------------------------------------------------
787    // generate_llms_txt — spec compliance
788    // -------------------------------------------------------------------
789
790    #[test]
791    fn generate_llms_txt_with_full_config_includes_all_fields() {
792        let dir = tempdir().expect("tempdir");
793        let config = SsgConfig {
794            site_name: "My Site".to_string(),
795            site_description: "A great site".to_string(),
796            base_url: "https://example.com".to_string(),
797            language: "en".to_string(),
798            ..Default::default()
799        };
800
801        generate_llms_txt(dir.path(), Some(&config)).unwrap();
802        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
803        assert!(body.contains("# My Site"));
804        assert!(body.contains("> A great site"));
805        assert!(body.contains("Language: en"));
806    }
807
808    #[test]
809    fn generate_llms_txt_without_config_uses_defaults() {
810        let dir = tempdir().expect("tempdir");
811        generate_llms_txt(dir.path(), None).unwrap();
812
813        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
814        assert!(body.contains("# Site"));
815        assert!(body.contains("Language: en"));
816    }
817
818    #[test]
819    fn generate_llms_txt_strips_trailing_slash_from_base_url() {
820        let dir = tempdir().expect("tempdir");
821        let config = SsgConfig {
822            site_name: "S".to_string(),
823            site_description: "D".to_string(),
824            base_url: "https://example.com/".to_string(),
825            ..Default::default()
826        };
827
828        // Write a page so we can verify URL formatting
829        write_page(dir.path(), "index.html", "Home", "Welcome", "");
830
831        generate_llms_txt(dir.path(), Some(&config)).unwrap();
832        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
833        // URLs should not have double slashes
834        assert!(
835            !body.contains("//index.html"),
836            "trailing slash should be normalised:\n{body}"
837        );
838    }
839
840    #[test]
841    fn generate_llms_txt_into_missing_parent_returns_err() {
842        let bogus = Path::new("/this/path/should/not/exist");
843        assert!(generate_llms_txt(bogus, None).is_err());
844    }
845
846    #[test]
847    fn test_llms_txt_contains_language() {
848        let dir = tempdir().expect("tempdir");
849        let config = SsgConfig {
850            language: "fr".to_string(),
851            ..Default::default()
852        };
853        generate_llms_txt(dir.path(), Some(&config)).unwrap();
854        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
855        assert!(
856            body.contains("Language: fr"),
857            "llms.txt must include Language field:\n{body}"
858        );
859    }
860
861    #[test]
862    fn test_llms_txt_contains_language_defaults_to_en() {
863        let dir = tempdir().expect("tempdir");
864        let config = SsgConfig {
865            language: String::new(),
866            ..Default::default()
867        };
868        generate_llms_txt(dir.path(), Some(&config)).unwrap();
869        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
870        assert!(
871            body.contains("Language: en"),
872            "empty language should default to en:\n{body}"
873        );
874    }
875
876    #[test]
877    fn test_llms_txt_excludes_drafts() {
878        let dir = tempdir().expect("tempdir");
879        write_page(dir.path(), "published.html", "Published", "Visible", "");
880        write_page(
881            dir.path(),
882            "draft.html",
883            "Draft Post",
884            "Hidden",
885            r#""draft": true"#,
886        );
887
888        generate_llms_txt(dir.path(), None).unwrap();
889        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
890        assert!(
891            body.contains("Published"),
892            "published page must appear:\n{body}"
893        );
894        assert!(
895            !body.contains("Draft Post"),
896            "draft page must be excluded:\n{body}"
897        );
898    }
899
900    #[test]
901    fn test_llms_txt_excludes_private() {
902        let dir = tempdir().expect("tempdir");
903        write_page(dir.path(), "public.html", "Public", "Visible", "");
904        write_page(
905            dir.path(),
906            "secret.html",
907            "Secret",
908            "Hidden",
909            r#""private": true"#,
910        );
911
912        generate_llms_txt(dir.path(), None).unwrap();
913        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
914        assert!(
915            !body.contains("Secret"),
916            "private page must be excluded:\n{body}"
917        );
918    }
919
920    #[test]
921    fn test_llms_txt_excludes_404() {
922        let dir = tempdir().expect("tempdir");
923        write_page(dir.path(), "index.html", "Home", "Welcome", "");
924        write_page(dir.path(), "404.html", "Not Found", "Error page", "");
925
926        generate_llms_txt(dir.path(), None).unwrap();
927        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
928        assert!(
929            !body.contains("Not Found"),
930            "404 page must be excluded:\n{body}"
931        );
932    }
933
934    #[test]
935    fn test_llms_txt_contains_sections() {
936        let dir = tempdir().expect("tempdir");
937        write_page(dir.path(), "index.html", "Home", "Welcome", "");
938        write_page(dir.path(), "blog/post.html", "My Post", "A blog post", "");
939        write_page(
940            dir.path(),
941            "docs/api.html",
942            "API Docs",
943            "API reference",
944            "",
945        );
946
947        generate_llms_txt(dir.path(), None).unwrap();
948        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
949        assert!(
950            body.contains("## Pages"),
951            "should have Pages section:\n{body}"
952        );
953        assert!(
954            body.contains("## Blog"),
955            "should have Blog section:\n{body}"
956        );
957        assert!(
958            body.contains("## Docs"),
959            "should have Docs section:\n{body}"
960        );
961        assert!(
962            body.contains("- [My Post]"),
963            "should contain page link:\n{body}"
964        );
965    }
966
967    #[test]
968    fn test_llms_txt_contains_disallow_section() {
969        let dir = tempdir().expect("tempdir");
970        fs::write(
971            dir.path().join("robots.txt"),
972            "User-agent: *\nDisallow: /admin/\n",
973        )
974        .unwrap();
975
976        generate_llms_txt(dir.path(), None).unwrap();
977        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
978        assert!(
979            body.contains("## Disallow"),
980            "should have Disallow section:\n{body}"
981        );
982        assert!(
983            body.contains("- /admin/"),
984            "should contain disallow pattern:\n{body}"
985        );
986    }
987
988    #[test]
989    fn test_llms_txt_no_disallow_without_robots() {
990        let dir = tempdir().expect("tempdir");
991        generate_llms_txt(dir.path(), None).unwrap();
992        let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
993        assert!(
994            !body.contains("## Disallow"),
995            "no robots.txt means no Disallow section:\n{body}"
996        );
997    }
998
999    // -------------------------------------------------------------------
1000    // generate_llms_full_txt
1001    // -------------------------------------------------------------------
1002
1003    #[test]
1004    fn test_llms_full_txt_contains_body_content() {
1005        let dir = tempdir().expect("tempdir");
1006        write_page(dir.path(), "index.html", "Home", "Welcome home", "");
1007
1008        generate_llms_full_txt(dir.path(), None).unwrap();
1009        let body =
1010            fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
1011        assert!(body.contains("# Site"), "header present:\n{body}");
1012        assert!(body.contains("Language: en"), "language present:\n{body}");
1013        assert!(body.contains("## [Home]"), "page title present:\n{body}");
1014        assert!(body.contains("Welcome home"), "body text present:\n{body}");
1015    }
1016
1017    #[test]
1018    fn test_llms_full_txt_excludes_drafts() {
1019        let dir = tempdir().expect("tempdir");
1020        write_page(dir.path(), "ok.html", "Visible", "Content", "");
1021        write_page(
1022            dir.path(),
1023            "hidden.html",
1024            "Hidden",
1025            "Secret",
1026            r#""draft": true"#,
1027        );
1028
1029        generate_llms_full_txt(dir.path(), None).unwrap();
1030        let body =
1031            fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
1032        assert!(body.contains("Visible"), "published page present:\n{body}");
1033        assert!(!body.contains("Hidden"), "draft excluded:\n{body}");
1034    }
1035
1036    #[test]
1037    fn test_llms_full_txt_excludes_404() {
1038        let dir = tempdir().expect("tempdir");
1039        write_page(dir.path(), "index.html", "Home", "Welcome", "");
1040        write_page(dir.path(), "404.html", "Not Found", "Error", "");
1041
1042        generate_llms_full_txt(dir.path(), None).unwrap();
1043        let body =
1044            fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
1045        assert!(!body.contains("Not Found"), "404 excluded:\n{body}");
1046    }
1047
1048    // -------------------------------------------------------------------
1049    // strip_html_tags / extract_body / collapse_whitespace
1050    // -------------------------------------------------------------------
1051
1052    #[test]
1053    fn test_strip_html_tags() {
1054        assert_eq!(strip_html_tags("<p>hello</p>"), "hello");
1055        assert_eq!(strip_html_tags("<div><b>bold</b> text</div>"), "bold text");
1056        assert_eq!(strip_html_tags("no tags"), "no tags");
1057        assert_eq!(strip_html_tags(""), "");
1058    }
1059
1060    #[test]
1061    fn test_extract_body() {
1062        let html =
1063            "<html><head><title>T</title></head><body>Content</body></html>";
1064        assert_eq!(extract_body(html), "Content");
1065    }
1066
1067    #[test]
1068    fn test_extract_body_with_attributes() {
1069        let html = "<html><body class=\"main\">Content</body></html>";
1070        assert_eq!(extract_body(html), "Content");
1071    }
1072
1073    #[test]
1074    fn test_extract_body_no_body_tag() {
1075        let html = "<p>Just a fragment</p>";
1076        assert_eq!(extract_body(html), html);
1077    }
1078
1079    #[test]
1080    fn test_collapse_whitespace() {
1081        assert_eq!(collapse_whitespace("  hello   world  "), "hello world");
1082        assert_eq!(collapse_whitespace("no  extra"), "no extra");
1083        assert_eq!(collapse_whitespace(""), "");
1084    }
1085
1086    // -------------------------------------------------------------------
1087    // titlecase_word
1088    // -------------------------------------------------------------------
1089
1090    #[test]
1091    fn test_titlecase_word() {
1092        assert_eq!(titlecase_word("blog"), "Blog");
1093        assert_eq!(titlecase_word("DOCS"), "Docs");
1094        assert_eq!(titlecase_word(""), "");
1095        assert_eq!(titlecase_word("a"), "A");
1096    }
1097
1098    // -------------------------------------------------------------------
1099    // after_compile — short-circuit + dispatch paths
1100    // -------------------------------------------------------------------
1101
1102    #[test]
1103    fn after_compile_missing_site_dir_returns_ok_without_writing() {
1104        let dir = tempdir().expect("tempdir");
1105        let missing = dir.path().join("missing");
1106        let ctx =
1107            PluginContext::new(dir.path(), dir.path(), &missing, dir.path());
1108
1109        AiPlugin.after_compile(&ctx).expect("missing site is fine");
1110        assert!(!missing.exists());
1111        assert!(!dir.path().join("llms.txt").exists());
1112    }
1113
1114    #[test]
1115    fn after_compile_injects_max_snippet_meta_tag() {
1116        let (_tmp, site, ctx) = make_site();
1117        let html = "<html><head><title>X</title></head><body></body></html>";
1118        fs::write(site.join("index.html"), html).unwrap();
1119
1120        AiPlugin.after_compile(&ctx).unwrap();
1121        let output = fs::read_to_string(site.join("index.html")).unwrap();
1122        assert!(output.contains("max-snippet"));
1123        assert!(output.contains("max-image-preview:large"));
1124    }
1125
1126    #[test]
1127    fn after_compile_creates_llms_txt_in_site_root() {
1128        let (_tmp, site, ctx) = make_site();
1129        AiPlugin.after_compile(&ctx).unwrap();
1130        assert!(site.join("llms.txt").exists());
1131    }
1132
1133    #[test]
1134    fn after_compile_creates_llms_full_txt_in_site_root() {
1135        let (_tmp, site, ctx) = make_site();
1136        AiPlugin.after_compile(&ctx).unwrap();
1137        assert!(site.join("llms-full.txt").exists());
1138    }
1139
1140    #[test]
1141    fn after_compile_idempotent_does_not_duplicate_meta_tag() {
1142        let (_tmp, site, ctx) = make_site();
1143        let html = "<html><head><title>X</title></head><body></body></html>";
1144        fs::write(site.join("index.html"), html).unwrap();
1145
1146        AiPlugin.after_compile(&ctx).unwrap();
1147        AiPlugin.after_compile(&ctx).unwrap();
1148
1149        let output = fs::read_to_string(site.join("index.html")).unwrap();
1150        assert_eq!(output.matches("max-snippet").count(), 1);
1151    }
1152
1153    #[test]
1154    fn after_compile_skips_html_files_without_head_tag() {
1155        let (_tmp, site, ctx) = make_site();
1156        fs::write(site.join("fragment.html"), "<p>just a fragment</p>")
1157            .unwrap();
1158
1159        AiPlugin.after_compile(&ctx).unwrap();
1160        let output = fs::read_to_string(site.join("fragment.html")).unwrap();
1161        assert!(!output.contains("max-snippet"));
1162        assert_eq!(output, "<p>just a fragment</p>");
1163    }
1164
1165    #[test]
1166    fn after_compile_processes_files_in_subdirectories() {
1167        let (_tmp, site, ctx) = make_site();
1168        let nested = site.join("blog");
1169        fs::create_dir_all(&nested).unwrap();
1170        fs::write(
1171            nested.join("post.html"),
1172            "<html><head></head><body></body></html>",
1173        )
1174        .unwrap();
1175
1176        AiPlugin.after_compile(&ctx).unwrap();
1177        let output = fs::read_to_string(nested.join("post.html")).unwrap();
1178        assert!(output.contains("max-snippet"));
1179    }
1180
1181    #[test]
1182    fn after_compile_logs_warning_for_pages_with_missing_alt() {
1183        let (_tmp, site, ctx) = make_site();
1184        fs::write(
1185            site.join("bad.html"),
1186            r#"<html><head></head><body><img src="a.jpg"></body></html>"#,
1187        )
1188        .unwrap();
1189        fs::write(
1190            site.join("worse.html"),
1191            r#"<html><head></head><body><img src="a.jpg" alt=""></body></html>"#,
1192        )
1193        .unwrap();
1194
1195        AiPlugin.after_compile(&ctx).unwrap();
1196        let bad = fs::read_to_string(site.join("bad.html")).unwrap();
1197        assert!(bad.contains("max-snippet"));
1198    }
1199
1200    #[test]
1201    fn after_compile_does_not_rewrite_unchanged_files() {
1202        let (_tmp, site, ctx) = make_site();
1203        let html = "<html><head><meta name=\"robots\" content=\"max-snippet:-1\"></head><body></body></html>";
1204        fs::write(site.join("index.html"), html).unwrap();
1205        let original_mtime = fs::metadata(site.join("index.html"))
1206            .unwrap()
1207            .modified()
1208            .unwrap();
1209
1210        AiPlugin.after_compile(&ctx).unwrap();
1211        let after = fs::read_to_string(site.join("index.html")).unwrap();
1212        assert_eq!(after, html, "unchanged file body must be preserved");
1213        let _ = original_mtime;
1214    }
1215
1216    // -------------------------------------------------------------------
1217    // collect_html_files — recursion + filtering
1218    // -------------------------------------------------------------------
1219
1220    #[test]
1221    fn collect_html_files_returns_empty_for_missing_directory() {
1222        let dir = tempdir().expect("tempdir");
1223        let result = collect_html_files(&dir.path().join("missing")).unwrap();
1224        assert!(result.is_empty());
1225    }
1226
1227    #[test]
1228    fn collect_html_files_filters_non_html_extensions() {
1229        let dir = tempdir().expect("tempdir");
1230        fs::write(dir.path().join("a.html"), "").unwrap();
1231        fs::write(dir.path().join("b.css"), "").unwrap();
1232        fs::write(dir.path().join("c.js"), "").unwrap();
1233
1234        let result = collect_html_files(dir.path()).unwrap();
1235        assert_eq!(result.len(), 1);
1236    }
1237
1238    #[test]
1239    fn collect_html_files_recurses_into_nested_subdirectories() {
1240        let dir = tempdir().expect("tempdir");
1241        let nested = dir.path().join("a").join("b");
1242        fs::create_dir_all(&nested).unwrap();
1243        fs::write(dir.path().join("top.html"), "").unwrap();
1244        fs::write(nested.join("deep.html"), "").unwrap();
1245
1246        let result = collect_html_files(dir.path()).unwrap();
1247        assert_eq!(result.len(), 2);
1248    }
1249
1250    #[test]
1251    fn collect_html_files_returns_results_sorted() {
1252        let dir = tempdir().expect("tempdir");
1253        for name in ["zebra.html", "apple.html", "mango.html"] {
1254            fs::write(dir.path().join(name), "").unwrap();
1255        }
1256        let result = collect_html_files(dir.path()).unwrap();
1257        let names: Vec<_> = result
1258            .iter()
1259            .map(|p| p.file_name().unwrap().to_str().unwrap())
1260            .collect();
1261        assert_eq!(names, vec!["apple.html", "mango.html", "zebra.html"]);
1262    }
1263}
ssg/ai.rs

ssg/
ai.rs