1use crate::plugin::{Plugin, PluginContext};
14use anyhow::Result;
15use std::{
16 collections::BTreeMap,
17 fs,
18 path::{Path, PathBuf},
19};
20
21#[derive(Debug, Clone, Copy)]
28pub struct AiPlugin;
29
30impl Plugin for AiPlugin {
31 fn name(&self) -> &'static str {
32 "ai"
33 }
34
35 fn after_compile(&self, ctx: &PluginContext) -> Result<()> {
36 if !ctx.site_dir.exists() {
37 return Ok(());
38 }
39
40 generate_llms_txt(&ctx.site_dir, ctx.config.as_ref())?;
41 generate_llms_full_txt(&ctx.site_dir, ctx.config.as_ref())?;
42
43 let html_files = collect_html_files(&ctx.site_dir)?;
44 let pages_with_missing_alt =
45 process_html_for_ai(&html_files, &ctx.site_dir)?;
46
47 if pages_with_missing_alt > 0 {
48 log::warn!(
49 "[ai] {pages_with_missing_alt} page(s) have images without alt text"
50 );
51 }
52
53 Ok(())
54 }
55}
56
57fn process_html_for_ai(
59 html_files: &[PathBuf],
60 site_dir: &Path,
61) -> Result<usize> {
62 let mut pages_with_missing_alt = 0usize;
63
64 for path in html_files {
65 let html = fs::read_to_string(path)?;
66 let modified = inject_max_snippet(&html);
67
68 check_alt_text(path, &modified, site_dir, &mut pages_with_missing_alt);
69
70 if modified != html {
71 fs::write(path, modified)?;
72 }
73 }
74
75 Ok(pages_with_missing_alt)
76}
77
78fn inject_max_snippet(html: &str) -> String {
80 if html.contains("max-snippet") || !html.contains("</head>") {
81 return html.to_string();
82 }
83 let tag = "<meta name=\"robots\" content=\"max-snippet:-1, max-image-preview:large, max-video-preview:-1\">\n";
84 if let Some(pos) = html.find("</head>") {
85 let mut modified = html.to_string();
86 modified.insert_str(pos, tag);
87 modified
88 } else {
89 html.to_string()
90 }
91}
92
93fn check_alt_text(
95 path: &Path,
96 html: &str,
97 site_dir: &Path,
98 counter: &mut usize,
99) {
100 let missing = count_missing_alt(html);
101 if missing > 0 {
102 let rel = path.strip_prefix(site_dir).unwrap_or(path).display();
103 log::warn!("[ai] {missing} image(s) missing alt text in {rel}");
104 *counter += 1;
105 }
106}
107
108fn collect_page_entries(
117 site_dir: &Path,
118) -> Result<Vec<(String, String, String)>> {
119 let html_files = collect_html_files(site_dir)?;
120 let mut entries = Vec::new();
121
122 for html_path in &html_files {
123 let rel = html_path.strip_prefix(site_dir).unwrap_or(html_path);
124
125 let sidecar_path = html_path.with_extension("meta.json");
127 let meta: serde_json::Map<String, serde_json::Value> =
128 if sidecar_path.exists() {
129 if let Ok(content) = fs::read_to_string(&sidecar_path) {
130 serde_json::from_str(&content).unwrap_or_default()
131 } else {
132 serde_json::Map::new()
133 }
134 } else {
135 serde_json::Map::new()
136 };
137
138 if is_excluded_page(rel, &meta) {
139 continue;
140 }
141
142 let title = meta
143 .get("title")
144 .and_then(serde_json::Value::as_str)
145 .unwrap_or_default()
146 .to_string();
147 let description = meta
148 .get("description")
149 .and_then(serde_json::Value::as_str)
150 .unwrap_or_default()
151 .to_string();
152
153 let url = format!("/{}", rel.to_string_lossy().replace('\\', "/"));
155
156 if !title.is_empty() {
157 entries.push((title, url, description));
158 }
159 }
160
161 Ok(entries)
162}
163
164fn is_excluded_page(
168 path: &Path,
169 frontmatter: &serde_json::Map<String, serde_json::Value>,
170) -> bool {
171 let file_name = path
173 .file_name()
174 .map(|n| n.to_string_lossy().to_lowercase())
175 .unwrap_or_default();
176 if file_name == "404.html" || file_name.starts_with("error") {
177 return true;
178 }
179
180 if let Some(draft) = frontmatter.get("draft") {
182 if draft.as_bool().unwrap_or(false)
183 || draft.as_str().is_some_and(|s| s == "true")
184 {
185 return true;
186 }
187 }
188
189 if let Some(private) = frontmatter.get("private") {
191 if private.as_bool().unwrap_or(false)
192 || private.as_str().is_some_and(|s| s == "true")
193 {
194 return true;
195 }
196 }
197
198 false
199}
200
201fn group_pages_by_section(
206 entries: &[(String, String, String)],
207) -> BTreeMap<String, Vec<(String, String, String)>> {
208 let mut sections: BTreeMap<String, Vec<(String, String, String)>> =
209 BTreeMap::new();
210
211 for (title, url, description) in entries {
212 let trimmed = url.trim_start_matches('/');
214 let section = if let Some(slash) = trimmed.find('/') {
215 let dir = &trimmed[..slash];
216 titlecase_word(dir)
217 } else {
218 "Pages".to_string()
219 };
220
221 sections.entry(section).or_default().push((
222 title.clone(),
223 url.clone(),
224 description.clone(),
225 ));
226 }
227
228 sections
229}
230
231fn titlecase_word(s: &str) -> String {
233 let mut chars = s.chars();
234 match chars.next() {
235 None => String::new(),
236 Some(first) => {
237 let upper: String = first.to_uppercase().collect();
238 format!("{upper}{}", chars.as_str().to_lowercase())
239 }
240 }
241}
242
243fn parse_robots_disallow(site_dir: &Path) -> Vec<String> {
245 let robots_path = site_dir.join("robots.txt");
246 let Ok(content) = fs::read_to_string(&robots_path) else {
247 return Vec::new();
248 };
249
250 content
251 .lines()
252 .filter_map(|line| {
253 let trimmed = line.trim();
254 if let Some(rest) = trimmed.strip_prefix("Disallow:") {
255 let pattern = rest.trim();
256 if !pattern.is_empty() {
257 return Some(pattern.to_string());
258 }
259 }
260 None
261 })
262 .collect()
263}
264
265fn generate_llms_txt(
282 site_dir: &Path,
283 config: Option<&crate::cmd::SsgConfig>,
284) -> Result<()> {
285 let site_name = config.map_or("Site", |c| c.site_name.as_str());
286 let base_url = config.map_or("", |c| c.base_url.as_str());
287 let description = config.map_or("", |c| c.site_description.as_str());
288 let language = config
289 .map(|c| c.language.as_str())
290 .filter(|l| !l.is_empty())
291 .unwrap_or("en");
292 let canonical_root = base_url.trim_end_matches('/');
293
294 let mut content =
295 format!("# {site_name}\n\n> {description}\n\nLanguage: {language}\n");
296
297 let entries = collect_page_entries(site_dir).unwrap_or_default();
299 let sections = group_pages_by_section(&entries);
300
301 for (section, pages) in §ions {
302 content.push_str(&format!("\n## {section}\n"));
303 for (title, url, desc) in pages {
304 let full_url = if canonical_root.is_empty() {
305 url.clone()
306 } else {
307 format!("{canonical_root}{url}")
308 };
309 if desc.is_empty() {
310 content.push_str(&format!("- [{title}]({full_url})\n"));
311 } else {
312 content.push_str(&format!("- [{title}]({full_url}): {desc}\n"));
313 }
314 }
315 }
316
317 let disallow = parse_robots_disallow(site_dir);
319 if !disallow.is_empty() {
320 content.push_str("\n## Disallow\n");
321 for pattern in &disallow {
322 content.push_str(&format!("- {pattern}\n"));
323 }
324 }
325
326 fs::write(site_dir.join("llms.txt"), content)?;
327 log::info!("[ai] Generated llms.txt");
328 Ok(())
329}
330
331fn generate_llms_full_txt(
336 site_dir: &Path,
337 config: Option<&crate::cmd::SsgConfig>,
338) -> Result<()> {
339 let site_name = config.map_or("Site", |c| c.site_name.as_str());
340 let base_url = config.map_or("", |c| c.base_url.as_str());
341 let description = config.map_or("", |c| c.site_description.as_str());
342 let language = config
343 .map(|c| c.language.as_str())
344 .filter(|l| !l.is_empty())
345 .unwrap_or("en");
346 let canonical_root = base_url.trim_end_matches('/');
347
348 let mut content =
349 format!("# {site_name}\n\n> {description}\n\nLanguage: {language}\n");
350
351 let html_files = collect_html_files(site_dir)?;
352
353 for html_path in &html_files {
354 let rel = html_path.strip_prefix(site_dir).unwrap_or(html_path);
355
356 let sidecar_path = html_path.with_extension("meta.json");
358 let meta: serde_json::Map<String, serde_json::Value> =
359 if sidecar_path.exists() {
360 if let Ok(c) = fs::read_to_string(&sidecar_path) {
361 serde_json::from_str(&c).unwrap_or_default()
362 } else {
363 serde_json::Map::new()
364 }
365 } else {
366 serde_json::Map::new()
367 };
368
369 if is_excluded_page(rel, &meta) {
370 continue;
371 }
372
373 let title = meta
374 .get("title")
375 .and_then(serde_json::Value::as_str)
376 .unwrap_or_default();
377
378 if title.is_empty() {
379 continue;
380 }
381
382 let url = format!("/{}", rel.to_string_lossy().replace('\\', "/"));
383 let full_url = if canonical_root.is_empty() {
384 url.clone()
385 } else {
386 format!("{canonical_root}{url}")
387 };
388
389 let html = fs::read_to_string(html_path).unwrap_or_default();
391 let body_text = strip_html_tags(&extract_body(&html));
392 let trimmed = collapse_whitespace(&body_text);
393
394 content.push_str(&format!("\n---\n\n## [{title}]({full_url})\n\n"));
395 if !trimmed.is_empty() {
396 content.push_str(&trimmed);
397 content.push('\n');
398 }
399 }
400
401 fs::write(site_dir.join("llms-full.txt"), content)?;
402 log::info!("[ai] Generated llms-full.txt");
403 Ok(())
404}
405
406fn extract_body(html: &str) -> String {
408 let lower = html.to_lowercase();
409 let start = lower
410 .find("<body")
411 .and_then(|i| lower[i..].find('>').map(|j| i + j + 1))
412 .unwrap_or(0);
413 let end = lower.find("</body>").unwrap_or(html.len());
414 html[start..end].to_string()
415}
416
417fn strip_html_tags(html: &str) -> String {
419 let mut result = String::with_capacity(html.len());
420 let mut in_tag = false;
421 for ch in html.chars() {
422 match ch {
423 '<' => in_tag = true,
424 '>' => in_tag = false,
425 _ if !in_tag => result.push(ch),
426 _ => {}
427 }
428 }
429 result
430}
431
432fn collapse_whitespace(s: &str) -> String {
434 let mut result = String::with_capacity(s.len());
435 let mut prev_ws = true; for ch in s.chars() {
437 if ch.is_whitespace() {
438 if !prev_ws {
439 result.push(' ');
440 prev_ws = true;
441 }
442 } else {
443 result.push(ch);
444 prev_ws = false;
445 }
446 }
447 if result.ends_with(' ') {
449 let _ = result.pop();
450 }
451 result
452}
453
454fn count_missing_alt(html: &str) -> usize {
456 let lower = html.to_lowercase();
457 let mut count = 0;
458 let mut pos = 0;
459 while let Some(start) = lower[pos..].find("<img") {
460 let abs = pos + start;
461 let tag_end =
462 lower[abs..].find('>').map_or(lower.len(), |e| abs + e + 1);
463 let tag = &lower[abs..tag_end];
464
465 let has_alt = tag.contains("alt=");
466 let empty_alt = tag.contains("alt=\"\"") || tag.contains("alt=''");
467 if !has_alt || empty_alt {
468 count += 1;
469 }
470 pos = tag_end;
471 }
472 count
473}
474
475fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
477 crate::walk::walk_files(dir, "html")
478}
479
480#[cfg(test)]
481mod tests {
482 #![allow(clippy::unwrap_used, clippy::expect_used)]
483
484 use super::*;
485 use crate::cmd::SsgConfig;
486 use crate::test_support::init_logger;
487 use std::path::PathBuf;
488 use tempfile::{tempdir, TempDir};
489
490 fn make_site() -> (TempDir, PathBuf, PluginContext) {
495 init_logger();
496 let dir = tempdir().expect("create tempdir");
497 let site = dir.path().join("site");
498 fs::create_dir_all(&site).expect("mkdir site");
499 let ctx = PluginContext::new(dir.path(), dir.path(), &site, dir.path());
500 (dir, site, ctx)
501 }
502
503 fn write_page(
505 site: &Path,
506 rel_path: &str,
507 title: &str,
508 description: &str,
509 extra_fields: &str,
510 ) {
511 let html_path = site.join(rel_path);
512 if let Some(parent) = html_path.parent() {
513 fs::create_dir_all(parent).unwrap();
514 }
515 let html = format!(
516 "<html><head><title>{title}</title></head>\
517 <body><h1>{title}</h1><p>{description}</p></body></html>"
518 );
519 fs::write(&html_path, html).unwrap();
520
521 let mut sidecar_json =
522 format!(r#"{{"title": "{title}", "description": "{description}""#);
523 if !extra_fields.is_empty() {
524 sidecar_json.push_str(", ");
525 sidecar_json.push_str(extra_fields);
526 }
527 sidecar_json.push('}');
528 fs::write(html_path.with_extension("meta.json"), sidecar_json).unwrap();
529 }
530
531 #[test]
536 fn ai_plugin_is_copy_after_move() {
537 let plugin = AiPlugin;
539 let _copy = plugin;
540 assert_eq!(plugin.name(), "ai");
541 }
542
543 #[test]
544 fn name_returns_static_ai_identifier() {
545 assert_eq!(AiPlugin.name(), "ai");
546 }
547
548 #[test]
553 fn count_missing_alt_table_driven() {
554 let cases: &[(&str, usize, &str)] = &[
555 (
557 r#"<img src="a.jpg" alt="ok">"#,
558 0,
559 "alt present and non-empty",
560 ),
561 (r#"<img src="a.jpg">"#, 1, "no alt attribute at all"),
562 (r#"<img src="a.jpg" alt="">"#, 1, "empty double-quoted alt"),
563 (r#"<img src="a.jpg" alt=''>"#, 1, "empty single-quoted alt"),
564 (
565 r#"<img src="a.jpg"><img src="b.jpg" alt="ok">"#,
566 1,
567 "first missing, second ok",
568 ),
569 (
570 r#"<img src="a.jpg"><img src="b.jpg">"#,
571 2,
572 "both missing — sequential scan progresses",
573 ),
574 ("", 0, "empty input → zero"),
575 ("<p>no images here</p>", 0, "no <img> tags at all"),
576 (r#"<IMG SRC="a.jpg" ALT="ok">"#, 0, "case-insensitive ALT"),
577 (r#"<IMG SRC="a.jpg">"#, 1, "uppercase tag, no alt"),
578 ];
579 for (input, expected, comment) in cases {
580 assert_eq!(
581 count_missing_alt(input),
582 *expected,
583 "{comment}: count_missing_alt({input:?})"
584 );
585 }
586 }
587
588 #[test]
589 fn count_missing_alt_unterminated_tag_does_not_panic() {
590 let result = count_missing_alt("<img src=foo");
591 assert!(result <= 1);
592 }
593
594 #[test]
599 fn test_parse_robots_disallow() {
600 let dir = tempdir().expect("tempdir");
601
602 fs::write(
604 dir.path().join("robots.txt"),
605 "User-agent: *\nDisallow: /admin/\nDisallow: /private/\nAllow: /\n",
606 )
607 .unwrap();
608 let result = parse_robots_disallow(dir.path());
609 assert_eq!(result, vec!["/admin/", "/private/"]);
610 }
611
612 #[test]
613 fn test_parse_robots_disallow_empty_file() {
614 let dir = tempdir().expect("tempdir");
615 fs::write(dir.path().join("robots.txt"), "").unwrap();
616 let result = parse_robots_disallow(dir.path());
617 assert!(result.is_empty());
618 }
619
620 #[test]
621 fn test_parse_robots_disallow_no_disallow_lines() {
622 let dir = tempdir().expect("tempdir");
623 fs::write(
624 dir.path().join("robots.txt"),
625 "User-agent: *\nAllow: /\nSitemap: https://example.com/sitemap.xml\n",
626 )
627 .unwrap();
628 let result = parse_robots_disallow(dir.path());
629 assert!(result.is_empty());
630 }
631
632 #[test]
633 fn test_parse_robots_disallow_multiple_user_agents() {
634 let dir = tempdir().expect("tempdir");
635 fs::write(
636 dir.path().join("robots.txt"),
637 "User-agent: Googlebot\nDisallow: /nogoogle/\n\n\
638 User-agent: *\nDisallow: /secret/\n",
639 )
640 .unwrap();
641 let result = parse_robots_disallow(dir.path());
642 assert_eq!(result, vec!["/nogoogle/", "/secret/"]);
643 }
644
645 #[test]
646 fn test_parse_robots_disallow_missing_file() {
647 let dir = tempdir().expect("tempdir");
648 let result = parse_robots_disallow(dir.path());
649 assert!(result.is_empty());
650 }
651
652 #[test]
653 fn test_parse_robots_disallow_empty_pattern_skipped() {
654 let dir = tempdir().expect("tempdir");
656 fs::write(
657 dir.path().join("robots.txt"),
658 "User-agent: *\nDisallow:\nDisallow: /blocked/\n",
659 )
660 .unwrap();
661 let result = parse_robots_disallow(dir.path());
662 assert_eq!(result, vec!["/blocked/"]);
663 }
664
665 #[test]
670 fn test_is_excluded_page_draft() {
671 let mut meta = serde_json::Map::new();
672 let _ = meta.insert("draft".to_string(), serde_json::Value::Bool(true));
673 assert!(is_excluded_page(Path::new("post.html"), &meta));
674 }
675
676 #[test]
677 fn test_is_excluded_page_draft_string() {
678 let mut meta = serde_json::Map::new();
679 let _ = meta.insert(
680 "draft".to_string(),
681 serde_json::Value::String("true".to_string()),
682 );
683 assert!(is_excluded_page(Path::new("post.html"), &meta));
684 }
685
686 #[test]
687 fn test_is_excluded_page_private() {
688 let mut meta = serde_json::Map::new();
689 let _ =
690 meta.insert("private".to_string(), serde_json::Value::Bool(true));
691 assert!(is_excluded_page(Path::new("post.html"), &meta));
692 }
693
694 #[test]
695 fn test_is_excluded_page_404() {
696 let meta = serde_json::Map::new();
697 assert!(is_excluded_page(Path::new("404.html"), &meta));
698 }
699
700 #[test]
701 fn test_is_excluded_page_normal() {
702 let mut meta = serde_json::Map::new();
703 let _ = meta.insert(
704 "title".to_string(),
705 serde_json::Value::String("Hello".to_string()),
706 );
707 assert!(!is_excluded_page(Path::new("index.html"), &meta));
708 }
709
710 #[test]
711 fn test_is_excluded_page_error_page() {
712 let meta = serde_json::Map::new();
713 assert!(is_excluded_page(Path::new("error500.html"), &meta));
714 }
715
716 #[test]
721 fn test_group_pages_by_section() {
722 let entries = vec![
723 (
724 "Home".to_string(),
725 "/index.html".to_string(),
726 "Welcome".to_string(),
727 ),
728 (
729 "Post 1".to_string(),
730 "/blog/post1.html".to_string(),
731 "First".to_string(),
732 ),
733 (
734 "Post 2".to_string(),
735 "/blog/post2.html".to_string(),
736 "Second".to_string(),
737 ),
738 (
739 "API Ref".to_string(),
740 "/docs/api.html".to_string(),
741 "API docs".to_string(),
742 ),
743 ];
744 let grouped = group_pages_by_section(&entries);
745
746 assert_eq!(grouped.len(), 3);
747 assert!(grouped.contains_key("Pages"));
748 assert!(grouped.contains_key("Blog"));
749 assert!(grouped.contains_key("Docs"));
750 assert_eq!(grouped["Pages"].len(), 1);
751 assert_eq!(grouped["Blog"].len(), 2);
752 assert_eq!(grouped["Docs"].len(), 1);
753 }
754
755 #[test]
756 fn test_group_pages_by_section_root_only() {
757 let entries = vec![
758 (
759 "About".to_string(),
760 "/about.html".to_string(),
761 String::new(),
762 ),
763 (
764 "Contact".to_string(),
765 "/contact.html".to_string(),
766 String::new(),
767 ),
768 ];
769 let grouped = group_pages_by_section(&entries);
770 assert_eq!(grouped.len(), 1);
771 assert_eq!(grouped["Pages"].len(), 2);
772 }
773
774 #[test]
775 fn test_group_pages_by_section_deterministic_order() {
776 let entries = vec![
777 ("Z".to_string(), "/zebra/z.html".to_string(), String::new()),
778 ("A".to_string(), "/alpha/a.html".to_string(), String::new()),
779 ("M".to_string(), "/middle/m.html".to_string(), String::new()),
780 ];
781 let grouped = group_pages_by_section(&entries);
782 let keys: Vec<&String> = grouped.keys().collect();
783 assert_eq!(keys, vec!["Alpha", "Middle", "Zebra"]);
784 }
785
786 #[test]
791 fn generate_llms_txt_with_full_config_includes_all_fields() {
792 let dir = tempdir().expect("tempdir");
793 let config = SsgConfig {
794 site_name: "My Site".to_string(),
795 site_description: "A great site".to_string(),
796 base_url: "https://example.com".to_string(),
797 language: "en".to_string(),
798 ..Default::default()
799 };
800
801 generate_llms_txt(dir.path(), Some(&config)).unwrap();
802 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
803 assert!(body.contains("# My Site"));
804 assert!(body.contains("> A great site"));
805 assert!(body.contains("Language: en"));
806 }
807
808 #[test]
809 fn generate_llms_txt_without_config_uses_defaults() {
810 let dir = tempdir().expect("tempdir");
811 generate_llms_txt(dir.path(), None).unwrap();
812
813 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
814 assert!(body.contains("# Site"));
815 assert!(body.contains("Language: en"));
816 }
817
818 #[test]
819 fn generate_llms_txt_strips_trailing_slash_from_base_url() {
820 let dir = tempdir().expect("tempdir");
821 let config = SsgConfig {
822 site_name: "S".to_string(),
823 site_description: "D".to_string(),
824 base_url: "https://example.com/".to_string(),
825 ..Default::default()
826 };
827
828 write_page(dir.path(), "index.html", "Home", "Welcome", "");
830
831 generate_llms_txt(dir.path(), Some(&config)).unwrap();
832 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
833 assert!(
835 !body.contains("//index.html"),
836 "trailing slash should be normalised:\n{body}"
837 );
838 }
839
840 #[test]
841 fn generate_llms_txt_into_missing_parent_returns_err() {
842 let bogus = Path::new("/this/path/should/not/exist");
843 assert!(generate_llms_txt(bogus, None).is_err());
844 }
845
846 #[test]
847 fn test_llms_txt_contains_language() {
848 let dir = tempdir().expect("tempdir");
849 let config = SsgConfig {
850 language: "fr".to_string(),
851 ..Default::default()
852 };
853 generate_llms_txt(dir.path(), Some(&config)).unwrap();
854 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
855 assert!(
856 body.contains("Language: fr"),
857 "llms.txt must include Language field:\n{body}"
858 );
859 }
860
861 #[test]
862 fn test_llms_txt_contains_language_defaults_to_en() {
863 let dir = tempdir().expect("tempdir");
864 let config = SsgConfig {
865 language: String::new(),
866 ..Default::default()
867 };
868 generate_llms_txt(dir.path(), Some(&config)).unwrap();
869 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
870 assert!(
871 body.contains("Language: en"),
872 "empty language should default to en:\n{body}"
873 );
874 }
875
876 #[test]
877 fn test_llms_txt_excludes_drafts() {
878 let dir = tempdir().expect("tempdir");
879 write_page(dir.path(), "published.html", "Published", "Visible", "");
880 write_page(
881 dir.path(),
882 "draft.html",
883 "Draft Post",
884 "Hidden",
885 r#""draft": true"#,
886 );
887
888 generate_llms_txt(dir.path(), None).unwrap();
889 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
890 assert!(
891 body.contains("Published"),
892 "published page must appear:\n{body}"
893 );
894 assert!(
895 !body.contains("Draft Post"),
896 "draft page must be excluded:\n{body}"
897 );
898 }
899
900 #[test]
901 fn test_llms_txt_excludes_private() {
902 let dir = tempdir().expect("tempdir");
903 write_page(dir.path(), "public.html", "Public", "Visible", "");
904 write_page(
905 dir.path(),
906 "secret.html",
907 "Secret",
908 "Hidden",
909 r#""private": true"#,
910 );
911
912 generate_llms_txt(dir.path(), None).unwrap();
913 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
914 assert!(
915 !body.contains("Secret"),
916 "private page must be excluded:\n{body}"
917 );
918 }
919
920 #[test]
921 fn test_llms_txt_excludes_404() {
922 let dir = tempdir().expect("tempdir");
923 write_page(dir.path(), "index.html", "Home", "Welcome", "");
924 write_page(dir.path(), "404.html", "Not Found", "Error page", "");
925
926 generate_llms_txt(dir.path(), None).unwrap();
927 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
928 assert!(
929 !body.contains("Not Found"),
930 "404 page must be excluded:\n{body}"
931 );
932 }
933
934 #[test]
935 fn test_llms_txt_contains_sections() {
936 let dir = tempdir().expect("tempdir");
937 write_page(dir.path(), "index.html", "Home", "Welcome", "");
938 write_page(dir.path(), "blog/post.html", "My Post", "A blog post", "");
939 write_page(
940 dir.path(),
941 "docs/api.html",
942 "API Docs",
943 "API reference",
944 "",
945 );
946
947 generate_llms_txt(dir.path(), None).unwrap();
948 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
949 assert!(
950 body.contains("## Pages"),
951 "should have Pages section:\n{body}"
952 );
953 assert!(
954 body.contains("## Blog"),
955 "should have Blog section:\n{body}"
956 );
957 assert!(
958 body.contains("## Docs"),
959 "should have Docs section:\n{body}"
960 );
961 assert!(
962 body.contains("- [My Post]"),
963 "should contain page link:\n{body}"
964 );
965 }
966
967 #[test]
968 fn test_llms_txt_contains_disallow_section() {
969 let dir = tempdir().expect("tempdir");
970 fs::write(
971 dir.path().join("robots.txt"),
972 "User-agent: *\nDisallow: /admin/\n",
973 )
974 .unwrap();
975
976 generate_llms_txt(dir.path(), None).unwrap();
977 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
978 assert!(
979 body.contains("## Disallow"),
980 "should have Disallow section:\n{body}"
981 );
982 assert!(
983 body.contains("- /admin/"),
984 "should contain disallow pattern:\n{body}"
985 );
986 }
987
988 #[test]
989 fn test_llms_txt_no_disallow_without_robots() {
990 let dir = tempdir().expect("tempdir");
991 generate_llms_txt(dir.path(), None).unwrap();
992 let body = fs::read_to_string(dir.path().join("llms.txt")).unwrap();
993 assert!(
994 !body.contains("## Disallow"),
995 "no robots.txt means no Disallow section:\n{body}"
996 );
997 }
998
999 #[test]
1004 fn test_llms_full_txt_contains_body_content() {
1005 let dir = tempdir().expect("tempdir");
1006 write_page(dir.path(), "index.html", "Home", "Welcome home", "");
1007
1008 generate_llms_full_txt(dir.path(), None).unwrap();
1009 let body =
1010 fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
1011 assert!(body.contains("# Site"), "header present:\n{body}");
1012 assert!(body.contains("Language: en"), "language present:\n{body}");
1013 assert!(body.contains("## [Home]"), "page title present:\n{body}");
1014 assert!(body.contains("Welcome home"), "body text present:\n{body}");
1015 }
1016
1017 #[test]
1018 fn test_llms_full_txt_excludes_drafts() {
1019 let dir = tempdir().expect("tempdir");
1020 write_page(dir.path(), "ok.html", "Visible", "Content", "");
1021 write_page(
1022 dir.path(),
1023 "hidden.html",
1024 "Hidden",
1025 "Secret",
1026 r#""draft": true"#,
1027 );
1028
1029 generate_llms_full_txt(dir.path(), None).unwrap();
1030 let body =
1031 fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
1032 assert!(body.contains("Visible"), "published page present:\n{body}");
1033 assert!(!body.contains("Hidden"), "draft excluded:\n{body}");
1034 }
1035
1036 #[test]
1037 fn test_llms_full_txt_excludes_404() {
1038 let dir = tempdir().expect("tempdir");
1039 write_page(dir.path(), "index.html", "Home", "Welcome", "");
1040 write_page(dir.path(), "404.html", "Not Found", "Error", "");
1041
1042 generate_llms_full_txt(dir.path(), None).unwrap();
1043 let body =
1044 fs::read_to_string(dir.path().join("llms-full.txt")).unwrap();
1045 assert!(!body.contains("Not Found"), "404 excluded:\n{body}");
1046 }
1047
1048 #[test]
1053 fn test_strip_html_tags() {
1054 assert_eq!(strip_html_tags("<p>hello</p>"), "hello");
1055 assert_eq!(strip_html_tags("<div><b>bold</b> text</div>"), "bold text");
1056 assert_eq!(strip_html_tags("no tags"), "no tags");
1057 assert_eq!(strip_html_tags(""), "");
1058 }
1059
1060 #[test]
1061 fn test_extract_body() {
1062 let html =
1063 "<html><head><title>T</title></head><body>Content</body></html>";
1064 assert_eq!(extract_body(html), "Content");
1065 }
1066
1067 #[test]
1068 fn test_extract_body_with_attributes() {
1069 let html = "<html><body class=\"main\">Content</body></html>";
1070 assert_eq!(extract_body(html), "Content");
1071 }
1072
1073 #[test]
1074 fn test_extract_body_no_body_tag() {
1075 let html = "<p>Just a fragment</p>";
1076 assert_eq!(extract_body(html), html);
1077 }
1078
1079 #[test]
1080 fn test_collapse_whitespace() {
1081 assert_eq!(collapse_whitespace(" hello world "), "hello world");
1082 assert_eq!(collapse_whitespace("no extra"), "no extra");
1083 assert_eq!(collapse_whitespace(""), "");
1084 }
1085
1086 #[test]
1091 fn test_titlecase_word() {
1092 assert_eq!(titlecase_word("blog"), "Blog");
1093 assert_eq!(titlecase_word("DOCS"), "Docs");
1094 assert_eq!(titlecase_word(""), "");
1095 assert_eq!(titlecase_word("a"), "A");
1096 }
1097
1098 #[test]
1103 fn after_compile_missing_site_dir_returns_ok_without_writing() {
1104 let dir = tempdir().expect("tempdir");
1105 let missing = dir.path().join("missing");
1106 let ctx =
1107 PluginContext::new(dir.path(), dir.path(), &missing, dir.path());
1108
1109 AiPlugin.after_compile(&ctx).expect("missing site is fine");
1110 assert!(!missing.exists());
1111 assert!(!dir.path().join("llms.txt").exists());
1112 }
1113
1114 #[test]
1115 fn after_compile_injects_max_snippet_meta_tag() {
1116 let (_tmp, site, ctx) = make_site();
1117 let html = "<html><head><title>X</title></head><body></body></html>";
1118 fs::write(site.join("index.html"), html).unwrap();
1119
1120 AiPlugin.after_compile(&ctx).unwrap();
1121 let output = fs::read_to_string(site.join("index.html")).unwrap();
1122 assert!(output.contains("max-snippet"));
1123 assert!(output.contains("max-image-preview:large"));
1124 }
1125
1126 #[test]
1127 fn after_compile_creates_llms_txt_in_site_root() {
1128 let (_tmp, site, ctx) = make_site();
1129 AiPlugin.after_compile(&ctx).unwrap();
1130 assert!(site.join("llms.txt").exists());
1131 }
1132
1133 #[test]
1134 fn after_compile_creates_llms_full_txt_in_site_root() {
1135 let (_tmp, site, ctx) = make_site();
1136 AiPlugin.after_compile(&ctx).unwrap();
1137 assert!(site.join("llms-full.txt").exists());
1138 }
1139
1140 #[test]
1141 fn after_compile_idempotent_does_not_duplicate_meta_tag() {
1142 let (_tmp, site, ctx) = make_site();
1143 let html = "<html><head><title>X</title></head><body></body></html>";
1144 fs::write(site.join("index.html"), html).unwrap();
1145
1146 AiPlugin.after_compile(&ctx).unwrap();
1147 AiPlugin.after_compile(&ctx).unwrap();
1148
1149 let output = fs::read_to_string(site.join("index.html")).unwrap();
1150 assert_eq!(output.matches("max-snippet").count(), 1);
1151 }
1152
1153 #[test]
1154 fn after_compile_skips_html_files_without_head_tag() {
1155 let (_tmp, site, ctx) = make_site();
1156 fs::write(site.join("fragment.html"), "<p>just a fragment</p>")
1157 .unwrap();
1158
1159 AiPlugin.after_compile(&ctx).unwrap();
1160 let output = fs::read_to_string(site.join("fragment.html")).unwrap();
1161 assert!(!output.contains("max-snippet"));
1162 assert_eq!(output, "<p>just a fragment</p>");
1163 }
1164
1165 #[test]
1166 fn after_compile_processes_files_in_subdirectories() {
1167 let (_tmp, site, ctx) = make_site();
1168 let nested = site.join("blog");
1169 fs::create_dir_all(&nested).unwrap();
1170 fs::write(
1171 nested.join("post.html"),
1172 "<html><head></head><body></body></html>",
1173 )
1174 .unwrap();
1175
1176 AiPlugin.after_compile(&ctx).unwrap();
1177 let output = fs::read_to_string(nested.join("post.html")).unwrap();
1178 assert!(output.contains("max-snippet"));
1179 }
1180
1181 #[test]
1182 fn after_compile_logs_warning_for_pages_with_missing_alt() {
1183 let (_tmp, site, ctx) = make_site();
1184 fs::write(
1185 site.join("bad.html"),
1186 r#"<html><head></head><body><img src="a.jpg"></body></html>"#,
1187 )
1188 .unwrap();
1189 fs::write(
1190 site.join("worse.html"),
1191 r#"<html><head></head><body><img src="a.jpg" alt=""></body></html>"#,
1192 )
1193 .unwrap();
1194
1195 AiPlugin.after_compile(&ctx).unwrap();
1196 let bad = fs::read_to_string(site.join("bad.html")).unwrap();
1197 assert!(bad.contains("max-snippet"));
1198 }
1199
1200 #[test]
1201 fn after_compile_does_not_rewrite_unchanged_files() {
1202 let (_tmp, site, ctx) = make_site();
1203 let html = "<html><head><meta name=\"robots\" content=\"max-snippet:-1\"></head><body></body></html>";
1204 fs::write(site.join("index.html"), html).unwrap();
1205 let original_mtime = fs::metadata(site.join("index.html"))
1206 .unwrap()
1207 .modified()
1208 .unwrap();
1209
1210 AiPlugin.after_compile(&ctx).unwrap();
1211 let after = fs::read_to_string(site.join("index.html")).unwrap();
1212 assert_eq!(after, html, "unchanged file body must be preserved");
1213 let _ = original_mtime;
1214 }
1215
1216 #[test]
1221 fn collect_html_files_returns_empty_for_missing_directory() {
1222 let dir = tempdir().expect("tempdir");
1223 let result = collect_html_files(&dir.path().join("missing")).unwrap();
1224 assert!(result.is_empty());
1225 }
1226
1227 #[test]
1228 fn collect_html_files_filters_non_html_extensions() {
1229 let dir = tempdir().expect("tempdir");
1230 fs::write(dir.path().join("a.html"), "").unwrap();
1231 fs::write(dir.path().join("b.css"), "").unwrap();
1232 fs::write(dir.path().join("c.js"), "").unwrap();
1233
1234 let result = collect_html_files(dir.path()).unwrap();
1235 assert_eq!(result.len(), 1);
1236 }
1237
1238 #[test]
1239 fn collect_html_files_recurses_into_nested_subdirectories() {
1240 let dir = tempdir().expect("tempdir");
1241 let nested = dir.path().join("a").join("b");
1242 fs::create_dir_all(&nested).unwrap();
1243 fs::write(dir.path().join("top.html"), "").unwrap();
1244 fs::write(nested.join("deep.html"), "").unwrap();
1245
1246 let result = collect_html_files(dir.path()).unwrap();
1247 assert_eq!(result.len(), 2);
1248 }
1249
1250 #[test]
1251 fn collect_html_files_returns_results_sorted() {
1252 let dir = tempdir().expect("tempdir");
1253 for name in ["zebra.html", "apple.html", "mango.html"] {
1254 fs::write(dir.path().join(name), "").unwrap();
1255 }
1256 let result = collect_html_files(dir.path()).unwrap();
1257 let names: Vec<_> = result
1258 .iter()
1259 .map(|p| p.file_name().unwrap().to_str().unwrap())
1260 .collect();
1261 assert_eq!(names, vec!["apple.html", "mango.html", "zebra.html"]);
1262 }
1263}