Skip to main content

ssg/seo/
helpers.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Internal helper functions for SEO plugins.
5
6use anyhow::Result;
7use std::path::{Path, PathBuf};
8
9/// Extract the page title from the `<title>` tag.
10pub fn extract_title(html: &str) -> String {
11    if let Some(start) = html.find("<title>") {
12        let after = &html[start + 7..];
13        if let Some(end) = after.find("</title>") {
14            let title = strip_tags(&after[..end]);
15            let trimmed = title.trim();
16            if !trimmed.is_empty() {
17                return trimmed.to_string();
18            }
19        }
20    }
21    String::new()
22}
23
24/// Extract plain text from the page content, strip tags, and truncate to
25/// `max_len` characters.
26///
27/// Prefers `<main>` content if present. Falls back to `<body>` with nav,
28/// header, footer, script, and style blocks removed.
29pub(super) fn extract_description(html: &str, max_len: usize) -> String {
30    let content = extract_main_content(html);
31
32    let clean = strip_inline_tags(&content, &["script", "style"]);
33
34    let text = strip_tags(&clean);
35    let trimmed = text.trim();
36    truncate_at_word_boundary(trimmed, max_len)
37}
38
39/// Extracts the inner content of `<main>`, or falls back to `<body>` with
40/// non-content elements removed.
41fn extract_main_content(html: &str) -> String {
42    if let Some(inner) = extract_tag_inner(html, "main") {
43        return inner;
44    }
45
46    let body =
47        extract_tag_inner(html, "body").unwrap_or_else(|| html.to_string());
48    strip_inline_tags(&body, &["script", "style", "nav", "header", "footer"])
49}
50
51/// Extracts the inner HTML of the first occurrence of `<tag_name>...</tag_name>`.
52fn extract_tag_inner(html: &str, tag_name: &str) -> Option<String> {
53    let open = format!("<{tag_name}");
54    let close = format!("</{tag_name}>");
55    let start = html.find(&open)?;
56    let after = &html[start..];
57    let gt = after.find('>')?;
58    let inner = &after[gt + 1..];
59    if let Some(end) = inner.find(&close) {
60        Some(inner[..end].to_string())
61    } else {
62        Some(inner.to_string())
63    }
64}
65
66/// Removes matched `<tag>...</tag>` blocks for each tag name in `tags`.
67fn strip_inline_tags(html: &str, tags: &[&str]) -> String {
68    let mut clean = html.to_string();
69    for tag in tags {
70        let open = format!("<{tag}");
71        let close = format!("</{tag}>");
72        while let Some(start) = clean.find(&open) {
73            if let Some(end) = clean[start..].find(&close) {
74                clean.replace_range(start..start + end + close.len(), " ");
75            } else {
76                break;
77            }
78        }
79    }
80    clean
81}
82
83/// Truncates text to `max_len` at a word boundary.
84fn truncate_at_word_boundary(text: &str, max_len: usize) -> String {
85    if text.len() <= max_len {
86        return text.to_string();
87    }
88    let mut end = max_len;
89    while end > 0 && !text.is_char_boundary(end) {
90        end -= 1;
91    }
92    let truncated = &text[..end];
93    if let Some(last_space) = truncated.rfind(' ') {
94        truncated[..last_space].to_string()
95    } else {
96        truncated.to_string()
97    }
98}
99
100/// Remove all HTML tags and collapse whitespace.
101pub(super) fn strip_tags(html: &str) -> String {
102    let mut result = String::with_capacity(html.len());
103    let mut in_tag = false;
104    for ch in html.chars() {
105        match ch {
106            '<' => in_tag = true,
107            '>' => {
108                in_tag = false;
109                result.push(' ');
110            }
111            _ if !in_tag => result.push(ch),
112            _ => {}
113        }
114    }
115    // Collapse whitespace
116    let mut collapsed = String::with_capacity(result.len());
117    let mut prev_space = false;
118    for ch in result.chars() {
119        if ch.is_whitespace() {
120            if !prev_space {
121                collapsed.push(' ');
122                prev_space = true;
123            }
124        } else {
125            collapsed.push(ch);
126            prev_space = false;
127        }
128    }
129    collapsed.trim().to_string()
130}
131
132/// Collect all `.html` files under `dir` (delegates to `crate::walk`).
133#[allow(dead_code)] // used only by tests in seo::mod
134pub(super) fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
135    crate::walk::walk_files(dir, "html")
136}
137
138/// Escape a string for safe inclusion in an HTML attribute value.
139pub(super) fn escape_attr(s: &str) -> String {
140    s.replace('&', "&amp;")
141        .replace('"', "&quot;")
142        .replace('<', "&lt;")
143        .replace('>', "&gt;")
144}
145
146/// Check for an actual `<meta` tag (not just an HTML comment marker).
147///
148/// Staticdatagen may emit empty comment blocks like:
149/// ```html
150/// <!-- # Start Open Graph / Facebook Meta Tags -->
151/// <!-- # End Open Graph / Facebook Meta Tags -->
152/// ```
153/// These should NOT count as "tag present" — only real `<meta` tags do.
154pub fn has_meta_tag(html: &str, attr: &str) -> bool {
155    html.contains(&format!("<meta property=\"{attr}\""))
156        || html.contains(&format!("<meta property='{attr}'"))
157        || html.contains(&format!("<meta name=\"{attr}\""))
158        || html.contains(&format!("<meta name='{attr}'"))
159}
160
161/// Extract the canonical URL from a `<link rel="canonical">` tag.
162pub(super) fn extract_canonical(html: &str) -> String {
163    if let Some(pos) = html.find("rel=\"canonical\"") {
164        let region_start = pos.saturating_sub(200);
165        let region = &html[region_start..html.len().min(pos + 200)];
166        if let Some(href_start) = region.find("href=\"") {
167            let after = &region[href_start + 6..];
168            if let Some(end) = after.find('"') {
169                return after[..end].to_string();
170            }
171        }
172    }
173    String::new()
174}
175
176/// Extract the content of a specific meta tag by name or property.
177pub(super) fn extract_existing_meta(html: &str, attr: &str) -> String {
178    for prefix in &[
179        format!("<meta name=\"{attr}\" content=\""),
180        format!("<meta property=\"{attr}\" content=\""),
181        format!("<meta name='{attr}' content='"),
182        format!("<meta property='{attr}' content='"),
183    ] {
184        if let Some(pos) = html.find(prefix.as_str()) {
185            let after = &html[pos + prefix.len()..];
186            let delim = if prefix.ends_with('\'') { '\'' } else { '"' };
187            if let Some(end) = after.find(delim) {
188                let value = after[..end].trim();
189                if !value.is_empty() {
190                    return value.to_string();
191                }
192            }
193        }
194    }
195    String::new()
196}
197
198/// Extract the `lang` attribute from the `<html>` tag.
199pub(super) fn extract_html_lang(html: &str) -> String {
200    if let Some(start) = html.find("<html") {
201        let tag_end = html[start..].find('>').unwrap_or(200);
202        let tag = &html[start..start + tag_end];
203        if let Some(lang_pos) = tag.find("lang=\"") {
204            let after = &tag[lang_pos + 6..];
205            if let Some(end) = after.find('"') {
206                return after[..end].to_string();
207            }
208        }
209        if let Some(lang_pos) = tag.find("lang='") {
210            let after = &tag[lang_pos + 6..];
211            if let Some(end) = after.find('\'') {
212                return after[..end].to_string();
213            }
214        }
215    }
216    String::new()
217}
218
219/// Extract the first image URL from `<main>` or `<article>` content.
220pub(super) fn extract_first_content_image(html: &str) -> String {
221    // Look in <main> or <article> first
222    let search_region = if let Some(start) = html.find("<main") {
223        &html[start..]
224    } else if let Some(start) = html.find("<article") {
225        &html[start..]
226    } else {
227        return String::new();
228    };
229
230    if let Some(img_pos) = search_region.find("<img") {
231        let after_img = &search_region[img_pos..];
232        let tag_end = after_img.find('>').unwrap_or(500).min(500);
233        let img_tag = &after_img[..tag_end];
234        if let Some(src_pos) = img_tag.find("src=\"") {
235            let after_src = &img_tag[src_pos + 5..];
236            if let Some(end) = after_src.find('"') {
237                return after_src[..end].to_string();
238            }
239        }
240    }
241    String::new()
242}
243
244/// Extract the author name from `<meta name="author">` or byline markup.
245pub(super) fn extract_meta_author(html: &str) -> String {
246    // Try meta tag first
247    let from_meta = extract_existing_meta(html, "author");
248    if !from_meta.is_empty() {
249        return from_meta;
250    }
251    // Try <span class="author"> or similar byline patterns
252    for pattern in &["class=\"author\">", "class='author'>", "rel=\"author\">"]
253    {
254        if let Some(pos) = html.find(pattern) {
255            let after = &html[pos + pattern.len()..];
256            if let Some(end) = after.find('<') {
257                let name = after[..end].trim();
258                // Strip "by " prefix
259                let name = name.strip_prefix("by ").unwrap_or(name).trim();
260                if !name.is_empty() {
261                    return name.to_string();
262                }
263            }
264        }
265    }
266    String::new()
267}
268
269/// Extract a date from an existing JSON-LD block in the HTML.
270pub(super) fn extract_date_from_html(
271    html: &str,
272    field: &str,
273) -> Option<String> {
274    let pattern = format!("\"{field}\":\"");
275    if let Some(pos) = html.find(&pattern) {
276        let after = &html[pos + pattern.len()..];
277        if let Some(end) = after.find('"') {
278            let date = &after[..end];
279            if !date.is_empty() {
280                return Some(date.to_string());
281            }
282        }
283    }
284    None
285}
286
287/// Extract a date from `<time datetime="...">` or `<meta property="article:published_time">`.
288pub(super) fn extract_meta_date(html: &str) -> Option<String> {
289    // Try article:published_time meta
290    let meta = extract_existing_meta(html, "article:published_time");
291    if !meta.is_empty() {
292        return Some(meta);
293    }
294    // Try first <time datetime="..."> in the page
295    if let Some(pos) = html.find("datetime=\"") {
296        let after = &html[pos + 10..];
297        if let Some(end) = after.find('"') {
298            let date = &after[..end];
299            if !date.is_empty() {
300                return Some(date.to_string());
301            }
302        }
303    }
304    None
305}
306
307/// Recursively collects HTML files (delegates to `crate::walk`).
308#[allow(dead_code)] // used only by tests in seo::mod
309pub(super) fn collect_html_files_recursive(dir: &Path) -> Result<Vec<PathBuf>> {
310    crate::walk::walk_files(dir, "html")
311}
312
313#[cfg(test)]
314#[allow(clippy::unwrap_used, clippy::expect_used)]
315mod tests {
316    use super::*;
317    use std::fs;
318    use tempfile::tempdir;
319
320    #[test]
321    fn extract_title_from_html() {
322        let html = "<html><head><title>Test Page</title></head></html>";
323        assert_eq!(extract_title(html), "Test Page");
324    }
325
326    #[test]
327    fn extract_title_empty_no_tag() {
328        let html = "<html><head></head><body>Hello</body></html>";
329        assert_eq!(extract_title(html), "");
330    }
331
332    #[test]
333    fn extract_title_empty_tag() {
334        let html = "<html><head><title></title></head></html>";
335        assert_eq!(extract_title(html), "");
336    }
337
338    #[test]
339    fn extract_title_nested_tags() {
340        let html = "<title><span>Inner</span></title>";
341        // strip_tags removes the inner span, leaving "Inner"
342        assert_eq!(extract_title(html), "Inner");
343    }
344
345    #[test]
346    fn extract_description_from_body() {
347        let html = "<html><body><main><p>Short description here.</p></main></body></html>";
348        let desc = extract_description(html, 200);
349        assert!(desc.contains("Short description here"));
350    }
351
352    #[test]
353    fn extract_description_truncation() {
354        let long_text = "word ".repeat(100);
355        let html = format!("<main><p>{long_text}</p></main>");
356        let desc = extract_description(&html, 50);
357        assert!(desc.len() <= 50);
358    }
359
360    #[test]
361    fn strip_tags_basic() {
362        assert_eq!(strip_tags("<p>Hello <b>world</b></p>"), "Hello world");
363    }
364
365    #[test]
366    fn strip_tags_empty() {
367        assert_eq!(strip_tags(""), "");
368    }
369
370    #[test]
371    fn strip_tags_no_tags() {
372        assert_eq!(strip_tags("plain text"), "plain text");
373    }
374
375    #[test]
376    fn strip_tags_self_closing() {
377        let result = strip_tags("<img src=\"x\"/>text");
378        assert!(result.contains("text"));
379        assert!(!result.contains("img"));
380    }
381
382    #[test]
383    fn truncate_short_text_unchanged() {
384        assert_eq!(truncate_at_word_boundary("short", 100), "short");
385    }
386
387    #[test]
388    fn truncate_long_text_at_word() {
389        let text = "one two three four five six";
390        let result = truncate_at_word_boundary(text, 15);
391        assert!(result.len() <= 15);
392        // Should cut at a space
393        assert!(!result.ends_with(' '));
394        assert_eq!(result, "one two three");
395    }
396
397    #[test]
398    fn truncate_unicode() {
399        let text = "日本語 テスト データ";
400        let result = truncate_at_word_boundary(text, 15);
401        // Must not panic on multi-byte boundaries
402        assert!(result.len() <= 15);
403    }
404
405    #[test]
406    fn collect_html_files_finds_files() {
407        let tmp = tempdir().unwrap();
408        let sub = tmp.path().join("sub");
409        fs::create_dir_all(&sub).unwrap();
410        fs::write(tmp.path().join("index.html"), "<html></html>").unwrap();
411        fs::write(sub.join("page.html"), "<html></html>").unwrap();
412
413        let files = collect_html_files(tmp.path()).unwrap();
414        assert_eq!(files.len(), 2);
415    }
416
417    #[test]
418    fn collect_html_files_recursive_finds_files() {
419        let tmp = tempdir().unwrap();
420        let sub = tmp.path().join("sub");
421        fs::create_dir_all(&sub).unwrap();
422        fs::write(tmp.path().join("index.html"), "<html></html>").unwrap();
423        fs::write(sub.join("page.html"), "<html></html>").unwrap();
424        fs::write(sub.join("style.css"), "body{}").unwrap();
425
426        let files = collect_html_files_recursive(tmp.path()).unwrap();
427        assert_eq!(files.len(), 2);
428        assert!(files.iter().all(|p| p.extension().unwrap() == "html"));
429    }
430
431    #[test]
432    fn collect_html_files_recursive_empty_dir() {
433        let tmp = tempdir().unwrap();
434        let files = collect_html_files_recursive(tmp.path()).unwrap();
435        assert!(files.is_empty());
436    }
437
438    #[test]
439    fn escape_attr_special_chars() {
440        assert_eq!(escape_attr("a&b<c>d\"e"), "a&amp;b&lt;c&gt;d&quot;e");
441    }
442
443    #[test]
444    fn has_meta_tag_present() {
445        let html = r#"<meta property="og:title" content="Hi">"#;
446        assert!(has_meta_tag(html, "og:title"));
447    }
448
449    #[test]
450    fn has_meta_tag_absent() {
451        let html = "<html><head></head></html>";
452        assert!(!has_meta_tag(html, "og:title"));
453    }
454
455    #[test]
456    fn extract_canonical_found() {
457        let html = r#"<link rel="canonical" href="https://example.com/page">"#;
458        assert_eq!(extract_canonical(html), "https://example.com/page");
459    }
460
461    #[test]
462    fn extract_canonical_missing() {
463        let html = "<html><head></head></html>";
464        assert_eq!(extract_canonical(html), "");
465    }
466
467    #[test]
468    fn extract_existing_meta_by_name() {
469        let html = r#"<meta name="author" content="Alice">"#;
470        assert_eq!(extract_existing_meta(html, "author"), "Alice");
471    }
472
473    #[test]
474    fn extract_html_lang_found() {
475        let html = r#"<html lang="fr"><head></head></html>"#;
476        assert_eq!(extract_html_lang(html), "fr");
477    }
478
479    #[test]
480    fn extract_html_lang_missing() {
481        let html = "<html><head></head></html>";
482        assert_eq!(extract_html_lang(html), "");
483    }
484
485    #[test]
486    fn extract_date_from_html_found() {
487        let html = r#"{"datePublished":"2025-01-15"}"#;
488        assert_eq!(
489            extract_date_from_html(html, "datePublished"),
490            Some("2025-01-15".to_string())
491        );
492    }
493
494    #[test]
495    fn extract_date_from_html_missing() {
496        assert_eq!(
497            extract_date_from_html("<html></html>", "datePublished"),
498            None
499        );
500    }
501}