ssg/seo/
jsonld.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! JSON-LD structured data injection plugin.
5
6use super::helpers::{
7    extract_date_from_html, extract_description, extract_first_content_image,
8    extract_html_lang, extract_meta_author, extract_meta_date, extract_title,
9};
10use crate::plugin::{Plugin, PluginContext};
11use anyhow::Result;
12use std::path::Path;
13
14/// Configuration for the JSON-LD structured data plugin.
15#[derive(Debug, Clone)]
16pub struct JsonLdConfig {
17    /// Base URL of the site (for absolute URLs in JSON-LD).
18    pub base_url: String,
19    /// Organization name for Organization schema.
20    pub org_name: String,
21    /// Whether to generate `BreadcrumbList` for every page.
22    pub breadcrumbs: bool,
23}
24
25/// Injects JSON-LD structured data into HTML files.
26///
27/// Auto-detects schema.org types from page metadata:
28/// - Pages with `<article>` → `Article`
29/// - All other pages → `WebPage`
30/// - `BreadcrumbList` derived from URL path (opt-in)
31///
32/// Idempotent: skips files that already contain `application/ld+json`.
33#[derive(Debug, Clone)]
34pub struct JsonLdPlugin {
35    pub(crate) config: JsonLdConfig,
36}
37
38impl JsonLdPlugin {
39    /// Creates a new `JsonLdPlugin` with the given configuration.
40    #[must_use]
41    pub const fn new(config: JsonLdConfig) -> Self {
42        Self { config }
43    }
44
45    /// Creates a `JsonLdPlugin` from site config values.
46    #[must_use]
47    pub fn from_site(base_url: &str, site_name: &str) -> Self {
48        Self {
49            config: JsonLdConfig {
50                base_url: base_url.to_string(),
51                org_name: site_name.to_string(),
52                breadcrumbs: true,
53            },
54        }
55    }
56}
57
58/// Builds an Article JSON-LD object from page metadata.
59fn build_article_jsonld(
60    title: &str,
61    description: &str,
62    page_url: &str,
63    org_name: &str,
64    author_name: &str,
65    image_url: &str,
66    date_published: Option<&String>,
67    date_modified: Option<&String>,
68    lang: &str,
69) -> serde_json::Value {
70    let mut article = serde_json::json!({
71        "@context": "https://schema.org",
72        "@type": "Article",
73        "headline": title,
74        "description": description,
75        "url": page_url,
76        "inLanguage": if lang.is_empty() { "en" } else { lang },
77        "mainEntityOfPage": {
78            "@type": "WebPage",
79            "@id": page_url
80        },
81        "publisher": {
82            "@type": "Organization",
83            "name": org_name
84        }
85    });
86
87    if !author_name.is_empty() {
88        article["author"] = serde_json::json!({
89            "@type": "Person",
90            "name": author_name
91        });
92    }
93
94    if !image_url.is_empty() {
95        article["image"] = serde_json::json!({
96            "@type": "ImageObject",
97            "url": image_url
98        });
99    }
100
101    if let Some(dp) = date_published {
102        article["datePublished"] = serde_json::json!(dp);
103    }
104    if let Some(dm) = date_modified {
105        article["dateModified"] = serde_json::json!(dm);
106    } else if let Some(dp) = date_published {
107        article["dateModified"] = serde_json::json!(dp);
108    }
109
110    article
111}
112
113/// Builds a `WebPage` JSON-LD object from page metadata.
114fn build_webpage_jsonld(
115    title: &str,
116    description: &str,
117    page_url: &str,
118    author_name: &str,
119    image_url: &str,
120    date_published: Option<&String>,
121    lang: &str,
122) -> serde_json::Value {
123    let mut webpage = serde_json::json!({
124        "@context": "https://schema.org",
125        "@type": "WebPage",
126        "name": title,
127        "description": description,
128        "url": page_url,
129        "inLanguage": if lang.is_empty() { "en" } else { lang }
130    });
131
132    if !author_name.is_empty() {
133        webpage["author"] = serde_json::json!({
134            "@type": "Person",
135            "name": author_name
136        });
137    }
138
139    if !image_url.is_empty() {
140        webpage["image"] = serde_json::json!({
141            "@type": "ImageObject",
142            "url": image_url
143        });
144    }
145
146    if let Some(dp) = date_published {
147        webpage["datePublished"] = serde_json::json!(dp);
148    }
149
150    webpage
151}
152
153/// Builds a `BreadcrumbList` JSON-LD object from the URL path, if applicable.
154fn build_breadcrumb_jsonld(
155    base: &str,
156    rel_path: &str,
157) -> Option<serde_json::Value> {
158    let parts: Vec<&str> = rel_path
159        .trim_matches('/')
160        .split('/')
161        .filter(|p| !p.is_empty() && *p != "index.html")
162        .collect();
163
164    if parts.is_empty() {
165        return None;
166    }
167
168    let mut items = vec![serde_json::json!({
169        "@type": "ListItem",
170        "position": 1,
171        "name": "Home",
172        "item": format!("{}/", base)
173    })];
174
175    let mut accumulated = String::new();
176    for (i, part) in parts.iter().enumerate() {
177        accumulated = format!("{accumulated}/{part}");
178        let name = part.trim_end_matches(".html").replace('-', " ");
179        items.push(serde_json::json!({
180            "@type": "ListItem",
181            "position": i + 2,
182            "name": name,
183            "item": format!("{}{}", base, accumulated)
184        }));
185    }
186
187    Some(serde_json::json!({
188        "@context": "https://schema.org",
189        "@type": "BreadcrumbList",
190        "itemListElement": items
191    }))
192}
193
194/// Builds all JSON-LD scripts for a single page.
195fn build_jsonld_scripts(
196    html: &str,
197    base: &str,
198    rel_path: &str,
199    org_name: &str,
200    breadcrumbs: bool,
201) -> Vec<serde_json::Value> {
202    let title = extract_title(html);
203    let description = extract_description(html, 160);
204    let page_url = format!("{base}/{rel_path}");
205    let author_name = extract_meta_author(html);
206    let image_url = extract_first_content_image(html);
207    let date_published = extract_date_from_html(html, "datePublished")
208        .or_else(|| extract_meta_date(html));
209    let date_modified = extract_date_from_html(html, "dateModified");
210    let lang = extract_html_lang(html);
211
212    let mut scripts = Vec::new();
213
214    if html.contains("<article") {
215        scripts.push(build_article_jsonld(
216            &title,
217            &description,
218            &page_url,
219            org_name,
220            &author_name,
221            &image_url,
222            date_published.as_ref(),
223            date_modified.as_ref(),
224            &lang,
225        ));
226    } else {
227        scripts.push(build_webpage_jsonld(
228            &title,
229            &description,
230            &page_url,
231            &author_name,
232            &image_url,
233            date_published.as_ref(),
234            &lang,
235        ));
236    }
237
238    if breadcrumbs {
239        if let Some(breadcrumb) = build_breadcrumb_jsonld(base, rel_path) {
240            scripts.push(breadcrumb);
241        }
242    }
243
244    scripts
245}
246
247impl Plugin for JsonLdPlugin {
248    fn name(&self) -> &'static str {
249        "json-ld"
250    }
251
252    fn has_transform(&self) -> bool {
253        true
254    }
255
256    fn transform_html(
257        &self,
258        html: &str,
259        path: &Path,
260        ctx: &PluginContext,
261    ) -> Result<String> {
262        if html.contains("application/ld+json") {
263            return Ok(html.to_string());
264        }
265
266        let Some(head_pos) = html.find("</head>") else {
267            return Ok(html.to_string());
268        };
269
270        let base = self.config.base_url.trim_end_matches('/');
271        let site_dir = &ctx.site_dir;
272
273        let rel_path = path
274            .strip_prefix(site_dir)
275            .unwrap_or(path)
276            .to_string_lossy()
277            .replace('\\', "/");
278
279        let scripts = build_jsonld_scripts(
280            html,
281            base,
282            &rel_path,
283            &self.config.org_name,
284            self.config.breadcrumbs,
285        );
286
287        let mut injection = String::new();
288        for script in &scripts {
289            let json = serde_json::to_string(script)?;
290            injection.push_str(&format!(
291                "<script type=\"application/ld+json\">{json}</script>\n"
292            ));
293        }
294
295        let result =
296            format!("{}{}{}", &html[..head_pos], injection, &html[head_pos..]);
297        Ok(result)
298    }
299
300    fn after_compile(&self, _ctx: &PluginContext) -> Result<()> {
301        Ok(())
302    }
303}
304
305// =====================================================================
306// JSON-LD validation (resolves #467)
307// =====================================================================
308
309/// A single validation failure against a JSON-LD block.
310#[derive(Debug, Clone, PartialEq, Eq)]
311pub struct JsonLdValidationError {
312    /// The schema.org `@type` of the block (or "Unknown" if absent).
313    pub schema_type: String,
314    /// Required field that was missing or had the wrong shape.
315    pub field: String,
316    /// Human-readable reason.
317    pub reason: String,
318}
319
320impl std::fmt::Display for JsonLdValidationError {
321    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
322        write!(
323            f,
324            "[{}] missing/invalid `{}` — {}",
325            self.schema_type, self.field, self.reason
326        )
327    }
328}
329
330/// Walks an HTML string, extracts every `<script type="application/ld+json">`
331/// block, parses it as JSON, and validates required fields per
332/// schema.org `@type`.
333///
334/// Supported types (with their required-field guards):
335///
336/// - **`Article`** — `headline`, `datePublished`, `author`, `image`
337/// - **`WebPage`** — `name` (Google rich-results requirement; `url`
338///   and `inLanguage` are Recommended only and not flagged here)
339/// - **`BreadcrumbList`** — `itemListElement` (non-empty array)
340/// - **`FAQPage`** — `mainEntity` (non-empty array of `Question`)
341/// - **`LocalBusiness`** — `name`, `address`
342/// - **`Organization`** — `name`, `url`
343///
344/// Returns the empty vector if every block parses and passes its
345/// required-field check. Unknown `@type` values are treated as
346/// pass-through (no required fields enforced) so user-extended
347/// schemas don't trigger false negatives.
348#[must_use]
349pub fn validate_jsonld(html: &str) -> Vec<JsonLdValidationError> {
350    let mut errors = Vec::new();
351
352    for block in extract_jsonld_blocks(html) {
353        match serde_json::from_str::<serde_json::Value>(&block) {
354            Ok(value) => validate_one(&value, &mut errors),
355            Err(parse_err) => {
356                errors.push(JsonLdValidationError {
357                    schema_type: "Unparseable".to_string(),
358                    field: "(payload)".to_string(),
359                    reason: format!("invalid JSON: {parse_err}"),
360                });
361            }
362        }
363    }
364
365    errors
366}
367
368/// Returns the inner JSON of every `<script type="application/ld+json">`
369/// block. Tolerant of attribute order and whitespace.
370///
371/// Resolves audit items #4 + #5:
372/// - `type` is parsed as a discrete attribute value rather than
373///   substring-matched, so `type="application/ld+json/extra"` no
374///   longer falsely qualifies.
375/// - The `</script>` close finder is JSON-string-aware: a literal
376///   `</script>` *inside* a JSON string value (e.g.
377///   `"description": "code: </script>"`) is correctly skipped over.
378///   The HTML5 spec actually forbids `</script>` inside script
379///   bodies even in strings — most authors escape as `<\/script>`
380///   — but our extractor handles either form gracefully.
381fn extract_jsonld_blocks(html: &str) -> Vec<String> {
382    let mut blocks = Vec::new();
383    let lower = html.to_lowercase();
384    let mut cursor = 0;
385
386    while let Some(rel_open) = lower[cursor..].find("<script") {
387        let abs_open = cursor + rel_open;
388        // Use find_tag_end equivalent: advance past `>` while
389        // skipping any `>` characters that appear inside quoted
390        // attribute values. Without this, `<script type="text/x>y">`
391        // would close prematurely at the inner `>`.
392        let tag_end = find_html_tag_end(&lower, abs_open);
393        let tag = &lower[abs_open..tag_end];
394        cursor = tag_end;
395
396        if !is_jsonld_script_tag(tag) {
397            continue;
398        }
399
400        let Some(close) = find_script_close_skipping_strings(&html[cursor..])
401        else {
402            break;
403        };
404        // Use the original-case slice for the JSON payload —
405        // schema.org values are case-sensitive.
406        blocks.push(html[cursor..cursor + close].trim().to_string());
407        cursor += close + "</script>".len();
408    }
409
410    blocks
411}
412
413/// Returns `true` if the `<script ...>` tag declares
414/// `type="application/ld+json"` exactly (any quoting; no
415/// substring match).
416fn is_jsonld_script_tag(tag: &str) -> bool {
417    extract_attr(tag, "type")
418        .is_some_and(|v| v.eq_ignore_ascii_case("application/ld+json"))
419}
420
421/// Extracts the value of an HTML attribute from an open-tag string.
422/// Tolerant of quoting and whitespace. Returns `None` if the
423/// attribute is absent or has no value.
424fn extract_attr(tag: &str, name: &str) -> Option<String> {
425    let lower = tag.to_lowercase();
426    let needle = format!("{}=", name.to_lowercase());
427    let idx = lower.find(&needle)?;
428    // Make sure the match starts at a token boundary (preceding
429    // char is whitespace or `<` or the very start of `tag`).
430    let pre = lower.as_bytes().get(idx.wrapping_sub(1));
431    let boundary_ok = idx == 0
432        || matches!(pre, Some(b) if b.is_ascii_whitespace() || *b == b'<');
433    if !boundary_ok {
434        return None;
435    }
436    let rest = &tag[idx + needle.len()..];
437    let trimmed = rest.trim_start();
438    if let Some(s) = trimmed.strip_prefix('"') {
439        s.find('"').map(|e| s[..e].to_string())
440    } else if let Some(s) = trimmed.strip_prefix('\'') {
441        s.find('\'').map(|e| s[..e].to_string())
442    } else {
443        let end = trimmed
444            .find(|c: char| c.is_whitespace() || c == '>')
445            .unwrap_or(trimmed.len());
446        Some(trimmed[..end].to_string())
447    }
448}
449
450/// Returns the byte offset of `</script>` in `body` while ignoring
451/// occurrences that appear *inside* a JSON string literal.
452///
453/// The walker tracks two pieces of state: whether we're currently
454/// inside a `"..."` string, and whether the previous byte was the
455/// JSON escape character `\`. Scanning is done in bytes (UTF-8 is
456/// not relevant for the ASCII-only delimiters we care about).
457fn find_script_close_skipping_strings(body: &str) -> Option<usize> {
458    let bytes = body.as_bytes();
459    let needle = b"</script>";
460    let mut i = 0;
461    let mut in_string = false;
462    let mut escape = false;
463    while i < bytes.len() {
464        if in_string {
465            if escape {
466                escape = false;
467            } else if bytes[i] == b'\\' {
468                escape = true;
469            } else if bytes[i] == b'"' {
470                in_string = false;
471            }
472            i += 1;
473            continue;
474        }
475        if bytes[i] == b'"' {
476            in_string = true;
477            i += 1;
478            continue;
479        }
480        // Case-insensitive check for `</script>`.
481        if i + needle.len() <= bytes.len()
482            && bytes[i..i + needle.len()].eq_ignore_ascii_case(needle)
483        {
484            return Some(i);
485        }
486        i += 1;
487    }
488    None
489}
490
491/// Like `accessibility::find_tag_end` — returns the index just past
492/// the `>` that closes the open tag at `tag_start`, while skipping
493/// `>` characters that occur inside quoted attribute values.
494fn find_html_tag_end(html: &str, tag_start: usize) -> usize {
495    let bytes = html.as_bytes();
496    let mut i = tag_start;
497    let mut quote: Option<u8> = None;
498    while i < bytes.len() {
499        let b = bytes[i];
500        match quote {
501            Some(q) if b == q => quote = None,
502            Some(_) => {}
503            None => match b {
504                b'"' | b'\'' => quote = Some(b),
505                b'>' => return i + 1,
506                _ => {}
507            },
508        }
509        i += 1;
510    }
511    bytes.len()
512}
513
514/// Validates a single parsed JSON-LD value (object or array).
515fn validate_one(
516    value: &serde_json::Value,
517    errors: &mut Vec<JsonLdValidationError>,
518) {
519    // schema.org allows top-level @graph arrays; descend into them.
520    if let Some(graph) = value.get("@graph").and_then(|v| v.as_array()) {
521        for entry in graph {
522            validate_one(entry, errors);
523        }
524        return;
525    }
526
527    // Array at top level — validate each entry.
528    if let Some(array) = value.as_array() {
529        for entry in array {
530            validate_one(entry, errors);
531        }
532        return;
533    }
534
535    let schema_type = value
536        .get("@type")
537        .and_then(|v| v.as_str())
538        .unwrap_or("Unknown")
539        .to_string();
540
541    // Required-field sets aligned with Google's rich-results
542    // requirements (https://developers.google.com/search/docs/appearance/structured-data),
543    // not the broader schema.org vocabulary. schema.org marks many
544    // useful fields as `Recommended` rather than `Required` — this
545    // validator only fires on truly-missing fields the search
546    // engines actually penalise.
547    let required: &[&str] = match schema_type.as_str() {
548        "Article" | "NewsArticle" | "BlogPosting" => {
549            // Google requires headline + datePublished + author +
550            // image for Article rich results.
551            &["headline", "datePublished", "author", "image"]
552        }
553        // WebPage's only hard requirement is `name`. `url` and
554        // `inLanguage` are Recommended but not penalised when
555        // absent — auto-generated stub pages (taxonomy indexes,
556        // 404, offline) routinely omit them.
557        "WebPage" => &["name"],
558        "BreadcrumbList" => &["itemListElement"],
559        "FAQPage" => &["mainEntity"],
560        "LocalBusiness" | "Restaurant" | "Store" => &["name", "address"],
561        "Organization" => &["name", "url"],
562        // Unknown types: don't enforce required fields. Users may ship
563        // custom @types that are still valid schema.org extensions.
564        _ => return,
565    };
566
567    for field in required {
568        match value.get(*field) {
569            None => errors.push(JsonLdValidationError {
570                schema_type: schema_type.clone(),
571                field: (*field).to_string(),
572                reason: "field absent".to_string(),
573            }),
574            Some(serde_json::Value::Null) => {
575                errors.push(JsonLdValidationError {
576                    schema_type: schema_type.clone(),
577                    field: (*field).to_string(),
578                    reason: "field is null".to_string(),
579                });
580            }
581            Some(serde_json::Value::String(s)) if s.trim().is_empty() => {
582                errors.push(JsonLdValidationError {
583                    schema_type: schema_type.clone(),
584                    field: (*field).to_string(),
585                    reason: "field is empty string".to_string(),
586                });
587            }
588            Some(serde_json::Value::Array(a)) if a.is_empty() => {
589                errors.push(JsonLdValidationError {
590                    schema_type: schema_type.clone(),
591                    field: (*field).to_string(),
592                    reason: "array is empty".to_string(),
593                });
594            }
595            _ => {}
596        }
597    }
598
599    // BreadcrumbList: itemListElement entries should each be ListItem
600    // with a `position` and `name`. Catch the most common regression.
601    if schema_type == "BreadcrumbList" {
602        if let Some(items) =
603            value.get("itemListElement").and_then(|v| v.as_array())
604        {
605            for (idx, item) in items.iter().enumerate() {
606                if item.get("position").is_none() {
607                    errors.push(JsonLdValidationError {
608                        schema_type: schema_type.clone(),
609                        field: format!("itemListElement[{idx}].position"),
610                        reason: "ListItem missing position".to_string(),
611                    });
612                }
613                if item.get("name").is_none() && item.get("item").is_none() {
614                    errors.push(JsonLdValidationError {
615                        schema_type: schema_type.clone(),
616                        field: format!("itemListElement[{idx}].name|item"),
617                        reason: "ListItem missing name and item".to_string(),
618                    });
619                }
620            }
621        }
622    }
623}
624
625#[cfg(test)]
626#[allow(clippy::unwrap_used, clippy::expect_used)]
627mod tests {
628    use super::*;
629    use std::path::Path;
630    use tempfile::tempdir;
631
632    fn ctx(site: &Path) -> PluginContext {
633        PluginContext::new(
634            Path::new("content"),
635            Path::new("build"),
636            site,
637            Path::new("templates"),
638        )
639    }
640
641    fn cfg() -> JsonLdConfig {
642        JsonLdConfig {
643            base_url: "https://example.com".to_string(),
644            org_name: "Example Org".to_string(),
645            breadcrumbs: true,
646        }
647    }
648
649    #[test]
650    fn name_is_stable() {
651        let p = JsonLdPlugin::new(cfg());
652        assert_eq!(p.name(), "json-ld");
653    }
654
655    #[test]
656    fn from_site_constructs_with_breadcrumbs_enabled() {
657        let p = JsonLdPlugin::from_site("https://x.example", "X");
658        assert_eq!(p.config.base_url, "https://x.example");
659        assert_eq!(p.config.org_name, "X");
660        assert!(p.config.breadcrumbs);
661    }
662
663    // ── build_article_jsonld ───────────────────────────────────
664
665    #[test]
666    fn article_includes_author_when_provided() {
667        let v = build_article_jsonld(
668            "T",
669            "D",
670            "https://x/p",
671            "Org",
672            "Jane",
673            "",
674            None,
675            None,
676            "en",
677        );
678        assert_eq!(v["author"]["name"], "Jane");
679        assert_eq!(v["author"]["@type"], "Person");
680    }
681
682    #[test]
683    fn article_omits_author_when_empty() {
684        let v = build_article_jsonld(
685            "T",
686            "D",
687            "https://x/p",
688            "Org",
689            "",
690            "",
691            None,
692            None,
693            "en",
694        );
695        assert!(v.get("author").is_none());
696    }
697
698    #[test]
699    fn article_includes_image_when_url_present() {
700        let v = build_article_jsonld(
701            "T",
702            "D",
703            "https://x/p",
704            "Org",
705            "",
706            "https://x/img.png",
707            None,
708            None,
709            "en",
710        );
711        assert_eq!(v["image"]["@type"], "ImageObject");
712        assert_eq!(v["image"]["url"], "https://x/img.png");
713    }
714
715    #[test]
716    fn article_uses_date_published_for_date_modified_fallback() {
717        let dp = "2025-01-01".to_string();
718        let v = build_article_jsonld(
719            "T",
720            "D",
721            "https://x/p",
722            "Org",
723            "",
724            "",
725            Some(&dp),
726            None,
727            "en",
728        );
729        assert_eq!(v["datePublished"], "2025-01-01");
730        assert_eq!(
731            v["dateModified"], "2025-01-01",
732            "missing dateModified should fall back to datePublished"
733        );
734    }
735
736    #[test]
737    fn article_keeps_distinct_date_modified() {
738        let dp = "2025-01-01".to_string();
739        let dm = "2025-06-15".to_string();
740        let v = build_article_jsonld(
741            "T",
742            "D",
743            "https://x/p",
744            "Org",
745            "",
746            "",
747            Some(&dp),
748            Some(&dm),
749            "en",
750        );
751        assert_eq!(v["datePublished"], "2025-01-01");
752        assert_eq!(v["dateModified"], "2025-06-15");
753    }
754
755    #[test]
756    fn article_defaults_lang_to_en_when_empty() {
757        let v = build_article_jsonld(
758            "T",
759            "D",
760            "https://x/p",
761            "Org",
762            "",
763            "",
764            None,
765            None,
766            "",
767        );
768        assert_eq!(v["inLanguage"], "en");
769    }
770
771    // ── build_webpage_jsonld ───────────────────────────────────
772
773    #[test]
774    fn webpage_includes_author_image_date_when_present() {
775        let dp = "2025-01-01".to_string();
776        let v = build_webpage_jsonld(
777            "T",
778            "D",
779            "https://x/p",
780            "Jane",
781            "https://x/i.png",
782            Some(&dp),
783            "fr",
784        );
785        assert_eq!(v["@type"], "WebPage");
786        assert_eq!(v["author"]["name"], "Jane");
787        assert_eq!(v["image"]["url"], "https://x/i.png");
788        assert_eq!(v["datePublished"], "2025-01-01");
789        assert_eq!(v["inLanguage"], "fr");
790    }
791
792    #[test]
793    fn webpage_omits_optional_fields_when_empty() {
794        let v = build_webpage_jsonld("T", "D", "https://x/p", "", "", None, "");
795        assert!(v.get("author").is_none());
796        assert!(v.get("image").is_none());
797        assert!(v.get("datePublished").is_none());
798        assert_eq!(v["inLanguage"], "en");
799    }
800
801    // ── build_breadcrumb_jsonld ────────────────────────────────
802
803    #[test]
804    fn breadcrumb_returns_none_for_root_path() {
805        // Just `index.html` (or empty path) → no breadcrumb chain.
806        assert!(build_breadcrumb_jsonld("https://x", "/").is_none());
807        assert!(build_breadcrumb_jsonld("https://x", "index.html").is_none());
808    }
809
810    #[test]
811    fn breadcrumb_builds_chain_for_nested_path() {
812        let v = build_breadcrumb_jsonld("https://x", "blog/my-post/index.html")
813            .expect("should produce breadcrumb for nested path");
814        assert_eq!(v["@type"], "BreadcrumbList");
815        let items = v["itemListElement"].as_array().unwrap();
816        assert_eq!(items.len(), 3); // Home + blog + my-post
817        assert_eq!(items[0]["name"], "Home");
818        assert_eq!(items[1]["name"], "blog");
819        assert_eq!(items[2]["name"], "my post"); // hyphens → spaces
820    }
821
822    #[test]
823    fn breadcrumb_handles_html_extension_in_part_name() {
824        let v = build_breadcrumb_jsonld("https://x", "page.html").unwrap();
825        let items = v["itemListElement"].as_array().unwrap();
826        assert_eq!(items.len(), 2);
827        assert_eq!(items[1]["name"], "page");
828    }
829
830    // ── build_jsonld_scripts ───────────────────────────────────
831
832    #[test]
833    fn build_scripts_picks_article_when_article_tag_present() {
834        let html = r#"<html><head><title>Post</title></head>
835            <body><article>content</article></body></html>"#;
836        let scripts =
837            build_jsonld_scripts(html, "https://x", "p/", "Org", false);
838        assert_eq!(scripts[0]["@type"], "Article");
839    }
840
841    #[test]
842    fn build_scripts_picks_webpage_when_no_article_tag() {
843        let html = "<html><head><title>P</title></head><body>x</body></html>";
844        let scripts =
845            build_jsonld_scripts(html, "https://x", "p/", "Org", false);
846        assert_eq!(scripts[0]["@type"], "WebPage");
847    }
848
849    #[test]
850    fn build_scripts_includes_breadcrumb_when_enabled() {
851        let html = "<html><head><title>P</title></head><body>x</body></html>";
852        let scripts =
853            build_jsonld_scripts(html, "https://x", "blog/post/", "Org", true);
854        assert!(
855            scripts.iter().any(|s| s["@type"] == "BreadcrumbList"),
856            "breadcrumb should be present when enabled and path nested"
857        );
858    }
859
860    #[test]
861    fn build_scripts_skips_breadcrumb_when_disabled() {
862        let html = "<html><head><title>P</title></head><body>x</body></html>";
863        let scripts =
864            build_jsonld_scripts(html, "https://x", "blog/post/", "Org", false);
865        assert!(!scripts.iter().any(|s| s["@type"] == "BreadcrumbList"));
866    }
867
868    // ── after_compile end-to-end ───────────────────────────────
869
870    #[test]
871    fn after_compile_no_op_when_site_missing() {
872        let dir = tempdir().unwrap();
873        let nope = dir.path().join("nope");
874        JsonLdPlugin::new(cfg()).after_compile(&ctx(&nope)).unwrap();
875    }
876
877    #[test]
878    fn transform_html_injects_jsonld() {
879        let dir = tempdir().unwrap();
880        let c = ctx(dir.path());
881        let html = "<html><head><title>X</title></head><body>x</body></html>";
882        let page_path = dir.path().join("index.html");
883        let after = JsonLdPlugin::new(cfg())
884            .transform_html(html, &page_path, &c)
885            .unwrap();
886        assert!(after.contains("application/ld+json"));
887        assert!(after.contains("\"@type\":\"WebPage\""));
888    }
889
890    #[test]
891    fn transform_html_skips_existing_jsonld() {
892        let dir = tempdir().unwrap();
893        let c = ctx(dir.path());
894        let html = r#"<html><head><script type="application/ld+json">{"@type":"X"}</script><title>X</title></head></html>"#;
895        let page_path = dir.path().join("p.html");
896        let after = JsonLdPlugin::new(cfg())
897            .transform_html(html, &page_path, &c)
898            .unwrap();
899        // Only one JSON-LD block — no duplicate injected.
900        assert_eq!(after.matches("application/ld+json").count(), 1);
901        assert!(after.contains(r#"{"@type":"X"}"#));
902    }
903
904    #[test]
905    fn transform_html_skips_without_head_tag() {
906        let dir = tempdir().unwrap();
907        let c = ctx(dir.path());
908        let raw = "<!doctype html><html><body>only</body></html>";
909        let page_path = dir.path().join("frag.html");
910        let after = JsonLdPlugin::new(cfg())
911            .transform_html(raw, &page_path, &c)
912            .unwrap();
913        assert_eq!(after, raw);
914    }
915
916    // ── JSON-LD validation (issue #467) ────────────────────────────
917
918    #[test]
919    fn validate_extracts_block() {
920        let html = r#"<html><head>
921            <script type="application/ld+json">
922            {"@context":"https://schema.org","@type":"WebPage",
923             "name":"Hi","url":"https://x.test/","inLanguage":"en"}
924            </script></head><body></body></html>"#;
925        assert!(validate_jsonld(html).is_empty());
926    }
927
928    #[test]
929    fn validate_flags_missing_required_field_on_article() {
930        let html = r#"<script type="application/ld+json">
931            {"@context":"https://schema.org","@type":"Article",
932             "headline":"H","datePublished":"2026-05-10","author":"A"}
933        </script>"#;
934        let errs = validate_jsonld(html);
935        assert!(
936            errs.iter()
937                .any(|e| e.schema_type == "Article" && e.field == "image"),
938            "expected Article.image violation, got {errs:?}"
939        );
940    }
941
942    #[test]
943    fn validate_flags_empty_breadcrumb_list() {
944        let html = r#"<script type="application/ld+json">
945            {"@context":"https://schema.org","@type":"BreadcrumbList",
946             "itemListElement":[]}
947        </script>"#;
948        let errs = validate_jsonld(html);
949        assert!(
950            errs.iter().any(|e| e.field == "itemListElement"),
951            "expected itemListElement empty-array error, got {errs:?}"
952        );
953    }
954
955    #[test]
956    fn validate_breadcrumb_listitem_missing_position() {
957        let html = r#"<script type="application/ld+json">
958            {"@type":"BreadcrumbList",
959             "itemListElement":[{"name":"Home","item":"https://x/"}]}
960        </script>"#;
961        let errs = validate_jsonld(html);
962        assert!(
963            errs.iter()
964                .any(|e| e.field == "itemListElement[0].position"),
965            "expected position-missing error, got {errs:?}"
966        );
967    }
968
969    #[test]
970    fn validate_unparseable_json() {
971        let html = r#"<script type="application/ld+json">{not json}</script>"#;
972        let errs = validate_jsonld(html);
973        assert_eq!(errs.len(), 1);
974        assert_eq!(errs[0].schema_type, "Unparseable");
975    }
976
977    #[test]
978    fn validate_descends_into_graph() {
979        // Article inside @graph missing required fields exercises the
980        // descent path. Article has 4 required fields; this provides 1.
981        let html = r#"<script type="application/ld+json">
982            {"@context":"https://schema.org","@graph":[
983                {"@type":"Article","headline":"H"}
984            ]}
985        </script>"#;
986        let errs = validate_jsonld(html);
987        // Article requires headline + datePublished + author + image;
988        // we only provided headline, so the other 3 fire.
989        assert!(errs
990            .iter()
991            .any(|e| e.schema_type == "Article" && e.field == "datePublished"));
992        assert!(errs
993            .iter()
994            .any(|e| e.schema_type == "Article" && e.field == "author"));
995        assert!(errs
996            .iter()
997            .any(|e| e.schema_type == "Article" && e.field == "image"));
998    }
999
1000    #[test]
1001    fn validate_unknown_type_passes_through() {
1002        let html = r#"<script type="application/ld+json">
1003            {"@type":"CustomThing","foo":"bar"}
1004        </script>"#;
1005        assert!(validate_jsonld(html).is_empty());
1006    }
1007
1008    #[test]
1009    fn validate_handles_multiple_blocks() {
1010        let html = r#"
1011            <script type="application/ld+json">{"@type":"Organization","name":"O","url":"https://o/"}</script>
1012            <script type="application/ld+json">{"@type":"Article","headline":"H"}</script>
1013        "#;
1014        let errs = validate_jsonld(html);
1015        // Org passes; Article missing 3 of 4 required.
1016        assert_eq!(
1017            errs.iter()
1018                .filter(|e| e.schema_type == "Organization")
1019                .count(),
1020            0
1021        );
1022        assert!(
1023            errs.iter().filter(|e| e.schema_type == "Article").count() >= 3
1024        );
1025    }
1026
1027    // ── Strict type-attribute parsing (audit fix item #4) ──────────
1028
1029    #[test]
1030    fn validate_skips_extra_qualified_type() {
1031        // `application/ld+json/extra` must NOT be treated as JSON-LD.
1032        // Pre-fix: `tag.contains("application/ld+json")` falsely
1033        // matched this.
1034        let html = r#"<script type="application/ld+json/extra">
1035            {"@type":"Article"}
1036        </script>"#;
1037        assert!(
1038            validate_jsonld(html).is_empty(),
1039            "non-JSON-LD type must not be validated"
1040        );
1041    }
1042
1043    #[test]
1044    fn validate_recognises_type_with_single_quotes() {
1045        let html = r#"<script type='application/ld+json'>
1046            {"@type":"Organization","name":"O","url":"https://o/"}
1047        </script>"#;
1048        assert!(validate_jsonld(html).is_empty());
1049    }
1050
1051    #[test]
1052    fn validate_recognises_type_after_other_attrs() {
1053        let html = r#"<script id="ld1" type="application/ld+json">
1054            {"@type":"Organization","name":"O","url":"https://o/"}
1055        </script>"#;
1056        assert!(validate_jsonld(html).is_empty());
1057    }
1058
1059    // ── String-literal-aware </script> finder (audit fix item #5) ──
1060
1061    #[test]
1062    fn validate_handles_close_script_inside_json_string() {
1063        // The old extractor truncated at the first `</script>` inside
1064        // a string value, producing parse-failure noise. The fixed
1065        // extractor only honours `</script>` outside JSON strings.
1066        let html = r#"<script type="application/ld+json">
1067            {"@type":"Article",
1068             "headline":"H","datePublished":"2026-01-01",
1069             "author":"A","image":"https://x/i.png",
1070             "description":"this contains a </script> inside the string and is still valid JSON"}
1071        </script>"#;
1072        let errs = validate_jsonld(html);
1073        // Article has all 4 required fields. The pre-fix bug would
1074        // have produced an Unparseable error because the extractor
1075        // would close at the inner `</script>`, leaving truncated
1076        // JSON.
1077        assert!(
1078            errs.iter().all(|e| e.schema_type != "Unparseable"),
1079            "no parse errors expected, got {errs:?}"
1080        );
1081    }
1082
1083    #[test]
1084    fn extract_attr_returns_none_when_attribute_absent() {
1085        assert_eq!(extract_attr("<script src=x>", "type"), None);
1086    }
1087
1088    #[test]
1089    fn extract_attr_handles_double_quoted_value() {
1090        assert_eq!(
1091            extract_attr(r#"<script type="application/ld+json">"#, "type"),
1092            Some("application/ld+json".to_string())
1093        );
1094    }
1095
1096    #[test]
1097    fn extract_attr_rejects_substring_match_in_other_attribute() {
1098        // `data-mytype="foo"` must NOT match a `type=` query.
1099        assert_eq!(extract_attr(r#"<script data-mytype="foo">"#, "type"), None);
1100    }
1101}
ssg/seo/jsonld.rs

ssg/seo/
jsonld.rs