Skip to main content

ssg/postprocess/
html_fix.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! HTML fix plugin.
5
6use super::helpers::rfc2822_to_iso8601;
7use crate::plugin::{Plugin, PluginContext};
8use anyhow::Result;
9use std::path::Path;
10
11/// Repairs HTML output:
12/// - Fix 7: Upgrades JSON-LD `@context` from `http://schema.org/` to
13///   `https://schema.org`.
14/// - Fix 9: Repairs broken `.class=` image syntax where `<p` is
15///   injected into `<img>` tags.
16#[derive(Debug, Clone, Copy)]
17pub struct HtmlFixPlugin;
18
19impl Plugin for HtmlFixPlugin {
20    fn name(&self) -> &'static str {
21        "html-fix"
22    }
23
24    fn has_transform(&self) -> bool {
25        true
26    }
27
28    fn transform_html(
29        &self,
30        html: &str,
31        _path: &Path,
32        _ctx: &PluginContext,
33    ) -> Result<String> {
34        Ok(apply_html_fixes(html))
35    }
36
37    fn after_compile(&self, _ctx: &PluginContext) -> Result<()> {
38        Ok(())
39    }
40}
41
42/// Applies all HTML fixes to a single page and returns the modified content.
43fn apply_html_fixes(html: &str) -> String {
44    let mut modified = html.to_string();
45
46    if needs_schema_context_fix(&modified) {
47        modified = modified
48            .replace("\"http://schema.org/\"", "\"https://schema.org\"")
49            .replace("\"http://schema.org\"", "\"https://schema.org\"");
50    }
51
52    if modified.contains("application/ld+json") {
53        modified = fix_jsonld_dates(&modified);
54    }
55
56    if modified.contains("<p src=") {
57        modified = fix_broken_img_tags(&modified);
58    }
59
60    if needs_class_syntax_fix(&modified) {
61        modified = fix_literal_class_syntax(&modified);
62    }
63
64    if needs_mobile_web_app_capable_meta(&modified) {
65        modified = inject_mobile_web_app_capable_meta(&modified);
66    }
67
68    if has_empty_preload(&modified) {
69        modified = remove_empty_preload_links(&modified);
70    }
71
72    modified
73}
74
75/// Returns `true` if the HTML contains `http://schema.org` context that needs upgrading.
76fn needs_schema_context_fix(html: &str) -> bool {
77    html.contains("\"http://schema.org/\"")
78        || html.contains("\"http://schema.org\"")
79}
80
81/// Returns `true` if the HTML contains literal `.class=` syntax to fix.
82fn needs_class_syntax_fix(html: &str) -> bool {
83    html.contains(".class=&quot;") || html.contains(".class=\"")
84}
85
86/// Returns `true` if the HTML appears to contain a `<link rel="preload">`
87/// tag whose `href` is empty or absent. Chrome logs
88/// "<link rel=preload> has an invalid href value" for these. The check
89/// is intentionally cheap; `remove_empty_preload_links` does the precise
90/// per-tag work only if this returns `true`.
91fn has_empty_preload(html: &str) -> bool {
92    // The cheapest signal of "preload + no real href" is `href` followed
93    // immediately by space or `>` (bare attribute) anywhere in the same
94    // document, *and* a preload link somewhere too. False positives just
95    // trigger the precise rewriter, which is idempotent.
96    let has_preload = html.contains("rel=preload")
97        || html.contains("rel=\"preload\"")
98        || html.contains("rel='preload'");
99    let has_empty_href = html.contains("href=\"\"")
100        || html.contains("href=''")
101        || html.contains(" href ")
102        || html.contains(" href>")
103        || html.contains(" href/>");
104    has_preload && has_empty_href
105}
106
107/// Removes any `<link>` tag that declares `rel="preload"` and has an empty
108/// or missing `href`. Idempotent.
109pub(super) fn remove_empty_preload_links(html: &str) -> String {
110    let mut out = String::with_capacity(html.len());
111    let mut cursor = 0;
112    while cursor < html.len() {
113        // Find the next `<link` (case-insensitive) starting at cursor.
114        let Some(rel_offset) =
115            html[cursor..].to_ascii_lowercase().find("<link")
116        else {
117            out.push_str(&html[cursor..]);
118            break;
119        };
120        let tag_start = cursor + rel_offset;
121        out.push_str(&html[cursor..tag_start]);
122
123        // Walk forward to the closing `>`, respecting quoted attribute values.
124        let bytes = html.as_bytes();
125        let mut j = tag_start;
126        let mut quote: Option<u8> = None;
127        while j < bytes.len() {
128            let b = bytes[j];
129            match quote {
130                Some(q) if b == q => quote = None,
131                Some(_) => {}
132                None => match b {
133                    b'"' | b'\'' => quote = Some(b),
134                    b'>' => break,
135                    _ => {}
136                },
137            }
138            j += 1;
139        }
140        let tag_end = (j + 1).min(html.len());
141        let tag = &html[tag_start..tag_end];
142        let lower = tag.to_ascii_lowercase();
143        let is_preload = lower.contains("rel=\"preload\"")
144            || lower.contains("rel='preload'")
145            || lower.contains("rel=preload");
146        let has_real_href = href_is_present_and_non_empty(&lower);
147        // Drop only empty-href preload tags; keep everything else.
148        if !is_preload || has_real_href {
149            out.push_str(tag);
150        }
151        cursor = tag_end;
152    }
153    out
154}
155
156/// Returns `true` if a (lowercased) tag string has a `href` attribute that
157/// is present and non-empty. Tolerates double, single, and unquoted forms.
158fn href_is_present_and_non_empty(lower_tag: &str) -> bool {
159    if lower_tag.contains("href=\"\"") || lower_tag.contains("href=''") {
160        return false;
161    }
162    let Some(idx) = lower_tag.find("href") else {
163        return false;
164    };
165    // Must be followed by `=`, possibly with surrounding whitespace.
166    let after = lower_tag[idx + 4..].trim_start();
167    let Some(rest) = after.strip_prefix('=') else {
168        return false;
169    };
170    let rest = rest.trim_start();
171    match rest.chars().next() {
172        None | Some('>') => false,
173        Some('"') => rest.len() > 1 && !rest.starts_with("\"\""),
174        Some('\'') => rest.len() > 1 && !rest.starts_with("''"),
175        Some(c) if c.is_whitespace() => false,
176        Some(_) => true,
177    }
178}
179
180/// Returns `true` if the HTML emits the legacy
181/// `apple-mobile-web-app-capable` meta but lacks the modern
182/// `mobile-web-app-capable` meta that Chrome now requires. Tolerates
183/// quoted, single-quoted, or unquoted attribute values (post-minify HTML
184/// often drops quotes around short values like `yes`).
185fn needs_mobile_web_app_capable_meta(html: &str) -> bool {
186    let has_legacy = html.contains("apple-mobile-web-app-capable");
187    let has_modern = find_modern_mobile_web_app_capable(html).is_some();
188    has_legacy && !has_modern
189}
190
191/// Returns the byte offset of a `name=...mobile-web-app-capable...` meta
192/// attribute that is **not** the apple variant, or `None` if none found.
193fn find_modern_mobile_web_app_capable(html: &str) -> Option<usize> {
194    // Search for the bare attribute name in any of the three quoting
195    // styles, then verify it isn't preceded by `apple-` (which would make
196    // it the legacy variant).
197    let needles = [
198        "name=\"mobile-web-app-capable\"",
199        "name='mobile-web-app-capable'",
200        "name=mobile-web-app-capable",
201    ];
202    for n in &needles {
203        if let Some(pos) = html.find(n) {
204            return Some(pos);
205        }
206    }
207    None
208}
209
210/// Injects `<meta name="mobile-web-app-capable" content="yes">` immediately
211/// after the legacy Apple variant so installed-PWA support works in Chrome
212/// without console deprecation warnings. Handles minified HTML where the
213/// `name=` attribute may be unquoted and may appear after `content=`.
214pub(super) fn inject_mobile_web_app_capable_meta(html: &str) -> String {
215    let modern = "<meta name=\"mobile-web-app-capable\" content=\"yes\">";
216    // Find the apple-variant attribute name. Tolerate quoted/unquoted forms.
217    let candidates = [
218        "name=\"apple-mobile-web-app-capable\"",
219        "name='apple-mobile-web-app-capable'",
220        "name=apple-mobile-web-app-capable",
221    ];
222    let name_pos = candidates.iter().find_map(|n| html.find(n));
223    let Some(name_pos) = name_pos else {
224        return html.to_string();
225    };
226    // Walk forward to the next `>` that closes this <meta> tag.
227    let after = &html[name_pos..];
228    let Some(rel_close) = after.find('>') else {
229        return html.to_string();
230    };
231    let insert_at = name_pos + rel_close + 1;
232    format!("{}{modern}{}", &html[..insert_at], &html[insert_at..])
233}
234
235/// Fix JSON-LD date fields from RFC 2822 to ISO 8601.
236pub(super) fn fix_jsonld_dates(html: &str) -> String {
237    let mut result = html.to_string();
238
239    // Match "datePublished":"..." and "dateModified":"..." patterns
240    for field in &["datePublished", "dateModified"] {
241        let pattern = format!("\"{field}\":\"");
242        let mut search_from = 0;
243        while let Some(start) = result[search_from..].find(&pattern) {
244            let abs_start = search_from + start + pattern.len();
245            if let Some(end) = result[abs_start..].find('"') {
246                let date_str = &result[abs_start..abs_start + end];
247                // Only convert if it looks like RFC 2822 (starts with
248                // a day abbreviation like "Mon," "Tue,", etc.)
249                if date_str.len() > 5
250                    && date_str.as_bytes()[3] == b','
251                    && date_str.as_bytes()[0].is_ascii_alphabetic()
252                {
253                    let iso = rfc2822_to_iso8601(date_str);
254                    if iso != date_str {
255                        result = format!(
256                            "{}{}{}",
257                            &result[..abs_start],
258                            iso,
259                            &result[abs_start + end..]
260                        );
261                    }
262                }
263                search_from = abs_start + 1;
264            } else {
265                break;
266            }
267        }
268    }
269
270    result
271}
272
273/// Repair broken `<img ... <p src="...">` patterns by reconstructing
274/// valid `<img>` tags.
275pub(super) fn fix_broken_img_tags(html: &str) -> String {
276    let mut result = html.to_string();
277    // Pattern: <img ... <p src="URL">
278    // Replace with: <img ... src="URL" />
279    while let Some(p_pos) = result.find("<p src=") {
280        // Look backwards for the <img tag start
281        let before = &result[..p_pos];
282        if let Some(img_start) = before.rfind("<img") {
283            // Extract the src value from <p src="...">
284            let after_p = &result[p_pos..]; // includes "<p src="
285            if let Some(quote_start) = after_p.find("src=\"") {
286                let val_start = quote_start + 5; // skip src="
287                let remaining = &after_p[val_start..];
288                if let Some(quote_end) = remaining.find('"') {
289                    let src_value = remaining[..quote_end].to_string();
290                    // Find the closing > of this broken tag
291                    let close_offset = remaining[quote_end..]
292                        .find('>')
293                        .map_or(result.len(), |i| {
294                            p_pos + val_start + quote_end + i + 1
295                        });
296
297                    // Extract existing attributes from the img tag portion
298                    let img_attrs = result[img_start + 4..p_pos].trim();
299                    let img_attrs_clean =
300                        img_attrs.trim_end_matches(|c: char| {
301                            c.is_whitespace() || c == '<'
302                        });
303
304                    let new_img = format!(
305                        "<img {img_attrs_clean} src=\"{src_value}\" />"
306                    );
307                    result = format!(
308                        "{}{}{}",
309                        &result[..img_start],
310                        new_img,
311                        &result[close_offset..]
312                    );
313                    continue;
314                }
315            }
316        }
317        // If we can't parse, skip to avoid infinite loop
318        break;
319    }
320    result
321}
322
323/// Remove literal `.class=&quot;...&quot;` or `.class="..."` from HTML
324/// and apply them as actual class attributes.
325pub(super) fn fix_literal_class_syntax(html: &str) -> String {
326    let mut result = html.to_string();
327
328    // Handle .class=&quot;...&quot; (HTML-encoded quotes)
329    result = fix_class_syntax_variant(&result, ".class=&quot;", "&quot;");
330    // Handle .class="..." (literal quotes)
331    result = fix_class_syntax_variant(&result, ".class=\"", "\"");
332
333    result
334}
335
336/// Handles one variant of the `.class=` syntax fix.
337fn fix_class_syntax_variant(
338    html: &str,
339    open_pattern: &str,
340    close_pattern: &str,
341) -> String {
342    let mut result = html.to_string();
343    while let Some(start) = result.find(open_pattern) {
344        let after = &result[start + open_pattern.len()..];
345        if let Some(end) = after.find(close_pattern) {
346            let class_value = after[..end].to_string();
347            let remove_end =
348                start + open_pattern.len() + end + close_pattern.len();
349            result = format!("{}{}", &result[..start], &result[remove_end..]);
350            inject_class_attr(&mut result, start, &class_value);
351        } else {
352            break;
353        }
354    }
355    result
356}
357
358/// Injects a class attribute into the nearest preceding tag if it doesn't already have one.
359fn inject_class_attr(html: &mut String, pos: usize, class_value: &str) {
360    if let Some(tag_end) = html[..pos].rfind('>') {
361        if let Some(tag_start) = html[..tag_end].rfind('<') {
362            let tag = &html[tag_start..tag_end];
363            if !tag.contains("class=") {
364                let insert_pos = tag_end;
365                *html = format!(
366                    "{} class=\"{}\"{}",
367                    &html[..insert_pos],
368                    class_value,
369                    &html[insert_pos..]
370                );
371            }
372        }
373    }
374}
375
376#[cfg(test)]
377#[allow(clippy::unwrap_used, clippy::expect_used)]
378mod tests {
379    use super::*;
380    use crate::plugin::PluginContext;
381    use std::path::Path;
382    use tempfile::tempdir;
383
384    fn test_ctx(site_dir: &Path) -> PluginContext {
385        crate::test_support::init_logger();
386        PluginContext::new(
387            Path::new("content"),
388            Path::new("build"),
389            site_dir,
390            Path::new("templates"),
391        )
392    }
393
394    #[test]
395    fn test_html_fix_upgrades_jsonld_context() -> Result<()> {
396        let tmp = tempdir()?;
397        let ctx = test_ctx(tmp.path());
398
399        let html = r#"<html><head>
400<script type="application/ld+json">
401{"@context":"http://schema.org/","@type":"WebPage"}
402</script>
403</head><body></body></html>"#;
404
405        let result = HtmlFixPlugin.transform_html(
406            html,
407            Path::new("index.html"),
408            &ctx,
409        )?;
410        assert!(result.contains("\"https://schema.org\""));
411        assert!(!result.contains("\"http://schema.org/\""));
412        Ok(())
413    }
414
415    #[test]
416    fn test_html_fix_converts_jsonld_dates() -> Result<()> {
417        let tmp = tempdir()?;
418        let ctx = test_ctx(tmp.path());
419
420        let html = r#"<html><head>
421<script type="application/ld+json">
422{"@context":"https://schema.org","@type":"Article","datePublished":"Thu, 11 Apr 2026 06:06:06 +0000","dateModified":"Mon, 01 Sep 2025 06:06:06 +0000"}
423</script>
424</head><body></body></html>"#;
425
426        let result = HtmlFixPlugin.transform_html(
427            html,
428            Path::new("article.html"),
429            &ctx,
430        )?;
431        assert!(
432            result.contains("\"datePublished\":\"2026-04-11"),
433            "Expected ISO date, got: {result}"
434        );
435        assert!(
436            result.contains("\"dateModified\":\"2025-09-01"),
437            "Expected ISO date, got: {result}"
438        );
439        assert!(!result.contains("Thu, 11 Apr"));
440        Ok(())
441    }
442
443    #[test]
444    fn test_fix_broken_img_tags() {
445        let input =
446            r#"<img alt="test" class="w-25" title="test" <p src="image.jpg">"#;
447        let result = fix_broken_img_tags(input);
448        assert!(result.contains("src=\"image.jpg\""));
449        assert!(!result.contains("<p src="));
450    }
451
452    #[test]
453    fn test_fix_literal_class_syntax() {
454        let input = r#"<img alt="test" src="img.jpg">.class=&quot;w-25 float-start&quot;"#;
455        let result = fix_literal_class_syntax(input);
456        assert!(!result.contains(".class=&quot;"));
457    }
458
459    // -----------------------------------------------------------------
460    // fix_jsonld_dates
461    // -----------------------------------------------------------------
462
463    #[test]
464    fn test_fix_jsonld_dates_iso_passthrough() {
465        let input =
466            r#"{"datePublished":"2026-04-11","dateModified":"2025-09-01"}"#;
467        let result = fix_jsonld_dates(input);
468        assert_eq!(result, input, "ISO dates should pass through unchanged");
469    }
470
471    #[test]
472    fn test_fix_jsonld_dates_converts_rfc2822() {
473        let input = r#"{"datePublished":"Thu, 11 Apr 2026 06:06:06 +0000"}"#;
474        let result = fix_jsonld_dates(input);
475        assert!(
476            result.contains("\"datePublished\":\"2026-04-11T06:06:06+00:00\""),
477            "Should convert RFC 2822 to ISO 8601, got: {result}"
478        );
479    }
480
481    #[test]
482    fn test_fix_jsonld_dates_both_fields() {
483        let input = r#"{"datePublished":"Mon, 01 Sep 2025 12:00:00 +0000","dateModified":"Tue, 02 Sep 2025 14:30:00 +0000"}"#;
484        let result = fix_jsonld_dates(input);
485        assert!(result.contains("2025-09-01T12:00:00+00:00"));
486        assert!(result.contains("2025-09-02T14:30:00+00:00"));
487    }
488
489    // -----------------------------------------------------------------
490    // fix_broken_img_tags
491    // -----------------------------------------------------------------
492
493    #[test]
494    fn test_fix_broken_img_tags_multiple() {
495        let input =
496            r#"<img alt="a" <p src="one.jpg"><img alt="b" <p src="two.jpg">"#;
497        let result = fix_broken_img_tags(input);
498        assert!(result.contains("src=\"one.jpg\""), "first img: {result}");
499        assert!(result.contains("src=\"two.jpg\""), "second img: {result}");
500        assert!(
501            !result.contains("<p src="),
502            "no broken tags remain: {result}"
503        );
504    }
505
506    #[test]
507    fn test_fix_broken_img_tags_none() {
508        let input = r#"<img alt="ok" src="good.jpg" />"#;
509        let result = fix_broken_img_tags(input);
510        assert_eq!(
511            result, input,
512            "No broken tags should leave input unchanged"
513        );
514    }
515
516    // -----------------------------------------------------------------
517    // fix_literal_class_syntax
518    // -----------------------------------------------------------------
519
520    #[test]
521    fn test_fix_literal_class_syntax_html_encoded() {
522        let input =
523            r#"<img src="img.jpg">.class=&quot;w-25 float-start&quot; rest"#;
524        let result = fix_literal_class_syntax(input);
525        assert!(
526            !result.contains(".class=&quot;"),
527            "should remove .class=&quot;"
528        );
529        assert!(
530            result.contains("class=\"w-25 float-start\""),
531            "should inject class attr, got: {result}"
532        );
533    }
534
535    #[test]
536    fn test_fix_literal_class_syntax_literal_quotes() {
537        let input = r#"<img src="img.jpg">.class="my-class" rest"#;
538        let result = fix_literal_class_syntax(input);
539        assert!(
540            !result.contains(".class=\""),
541            "should remove .class=\", got: {result}"
542        );
543        assert!(
544            result.contains("class=\"my-class\""),
545            "should inject class attr, got: {result}"
546        );
547    }
548
549    #[test]
550    fn test_fix_literal_class_syntax_no_class() {
551        let input = r#"<img src="img.jpg"> some text"#;
552        let result = fix_literal_class_syntax(input);
553        assert_eq!(result, input, "No .class= should leave input unchanged");
554    }
555
556    // -----------------------------------------------------------------
557    // inject_mobile_web_app_capable_meta
558    // -----------------------------------------------------------------
559
560    #[test]
561    fn test_inject_mobile_web_app_capable_meta_added() {
562        let input = r#"<head><meta name="apple-mobile-web-app-capable" content="yes"></head>"#;
563        let result = inject_mobile_web_app_capable_meta(input);
564        assert!(
565            result.contains(
566                r#"<meta name="mobile-web-app-capable" content="yes">"#
567            ),
568            "modern meta should be injected, got: {result}"
569        );
570        assert!(
571            result.contains(
572                r#"<meta name="apple-mobile-web-app-capable" content="yes">"#
573            ),
574            "legacy meta must remain for backwards compatibility"
575        );
576    }
577
578    // -----------------------------------------------------------------
579    // remove_empty_preload_links
580    // -----------------------------------------------------------------
581
582    #[test]
583    fn test_remove_empty_preload_drops_bare_href() {
584        let input = r#"<head><link as=image fetchpriority=high href rel=preload type=image/webp><title>x</title></head>"#;
585        let result = remove_empty_preload_links(input);
586        assert!(
587            !result.contains("rel=preload"),
588            "empty preload should be removed, got: {result}"
589        );
590        assert!(result.contains("<title>x</title>"), "rest preserved");
591    }
592
593    #[test]
594    fn test_remove_empty_preload_drops_quoted_empty_href() {
595        let input = r#"<link rel="preload" href="" as="image">"#;
596        let result = remove_empty_preload_links(input);
597        assert_eq!(result, "");
598    }
599
600    #[test]
601    fn test_remove_empty_preload_keeps_valid_preload() {
602        let input = r#"<link rel="preload" href="/banner.webp" as="image">"#;
603        let result = remove_empty_preload_links(input);
604        assert_eq!(result, input);
605    }
606
607    #[test]
608    fn test_remove_empty_preload_preserves_utf8() {
609        let input = r#"<title>日本語</title><link rel=preload href as=image><p>テスト</p>"#;
610        let result = remove_empty_preload_links(input);
611        assert!(result.contains("日本語"));
612        assert!(result.contains("テスト"));
613        assert!(!result.contains("rel=preload"));
614    }
615
616    #[test]
617    fn test_apply_html_fixes_idempotent_on_modern_meta() {
618        let input = r#"<head><meta name="apple-mobile-web-app-capable" content="yes"><meta name="mobile-web-app-capable" content="yes"></head>"#;
619        let result = apply_html_fixes(input);
620        // Should not double-inject when modern meta already exists.
621        let count = result.matches("name=\"mobile-web-app-capable\"").count();
622        assert_eq!(count, 1, "no duplicate injection, got: {result}");
623    }
624}