1use anyhow::Result;
7use std::path::{Path, PathBuf};
8
9pub fn extract_title(html: &str) -> String {
11 if let Some(start) = html.find("<title>") {
12 let after = &html[start + 7..];
13 if let Some(end) = after.find("</title>") {
14 let title = strip_tags(&after[..end]);
15 let trimmed = title.trim();
16 if !trimmed.is_empty() {
17 return trimmed.to_string();
18 }
19 }
20 }
21 String::new()
22}
23
24pub(super) fn extract_description(html: &str, max_len: usize) -> String {
30 let content = extract_main_content(html);
31
32 let clean = strip_inline_tags(&content, &["script", "style"]);
33
34 let text = strip_tags(&clean);
35 let trimmed = text.trim();
36 truncate_at_word_boundary(trimmed, max_len)
37}
38
39fn extract_main_content(html: &str) -> String {
42 if let Some(inner) = extract_tag_inner(html, "main") {
43 return inner;
44 }
45
46 let body =
47 extract_tag_inner(html, "body").unwrap_or_else(|| html.to_string());
48 strip_inline_tags(&body, &["script", "style", "nav", "header", "footer"])
49}
50
51fn extract_tag_inner(html: &str, tag_name: &str) -> Option<String> {
53 let open = format!("<{tag_name}");
54 let close = format!("</{tag_name}>");
55 let start = html.find(&open)?;
56 let after = &html[start..];
57 let gt = after.find('>')?;
58 let inner = &after[gt + 1..];
59 if let Some(end) = inner.find(&close) {
60 Some(inner[..end].to_string())
61 } else {
62 Some(inner.to_string())
63 }
64}
65
66fn strip_inline_tags(html: &str, tags: &[&str]) -> String {
68 let mut clean = html.to_string();
69 for tag in tags {
70 let open = format!("<{tag}");
71 let close = format!("</{tag}>");
72 while let Some(start) = clean.find(&open) {
73 if let Some(end) = clean[start..].find(&close) {
74 clean.replace_range(start..start + end + close.len(), " ");
75 } else {
76 break;
77 }
78 }
79 }
80 clean
81}
82
83fn truncate_at_word_boundary(text: &str, max_len: usize) -> String {
85 if text.len() <= max_len {
86 return text.to_string();
87 }
88 let mut end = max_len;
89 while end > 0 && !text.is_char_boundary(end) {
90 end -= 1;
91 }
92 let truncated = &text[..end];
93 if let Some(last_space) = truncated.rfind(' ') {
94 truncated[..last_space].to_string()
95 } else {
96 truncated.to_string()
97 }
98}
99
100pub(super) fn strip_tags(html: &str) -> String {
102 let mut result = String::with_capacity(html.len());
103 let mut in_tag = false;
104 for ch in html.chars() {
105 match ch {
106 '<' => in_tag = true,
107 '>' => {
108 in_tag = false;
109 result.push(' ');
110 }
111 _ if !in_tag => result.push(ch),
112 _ => {}
113 }
114 }
115 let mut collapsed = String::with_capacity(result.len());
117 let mut prev_space = false;
118 for ch in result.chars() {
119 if ch.is_whitespace() {
120 if !prev_space {
121 collapsed.push(' ');
122 prev_space = true;
123 }
124 } else {
125 collapsed.push(ch);
126 prev_space = false;
127 }
128 }
129 collapsed.trim().to_string()
130}
131
132#[allow(dead_code)] pub(super) fn collect_html_files(dir: &Path) -> Result<Vec<PathBuf>> {
135 crate::walk::walk_files(dir, "html")
136}
137
138pub(super) fn escape_attr(s: &str) -> String {
140 s.replace('&', "&")
141 .replace('"', """)
142 .replace('<', "<")
143 .replace('>', ">")
144}
145
146pub fn has_meta_tag(html: &str, attr: &str) -> bool {
155 html.contains(&format!("<meta property=\"{attr}\""))
156 || html.contains(&format!("<meta property='{attr}'"))
157 || html.contains(&format!("<meta name=\"{attr}\""))
158 || html.contains(&format!("<meta name='{attr}'"))
159}
160
161pub(super) fn extract_canonical(html: &str) -> String {
163 if let Some(pos) = html.find("rel=\"canonical\"") {
164 let region_start = pos.saturating_sub(200);
165 let region = &html[region_start..html.len().min(pos + 200)];
166 if let Some(href_start) = region.find("href=\"") {
167 let after = ®ion[href_start + 6..];
168 if let Some(end) = after.find('"') {
169 return after[..end].to_string();
170 }
171 }
172 }
173 String::new()
174}
175
176pub(super) fn extract_existing_meta(html: &str, attr: &str) -> String {
178 for prefix in &[
179 format!("<meta name=\"{attr}\" content=\""),
180 format!("<meta property=\"{attr}\" content=\""),
181 format!("<meta name='{attr}' content='"),
182 format!("<meta property='{attr}' content='"),
183 ] {
184 if let Some(pos) = html.find(prefix.as_str()) {
185 let after = &html[pos + prefix.len()..];
186 let delim = if prefix.ends_with('\'') { '\'' } else { '"' };
187 if let Some(end) = after.find(delim) {
188 let value = after[..end].trim();
189 if !value.is_empty() {
190 return value.to_string();
191 }
192 }
193 }
194 }
195 String::new()
196}
197
198pub(super) fn extract_html_lang(html: &str) -> String {
200 if let Some(start) = html.find("<html") {
201 let tag_end = html[start..].find('>').unwrap_or(200);
202 let tag = &html[start..start + tag_end];
203 if let Some(lang_pos) = tag.find("lang=\"") {
204 let after = &tag[lang_pos + 6..];
205 if let Some(end) = after.find('"') {
206 return after[..end].to_string();
207 }
208 }
209 if let Some(lang_pos) = tag.find("lang='") {
210 let after = &tag[lang_pos + 6..];
211 if let Some(end) = after.find('\'') {
212 return after[..end].to_string();
213 }
214 }
215 }
216 String::new()
217}
218
219pub(super) fn extract_first_content_image(html: &str) -> String {
221 let search_region = if let Some(start) = html.find("<main") {
223 &html[start..]
224 } else if let Some(start) = html.find("<article") {
225 &html[start..]
226 } else {
227 return String::new();
228 };
229
230 if let Some(img_pos) = search_region.find("<img") {
231 let after_img = &search_region[img_pos..];
232 let tag_end = after_img.find('>').unwrap_or(500).min(500);
233 let img_tag = &after_img[..tag_end];
234 if let Some(src_pos) = img_tag.find("src=\"") {
235 let after_src = &img_tag[src_pos + 5..];
236 if let Some(end) = after_src.find('"') {
237 return after_src[..end].to_string();
238 }
239 }
240 }
241 String::new()
242}
243
244pub(super) fn extract_meta_author(html: &str) -> String {
246 let from_meta = extract_existing_meta(html, "author");
248 if !from_meta.is_empty() {
249 return from_meta;
250 }
251 for pattern in &["class=\"author\">", "class='author'>", "rel=\"author\">"]
253 {
254 if let Some(pos) = html.find(pattern) {
255 let after = &html[pos + pattern.len()..];
256 if let Some(end) = after.find('<') {
257 let name = after[..end].trim();
258 let name = name.strip_prefix("by ").unwrap_or(name).trim();
260 if !name.is_empty() {
261 return name.to_string();
262 }
263 }
264 }
265 }
266 String::new()
267}
268
269pub(super) fn extract_date_from_html(
271 html: &str,
272 field: &str,
273) -> Option<String> {
274 let pattern = format!("\"{field}\":\"");
275 if let Some(pos) = html.find(&pattern) {
276 let after = &html[pos + pattern.len()..];
277 if let Some(end) = after.find('"') {
278 let date = &after[..end];
279 if !date.is_empty() {
280 return Some(date.to_string());
281 }
282 }
283 }
284 None
285}
286
287pub(super) fn extract_meta_date(html: &str) -> Option<String> {
289 let meta = extract_existing_meta(html, "article:published_time");
291 if !meta.is_empty() {
292 return Some(meta);
293 }
294 if let Some(pos) = html.find("datetime=\"") {
296 let after = &html[pos + 10..];
297 if let Some(end) = after.find('"') {
298 let date = &after[..end];
299 if !date.is_empty() {
300 return Some(date.to_string());
301 }
302 }
303 }
304 None
305}
306
307#[allow(dead_code)] pub(super) fn collect_html_files_recursive(dir: &Path) -> Result<Vec<PathBuf>> {
310 crate::walk::walk_files(dir, "html")
311}
312
313#[cfg(test)]
314#[allow(clippy::unwrap_used, clippy::expect_used)]
315mod tests {
316 use super::*;
317 use std::fs;
318 use tempfile::tempdir;
319
320 #[test]
321 fn extract_title_from_html() {
322 let html = "<html><head><title>Test Page</title></head></html>";
323 assert_eq!(extract_title(html), "Test Page");
324 }
325
326 #[test]
327 fn extract_title_empty_no_tag() {
328 let html = "<html><head></head><body>Hello</body></html>";
329 assert_eq!(extract_title(html), "");
330 }
331
332 #[test]
333 fn extract_title_empty_tag() {
334 let html = "<html><head><title></title></head></html>";
335 assert_eq!(extract_title(html), "");
336 }
337
338 #[test]
339 fn extract_title_nested_tags() {
340 let html = "<title><span>Inner</span></title>";
341 assert_eq!(extract_title(html), "Inner");
343 }
344
345 #[test]
346 fn extract_description_from_body() {
347 let html = "<html><body><main><p>Short description here.</p></main></body></html>";
348 let desc = extract_description(html, 200);
349 assert!(desc.contains("Short description here"));
350 }
351
352 #[test]
353 fn extract_description_truncation() {
354 let long_text = "word ".repeat(100);
355 let html = format!("<main><p>{long_text}</p></main>");
356 let desc = extract_description(&html, 50);
357 assert!(desc.len() <= 50);
358 }
359
360 #[test]
361 fn strip_tags_basic() {
362 assert_eq!(strip_tags("<p>Hello <b>world</b></p>"), "Hello world");
363 }
364
365 #[test]
366 fn strip_tags_empty() {
367 assert_eq!(strip_tags(""), "");
368 }
369
370 #[test]
371 fn strip_tags_no_tags() {
372 assert_eq!(strip_tags("plain text"), "plain text");
373 }
374
375 #[test]
376 fn strip_tags_self_closing() {
377 let result = strip_tags("<img src=\"x\"/>text");
378 assert!(result.contains("text"));
379 assert!(!result.contains("img"));
380 }
381
382 #[test]
383 fn truncate_short_text_unchanged() {
384 assert_eq!(truncate_at_word_boundary("short", 100), "short");
385 }
386
387 #[test]
388 fn truncate_long_text_at_word() {
389 let text = "one two three four five six";
390 let result = truncate_at_word_boundary(text, 15);
391 assert!(result.len() <= 15);
392 assert!(!result.ends_with(' '));
394 assert_eq!(result, "one two three");
395 }
396
397 #[test]
398 fn truncate_unicode() {
399 let text = "日本語 テスト データ";
400 let result = truncate_at_word_boundary(text, 15);
401 assert!(result.len() <= 15);
403 }
404
405 #[test]
406 fn collect_html_files_finds_files() {
407 let tmp = tempdir().unwrap();
408 let sub = tmp.path().join("sub");
409 fs::create_dir_all(&sub).unwrap();
410 fs::write(tmp.path().join("index.html"), "<html></html>").unwrap();
411 fs::write(sub.join("page.html"), "<html></html>").unwrap();
412
413 let files = collect_html_files(tmp.path()).unwrap();
414 assert_eq!(files.len(), 2);
415 }
416
417 #[test]
418 fn collect_html_files_recursive_finds_files() {
419 let tmp = tempdir().unwrap();
420 let sub = tmp.path().join("sub");
421 fs::create_dir_all(&sub).unwrap();
422 fs::write(tmp.path().join("index.html"), "<html></html>").unwrap();
423 fs::write(sub.join("page.html"), "<html></html>").unwrap();
424 fs::write(sub.join("style.css"), "body{}").unwrap();
425
426 let files = collect_html_files_recursive(tmp.path()).unwrap();
427 assert_eq!(files.len(), 2);
428 assert!(files.iter().all(|p| p.extension().unwrap() == "html"));
429 }
430
431 #[test]
432 fn collect_html_files_recursive_empty_dir() {
433 let tmp = tempdir().unwrap();
434 let files = collect_html_files_recursive(tmp.path()).unwrap();
435 assert!(files.is_empty());
436 }
437
438 #[test]
439 fn escape_attr_special_chars() {
440 assert_eq!(escape_attr("a&b<c>d\"e"), "a&b<c>d"e");
441 }
442
443 #[test]
444 fn has_meta_tag_present() {
445 let html = r#"<meta property="og:title" content="Hi">"#;
446 assert!(has_meta_tag(html, "og:title"));
447 }
448
449 #[test]
450 fn has_meta_tag_absent() {
451 let html = "<html><head></head></html>";
452 assert!(!has_meta_tag(html, "og:title"));
453 }
454
455 #[test]
456 fn extract_canonical_found() {
457 let html = r#"<link rel="canonical" href="https://example.com/page">"#;
458 assert_eq!(extract_canonical(html), "https://example.com/page");
459 }
460
461 #[test]
462 fn extract_canonical_missing() {
463 let html = "<html><head></head></html>";
464 assert_eq!(extract_canonical(html), "");
465 }
466
467 #[test]
468 fn extract_existing_meta_by_name() {
469 let html = r#"<meta name="author" content="Alice">"#;
470 assert_eq!(extract_existing_meta(html, "author"), "Alice");
471 }
472
473 #[test]
474 fn extract_html_lang_found() {
475 let html = r#"<html lang="fr"><head></head></html>"#;
476 assert_eq!(extract_html_lang(html), "fr");
477 }
478
479 #[test]
480 fn extract_html_lang_missing() {
481 let html = "<html><head></head></html>";
482 assert_eq!(extract_html_lang(html), "");
483 }
484
485 #[test]
486 fn extract_date_from_html_found() {
487 let html = r#"{"datePublished":"2025-01-15"}"#;
488 assert_eq!(
489 extract_date_from_html(html, "datePublished"),
490 Some("2025-01-15".to_string())
491 );
492 }
493
494 #[test]
495 fn extract_date_from_html_missing() {
496 assert_eq!(
497 extract_date_from_html("<html></html>", "datePublished"),
498 None
499 );
500 }
501}