Skip to main content

ssg/
streaming.rs

1// Copyright © 2023 - 2026 Static Site Generator (SSG). All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Streaming compilation for large sites.
5//!
6//! Processes content files in batches to cap peak memory usage, enabling
7//! compilation of 100K+ page sites within a configurable memory budget.
8//!
9//! The streaming compiler divides content files into chunks based on the
10//! memory budget, compiles each chunk, then releases it before processing
11//! the next. After all chunks, a merge pass unifies cross-page artefacts
12//! (sitemap, search index, feeds).
13
14use crate::walk;
15use anyhow::{Context, Result};
16use std::{
17    fs,
18    path::{Path, PathBuf},
19};
20
21/// Default peak memory budget: 512 MB.
22pub const DEFAULT_MEMORY_BUDGET_MB: usize = 512;
23
24/// Estimated memory per page in bytes (HTML + metadata + buffers).
25/// Conservative estimate for batch sizing.
26const ESTIMATED_BYTES_PER_PAGE: usize = 64 * 1024; // 64 KB
27
28/// Memory budget configuration for streaming compilation.
29#[derive(Debug, Clone, Copy)]
30pub struct MemoryBudget {
31    /// Maximum memory in bytes.
32    pub max_bytes: usize,
33    /// Pages per batch, derived from `max_bytes`.
34    pub batch_size: usize,
35}
36
37impl MemoryBudget {
38    /// Creates a memory budget from a megabyte limit.
39    #[must_use]
40    pub fn from_mb(mb: usize) -> Self {
41        let max_bytes = mb * 1024 * 1024;
42        let batch_size = (max_bytes / ESTIMATED_BYTES_PER_PAGE).max(10);
43        Self {
44            max_bytes,
45            batch_size,
46        }
47    }
48
49    /// Creates the default 512 MB budget.
50    #[must_use]
51    pub fn default_budget() -> Self {
52        Self::from_mb(DEFAULT_MEMORY_BUDGET_MB)
53    }
54}
55
56/// Collects content files and returns them as batches.
57///
58/// Each batch contains at most `budget.batch_size` files.
59pub fn batched_content_files(
60    content_dir: &Path,
61    budget: &MemoryBudget,
62) -> Result<Vec<Vec<PathBuf>>> {
63    let all_files = walk::walk_files(content_dir, "md")
64        .with_context(|| format!("cannot walk {}", content_dir.display()))?;
65
66    if all_files.is_empty() {
67        return Ok(vec![]);
68    }
69
70    let batches: Vec<Vec<PathBuf>> = all_files
71        .chunks(budget.batch_size)
72        .map(|chunk| chunk.to_vec())
73        .collect();
74
75    log::info!(
76        "[streaming] {} file(s) in {} batch(es) (budget: {} MB, {} pages/batch)",
77        all_files.len(),
78        batches.len(),
79        budget.max_bytes / (1024 * 1024),
80        budget.batch_size,
81    );
82
83    Ok(batches)
84}
85
86/// Compiles a single batch of content files into the build directory.
87///
88/// Creates a temporary content directory containing only the batch files,
89/// runs `staticdatagen::compile` on it, then merges the output into the
90/// final site directory.
91pub fn compile_batch(
92    batch: &[PathBuf],
93    content_dir: &Path,
94    build_dir: &Path,
95    site_dir: &Path,
96    template_dir: &Path,
97    batch_idx: usize,
98) -> Result<()> {
99    if batch.is_empty() {
100        return Ok(());
101    }
102
103    // Create a temporary batch content directory
104    let batch_content = build_dir.join(format!(".batch-{batch_idx}"));
105    fs::create_dir_all(&batch_content)?;
106
107    // Copy batch files preserving directory structure
108    for file in batch {
109        let rel = file.strip_prefix(content_dir).unwrap_or(file);
110        let dest = batch_content.join(rel);
111        if let Some(parent) = dest.parent() {
112            fs::create_dir_all(parent)?;
113        }
114        let _ = fs::copy(file, &dest)?;
115    }
116
117    // Compile the batch
118    let batch_build = build_dir.join(format!(".batch-{batch_idx}-build"));
119    let batch_site = build_dir.join(format!(".batch-{batch_idx}-site"));
120    fs::create_dir_all(&batch_build)?;
121    fs::create_dir_all(&batch_site)?;
122
123    let compile_result = staticdatagen::compile(
124        &batch_build,
125        &batch_content,
126        &batch_site,
127        template_dir,
128    );
129
130    // Merge batch output into the main site directory
131    if compile_result.is_ok() {
132        fs::create_dir_all(site_dir)?;
133        merge_dir(&batch_site, site_dir)?;
134    }
135
136    // Clean up batch temporaries
137    let _ = fs::remove_dir_all(&batch_content);
138    let _ = fs::remove_dir_all(&batch_build);
139    let _ = fs::remove_dir_all(&batch_site);
140
141    compile_result.map_err(|e| anyhow::anyhow!("batch {batch_idx}: {e:?}"))
142}
143
144/// Recursively merges files from `src` into `dst`, overwriting on conflict.
145fn merge_dir(src: &Path, dst: &Path) -> Result<()> {
146    if !src.exists() {
147        return Ok(());
148    }
149
150    for entry in fs::read_dir(src)? {
151        let entry = entry?;
152        let path = entry.path();
153        let dest = dst.join(entry.file_name());
154
155        if path.is_dir() {
156            fs::create_dir_all(&dest)?;
157            merge_dir(&path, &dest)?;
158        } else {
159            let _ = fs::copy(&path, &dest)?;
160        }
161    }
162    Ok(())
163}
164
165/// Determines whether streaming compilation should be used.
166///
167/// Returns `true` if the content directory has more files than a single
168/// batch can hold, or if `--max-memory` was explicitly set.
169#[must_use]
170pub fn should_stream(
171    content_dir: &Path,
172    budget: &MemoryBudget,
173    explicitly_set: bool,
174) -> bool {
175    if explicitly_set {
176        return true;
177    }
178
179    let count = walk::walk_files(content_dir, "md").map_or(0, |f| f.len());
180
181    count > budget.batch_size
182}
183
184#[cfg(test)]
185#[allow(clippy::unwrap_used, clippy::expect_used)]
186mod tests {
187    use super::*;
188    use tempfile::tempdir;
189
190    #[test]
191    fn memory_budget_from_mb() {
192        let budget = MemoryBudget::from_mb(256);
193        assert_eq!(budget.max_bytes, 256 * 1024 * 1024);
194        assert!(budget.batch_size > 0);
195    }
196
197    #[test]
198    fn memory_budget_default() {
199        let budget = MemoryBudget::default_budget();
200        assert_eq!(budget.max_bytes, 512 * 1024 * 1024);
201    }
202
203    #[test]
204    fn memory_budget_minimum_batch_size() {
205        let budget = MemoryBudget::from_mb(0);
206        assert!(
207            budget.batch_size >= 10,
208            "batch size should have a floor of 10"
209        );
210    }
211
212    #[test]
213    fn batched_content_files_empty_dir() {
214        let dir = tempdir().unwrap();
215        let content = dir.path().join("content");
216        fs::create_dir_all(&content).unwrap();
217
218        let budget = MemoryBudget::from_mb(512);
219        let batches = batched_content_files(&content, &budget).unwrap();
220        assert!(batches.is_empty());
221    }
222
223    #[test]
224    fn batched_content_files_splits_correctly() {
225        let dir = tempdir().unwrap();
226        let content = dir.path().join("content");
227        fs::create_dir_all(&content).unwrap();
228
229        for i in 0..25 {
230            fs::write(
231                content.join(format!("page{i}.md")),
232                format!("# Page {i}"),
233            )
234            .unwrap();
235        }
236
237        let budget = MemoryBudget {
238            max_bytes: 0,
239            batch_size: 10,
240        };
241        let batches = batched_content_files(&content, &budget).unwrap();
242
243        assert_eq!(batches.len(), 3); // 10 + 10 + 5
244        assert_eq!(batches[0].len(), 10);
245        assert_eq!(batches[1].len(), 10);
246        assert_eq!(batches[2].len(), 5);
247    }
248
249    #[test]
250    fn merge_dir_combines_files() {
251        let dir = tempdir().unwrap();
252        let src = dir.path().join("src");
253        let dst = dir.path().join("dst");
254        fs::create_dir_all(&src).unwrap();
255        fs::create_dir_all(&dst).unwrap();
256
257        fs::write(src.join("a.html"), "from src").unwrap();
258        fs::write(dst.join("b.html"), "existing").unwrap();
259
260        merge_dir(&src, &dst).unwrap();
261
262        assert_eq!(fs::read_to_string(dst.join("a.html")).unwrap(), "from src");
263        assert_eq!(fs::read_to_string(dst.join("b.html")).unwrap(), "existing");
264    }
265
266    #[test]
267    fn merge_dir_overwrites_on_conflict() {
268        let dir = tempdir().unwrap();
269        let src = dir.path().join("src");
270        let dst = dir.path().join("dst");
271        fs::create_dir_all(&src).unwrap();
272        fs::create_dir_all(&dst).unwrap();
273
274        fs::write(src.join("a.html"), "new").unwrap();
275        fs::write(dst.join("a.html"), "old").unwrap();
276
277        merge_dir(&src, &dst).unwrap();
278
279        assert_eq!(fs::read_to_string(dst.join("a.html")).unwrap(), "new");
280    }
281
282    #[test]
283    fn should_stream_when_explicitly_set() {
284        let dir = tempdir().unwrap();
285        let content = dir.path().join("content");
286        fs::create_dir_all(&content).unwrap();
287
288        let budget = MemoryBudget::default_budget();
289        assert!(should_stream(&content, &budget, true));
290    }
291
292    #[test]
293    fn compile_batch_empty_is_noop() {
294        let dir = tempdir().unwrap();
295        let result = compile_batch(
296            &[],
297            dir.path(),
298            &dir.path().join("build"),
299            &dir.path().join("site"),
300            &dir.path().join("templates"),
301            0,
302        );
303        assert!(result.is_ok());
304    }
305
306    #[test]
307    fn merge_dir_nonexistent_src_is_noop() {
308        let dir = tempdir().unwrap();
309        let result =
310            merge_dir(&dir.path().join("nonexistent"), &dir.path().join("dst"));
311        assert!(result.is_ok());
312    }
313
314    #[test]
315    fn merge_dir_nested() {
316        let dir = tempdir().unwrap();
317        let src = dir.path().join("src");
318        let dst = dir.path().join("dst");
319        let nested = src.join("sub");
320        fs::create_dir_all(&nested).unwrap();
321        fs::create_dir_all(&dst).unwrap();
322        fs::write(nested.join("file.txt"), "nested").unwrap();
323
324        merge_dir(&src, &dst).unwrap();
325        assert_eq!(
326            fs::read_to_string(dst.join("sub/file.txt")).unwrap(),
327            "nested"
328        );
329    }
330
331    #[test]
332    fn should_stream_large_site() {
333        let dir = tempdir().unwrap();
334        let content = dir.path().join("content");
335        fs::create_dir_all(&content).unwrap();
336        // Create more files than default batch size (8192)
337        // Use a tiny budget instead
338        let budget = MemoryBudget {
339            max_bytes: 0,
340            batch_size: 2,
341        };
342        for i in 0..5 {
343            fs::write(content.join(format!("p{i}.md")), "# Hi").unwrap();
344        }
345        assert!(should_stream(&content, &budget, false));
346    }
347
348    #[test]
349    fn should_not_stream_small_site() {
350        let dir = tempdir().unwrap();
351        let content = dir.path().join("content");
352        fs::create_dir_all(&content).unwrap();
353        fs::write(content.join("index.md"), "# Home").unwrap();
354
355        let budget = MemoryBudget::default_budget();
356        assert!(!should_stream(&content, &budget, false));
357    }
358
359    // -----------------------------------------------------------------
360    // MemoryBudget — edge cases
361    // -----------------------------------------------------------------
362
363    #[test]
364    fn memory_budget_from_mb_one() {
365        let budget = MemoryBudget::from_mb(1);
366        assert_eq!(budget.max_bytes, 1024 * 1024);
367        // 1 MB / 64 KB = 16 pages per batch
368        assert_eq!(budget.batch_size, 16);
369    }
370
371    #[test]
372    fn memory_budget_from_mb_very_large() {
373        let budget = MemoryBudget::from_mb(4096);
374        assert_eq!(budget.max_bytes, 4096 * 1024 * 1024);
375        // 4 GB / 64 KB = 65536 pages per batch
376        assert_eq!(budget.batch_size, 65536);
377    }
378
379    #[test]
380    fn memory_budget_batch_size_floor_is_ten() {
381        // Even with 0 MB, the floor ensures at least 10 pages/batch
382        let budget = MemoryBudget::from_mb(0);
383        assert_eq!(budget.max_bytes, 0);
384        assert_eq!(budget.batch_size, 10);
385    }
386
387    #[test]
388    fn memory_budget_default_budget_matches_constant() {
389        let budget = MemoryBudget::default_budget();
390        assert_eq!(budget.max_bytes, DEFAULT_MEMORY_BUDGET_MB * 1024 * 1024);
391        assert_eq!(
392            budget.batch_size,
393            MemoryBudget::from_mb(DEFAULT_MEMORY_BUDGET_MB).batch_size
394        );
395    }
396
397    #[test]
398    fn memory_budget_clone_copy_debug() {
399        let a = MemoryBudget::from_mb(128);
400        let b = a; // Copy
401        assert_eq!(a.max_bytes, b.max_bytes);
402        assert_eq!(a.batch_size, b.batch_size);
403        let debug = format!("{a:?}");
404        assert!(debug.contains("MemoryBudget"));
405    }
406
407    // -----------------------------------------------------------------
408    // batched_content_files — additional scenarios
409    // -----------------------------------------------------------------
410
411    #[test]
412    fn batched_content_files_nonexistent_dir_returns_empty() {
413        let dir = tempdir().unwrap();
414        let budget = MemoryBudget::from_mb(512);
415        let result =
416            batched_content_files(&dir.path().join("nonexistent"), &budget);
417        // walk_files treats a missing dir as empty, so batched returns Ok([])
418        // or propagates an error — either is acceptable.
419        if let Ok(batches) = result {
420            assert!(batches.is_empty());
421        }
422        // Err is also acceptable — nonexistent dir may propagate error
423    }
424
425    #[test]
426    fn batched_content_files_single_file() {
427        let dir = tempdir().unwrap();
428        let content = dir.path().join("content");
429        fs::create_dir_all(&content).unwrap();
430        fs::write(content.join("index.md"), "# Home").unwrap();
431
432        let budget = MemoryBudget::from_mb(512);
433        let batches = batched_content_files(&content, &budget).unwrap();
434        assert_eq!(batches.len(), 1);
435        assert_eq!(batches[0].len(), 1);
436    }
437
438    #[test]
439    fn batched_content_files_ignores_non_md() {
440        let dir = tempdir().unwrap();
441        let content = dir.path().join("content");
442        fs::create_dir_all(&content).unwrap();
443        fs::write(content.join("page.md"), "# Page").unwrap();
444        fs::write(content.join("image.png"), "fakepng").unwrap();
445        fs::write(content.join("style.css"), "body{}").unwrap();
446
447        let budget = MemoryBudget::from_mb(512);
448        let batches = batched_content_files(&content, &budget).unwrap();
449        let total: usize = batches.iter().map(|b| b.len()).sum();
450        assert_eq!(total, 1, "only .md files should be collected");
451    }
452
453    #[test]
454    fn batched_content_files_exact_batch_boundary() {
455        let dir = tempdir().unwrap();
456        let content = dir.path().join("content");
457        fs::create_dir_all(&content).unwrap();
458        for i in 0..10 {
459            fs::write(content.join(format!("p{i}.md")), "# Hi").unwrap();
460        }
461
462        let budget = MemoryBudget {
463            max_bytes: 0,
464            batch_size: 10,
465        };
466        let batches = batched_content_files(&content, &budget).unwrap();
467        assert_eq!(batches.len(), 1);
468        assert_eq!(batches[0].len(), 10);
469    }
470
471    #[test]
472    fn batched_content_files_many_small_batches() {
473        let dir = tempdir().unwrap();
474        let content = dir.path().join("content");
475        fs::create_dir_all(&content).unwrap();
476        for i in 0..7 {
477            fs::write(content.join(format!("p{i}.md")), "# Hi").unwrap();
478        }
479
480        let budget = MemoryBudget {
481            max_bytes: 0,
482            batch_size: 2,
483        };
484        let batches = batched_content_files(&content, &budget).unwrap();
485        assert_eq!(batches.len(), 4); // 2+2+2+1
486        assert_eq!(batches[3].len(), 1);
487    }
488
489    #[test]
490    fn batched_content_files_nested_directories() {
491        let dir = tempdir().unwrap();
492        let content = dir.path().join("content");
493        fs::create_dir_all(content.join("blog")).unwrap();
494        fs::create_dir_all(content.join("docs")).unwrap();
495        fs::write(content.join("index.md"), "# Index").unwrap();
496        fs::write(content.join("blog/post.md"), "# Post").unwrap();
497        fs::write(content.join("docs/api.md"), "# API").unwrap();
498
499        let budget = MemoryBudget::from_mb(512);
500        let batches = batched_content_files(&content, &budget).unwrap();
501        let total: usize = batches.iter().map(|b| b.len()).sum();
502        assert_eq!(total, 3);
503    }
504
505    // -----------------------------------------------------------------
506    // merge_dir — additional scenarios
507    // -----------------------------------------------------------------
508
509    #[test]
510    fn merge_dir_deeply_nested() {
511        let dir = tempdir().unwrap();
512        let src = dir.path().join("src");
513        let dst = dir.path().join("dst");
514        fs::create_dir_all(src.join("a/b/c")).unwrap();
515        fs::create_dir_all(&dst).unwrap();
516        fs::write(src.join("a/b/c/deep.txt"), "deep content").unwrap();
517
518        merge_dir(&src, &dst).unwrap();
519        assert_eq!(
520            fs::read_to_string(dst.join("a/b/c/deep.txt")).unwrap(),
521            "deep content"
522        );
523    }
524
525    #[test]
526    fn merge_dir_empty_src() {
527        let dir = tempdir().unwrap();
528        let src = dir.path().join("src");
529        let dst = dir.path().join("dst");
530        fs::create_dir_all(&src).unwrap();
531        fs::create_dir_all(&dst).unwrap();
532        fs::write(dst.join("existing.txt"), "keep").unwrap();
533
534        merge_dir(&src, &dst).unwrap();
535        assert_eq!(
536            fs::read_to_string(dst.join("existing.txt")).unwrap(),
537            "keep"
538        );
539    }
540
541    #[test]
542    fn merge_dir_multiple_files() {
543        let dir = tempdir().unwrap();
544        let src = dir.path().join("src");
545        let dst = dir.path().join("dst");
546        fs::create_dir_all(&src).unwrap();
547        fs::create_dir_all(&dst).unwrap();
548        for i in 0..5 {
549            fs::write(src.join(format!("f{i}.txt")), format!("data{i}"))
550                .unwrap();
551        }
552
553        merge_dir(&src, &dst).unwrap();
554        for i in 0..5 {
555            assert_eq!(
556                fs::read_to_string(dst.join(format!("f{i}.txt"))).unwrap(),
557                format!("data{i}")
558            );
559        }
560    }
561
562    // -----------------------------------------------------------------
563    // should_stream — additional scenarios
564    // -----------------------------------------------------------------
565
566    #[test]
567    fn should_stream_with_no_content_dir() {
568        let dir = tempdir().unwrap();
569        let budget = MemoryBudget::from_mb(512);
570        // Non-existent dir, not explicitly set => false (walk returns 0)
571        assert!(!should_stream(
572            &dir.path().join("no-content"),
573            &budget,
574            false
575        ));
576    }
577
578    #[test]
579    fn should_stream_explicitly_set_overrides_count() {
580        // Even with zero files, explicit flag forces streaming
581        let dir = tempdir().unwrap();
582        let content = dir.path().join("content");
583        fs::create_dir_all(&content).unwrap();
584
585        let budget = MemoryBudget::from_mb(512);
586        assert!(should_stream(&content, &budget, true));
587    }
588
589    #[test]
590    fn should_stream_exactly_at_batch_boundary() {
591        let dir = tempdir().unwrap();
592        let content = dir.path().join("content");
593        fs::create_dir_all(&content).unwrap();
594        // Create exactly batch_size files => count == batch_size, not >
595        let budget = MemoryBudget {
596            max_bytes: 0,
597            batch_size: 3,
598        };
599        for i in 0..3 {
600            fs::write(content.join(format!("p{i}.md")), "# Hi").unwrap();
601        }
602        // 3 files, batch_size 3 => count is NOT > batch_size => false
603        assert!(!should_stream(&content, &budget, false));
604    }
605
606    #[test]
607    fn should_stream_one_over_boundary() {
608        let dir = tempdir().unwrap();
609        let content = dir.path().join("content");
610        fs::create_dir_all(&content).unwrap();
611        let budget = MemoryBudget {
612            max_bytes: 0,
613            batch_size: 3,
614        };
615        for i in 0..4 {
616            fs::write(content.join(format!("p{i}.md")), "# Hi").unwrap();
617        }
618        // 4 files, batch_size 3 => true
619        assert!(should_stream(&content, &budget, false));
620    }
621
622    // -----------------------------------------------------------------
623    // compile_batch — additional scenarios
624    // -----------------------------------------------------------------
625
626    #[test]
627    fn compile_batch_with_nonexistent_files_still_creates_dirs() {
628        let dir = tempdir().unwrap();
629        let content = dir.path().join("content");
630        let build = dir.path().join("build");
631        let site = dir.path().join("site");
632        let templates = dir.path().join("templates");
633        fs::create_dir_all(&content).unwrap();
634
635        // Pass paths that don't exist — the copy inside compile_batch
636        // will fail, but the batch content dir should still be created.
637        let result = compile_batch(
638            &[content.join("nonexistent.md")],
639            &content,
640            &build,
641            &site,
642            &templates,
643            0,
644        );
645        // This may error (file not found during copy), which is expected.
646        // The important thing is it doesn't panic.
647        let _ = result;
648    }
649
650    #[test]
651    fn compile_batch_creates_batch_content_dir() {
652        let dir = tempdir().unwrap();
653        let content = dir.path().join("content");
654        let build = dir.path().join("build");
655        let site = dir.path().join("site");
656        let templates = dir.path().join("templates");
657        fs::create_dir_all(&content).unwrap();
658        fs::create_dir_all(&templates).unwrap();
659        fs::write(content.join("page.md"), "---\ntitle: T\n---\n# Hi").unwrap();
660
661        // compile_batch with a real file — may fail at staticdatagen::compile
662        // but should not panic and should create the batch dir
663        let _result = compile_batch(
664            &[content.join("page.md")],
665            &content,
666            &build,
667            &site,
668            &templates,
669            42,
670        );
671        // Batch dirs are cleaned up, so we just verify no panic
672    }
673}