Skip to main content

icb_parser/
manager.rs

1//! Universal high-performance parser manager.
2//!
3//! # Overview
4//!
5//! Provides a unified, fault‑tolerant entry point for parsing source code
6//! across multiple programming languages.
7//!
8//! # Design Goals
9//!
10//! * Zero‑panic parsing pipeline
11//! * Deterministic behaviour
12//! * Efficient large‑scale directory traversal
13//! * Minimal memory overhead
14//! * Extensible language backends
15//!
16//! # Features
17//!
18//! * Multi‑language dispatch
19//! * Extension‑based file filtering
20//! * Graceful failure handling (I/O errors, unsupported languages)
21//! * Relative path normalisation
22//! * Batched file processing
23//!
24//! # Execution Model
25//!
26//! 1. Discover files
27//! 2. Filter by extension
28//! 3. Read sources
29//! 4. Dispatch parser
30//! 5. Collect facts
31//!
32//! # Safety Guarantees
33//!
34//! * No `unwrap()`
35//! * No panic propagation
36//! * I/O errors isolated per file (skipped silently)
37//! * Parse errors isolated per file (skipped silently)
38//!
39//! # Output
40//!
41//! Returns a vector of `(relative_path, Vec<RawNode>)`.
42
43use crate::facts::RawNode;
44use icb_common::{IcbError, Language};
45use std::path::{Path, PathBuf};
46use walkdir::WalkDir;
47
48/// Stateless parser manager.
49#[derive(Default)]
50pub struct ParserManager;
51
52impl ParserManager {
53    /// Create a new parser manager.
54    pub fn new() -> Self {
55        Self
56    }
57
58    /// Parse a single source file using the most appropriate backend for the
59    /// given language.
60    ///
61    /// # Arguments
62    ///
63    /// * `lang` – the programming language of the source.
64    /// * `source` – the raw source code as a UTF‑8 string.
65    ///
66    /// # Errors
67    ///
68    /// Returns [`IcbError::Parse`] if the specialised parser fails.
69    /// Unknown / unsupported languages are handled by the universal
70    /// heuristic parser and never produce an error.
71    pub fn parse_file(&self, lang: Language, source: &str) -> Result<Vec<RawNode>, IcbError> {
72        match lang {
73            Language::Python => crate::lang::python::parse_python(source),
74            Language::CppTreeSitter => crate::cpp_tree_sitter::parse_cpp_file(source),
75            Language::Go => crate::lang::go::parse_go(source),
76            Language::Ruby => crate::lang::ruby::parse_ruby(source),
77
78            Language::JavaScript | Language::Rust | Language::Unknown => {
79                Ok(crate::heuristic_parser::parse_universal(source, ""))
80            }
81
82            Language::Cpp => Ok(crate::heuristic_parser::parse_universal(source, "")),
83
84            _ => Ok(crate::heuristic_parser::parse_universal(source, "")),
85        }
86    }
87
88    /// Recursively discover and parse files under `root` for the given
89    /// language.
90    ///
91    /// The language determines both the file extensions and the parser
92    /// backend.  Files that cannot be read as UTF‑8 or that cause a parser
93    /// error are silently skipped.
94    ///
95    /// # Arguments
96    ///
97    /// * `lang` – the programming language to use for parsing.
98    /// * `root` – the root directory to walk.
99    ///
100    /// # Errors
101    ///
102    /// Returns [`IcbError::Parse`] if directory traversal fails (e.g.
103    /// permission denied).  Individual file failures are not propagated.
104    pub fn parse_directory(
105        &self,
106        lang: Language,
107        root: &Path,
108    ) -> Result<Vec<(String, Vec<RawNode>)>, IcbError> {
109        let files = discover_files(root, lang)?;
110        let base = normalize_root(root);
111        let mut results = Vec::with_capacity(files.len());
112
113        for path in files {
114            match process_file(self, lang, &path, &base) {
115                Ok(Some(entry)) => results.push(entry),
116                Ok(None) | Err(_) => continue,
117            }
118        }
119
120        Ok(results)
121    }
122}
123
124// ---- Internal helpers ---------------------------------------------------
125
126/// Walk a directory and collect every file whose extension matches the
127/// language filter.
128fn discover_files(root: &Path, lang: Language) -> Result<Vec<PathBuf>, IcbError> {
129    let extensions = extensions_for_language(lang);
130    let mut out = Vec::new();
131
132    for entry in WalkDir::new(root).follow_links(false) {
133        let entry = match entry {
134            Ok(e) => e,
135            Err(e) => return Err(IcbError::Parse(e.to_string())),
136        };
137        if !entry.file_type().is_file() {
138            continue;
139        }
140        let path = entry.path();
141        if should_include(path, &extensions) {
142            out.push(path.to_path_buf());
143        }
144    }
145
146    Ok(out)
147}
148
149/// Read a single file, parse it, and return `None` on any failure.
150fn process_file(
151    manager: &ParserManager,
152    lang: Language,
153    path: &Path,
154    base: &Path,
155) -> Result<Option<(String, Vec<RawNode>)>, IcbError> {
156    let source = match std::fs::read_to_string(path) {
157        Ok(s) => s,
158        Err(_) => return Ok(None),
159    };
160    let facts = match manager.parse_file(lang, &source) {
161        Ok(f) => f,
162        Err(_) => return Ok(None),
163    };
164    if facts.is_empty() {
165        return Ok(None);
166    }
167    let rel = relative_path(path, base);
168    Ok(Some((rel, facts)))
169}
170
171/// Return the canonical form of `root`, or the original if it fails.
172fn normalize_root(root: &Path) -> PathBuf {
173    root.canonicalize().unwrap_or_else(|_| root.to_path_buf())
174}
175
176/// Compute a relative path from `base` to `path`.
177fn relative_path(path: &Path, base: &Path) -> String {
178    path.strip_prefix(base)
179        .unwrap_or(path)
180        .to_string_lossy()
181        .to_string()
182}
183
184/// Check whether `path` should be included based on the allowed extensions.
185///
186/// When `exts` is empty, **all** files are included (universal mode).
187fn should_include(path: &Path, exts: &[&str]) -> bool {
188    if exts.is_empty() {
189        return true;
190    }
191    match path.extension().and_then(|s| s.to_str()) {
192        Some(ext) => {
193            let ext = ext.to_lowercase();
194            exts.iter().any(|e| *e == ext)
195        }
196        None => false,
197    }
198}
199
200/// Return the list of file extensions accepted for a given language.
201fn extensions_for_language(lang: Language) -> Vec<&'static str> {
202    match lang {
203        Language::Python => vec!["py"],
204        Language::Cpp | Language::CppTreeSitter => vec![
205            "c", "cpp", "cc", "cxx", "h", "hpp", "hxx", "hh", "inl", "inc",
206        ],
207        Language::Rust => vec!["rs"],
208        Language::JavaScript => vec!["js", "jsx", "ts", "tsx"],
209        Language::Go => vec!["go"],
210        Language::Java => vec!["java"],
211        Language::Ruby => vec!["rb"],
212        Language::Php => vec!["php"],
213        Language::Swift => vec!["swift"],
214        Language::Kotlin => vec!["kt", "kts"],
215        Language::Scala => vec!["scala"],
216        Language::CSharp => vec!["cs"],
217        Language::Lua => vec!["lua"],
218        Language::R => vec!["r"],
219        Language::Bash => vec!["sh", "bash"],
220        Language::Perl => vec!["pl", "pm"],
221        Language::Tcl => vec!["tcl"],
222        Language::Dart => vec!["dart"],
223        Language::Unknown => vec![],
224    }
225}