Skip to main content

icb_parser/
manager.rs

1//! Universal high-performance parser manager.
2//!
3//! # Overview
4//!
5//! Provides a unified, fault‑tolerant entry point for parsing source code
6//! across multiple programming languages.
7//!
8//! # Design Goals
9//!
10//! * Zero‑panic parsing pipeline
11//! * Deterministic behaviour
12//! * Efficient large‑scale directory traversal
13//! * Minimal memory overhead
14//! * Extensible language backends
15//!
16//! # Features
17//!
18//! * Multi‑language dispatch
19//! * Extension‑based file filtering
20//! * Graceful failure handling (I/O errors, unsupported languages)
21//! * Relative path normalisation
22//! * Batched file processing
23//!
24//! # Execution Model
25//!
26//! 1. Discover files
27//! 2. Filter by extension
28//! 3. Read sources
29//! 4. Dispatch parser
30//! 5. Collect facts
31//!
32//! # Safety Guarantees
33//!
34//! * No `unwrap()`
35//! * No panic propagation
36//! * I/O errors isolated per file (skipped silently)
37//! * Parse errors isolated per file (skipped silently)
38//!
39//! # Output
40//!
41//! Returns a vector of `(relative_path, Vec<RawNode>)`.
42
43use crate::facts::RawNode;
44use icb_common::{IcbError, Language};
45use std::path::{Path, PathBuf};
46use walkdir::WalkDir;
47
48/// Stateless parser manager.
49#[derive(Default)]
50pub struct ParserManager;
51
52impl ParserManager {
53    /// Create a new parser manager.
54    pub fn new() -> Self {
55        Self
56    }
57
58    /// Parse a single source file using the most appropriate backend for the
59    /// given language.
60    ///
61    /// # Arguments
62    ///
63    /// * `lang` – the programming language of the source.
64    /// * `source` – the raw source code as a UTF‑8 string.
65    ///
66    /// # Errors
67    ///
68    /// Returns [`IcbError::Parse`] if the specialised parser fails.
69    /// Unknown / unsupported languages are handled by the universal
70    /// heuristic parser and never produce an error.
71    pub fn parse_file(&self, lang: Language, source: &str) -> Result<Vec<RawNode>, IcbError> {
72        match lang {
73            Language::Python => crate::lang::python::parse_python(source),
74            Language::CppTreeSitter => crate::cpp_tree_sitter::parse_cpp_file(source),
75            Language::Go => crate::lang::go::parse_go(source),
76            Language::Ruby => crate::lang::ruby::parse_ruby(source),
77            Language::Rust => crate::lang::rust::parse_rust(source),
78
79            Language::JavaScript | Language::Unknown => {
80                Ok(crate::heuristic_parser::parse_universal(source, ""))
81            }
82
83            Language::Cpp => Ok(crate::heuristic_parser::parse_universal(source, "")),
84
85            _ => Ok(crate::heuristic_parser::parse_universal(source, "")),
86        }
87    }
88
89    /// Recursively discover and parse files under `root` for the given
90    /// language.
91    ///
92    /// The language determines both the file extensions and the parser
93    /// backend.  Files that cannot be read as UTF‑8 or that cause a parser
94    /// error are silently skipped.
95    ///
96    /// # Arguments
97    ///
98    /// * `lang` – the programming language to use for parsing.
99    /// * `root` – the root directory to walk.
100    ///
101    /// # Errors
102    ///
103    /// Returns [`IcbError::Parse`] if directory traversal fails (e.g.
104    /// permission denied).  Individual file failures are not propagated.
105    pub fn parse_directory(
106        &self,
107        lang: Language,
108        root: &Path,
109    ) -> Result<Vec<(String, Vec<RawNode>)>, IcbError> {
110        let files = discover_files(root, lang)?;
111        let base = normalize_root(root);
112        let mut results = Vec::with_capacity(files.len());
113
114        for path in files {
115            match process_file(self, lang, &path, &base) {
116                Ok(Some(entry)) => results.push(entry),
117                Ok(None) | Err(_) => continue,
118            }
119        }
120
121        Ok(results)
122    }
123}
124
125// ---- Internal helpers ---------------------------------------------------
126
127/// Walk a directory and collect every file whose extension matches the
128/// language filter.
129fn discover_files(root: &Path, lang: Language) -> Result<Vec<PathBuf>, IcbError> {
130    let extensions = extensions_for_language(lang);
131    let mut out = Vec::new();
132
133    for entry in WalkDir::new(root).follow_links(false) {
134        let entry = match entry {
135            Ok(e) => e,
136            Err(e) => return Err(IcbError::Parse(e.to_string())),
137        };
138        if !entry.file_type().is_file() {
139            continue;
140        }
141        let path = entry.path();
142        if should_include(path, &extensions) {
143            out.push(path.to_path_buf());
144        }
145    }
146
147    Ok(out)
148}
149
150/// Read a single file, parse it, and return `None` on any failure.
151fn process_file(
152    manager: &ParserManager,
153    lang: Language,
154    path: &Path,
155    base: &Path,
156) -> Result<Option<(String, Vec<RawNode>)>, IcbError> {
157    let source = match std::fs::read_to_string(path) {
158        Ok(s) => s,
159        Err(_) => return Ok(None),
160    };
161    let facts = match manager.parse_file(lang, &source) {
162        Ok(f) => f,
163        Err(_) => return Ok(None),
164    };
165    if facts.is_empty() {
166        return Ok(None);
167    }
168    let rel = relative_path(path, base);
169    Ok(Some((rel, facts)))
170}
171
172/// Return the canonical form of `root`, or the original if it fails.
173fn normalize_root(root: &Path) -> PathBuf {
174    root.canonicalize().unwrap_or_else(|_| root.to_path_buf())
175}
176
177/// Compute a relative path from `base` to `path`.
178fn relative_path(path: &Path, base: &Path) -> String {
179    path.strip_prefix(base)
180        .unwrap_or(path)
181        .to_string_lossy()
182        .to_string()
183}
184
185/// Check whether `path` should be included based on the allowed extensions.
186///
187/// When `exts` is empty, **all** files are included (universal mode).
188fn should_include(path: &Path, exts: &[&str]) -> bool {
189    if exts.is_empty() {
190        return true;
191    }
192    match path.extension().and_then(|s| s.to_str()) {
193        Some(ext) => {
194            let ext = ext.to_lowercase();
195            exts.iter().any(|e| *e == ext)
196        }
197        None => false,
198    }
199}
200
201/// Return the list of file extensions accepted for a given language.
202fn extensions_for_language(lang: Language) -> Vec<&'static str> {
203    match lang {
204        Language::Python => vec!["py"],
205        Language::Cpp | Language::CppTreeSitter => vec![
206            "c", "cpp", "cc", "cxx", "h", "hpp", "hxx", "hh", "inl", "inc",
207        ],
208        Language::Rust => vec!["rs"],
209        Language::JavaScript => vec!["js", "jsx", "ts", "tsx"],
210        Language::Go => vec!["go"],
211        Language::Java => vec!["java"],
212        Language::Ruby => vec!["rb"],
213        Language::Php => vec!["php"],
214        Language::Swift => vec!["swift"],
215        Language::Kotlin => vec!["kt", "kts"],
216        Language::Scala => vec!["scala"],
217        Language::CSharp => vec!["cs"],
218        Language::Lua => vec!["lua"],
219        Language::R => vec!["r"],
220        Language::Bash => vec!["sh", "bash"],
221        Language::Perl => vec!["pl", "pm"],
222        Language::Tcl => vec!["tcl"],
223        Language::Dart => vec!["dart"],
224        Language::Unknown => vec![],
225    }
226}