Skip to main content

icb_server/
graph_builder.rs

1//! Graph construction and caching logic for the server.
2//!
3//! # Overview
4//!
5//! This module is responsible for turning raw parser facts into a fully
6//! resolved [`CodePropertyGraph`].  It supports three distinct workflows:
7//!
8//! * **build** – parse a project or a single file and construct the graph
9//!   from scratch.
10//! * **cache** – serialise the graph to a compressed binary format using
11//!   [`icb_graph::cache`] so that subsequent runs can skip parsing
12//!   entirely.
13//! * **load** – restore a previously cached graph, automatically cleaning
14//!   up node names.
15//!
16//! # Incremental fact caching
17//!
18//! When a cache directory is provided (or the default `.icb_cache` is
19//! used), the module stores extracted facts for every source file together
20//! with a SHA‑256 hash of the file content.  On subsequent runs, files that
21//! have not changed are loaded directly from the cache, skipping the parser
22//! entirely.  This can reduce the analysis time for large projects from
23//! seconds to milliseconds.
24//!
25//! The incremental cache is **transparently used by both the Clang and
26//! tree‑sitter backends** – you get fast reloads regardless of the chosen
27//! parser.
28//!
29//! # C/C++: Clang preferred, tree‑sitter fallback
30//!
31//! For C and C++ projects the module **prefers the Clang parser** when the
32//! `icb-clang` crate is available.  Clang provides exact semantic
33//! analysis and never mistakes documentation, HTML, or embedded JavaScript
34//! for C++ code.  If Clang cannot be loaded (e.g. LLVM not installed), the
35//! pipeline automatically falls back to tree‑sitter‑cpp.
36//!
37//! # Strict file extension filtering & multi‑language support
38//!
39//! For every known language, only a curated list of file extensions is
40//! accepted.  This eliminates noise from documentation, build artefacts,
41//! and web assets.  When `build_or_load_graph_multi` is called with a
42//! specific list of languages, only those extensions are scanned.
43//!
44//! # Auto‑detection of language
45//!
46//! When `language` is `"auto"`, the module scans the project directory
47//! and picks the dominant language based on file extensions.  Unknown
48//! extensions fall back to the universal heuristic parser.
49
50use anyhow::anyhow;
51use icb_common::Language;
52use icb_graph::{cache, graph::CodePropertyGraph};
53use icb_parser::facts::RawNode;
54
55use std::collections::{HashMap, HashSet};
56use std::fs;
57use std::path::{Path, PathBuf};
58use std::sync::Arc;
59
60use walkdir::WalkDir;
61
62use crate::display_name;
63use crate::incremental_cache::IncrementalCache;
64
65#[derive(Debug, Clone)]
66struct PipelineConfig {
67    pub languages: HashSet<Language>,
68    pub strict_extensions: bool,
69    pub strip_comments: bool,
70    pub no_system_headers: bool,
71    pub inc_cache_dir: Option<PathBuf>,
72}
73
74impl Default for PipelineConfig {
75    fn default() -> Self {
76        Self {
77            languages: HashSet::new(),
78            strict_extensions: true,
79            strip_comments: true,
80            no_system_headers: true,
81            inc_cache_dir: None,
82        }
83    }
84}
85
86/// Entry point: single-language
87pub fn build_or_load_graph(
88    project: &Path,
89    language: &str,
90    graph_cache_path: Option<&PathBuf>,
91    inc_cache_dir: Option<&PathBuf>,
92    no_system_headers: bool,
93) -> anyhow::Result<CodePropertyGraph> {
94    let lang = resolve_language(project, language)?;
95    let strict = lang != Language::Unknown;
96
97    let cfg = PipelineConfig {
98        languages: {
99            let mut set = HashSet::new();
100            set.insert(lang);
101            set
102        },
103        no_system_headers,
104        strict_extensions: strict,
105        inc_cache_dir: inc_cache_dir.cloned(),
106        ..Default::default()
107    };
108
109    run_pipeline(project, cfg, graph_cache_path)
110}
111
112/// Entry point: multi-language
113pub fn build_or_load_graph_multi(
114    project: &Path,
115    languages: &[String],
116    graph_cache_path: Option<&PathBuf>,
117    inc_cache_dir: Option<&PathBuf>,
118    no_system_headers: bool,
119) -> anyhow::Result<CodePropertyGraph> {
120    if languages.is_empty() || languages.iter().any(|l| l == "auto") {
121        return build_or_load_graph(
122            project,
123            "auto",
124            graph_cache_path,
125            inc_cache_dir,
126            no_system_headers,
127        );
128    }
129
130    let cfg = PipelineConfig {
131        languages: {
132            let mut set = HashSet::new();
133            for l in languages {
134                if let Some(lang) = parse_language(l) {
135                    set.insert(lang);
136                }
137            }
138            set
139        },
140        no_system_headers,
141        strict_extensions: !languages
142            .iter()
143            .any(|l| parse_language(l) == Some(Language::Unknown)),
144        inc_cache_dir: inc_cache_dir.cloned(),
145        ..Default::default()
146    };
147
148    run_pipeline(project, cfg, graph_cache_path)
149}
150
151/// Core pipeline executor
152fn run_pipeline(
153    project: &Path,
154    cfg: PipelineConfig,
155    graph_cache_path: Option<&PathBuf>,
156) -> anyhow::Result<CodePropertyGraph> {
157    if let Some(cache_file) = graph_cache_path {
158        if cache_file.exists() {
159            if let Ok(mut g) = cache::load_graph(cache_file) {
160                display_name::cleanup_node_names(&mut g);
161                return Ok(g);
162            }
163        }
164    }
165
166    let inc_cache = cfg
167        .inc_cache_dir
168        .as_ref()
169        .map(|dir| {
170            if dir.extension().is_some() {
171                let mut d = dir.clone();
172                d.set_extension("");
173                IncrementalCache::new(&d)
174            } else {
175                IncrementalCache::new(dir)
176            }
177        })
178        .transpose()?
179        .or_else(|| IncrementalCache::new(&project.join(".icb_cache")).ok());
180
181    if cfg.languages.contains(&Language::CppTreeSitter) {
182        if let Some(cpg) = try_clang_pipeline(project, &cfg, graph_cache_path, inc_cache.as_ref()) {
183            return Ok(cpg);
184        }
185    }
186
187    let manager = Arc::new(icb_parser::manager::ParserManager::new());
188    let mut facts: Vec<(String, Vec<RawNode>)> = Vec::new();
189
190    for entry in WalkDir::new(project)
191        .into_iter()
192        .filter_map(|e| e.ok())
193        .filter(|e| e.file_type().is_file())
194    {
195        let path = entry.path();
196        let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
197
198        if cfg.strict_extensions {
199            let lang = detect_language_from_extension(ext);
200            if !cfg.languages.contains(&lang) {
201                continue;
202            }
203            let allowed = extensions_for_language(lang);
204            if !allowed.contains(&ext) {
205                continue;
206            }
207        }
208
209        let rel = path
210            .strip_prefix(project)
211            .unwrap_or(path)
212            .display()
213            .to_string();
214
215        let lang = if cfg.languages.len() == 1 {
216            *cfg.languages.iter().next().unwrap()
217        } else {
218            detect_language_from_extension(ext)
219        };
220
221        if let Some(ref cache) = inc_cache {
222            let manager = Arc::clone(&manager);
223            let file_facts = cache.process_file(
224                path,
225                &rel,
226                Box::new(move |source: &str| -> anyhow::Result<Vec<RawNode>> {
227                    manager.parse_file(lang, source).map_err(|e| anyhow!(e))
228                }),
229            )?;
230            facts.push((file_facts.relative_path, file_facts.facts));
231        } else {
232            let raw_source = fs::read_to_string(path).unwrap_or_default();
233            let source = if cfg.strip_comments {
234                strip_comments(&raw_source)
235            } else {
236                raw_source
237            };
238
239            let file_facts =
240                match icb_parser::manager::ParserManager::new().parse_file(lang, &source) {
241                    Ok(f) => f,
242                    Err(_) => continue,
243                };
244
245            facts.push((rel, file_facts));
246        }
247    }
248
249    let mut builder = icb_graph::builder::GraphBuilder::new();
250    for (_, file_facts) in facts {
251        let mut local = icb_graph::builder::GraphBuilder::new();
252        local.ingest_file_facts(&file_facts);
253        builder.merge(local);
254    }
255
256    display_name::cleanup_node_names(&mut builder.cpg);
257    builder.resolve_calls();
258
259    let mut cpg = builder.cpg;
260    display_name::cleanup_node_names(&mut cpg);
261
262    if let Some(cache_file) = graph_cache_path {
263        let _ = cache::save_graph(&cpg, cache_file);
264    }
265
266    Ok(cpg)
267}
268
269fn try_clang_pipeline(
270    project: &Path,
271    cfg: &PipelineConfig,
272    graph_cache_path: Option<&PathBuf>,
273    inc_cache: Option<&IncrementalCache>,
274) -> Option<CodePropertyGraph> {
275    #[cfg(feature = "clang")]
276    {
277        log::info!("Attempting Clang graph construction with incremental cache...");
278        let allow_system = !cfg.no_system_headers;
279
280        let mut facts: Vec<(String, Vec<RawNode>)> = Vec::new();
281
282        for entry in WalkDir::new(project)
283            .into_iter()
284            .filter_map(|e| e.ok())
285            .filter(|e| e.file_type().is_file())
286        {
287            let path = entry.path();
288            let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
289
290            let allowed = extensions_for_language(Language::CppTreeSitter);
291            if !allowed.contains(&ext) {
292                continue;
293            }
294
295            let rel = path
296                .strip_prefix(project)
297                .unwrap_or(path)
298                .display()
299                .to_string();
300
301            if let Some(cache) = inc_cache {
302                let file_facts = cache
303                    .process_file(
304                        path,
305                        &rel,
306                        Box::new(move |source: &str| -> anyhow::Result<Vec<RawNode>> {
307                            icb_clang::parser::parse_cpp_file(
308                                source,
309                                &["-std=c++17".to_string()],
310                                None,
311                                allow_system,
312                            )
313                            .map_err(|e| anyhow!(e))
314                        }),
315                    )
316                    .ok()?;
317                facts.push((file_facts.relative_path, file_facts.facts));
318            } else {
319                let source = std::fs::read_to_string(path).ok()?;
320                let file_facts = icb_clang::parser::parse_cpp_file(
321                    &source,
322                    &["-std=c++17".to_string()],
323                    None,
324                    allow_system,
325                )
326                .ok()?;
327                facts.push((rel, file_facts));
328            }
329        }
330
331        log::info!("Clang processed {} files", facts.len());
332
333        let mut builder = icb_graph::builder::GraphBuilder::new();
334        for (_, file_facts) in facts {
335            let mut local = icb_graph::builder::GraphBuilder::new();
336            local.ingest_file_facts(&file_facts);
337            builder.merge(local);
338        }
339
340        display_name::cleanup_node_names(&mut builder.cpg);
341        builder.resolve_calls();
342        let mut cpg = builder.cpg;
343        display_name::cleanup_node_names(&mut cpg);
344
345        if let Some(cache_file) = graph_cache_path {
346            let _ = cache::save_graph(&cpg, cache_file);
347        }
348        log::info!("Clang graph built successfully");
349        Some(cpg)
350    }
351    #[cfg(not(feature = "clang"))]
352    {
353        log::debug!("Clang feature not compiled in");
354        None
355    }
356}
357
358fn resolve_language(project: &Path, input: &str) -> anyhow::Result<Language> {
359    if input == "auto" {
360        Ok(detect_language_from_project(project))
361    } else {
362        parse_language(input).ok_or_else(|| anyhow!("unknown language"))
363    }
364}
365
366fn parse_language(s: &str) -> Option<Language> {
367    match s {
368        "cpp" | "c++" => Some(Language::CppTreeSitter),
369        "python" => Some(Language::Python),
370        "go" => Some(Language::Go),
371        "ruby" => Some(Language::Ruby),
372        "rust" => Some(Language::Rust),
373        "javascript" => Some(Language::JavaScript),
374        _ => None,
375    }
376}
377
378fn detect_language_from_extension(ext: &str) -> Language {
379    match ext {
380        "cpp" | "cc" | "cxx" | "h" | "hpp" => Language::CppTreeSitter,
381        "py" => Language::Python,
382        "rs" => Language::Rust,
383        "go" => Language::Go,
384        "rb" => Language::Ruby,
385        "js" | "ts" | "tsx" | "jsx" => Language::JavaScript,
386        _ => Language::Unknown,
387    }
388}
389
390fn detect_language_from_project(path: &Path) -> Language {
391    let mut counts: HashMap<Language, usize> = HashMap::new();
392
393    for entry in WalkDir::new(path).into_iter().filter_map(|e| e.ok()) {
394        if let Some(ext) = entry.path().extension().and_then(|s| s.to_str()) {
395            let lang = detect_language_from_extension(ext);
396            *counts.entry(lang).or_insert(0) += 1;
397        }
398    }
399
400    counts
401        .into_iter()
402        .max_by_key(|(_, c)| *c)
403        .map(|(l, _)| l)
404        .unwrap_or(Language::Unknown)
405}
406
407fn extensions_for_language(lang: Language) -> &'static [&'static str] {
408    match lang {
409        Language::CppTreeSitter => &["cpp", "cc", "cxx", "h", "hpp"],
410        Language::Python => &["py"],
411        Language::Rust => &["rs"],
412        Language::Go => &["go"],
413        Language::Ruby => &["rb"],
414        Language::JavaScript => &["js", "ts", "tsx", "jsx"],
415        _ => &[],
416    }
417}
418
419fn strip_comments(s: &str) -> String {
420    s.replace("//", " ").replace("/*", " ").replace("*/", " ")
421}
422
423#[allow(dead_code)]
424fn is_valid_identifier(name: &str, lang: Language) -> bool {
425    if matches!(lang, Language::CppTreeSitter | Language::Cpp) && name.contains("::") {
426        return true;
427    }
428    if name.len() == 1 && name.chars().all(|c| c.is_ascii_alphabetic()) {
429        return true;
430    }
431    if name.len() < 2 {
432        return false;
433    }
434    let first = name.chars().next().unwrap();
435    if !first.is_ascii_alphabetic() && first != '_' && first != '~' {
436        return false;
437    }
438    let allowed = |c: char| {
439        c.is_ascii_alphanumeric()
440            || c == '_'
441            || (matches!(lang, Language::CppTreeSitter | Language::Cpp) && (c == ':' || c == '~'))
442    };
443    if !name.chars().all(allowed) {
444        return false;
445    }
446    if name.chars().all(|c| c.is_ascii_digit()) {
447        return false;
448    }
449    if name.starts_with("class")
450        && name.len() > 5
451        && name[5..].chars().next().unwrap().is_uppercase()
452    {
453        return false;
454    }
455    if name.contains("_1_1") || name.contains("_8cpp") || name.contains("_8h") {
456        return false;
457    }
458    if name.len() > 40 && name.contains('_') {
459        return false;
460    }
461    if name.starts_with("dir_") && name.len() > 30 {
462        return false;
463    }
464    true
465}
466
467#[allow(dead_code)]
468fn is_javascript_noise(name: &str) -> bool {
469    static JS_NOISE: &[&str] = &[
470        "isNaN",
471        "eval",
472        "parseInt",
473        "parseFloat",
474        "undefined",
475        "NaN",
476        "Infinity",
477        "Object",
478        "Array",
479        "String",
480        "Number",
481        "Boolean",
482        "Function",
483        "RegExp",
484        "Math",
485        "Date",
486        "JSON",
487        "Promise",
488        "Symbol",
489        "Map",
490        "Set",
491        "WeakMap",
492        "WeakSet",
493        "Proxy",
494        "Reflect",
495        "console",
496        "window",
497        "document",
498        "navigator",
499        "location",
500        "history",
501        "localStorage",
502        "sessionStorage",
503        "alert",
504        "confirm",
505        "prompt",
506        "fetch",
507        "XMLHttpRequest",
508        "getElementById",
509        "getElementsByClassName",
510        "getElementsByTagName",
511        "querySelector",
512        "querySelectorAll",
513        "addEventListener",
514        "removeEventListener",
515        "appendChild",
516        "removeChild",
517        "srChild",
518        "srResult",
519        "srEntry",
520        "srScope",
521        "srLink",
522        "srChildren",
523        "clipboard_div",
524        "clipboard_icon",
525        "clipboard_successIcon",
526        "clipboard_successDuration",
527        "clipboard_title",
528        "pagenav",
529        "navtree",
530        "menudata",
531        "resizeHeight",
532        "resizeWidth",
533        "domSearchBox",
534        "domPopupSearchResults",
535        "domPopupSearchResultsWindow",
536        "domSearchClose",
537        "searchData",
538        "searchResults",
539        "resultsPath",
540        "topOffset",
541        "footerHeight",
542        "headerHeight",
543        "sidenavWidth",
544        "pagenavWidth",
545        "navSync",
546        "navtreeHeight",
547        "PAGENAV_COOKIE_NAME",
548        "RESIZE_COOKIE_NAME",
549        "SEARCH_COOKIE_NAME",
550        "NAVPATH_COOKIE_NAME",
551        "NAVTREE",
552        "NAVTREEINDEX",
553        "NAVTREEINDEX0",
554        "NAVTREEINDEX1",
555        "NAVTREEINDEX2",
556        "NAVTREEINDEX3",
557        "NAVTREEINDEX4",
558        "NAVTREEINDEX5",
559        "NAVTREEINDEX6",
560        "NAVTREEINDEX7",
561        "navTreeSubIndices",
562        "entityMap",
563        "htmlToNode",
564        "codefold",
565        "dynsection",
566        "showHideNavBar",
567        "showSyncOff",
568        "showSyncOn",
569        "SYNCOFFMSG",
570        "SYNCONMSG",
571        "toggleVisibility",
572        "toggleClass",
573        "focusItem",
574        "focusName",
575        "expandNode",
576        "gotoNode",
577        "gotoAnchor",
578        "showNode",
579        "showRoot",
580        "selectAndHighlight",
581        "highlightAnchor",
582        "highlightAdjacentNodes",
583        "highlightEdges",
584        "loadJS",
585        "createIndent",
586        "makeTree",
587        "makeAbsolut",
588        "makeMorphable",
589        "makeInstance",
590        "makeSetterGetter",
591        "getClass",
592        "getClassForType",
593        "getMethodNames",
594        "getMethodsFor",
595        "getEvents",
596        "getEventTarget",
597        "getEventPoint",
598        "createResults",
599        "SearchResults",
600        "handleResults",
601    ];
602    JS_NOISE.contains(&name)
603}
604
605#[allow(dead_code)]
606fn is_type_keyword(name: &str) -> bool {
607    matches!(
608        name,
609        "void"
610            | "int"
611            | "long"
612            | "short"
613            | "char"
614            | "float"
615            | "double"
616            | "signed"
617            | "unsigned"
618            | "bool"
619            | "wchar_t"
620            | "size_t"
621    )
622}