Skip to main content

icb_server/
graph_builder.rs

1//! Graph construction and caching logic for the server.
2//!
3//! # Overview
4//!
5//! This module is responsible for turning raw parser facts into a fully
6//! resolved [`CodePropertyGraph`].  It supports three distinct workflows:
7//!
8//! * **build** – parse a project or a single file and construct the graph
9//!   from scratch.
10//! * **cache** – serialise the graph to a compressed binary format using
11//!   [`icb_graph::cache`] so that subsequent runs can skip parsing
12//!   entirely.
13//! * **load** – restore a previously cached graph, automatically cleaning
14//!   up node names.
15//!
16//! # Incremental fact caching
17//!
18//! When a cache directory is provided (or the default `.icb_cache` is
19//! used), the module stores extracted facts for every source file together
20//! with a SHA‑256 hash of the file content.  On subsequent runs, files that
21//! have not changed are loaded directly from the cache, skipping the parser
22//! entirely.  This can reduce the analysis time for large projects from
23//! seconds to milliseconds.
24//!
25//! The incremental cache is **transparently used by both the Clang and
26//! tree‑sitter backends** – you get fast reloads regardless of the chosen
27//! parser.
28//!
29//! # C/C++: Clang preferred, tree‑sitter fallback
30//!
31//! For C and C++ projects the module **prefers the Clang parser** when the
32//! `icb-clang` crate is available.  Clang provides exact semantic
33//! analysis and never mistakes documentation, HTML, or embedded JavaScript
34//! for C++ code.  If Clang cannot be loaded (e.g. LLVM not installed), the
35//! pipeline automatically falls back to tree‑sitter‑cpp.
36//!
37//! # Rust: rustc preferred, tree‑sitter fallback
38//!
39//! For Rust projects, when the `rustc` feature is enabled and a nightly
40//! compiler is used, the pipeline uses the native [`icb_rustc`] backend
41//! which provides precise semantic information (traits, generics, macros).
42//! On stable or without the feature, it falls back to tree‑sitter‑rust.
43//!
44//! # Strict file extension filtering & multi‑language support
45//!
46//! For every known language, only a curated list of file extensions is
47//! accepted.  This eliminates noise from documentation, build artefacts,
48//! and web assets.  When `build_or_load_graph_multi` is called with a
49//! specific list of languages, only those extensions are scanned.
50//!
51//! # Auto‑detection of language
52//!
53//! When `language` is `"auto"`, the module scans the project directory
54//! and picks the dominant language based on file extensions.  Unknown
55//! extensions fall back to the universal heuristic parser.
56
57use anyhow::anyhow;
58use icb_common::Language;
59use icb_graph::{cache, graph::CodePropertyGraph};
60use icb_parser::facts::RawNode;
61
62#[cfg(feature = "rustc")]
63use icb_rustc;
64
65use std::collections::{HashMap, HashSet};
66use std::fs;
67use std::path::{Path, PathBuf};
68use std::sync::Arc;
69
70use walkdir::WalkDir;
71
72use crate::display_name;
73use crate::incremental_cache::IncrementalCache;
74
75#[derive(Debug, Clone)]
76struct PipelineConfig {
77    pub languages: HashSet<Language>,
78    pub strict_extensions: bool,
79    pub strip_comments: bool,
80    pub no_system_headers: bool,
81    pub inc_cache_dir: Option<PathBuf>,
82}
83
84impl Default for PipelineConfig {
85    fn default() -> Self {
86        Self {
87            languages: HashSet::new(),
88            strict_extensions: true,
89            strip_comments: true,
90            no_system_headers: true,
91            inc_cache_dir: None,
92        }
93    }
94}
95
96/// Entry point: single-language
97pub fn build_or_load_graph(
98    project: &Path,
99    language: &str,
100    graph_cache_path: Option<&PathBuf>,
101    inc_cache_dir: Option<&PathBuf>,
102    no_system_headers: bool,
103) -> anyhow::Result<CodePropertyGraph> {
104    let lang = resolve_language(project, language)?;
105    let strict = lang != Language::Unknown;
106
107    let cfg = PipelineConfig {
108        languages: {
109            let mut set = HashSet::new();
110            set.insert(lang);
111            set
112        },
113        no_system_headers,
114        strict_extensions: strict,
115        inc_cache_dir: inc_cache_dir.cloned(),
116        ..Default::default()
117    };
118
119    run_pipeline(project, cfg, graph_cache_path)
120}
121
122/// Entry point: multi-language
123pub fn build_or_load_graph_multi(
124    project: &Path,
125    languages: &[String],
126    graph_cache_path: Option<&PathBuf>,
127    inc_cache_dir: Option<&PathBuf>,
128    no_system_headers: bool,
129) -> anyhow::Result<CodePropertyGraph> {
130    if languages.is_empty() || languages.iter().any(|l| l == "auto") {
131        return build_or_load_graph(
132            project,
133            "auto",
134            graph_cache_path,
135            inc_cache_dir,
136            no_system_headers,
137        );
138    }
139
140    let cfg = PipelineConfig {
141        languages: {
142            let mut set = HashSet::new();
143            for l in languages {
144                if let Some(lang) = parse_language(l) {
145                    set.insert(lang);
146                }
147            }
148            set
149        },
150        no_system_headers,
151        strict_extensions: !languages
152            .iter()
153            .any(|l| parse_language(l) == Some(Language::Unknown)),
154        inc_cache_dir: inc_cache_dir.cloned(),
155        ..Default::default()
156    };
157
158    run_pipeline(project, cfg, graph_cache_path)
159}
160
161/// Core pipeline executor
162fn run_pipeline(
163    project: &Path,
164    cfg: PipelineConfig,
165    graph_cache_path: Option<&PathBuf>,
166) -> anyhow::Result<CodePropertyGraph> {
167    if let Some(cache_file) = graph_cache_path {
168        if cache_file.exists() {
169            if let Ok(mut g) = cache::load_graph(cache_file) {
170                display_name::cleanup_node_names(&mut g);
171                return Ok(g);
172            }
173        }
174    }
175
176    let inc_cache = cfg
177        .inc_cache_dir
178        .as_ref()
179        .map(|dir| {
180            if dir.extension().is_some() {
181                let mut d = dir.clone();
182                d.set_extension("");
183                IncrementalCache::new(&d)
184            } else {
185                IncrementalCache::new(dir)
186            }
187        })
188        .transpose()?
189        .or_else(|| IncrementalCache::new(&project.join(".icb_cache")).ok());
190
191    if cfg.languages.contains(&Language::Rust) {
192        if let Some(cpg) = try_rustc_pipeline(project, &cfg, graph_cache_path) {
193            return Ok(cpg);
194        }
195    }
196
197    if cfg.languages.contains(&Language::CppTreeSitter) {
198        if let Some(cpg) = try_clang_pipeline(project, &cfg, graph_cache_path, inc_cache.as_ref()) {
199            return Ok(cpg);
200        }
201    }
202
203    let manager = Arc::new(icb_parser::manager::ParserManager::new());
204    let mut facts: Vec<(String, Vec<RawNode>)> = Vec::new();
205
206    for entry in WalkDir::new(project)
207        .into_iter()
208        .filter_map(|e| e.ok())
209        .filter(|e| e.file_type().is_file())
210    {
211        let path = entry.path();
212        let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
213
214        if cfg.strict_extensions {
215            let lang = detect_language_from_extension(ext);
216            if !cfg.languages.contains(&lang) {
217                continue;
218            }
219            let allowed = extensions_for_language(lang);
220            if !allowed.contains(&ext) {
221                continue;
222            }
223        }
224
225        let rel = path
226            .strip_prefix(project)
227            .unwrap_or(path)
228            .display()
229            .to_string();
230
231        let lang = if cfg.languages.len() == 1 {
232            *cfg.languages.iter().next().unwrap()
233        } else {
234            detect_language_from_extension(ext)
235        };
236
237        if let Some(ref cache) = inc_cache {
238            let manager = Arc::clone(&manager);
239            let file_facts = cache.process_file(
240                path,
241                &rel,
242                Box::new(move |source: &str| -> anyhow::Result<Vec<RawNode>> {
243                    manager.parse_file(lang, source).map_err(|e| anyhow!(e))
244                }),
245            )?;
246            facts.push((file_facts.relative_path, file_facts.facts));
247        } else {
248            let raw_source = fs::read_to_string(path).unwrap_or_default();
249            let source = if cfg.strip_comments {
250                strip_comments(&raw_source)
251            } else {
252                raw_source
253            };
254
255            let file_facts =
256                match icb_parser::manager::ParserManager::new().parse_file(lang, &source) {
257                    Ok(f) => f,
258                    Err(_) => continue,
259                };
260
261            facts.push((rel, file_facts));
262        }
263    }
264
265    let mut builder = icb_graph::builder::GraphBuilder::new();
266    for (_, file_facts) in facts {
267        let mut local = icb_graph::builder::GraphBuilder::new();
268        local.ingest_file_facts(&file_facts);
269        builder.merge(local);
270    }
271
272    display_name::cleanup_node_names(&mut builder.cpg);
273    builder.resolve_calls();
274
275    let mut cpg = builder.cpg;
276    display_name::cleanup_node_names(&mut cpg);
277
278    if let Some(cache_file) = graph_cache_path {
279        let _ = cache::save_graph(&cpg, cache_file);
280    }
281
282    Ok(cpg)
283}
284
285/// Attempt to use the `icb-rustc` native backend for Rust projects.
286///
287/// Returns `Some(CodePropertyGraph)` if successful, otherwise `None`
288/// and the pipeline will fall back to tree‑sitter.
289fn try_rustc_pipeline(
290    _project: &Path,
291    _cfg: &PipelineConfig,
292    _graph_cache_path: Option<&PathBuf>,
293) -> Option<CodePropertyGraph> {
294    #[cfg(feature = "rustc")]
295    {
296        log::info!("Attempting rustc graph construction...");
297
298        let cargo_toml = _project.join("Cargo.toml");
299        if !cargo_toml.exists() {
300            log::warn!(
301                "Cargo.toml not found in {:?}, falling back to tree-sitter",
302                _project
303            );
304            return None;
305        }
306
307        let main_rs = _project.join("src/main.rs");
308        let lib_rs = _project.join("src/lib.rs");
309        let entry = if main_rs.exists() {
310            main_rs
311        } else if lib_rs.exists() {
312            lib_rs
313        } else {
314            log::warn!("No main.rs or lib.rs found, falling back");
315            return None;
316        };
317
318        let args: Vec<String> = vec!["--edition".to_string(), "2021".to_string()];
319        let facts = match icb_rustc::parse_rust_crate(&entry, &args) {
320            Ok(f) => f,
321            Err(e) => {
322                log::warn!("rustc analysis failed: {}, falling back to tree-sitter", e);
323                return None;
324            }
325        };
326
327        log::info!("rustc produced {} facts", facts.len());
328
329        let mut builder = icb_graph::builder::GraphBuilder::new();
330        let mut local = icb_graph::builder::GraphBuilder::new();
331        local.ingest_file_facts(&facts);
332        builder.merge(local);
333
334        display_name::cleanup_node_names(&mut builder.cpg);
335        builder.resolve_calls();
336        let mut cpg = builder.cpg;
337        display_name::cleanup_node_names(&mut cpg);
338
339        if let Some(cache_file) = _graph_cache_path {
340            let _ = cache::save_graph(&cpg, cache_file);
341        }
342        log::info!("rustc graph built successfully");
343        Some(cpg)
344    }
345    #[cfg(not(feature = "rustc"))]
346    {
347        log::debug!("rustc feature not compiled in");
348        None
349    }
350}
351
352fn try_clang_pipeline(
353    project: &Path,
354    cfg: &PipelineConfig,
355    graph_cache_path: Option<&PathBuf>,
356    inc_cache: Option<&IncrementalCache>,
357) -> Option<CodePropertyGraph> {
358    #[cfg(feature = "clang")]
359    {
360        log::info!("Attempting Clang graph construction with incremental cache...");
361        let allow_system = !cfg.no_system_headers;
362
363        let mut facts: Vec<(String, Vec<RawNode>)> = Vec::new();
364
365        for entry in WalkDir::new(project)
366            .into_iter()
367            .filter_map(|e| e.ok())
368            .filter(|e| e.file_type().is_file())
369        {
370            let path = entry.path();
371            let ext = path.extension().and_then(|s| s.to_str()).unwrap_or("");
372
373            let allowed = extensions_for_language(Language::CppTreeSitter);
374            if !allowed.contains(&ext) {
375                continue;
376            }
377
378            let rel = path
379                .strip_prefix(project)
380                .unwrap_or(path)
381                .display()
382                .to_string();
383
384            if let Some(cache) = inc_cache {
385                let file_facts = cache
386                    .process_file(
387                        path,
388                        &rel,
389                        Box::new(move |source: &str| -> anyhow::Result<Vec<RawNode>> {
390                            icb_clang::parser::parse_cpp_file(
391                                source,
392                                &["-std=c++17".to_string()],
393                                None,
394                                allow_system,
395                            )
396                            .map_err(|e| anyhow!(e))
397                        }),
398                    )
399                    .ok()?;
400                facts.push((file_facts.relative_path, file_facts.facts));
401            } else {
402                let source = std::fs::read_to_string(path).ok()?;
403                let file_facts = icb_clang::parser::parse_cpp_file(
404                    &source,
405                    &["-std=c++17".to_string()],
406                    None,
407                    allow_system,
408                )
409                .ok()?;
410                facts.push((rel, file_facts));
411            }
412        }
413
414        log::info!("Clang processed {} files", facts.len());
415
416        let mut builder = icb_graph::builder::GraphBuilder::new();
417        for (_, file_facts) in facts {
418            let mut local = icb_graph::builder::GraphBuilder::new();
419            local.ingest_file_facts(&file_facts);
420            builder.merge(local);
421        }
422
423        display_name::cleanup_node_names(&mut builder.cpg);
424        builder.resolve_calls();
425        let mut cpg = builder.cpg;
426        display_name::cleanup_node_names(&mut cpg);
427
428        if let Some(cache_file) = graph_cache_path {
429            let _ = cache::save_graph(&cpg, cache_file);
430        }
431        log::info!("Clang graph built successfully");
432        Some(cpg)
433    }
434    #[cfg(not(feature = "clang"))]
435    {
436        log::debug!("Clang feature not compiled in");
437        None
438    }
439}
440
441fn resolve_language(project: &Path, input: &str) -> anyhow::Result<Language> {
442    if input == "auto" {
443        Ok(detect_language_from_project(project))
444    } else {
445        parse_language(input).ok_or_else(|| anyhow!("unknown language"))
446    }
447}
448
449fn parse_language(s: &str) -> Option<Language> {
450    match s {
451        "cpp" | "c++" => Some(Language::CppTreeSitter),
452        "python" => Some(Language::Python),
453        "go" => Some(Language::Go),
454        "ruby" => Some(Language::Ruby),
455        "rust" => Some(Language::Rust),
456        "javascript" => Some(Language::JavaScript),
457        _ => None,
458    }
459}
460
461fn detect_language_from_extension(ext: &str) -> Language {
462    match ext {
463        "cpp" | "cc" | "cxx" | "h" | "hpp" => Language::CppTreeSitter,
464        "py" => Language::Python,
465        "rs" => Language::Rust,
466        "go" => Language::Go,
467        "rb" => Language::Ruby,
468        "js" | "ts" | "tsx" | "jsx" => Language::JavaScript,
469        _ => Language::Unknown,
470    }
471}
472
473fn detect_language_from_project(path: &Path) -> Language {
474    let mut counts: HashMap<Language, usize> = HashMap::new();
475
476    for entry in WalkDir::new(path).into_iter().filter_map(|e| e.ok()) {
477        if let Some(ext) = entry.path().extension().and_then(|s| s.to_str()) {
478            let lang = detect_language_from_extension(ext);
479            *counts.entry(lang).or_insert(0) += 1;
480        }
481    }
482
483    counts
484        .into_iter()
485        .max_by_key(|(_, c)| *c)
486        .map(|(l, _)| l)
487        .unwrap_or(Language::Unknown)
488}
489
490fn extensions_for_language(lang: Language) -> &'static [&'static str] {
491    match lang {
492        Language::CppTreeSitter => &["cpp", "cc", "cxx", "h", "hpp"],
493        Language::Python => &["py"],
494        Language::Rust => &["rs"],
495        Language::Go => &["go"],
496        Language::Ruby => &["rb"],
497        Language::JavaScript => &["js", "ts", "tsx", "jsx"],
498        _ => &[],
499    }
500}
501
502fn strip_comments(s: &str) -> String {
503    s.replace("//", " ").replace("/*", " ").replace("*/", " ")
504}
505
506#[allow(dead_code)]
507fn is_valid_identifier(name: &str, lang: Language) -> bool {
508    if matches!(lang, Language::CppTreeSitter | Language::Cpp) && name.contains("::") {
509        return true;
510    }
511    if name.len() == 1 && name.chars().all(|c| c.is_ascii_alphabetic()) {
512        return true;
513    }
514    if name.len() < 2 {
515        return false;
516    }
517    let first = name.chars().next().unwrap();
518    if !first.is_ascii_alphabetic() && first != '_' && first != '~' {
519        return false;
520    }
521    let allowed = |c: char| {
522        c.is_ascii_alphanumeric()
523            || c == '_'
524            || (matches!(lang, Language::CppTreeSitter | Language::Cpp) && (c == ':' || c == '~'))
525    };
526    if !name.chars().all(allowed) {
527        return false;
528    }
529    if name.chars().all(|c| c.is_ascii_digit()) {
530        return false;
531    }
532    if name.starts_with("class")
533        && name.len() > 5
534        && name[5..].chars().next().unwrap().is_uppercase()
535    {
536        return false;
537    }
538    if name.contains("_1_1") || name.contains("_8cpp") || name.contains("_8h") {
539        return false;
540    }
541    if name.len() > 40 && name.contains('_') {
542        return false;
543    }
544    if name.starts_with("dir_") && name.len() > 30 {
545        return false;
546    }
547    true
548}
549
550#[allow(dead_code)]
551fn is_javascript_noise(name: &str) -> bool {
552    static JS_NOISE: &[&str] = &[
553        "isNaN",
554        "eval",
555        "parseInt",
556        "parseFloat",
557        "undefined",
558        "NaN",
559        "Infinity",
560        "Object",
561        "Array",
562        "String",
563        "Number",
564        "Boolean",
565        "Function",
566        "RegExp",
567        "Math",
568        "Date",
569        "JSON",
570        "Promise",
571        "Symbol",
572        "Map",
573        "Set",
574        "WeakMap",
575        "WeakSet",
576        "Proxy",
577        "Reflect",
578        "console",
579        "window",
580        "document",
581        "navigator",
582        "location",
583        "history",
584        "localStorage",
585        "sessionStorage",
586        "alert",
587        "confirm",
588        "prompt",
589        "fetch",
590        "XMLHttpRequest",
591        "getElementById",
592        "getElementsByClassName",
593        "getElementsByTagName",
594        "querySelector",
595        "querySelectorAll",
596        "addEventListener",
597        "removeEventListener",
598        "appendChild",
599        "removeChild",
600        "srChild",
601        "srResult",
602        "srEntry",
603        "srScope",
604        "srLink",
605        "srChildren",
606        "clipboard_div",
607        "clipboard_icon",
608        "clipboard_successIcon",
609        "clipboard_successDuration",
610        "clipboard_title",
611        "pagenav",
612        "navtree",
613        "menudata",
614        "resizeHeight",
615        "resizeWidth",
616        "domSearchBox",
617        "domPopupSearchResults",
618        "domPopupSearchResultsWindow",
619        "domSearchClose",
620        "searchData",
621        "searchResults",
622        "resultsPath",
623        "topOffset",
624        "footerHeight",
625        "headerHeight",
626        "sidenavWidth",
627        "pagenavWidth",
628        "navSync",
629        "navtreeHeight",
630        "PAGENAV_COOKIE_NAME",
631        "RESIZE_COOKIE_NAME",
632        "SEARCH_COOKIE_NAME",
633        "NAVPATH_COOKIE_NAME",
634        "NAVTREE",
635        "NAVTREEINDEX",
636        "NAVTREEINDEX0",
637        "NAVTREEINDEX1",
638        "NAVTREEINDEX2",
639        "NAVTREEINDEX3",
640        "NAVTREEINDEX4",
641        "NAVTREEINDEX5",
642        "NAVTREEINDEX6",
643        "NAVTREEINDEX7",
644        "navTreeSubIndices",
645        "entityMap",
646        "htmlToNode",
647        "codefold",
648        "dynsection",
649        "showHideNavBar",
650        "showSyncOff",
651        "showSyncOn",
652        "SYNCOFFMSG",
653        "SYNCONMSG",
654        "toggleVisibility",
655        "toggleClass",
656        "focusItem",
657        "focusName",
658        "expandNode",
659        "gotoNode",
660        "gotoAnchor",
661        "showNode",
662        "showRoot",
663        "selectAndHighlight",
664        "highlightAnchor",
665        "highlightAdjacentNodes",
666        "highlightEdges",
667        "loadJS",
668        "createIndent",
669        "makeTree",
670        "makeAbsolut",
671        "makeMorphable",
672        "makeInstance",
673        "makeSetterGetter",
674        "getClass",
675        "getClassForType",
676        "getMethodNames",
677        "getMethodsFor",
678        "getEvents",
679        "getEventTarget",
680        "getEventPoint",
681        "createResults",
682        "SearchResults",
683        "handleResults",
684    ];
685    JS_NOISE.contains(&name)
686}
687
688#[allow(dead_code)]
689fn is_type_keyword(name: &str) -> bool {
690    matches!(
691        name,
692        "void"
693            | "int"
694            | "long"
695            | "short"
696            | "char"
697            | "float"
698            | "double"
699            | "signed"
700            | "unsigned"
701            | "bool"
702            | "wchar_t"
703            | "size_t"
704    )
705}