Skip to main content

icb_clang/
parser.rs

1//! High‑performance Clang AST visitor that extracts ICB facts for the
2//! Code Property Graph.
3//!
4//! # Overview
5//!
6//! The parser translates a single C/C++ translation unit (TU) into a flat
7//! vector of [`RawNode`] values.  It traverses the Clang AST recursively,
8//! but **only materialises nodes that are relevant for call‑graph
9//! construction**:
10//!
11//! * Functions and methods (`Function`),
12//! * Classes, structs (`Class`),
13//! * Call expressions (`CallSite`),
14//! * Variable declarations (`Variable`),
15//! * Parameter declarations (`Parameter`).
16//!
17//! All other AST constructs – compound statements, binary operators,
18//! implicit casts – are transparently skipped.  This keeps memory usage
19//! linear in the number of interesting declarations instead of quadratic in
20//! the AST depth.
21//!
22//! # System header isolation
23//!
24//! When `allow_system` is `false`, any cursor whose location Clang considers
25//! a system header is dropped together with its entire subtree.  This
26//! removes thousands of standard‑library nodes at almost zero cost and is
27//! enabled by default in the CLI and server.
28//!
29//! # Memory safety
30//!
31//! Every `CXString` obtained from the Clang API is disposed via
32//! `clang_disposeString` immediately after its contents have been copied to
33//! a Rust `String`.  Temporary C strings created for command‑line arguments
34//! are converted back to owned `CString` and dropped when no longer needed.
35//!
36//! # Concurrency
37//!
38//! A single `parse_cpp_file` call processes one TU on the calling thread.
39//! Parallelism is achieved by processing multiple TUs concurrently at the
40//! project level (see [`super::project`]).
41//!
42//! # Limitations
43//!
44//! * The parser does **not** resolve overloaded functions – it stores the
45//!   spelling of the call expression or its referenced entity as‑is.
46//! * Template instantiations are visited as part of the parent TU, which may
47//!   cause the same function body to be processed multiple times.
48//! * Precompiled headers are not supported.
49
50#![allow(non_upper_case_globals)]
51
52use clang_sys::*;
53use icb_common::{IcbError, Language, NodeKind};
54use icb_parser::facts::RawNode;
55use std::ffi::{c_uint, CString};
56use std::os::raw::c_void;
57use std::ptr;
58use tempfile::Builder;
59
60/// Returns `true` when the given cursor resides in a file that Clang
61/// treats as a system header.
62fn is_in_system_header(cursor: CXCursor) -> bool {
63    unsafe { clang_Location_isInSystemHeader(clang_getCursorLocation(cursor)) != 0 }
64}
65
66/// Returns the absolute path of the source file that contains `cursor`,
67/// or `None` if the information is unavailable.
68fn cursor_file(cursor: CXCursor) -> Option<String> {
69    unsafe {
70        let loc = clang_getCursorLocation(cursor);
71        let mut file: CXFile = ptr::null_mut();
72        clang_getFileLocation(
73            loc,
74            &mut file,
75            ptr::null_mut(),
76            ptr::null_mut(),
77            ptr::null_mut(),
78        );
79        if file.is_null() {
80            return None;
81        }
82        let name_cx = clang_getFileName(file);
83        let cstr = clang_getCString(name_cx);
84        let result = if cstr.is_null() {
85            None
86        } else {
87            Some(
88                std::ffi::CStr::from_ptr(cstr)
89                    .to_string_lossy()
90                    .into_owned(),
91            )
92        };
93        clang_disposeString(name_cx);
94        result
95    }
96}
97
98/// Parse a single C/C++ source file and return the extracted facts.
99pub fn parse_cpp_file(
100    source: &str,
101    args: &[String],
102    _file_name: Option<&str>,
103    allow_system: bool,
104) -> Result<Vec<RawNode>, IcbError> {
105    let index = unsafe { clang_createIndex(0, 0) };
106    if index.is_null() {
107        return Err(IcbError::Parse("failed to create Clang index".into()));
108    }
109
110    let temp_file = Builder::new()
111        .suffix(".cpp")
112        .tempfile()
113        .map_err(|e| IcbError::Parse(format!("tempfile error: {}", e)))?;
114    std::fs::write(temp_file.path(), source).map_err(IcbError::Io)?;
115    let filename = temp_file
116        .path()
117        .to_str()
118        .ok_or_else(|| IcbError::Parse("non-UTF8 temp path".into()))?;
119    let filename_c = CString::new(filename).unwrap();
120
121    let arg_ptrs: Vec<*const i8> = args
122        .iter()
123        .map(|a| CString::new(a.as_str()).unwrap().into_raw() as *const i8)
124        .collect();
125    let mut tu: CXTranslationUnit = ptr::null_mut();
126
127    let error = unsafe {
128        clang_parseTranslationUnit2(
129            index,
130            filename_c.as_ptr(),
131            arg_ptrs.as_ptr(),
132            args.len() as i32,
133            ptr::null_mut(),
134            0,
135            CXTranslationUnit_None,
136            &mut tu,
137        )
138    };
139
140    for &cstr_ptr in &arg_ptrs {
141        unsafe {
142            let _ = CString::from_raw(cstr_ptr as *mut i8);
143        }
144    }
145
146    if error != CXError_Success {
147        unsafe { clang_disposeIndex(index) };
148        return Err(IcbError::Parse(format!(
149            "failed to parse translation unit, error code {:?}",
150            error
151        )));
152    }
153
154    let cursor = unsafe { clang_getTranslationUnitCursor(tu) };
155    let mut nodes = Vec::new();
156    visit_children(cursor, &mut nodes, None, false, allow_system);
157
158    unsafe {
159        clang_disposeTranslationUnit(tu);
160        clang_disposeIndex(index);
161    }
162
163    Ok(nodes)
164}
165
166struct VisitorContext<'a> {
167    nodes: &'a mut Vec<RawNode>,
168    latest_parent: Option<usize>,
169    in_system: bool,
170    allow_system: bool,
171}
172
173fn visit_children(
174    cursor: CXCursor,
175    nodes: &mut Vec<RawNode>,
176    parent_idx: Option<usize>,
177    in_system: bool,
178    allow_system: bool,
179) -> Option<usize> {
180    let is_sys = is_in_system_header(cursor);
181
182    if !allow_system && is_sys {
183        return parent_idx;
184    }
185    if in_system && is_sys {
186        return parent_idx;
187    }
188
189    let kind = unsafe { clang_getCursorKind(cursor) };
190    let (node_kind, name, usr, is_container) = match kind {
191        CXCursor_FunctionDecl | CXCursor_CXXMethod => (
192            NodeKind::Function,
193            Some(cursor_spelling(cursor)),
194            Some(cursor_usr(cursor)),
195            true,
196        ),
197        CXCursor_ClassDecl | CXCursor_StructDecl => (
198            NodeKind::Class,
199            Some(cursor_spelling(cursor)),
200            Some(cursor_usr(cursor)),
201            true,
202        ),
203        CXCursor_CallExpr => {
204            let referenced = unsafe { clang_getCursorReferenced(cursor) };
205            let spelling = if referenced.kind == CXCursor_InvalidFile {
206                cursor_spelling(cursor)
207            } else {
208                cursor_spelling(referenced)
209            };
210            (NodeKind::CallSite, Some(spelling), None, false)
211        }
212        CXCursor_VarDecl => (
213            NodeKind::Variable,
214            Some(cursor_spelling(cursor)),
215            None,
216            false,
217        ),
218        CXCursor_ParmDecl => (
219            NodeKind::Parameter,
220            Some(cursor_spelling(cursor)),
221            None,
222            false,
223        ),
224        _ => {
225            let mut ctx = VisitorContext {
226                nodes,
227                latest_parent: parent_idx,
228                in_system,
229                allow_system,
230            };
231            let ctx_ptr = &mut ctx as *mut VisitorContext as *mut c_void;
232            unsafe {
233                clang_visitChildren(cursor, visitor_callback, ctx_ptr);
234            }
235            return ctx.latest_parent;
236        }
237    };
238
239    let (start_line, start_col, end_line, end_col) = if node_kind == NodeKind::Function {
240        function_body_location(cursor)
241    } else {
242        cursor_location(cursor)
243    };
244
245    let idx = nodes.len();
246    nodes.push(RawNode {
247        language: Language::Cpp,
248        kind: node_kind,
249        name,
250        usr,
251        start_line,
252        start_col,
253        end_line,
254        end_col,
255        children: Vec::new(),
256        source_file: cursor_file(cursor),
257    });
258
259    if let Some(pidx) = parent_idx {
260        nodes[pidx].children.push(idx);
261    }
262
263    if !is_container {
264        return parent_idx;
265    }
266
267    let next_in_system = in_system || is_sys;
268    let mut ctx = VisitorContext {
269        nodes,
270        latest_parent: Some(idx),
271        in_system: next_in_system,
272        allow_system,
273    };
274    let ctx_ptr = &mut ctx as *mut VisitorContext as *mut c_void;
275    unsafe {
276        clang_visitChildren(cursor, visitor_callback, ctx_ptr);
277    }
278
279    ctx.latest_parent
280}
281
282extern "C" fn visitor_callback(
283    cursor: CXCursor,
284    _parent: CXCursor,
285    client_data: CXClientData,
286) -> CXChildVisitResult {
287    let ctx: &mut VisitorContext = unsafe { &mut *(client_data as *mut VisitorContext) };
288    ctx.latest_parent = visit_children(
289        cursor,
290        ctx.nodes,
291        ctx.latest_parent,
292        ctx.in_system,
293        ctx.allow_system,
294    );
295    CXChildVisit_Continue
296}
297
298fn cursor_location(cursor: CXCursor) -> (usize, usize, usize, usize) {
299    let range = unsafe { clang_getCursorExtent(cursor) };
300    let start = unsafe { clang_getRangeStart(range) };
301    let end = unsafe { clang_getRangeEnd(range) };
302
303    let mut line: c_uint = 0;
304    let mut column: c_uint = 0;
305
306    unsafe {
307        clang_getPresumedLocation(start, ptr::null_mut(), &mut line, &mut column);
308    }
309    let s_line = line as usize;
310    let s_col = column as usize;
311
312    unsafe {
313        clang_getPresumedLocation(end, ptr::null_mut(), &mut line, &mut column);
314    }
315    let e_line = line as usize;
316    let e_col = column as usize;
317
318    (s_line, s_col, e_line, e_col)
319}
320
321fn cursor_spelling(cursor: CXCursor) -> String {
322    unsafe {
323        let cxstring = clang_getCursorSpelling(cursor);
324        let s = clang_getCString(cxstring);
325        let result = if s.is_null() {
326            String::new()
327        } else {
328            std::ffi::CStr::from_ptr(s).to_string_lossy().into_owned()
329        };
330        clang_disposeString(cxstring);
331        result
332    }
333}
334
335fn cursor_usr(cursor: CXCursor) -> String {
336    unsafe {
337        let cxstring = clang_getCursorUSR(cursor);
338        let s = clang_getCString(cxstring);
339        let result = if s.is_null() {
340            String::new()
341        } else {
342            std::ffi::CStr::from_ptr(s).to_string_lossy().into_owned()
343        };
344        clang_disposeString(cxstring);
345        result
346    }
347}
348
349/// Returns the source location of the first `compound_statement` child of
350/// the given function/method cursor.  Falls back to the cursor's own extent
351/// when no compound statement is present (e.g. for forward declarations).
352fn function_body_location(cursor: CXCursor) -> (usize, usize, usize, usize) {
353    let mut result = cursor_location(cursor);
354    let result_ptr = &mut result as *mut (usize, usize, usize, usize);
355    unsafe {
356        clang_visitChildren(cursor, body_visitor_callback, result_ptr as CXClientData);
357    }
358    result
359}
360
361extern "C" fn body_visitor_callback(
362    cursor: CXCursor,
363    _parent: CXCursor,
364    client_data: CXClientData,
365) -> CXChildVisitResult {
366    if unsafe { clang_getCursorKind(cursor) } == CXCursor_CompoundStmt {
367        let (sl, sc, el, ec) = cursor_location(cursor);
368        let out = unsafe { &mut *(client_data as *mut (usize, usize, usize, usize)) };
369        *out = (sl, sc, el, ec);
370        CXChildVisit_Break
371    } else {
372        CXChildVisit_Continue
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    #[test]
381    fn parse_simple_function() {
382        let code = "void foo() {}";
383        let facts = parse_cpp_file(code, &[], None, true).unwrap();
384        assert_eq!(facts.len(), 1);
385        assert_eq!(facts[0].kind, NodeKind::Function);
386        assert_eq!(facts[0].name.as_deref(), Some("foo"));
387        // For a single‑line function, start_line == end_line is valid.
388        assert!(facts[0].end_line >= facts[0].start_line);
389    }
390
391    #[test]
392    fn parse_function_with_call() {
393        let code = "void bar() {} void baz() { bar(); }";
394        let facts = parse_cpp_file(code, &[], None, true).unwrap();
395        let calls: Vec<_> = facts
396            .iter()
397            .filter(|n| n.kind == NodeKind::CallSite)
398            .collect();
399        assert_eq!(calls.len(), 1);
400        assert_eq!(calls[0].name.as_deref(), Some("bar"));
401    }
402
403    #[test]
404    fn parse_class_with_method() {
405        let code = "class A { void f() {} };";
406        let facts = parse_cpp_file(code, &[], None, true).unwrap();
407        let classes: Vec<_> = facts.iter().filter(|n| n.kind == NodeKind::Class).collect();
408        assert_eq!(classes.len(), 1);
409        let methods: Vec<_> = facts
410            .iter()
411            .filter(|n| n.kind == NodeKind::Function)
412            .collect();
413        assert_eq!(methods.len(), 1);
414    }
415
416    #[test]
417    fn exclude_system_headers() {
418        let code = "#include <vector>\nvoid func() {}";
419        let facts = parse_cpp_file(code, &[], None, false).unwrap();
420        assert!(facts.iter().all(|n| n.kind == NodeKind::Function));
421    }
422}