Skip to main content

icb_parser/
heuristic_parser.rs

1//! # Ultra Heuristic Near-AST Parser v3
2//!
3//! This version upgrades the parser from token-level heuristics
4//! to a **structure-aware near-AST extractor**.
5//!
6//! ## Core idea
7//!
8//! Instead of parsing grammar, we reconstruct:
9//!
10//! - scopes (brace trees)
11//! - declarations (class/function/namespace)
12//! - call sites
13//! - qualified names
14//! - structural blocks
15//!
16//! ## Design goals
17//!
18//! - near‑AST accuracy without tree‑sitter
19//! - robust C++/JS/Rust style class detection
20//! - template / inheritance tolerant scanning
21//! - O(n) linear pass
22//! - zero unwrap / zero panic
23
24use crate::facts::RawNode;
25use icb_common::{Language, NodeKind};
26use std::collections::HashSet;
27
28/// Entry point for universal parsing.
29pub fn parse_universal(source: &str, file: &str) -> Vec<RawNode> {
30    if looks_like_markup(source) {
31        return Vec::new();
32    }
33
34    let tokens = tokenize(source);
35    if tokens.is_empty() {
36        return Vec::new();
37    }
38
39    let scopes = build_scope_map(&tokens);
40    let mut out = Vec::with_capacity(tokens.len() / 5);
41
42    extract_structures(&tokens, &scopes, file, &mut out);
43    extract_calls(&tokens, &mut out);
44    extract_namespaces(&tokens, &mut out);
45
46    dedup(&mut out);
47    out
48}
49
50fn build_scope_map(tokens: &[Token]) -> Vec<u32> {
51    let mut depth: u32 = 0;
52    let mut map = Vec::with_capacity(tokens.len());
53
54    for t in tokens {
55        match t.kind {
56            TokenKind::OpenBrace => depth = depth.saturating_add(1),
57            TokenKind::CloseBrace => depth = depth.saturating_sub(1),
58            _ => {}
59        }
60        map.push(depth);
61    }
62
63    map
64}
65
66fn extract_structures(tokens: &[Token], _scopes: &[u32], file: &str, out: &mut Vec<RawNode>) {
67    let mut i: usize = 0;
68
69    while i < tokens.len() {
70        let t = &tokens[i];
71
72        if t.kind != TokenKind::Ident {
73            i += 1;
74            continue;
75        }
76
77        let word = t.text.as_str();
78
79        if is_class_keyword(word) {
80            if let Some((name, col)) = find_structural_name(tokens, i + 1) {
81                out.push(make_node(NodeKind::Class, name, t.line, col, file));
82                i += 1;
83                continue;
84            }
85        }
86
87        if is_function_keyword(word) {
88            if let Some((name, col)) = find_structural_name(tokens, i + 1) {
89                out.push(make_node(NodeKind::Function, name, t.line, col, file));
90                i += 1;
91                continue;
92            }
93        }
94
95        i += 1;
96    }
97}
98
99fn find_structural_name(tokens: &[Token], mut i: usize) -> Option<(String, usize)> {
100    let skip_noise = true;
101
102    while i < tokens.len() {
103        let t = &tokens[i];
104
105        match t.kind {
106            TokenKind::Ident if skip_noise => {
107                let s = t.text.as_str();
108
109                if is_noise_word(s) || is_modifier_word(s) {
110                    i += 1;
111                    continue;
112                }
113
114                return Some((t.text.clone(), t.col));
115            }
116
117            TokenKind::Ident => {
118                return Some((t.text.clone(), t.col));
119            }
120
121            TokenKind::LessThan => {
122                i = skip_template(tokens, i);
123            }
124
125            TokenKind::Colon => {
126                i += 1;
127            }
128
129            _ => i += 1,
130        }
131    }
132
133    None
134}
135
136fn skip_template(tokens: &[Token], mut i: usize) -> usize {
137    let mut depth = 0usize;
138
139    while i < tokens.len() {
140        match tokens[i].kind {
141            TokenKind::LessThan => depth += 1,
142            TokenKind::GreaterThan => {
143                if depth == 0 {
144                    break;
145                }
146                depth -= 1;
147                if depth == 0 {
148                    return i + 1;
149                }
150            }
151            _ => {}
152        }
153
154        i += 1;
155    }
156
157    i
158}
159
160fn extract_calls(tokens: &[Token], out: &mut Vec<RawNode>) {
161    let mut i = 0;
162
163    while i + 1 < tokens.len() {
164        if tokens[i].kind == TokenKind::Ident && tokens[i + 1].kind == TokenKind::OpenParen {
165            let name = build_qualified(tokens, i);
166
167            if is_valid_call(&name) {
168                out.push(RawNode {
169                    language: Language::Unknown,
170                    kind: NodeKind::CallSite,
171                    name: Some(name),
172                    usr: None,
173                    start_line: tokens[i].line,
174                    start_col: tokens[i].col,
175                    end_line: tokens[i].line,
176                    end_col: tokens[i].col + 1,
177                    children: Vec::new(),
178                    source_file: None,
179                });
180            }
181        }
182
183        i += 1;
184    }
185}
186
187fn extract_namespaces(tokens: &[Token], out: &mut Vec<RawNode>) {
188    let mut i = 0;
189
190    while i < tokens.len() {
191        if tokens[i].text == "namespace" {
192            if let Some((name, col)) = find_structural_name(tokens, i + 1) {
193                out.push(make_node(NodeKind::Class, name, tokens[i].line, col, ""));
194            }
195        }
196
197        i += 1;
198    }
199}
200
201fn build_qualified(tokens: &[Token], mut pos: usize) -> String {
202    let mut parts = Vec::new();
203
204    while pos > 0 {
205        let t = &tokens[pos];
206
207        if t.kind != TokenKind::Ident {
208            break;
209        }
210
211        parts.push(t.text.clone());
212
213        if pos >= 2 && tokens[pos - 1].kind == TokenKind::Dot {
214            pos -= 2;
215        } else {
216            break;
217        }
218    }
219
220    parts.reverse();
221    parts.join(".")
222}
223
224fn make_node(kind: NodeKind, name: String, line: usize, col: usize, file: &str) -> RawNode {
225    RawNode {
226        language: Language::Unknown,
227        kind,
228        name: Some(name),
229        usr: None,
230        start_line: line,
231        start_col: col,
232        end_line: line,
233        end_col: col + 1,
234        children: Vec::new(),
235        source_file: Some(file.to_string()),
236    }
237}
238
239fn dedup(facts: &mut Vec<RawNode>) {
240    let mut seen = HashSet::new();
241
242    facts.retain(|f| {
243        let key = (
244            f.name.clone().unwrap_or_default(),
245            f.start_line,
246            match f.kind {
247                NodeKind::Function => 1,
248                NodeKind::Class => 2,
249                NodeKind::CallSite => 3,
250                _ => 0,
251            },
252        );
253
254        seen.insert(key)
255    });
256}
257
258fn is_class_keyword(s: &str) -> bool {
259    matches!(
260        s,
261        "class" | "struct" | "interface" | "trait" | "enum" | "namespace" | "union" | "object"
262    )
263}
264
265fn is_function_keyword(s: &str) -> bool {
266    matches!(s, "fn" | "def" | "func" | "function" | "method")
267}
268
269fn is_modifier_word(s: &str) -> bool {
270    matches!(
271        s,
272        "public"
273            | "private"
274            | "protected"
275            | "static"
276            | "final"
277            | "abstract"
278            | "virtual"
279            | "override"
280            | "const"
281            | "inline"
282    )
283}
284
285fn is_noise_word(s: &str) -> bool {
286    matches!(
287        s,
288        "if" | "for" | "while" | "return" | "true" | "false" | "null" | "this" | "self"
289    )
290}
291
292fn is_valid_call(name: &str) -> bool {
293    !name.is_empty() && !is_noise_word(name)
294}
295
296fn looks_like_markup(src: &str) -> bool {
297    let s = src.as_bytes();
298    s.starts_with(b"<html") || s.starts_with(b"<?xml") || s.starts_with(b"<!DOCTYPE")
299}
300
301// ---------------------------------------------------------------------------
302// Tokenizer
303// ---------------------------------------------------------------------------
304
305#[derive(Clone, PartialEq)]
306enum TokenKind {
307    Ident,
308    OpenParen,
309    CloseParen,
310    OpenBrace,
311    CloseBrace,
312    Dot,
313    Comma,
314    Colon,
315    LessThan,
316    GreaterThan,
317}
318
319#[derive(Clone)]
320struct Token {
321    kind: TokenKind,
322    text: String,
323    line: usize,
324    col: usize,
325}
326
327fn tokenize(src: &str) -> Vec<Token> {
328    let bytes = src.as_bytes();
329    let mut out = Vec::with_capacity(src.len() / 4);
330
331    let mut i = 0;
332    let mut line = 1;
333    let mut col = 1;
334
335    while i < bytes.len() {
336        match bytes[i] {
337            b'\n' => {
338                line += 1;
339                col = 1;
340                i += 1;
341            }
342
343            b'(' => push(
344                &mut out,
345                TokenKind::OpenParen,
346                "(",
347                line,
348                col,
349                &mut i,
350                &mut col,
351            ),
352            b')' => push(
353                &mut out,
354                TokenKind::CloseParen,
355                ")",
356                line,
357                col,
358                &mut i,
359                &mut col,
360            ),
361            b'{' => push(
362                &mut out,
363                TokenKind::OpenBrace,
364                "{",
365                line,
366                col,
367                &mut i,
368                &mut col,
369            ),
370            b'}' => push(
371                &mut out,
372                TokenKind::CloseBrace,
373                "}",
374                line,
375                col,
376                &mut i,
377                &mut col,
378            ),
379            b'.' => push(&mut out, TokenKind::Dot, ".", line, col, &mut i, &mut col),
380            b',' => push(&mut out, TokenKind::Comma, ",", line, col, &mut i, &mut col),
381            b':' => push(&mut out, TokenKind::Colon, ":", line, col, &mut i, &mut col),
382            b'<' => push(
383                &mut out,
384                TokenKind::LessThan,
385                "<",
386                line,
387                col,
388                &mut i,
389                &mut col,
390            ),
391            b'>' => push(
392                &mut out,
393                TokenKind::GreaterThan,
394                ">",
395                line,
396                col,
397                &mut i,
398                &mut col,
399            ),
400
401            c if c.is_ascii_alphabetic() || c == b'_' => {
402                let start = i;
403
404                while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
405                    i += 1;
406                }
407
408                let text = &src[start..i];
409
410                out.push(Token {
411                    kind: TokenKind::Ident,
412                    text: text.to_string(),
413                    line,
414                    col,
415                });
416
417                col += i - start;
418            }
419
420            _ => {
421                i += 1;
422                col += 1;
423            }
424        }
425    }
426
427    out
428}
429
430fn push(
431    out: &mut Vec<Token>,
432    kind: TokenKind,
433    text: &str,
434    line: usize,
435    col: usize,
436    i: &mut usize,
437    c: &mut usize,
438) {
439    out.push(Token {
440        kind,
441        text: text.into(),
442        line,
443        col,
444    });
445
446    *i += 1;
447    *c += 1;
448}