Coverage for src/c2puml/core/parser.py: 79%
932 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
1#!/usr/bin/env python3
2"""
3Parser module for C to PlantUML converter - Step 1: Parse C code files and generate model.json
4"""
5import logging
6from pathlib import Path
7from typing import TYPE_CHECKING, Dict, List, Optional, Set
9from ..models import Enum, EnumValue, Field, FileModel, ProjectModel, Struct
10from .parser_tokenizer import (
11 CTokenizer,
12 StructureFinder,
13 TokenType,
14 find_enum_values,
15 find_struct_fields,
16)
17from .preprocessor import PreprocessorManager
18from .parser_anonymous_processor import AnonymousTypedefProcessor
19from ..utils import detect_file_encoding
21if TYPE_CHECKING:
22 from ..config import Config
23 from ..models import Alias, Enum, Field, Function, Struct, Union
26class CParser:
27 """C/C++ parser for extracting structural information from source code using tokenization"""
29 def __init__(self):
30 self.logger = logging.getLogger(__name__)
31 self.tokenizer = CTokenizer()
32 self.preprocessor = PreprocessorManager()
34 def parse_project(
35 self, source_folder: str, recursive_search: bool = True, config: "Config" = None
36 ) -> ProjectModel:
37 """Parse a C/C++ project and return a model"""
38 # Enhanced source path validation
39 if not source_folder or not isinstance(source_folder, str):
40 raise ValueError(f"Source folder must be a non-empty string, got: {type(source_folder)}")
42 if not source_folder.strip():
43 raise ValueError("Source folder cannot be empty or whitespace")
45 try:
46 source_folder_path = Path(source_folder).resolve()
47 except (OSError, RuntimeError) as e:
48 raise ValueError(f"Failed to resolve source folder path '{source_folder}': {e}")
50 if not source_folder_path.exists():
51 # Provide helpful error message with suggestions
52 error_msg = f"Source folder not found: {source_folder_path}"
54 # Check if it's a relative path issue
55 if not Path(source_folder).is_absolute():
56 current_dir = Path.cwd()
57 error_msg += f"\nCurrent working directory: {current_dir}"
58 error_msg += f"\nTried to resolve relative path: {source_folder}"
60 # Check if parent directory exists
61 parent_dir = source_folder_path.parent
62 if parent_dir.exists():
63 error_msg += f"\nParent directory exists: {parent_dir}"
64 # List contents of parent directory
65 try:
66 contents = [item.name for item in parent_dir.iterdir() if item.is_dir()]
67 if contents:
68 error_msg += f"\nAvailable directories in parent: {', '.join(contents[:10])}"
69 if len(contents) > 10:
70 error_msg += f" (and {len(contents) - 10} more)"
71 except (OSError, PermissionError):
72 error_msg += "\nCannot list parent directory contents (permission denied)"
73 else:
74 error_msg += f"\nParent directory does not exist: {parent_dir}"
76 raise ValueError(error_msg)
78 if not source_folder_path.is_dir():
79 raise ValueError(f"Source folder must be a directory, got: {source_folder_path} (is_file: {source_folder_path.is_file()})")
81 # Check if directory is readable
82 try:
83 source_folder_path.iterdir()
84 except PermissionError:
85 raise ValueError(f"Permission denied accessing source folder: {source_folder_path}")
86 except OSError as e:
87 raise ValueError(f"Error accessing source folder '{source_folder_path}': {e}")
89 self.logger.info("Parsing project: %s", source_folder_path)
91 # Find all C/C++ files in the project
92 try:
93 all_c_files = self._find_c_files(source_folder_path, recursive_search)
94 except OSError as e:
95 raise ValueError(f"Error searching for C/C++ files in '{source_folder_path}': {e}")
97 self.logger.info("Found %d C/C++ files", len(all_c_files))
99 # Apply file filtering based on configuration
100 c_files = []
101 if config:
102 for file_path in all_c_files:
103 if config._should_include_file(file_path.name):
104 c_files.append(file_path)
105 self.logger.debug(
106 "Included file after filtering: %s", file_path.name
107 )
108 else:
109 self.logger.debug(
110 "Excluded file after filtering: %s", file_path.name
111 )
112 else:
113 c_files = all_c_files
115 self.logger.info("After filtering: %d C/C++ files", len(c_files))
117 # Parse each file using filename as key for simplified tracking
118 files = {}
119 failed_files = []
121 for file_path in c_files:
122 try:
123 # Use relative path for tracking and filename as key
124 relative_path = str(file_path.relative_to(source_folder_path))
125 file_model = self.parse_file(file_path, relative_path)
127 # Use filename as key (filenames are guaranteed to be unique)
128 if file_model.name in files:
129 raise RuntimeError(
130 f"Duplicate filename detected: '{file_model.name}' from '{file_path}'. "
131 f"Already seen from '{files[file_model.name].file_path}'."
132 )
133 files[file_model.name] = file_model
135 self.logger.debug("Successfully parsed: %s", relative_path)
137 except (OSError, ValueError) as e:
138 self.logger.warning("Failed to parse %s: %s", file_path, e)
139 failed_files.append(str(file_path))
141 if failed_files:
142 error_msg = (
143 f"Failed to parse {len(failed_files)} files: {failed_files}. "
144 "Stopping model processing."
145 )
146 self.logger.error(error_msg)
147 raise RuntimeError(error_msg)
149 model = ProjectModel(
150 project_name=source_folder_path.name,
151 source_folder=str(source_folder_path),
152 files=files,
153 )
155 # Update all uses fields across the entire project
156 model.update_uses_fields()
158 self.logger.info("Parsing complete. Parsed %d files successfully.", len(files))
159 return model
161 def parse_file(self, file_path: Path, relative_path: str) -> FileModel:
162 """Parse a single C/C++ file and return a file model using tokenization"""
163 self.logger.debug("Parsing file: %s", file_path)
165 # Detect encoding
166 encoding = self._detect_encoding(file_path)
168 # Read file content
169 with open(file_path, "r", encoding=encoding) as f:
170 content = f.read()
172 # Tokenize the content
173 tokens = self.tokenizer.tokenize(content)
174 self.logger.debug("Tokenized file into %d tokens", len(tokens))
176 # Process preprocessor directives
177 self.preprocessor.add_defines_from_content(tokens)
178 processed_tokens = self.preprocessor.process_file(tokens)
179 self.logger.debug(
180 "Preprocessor processed %d tokens -> %d tokens",
181 len(tokens),
182 len(processed_tokens),
183 )
185 # Filter out whitespace and comments for structure finding
186 filtered_tokens = self.tokenizer.filter_tokens(processed_tokens)
187 structure_finder = StructureFinder(filtered_tokens)
189 # Parse different structures using tokenizer
190 structs = self._parse_structs_with_tokenizer(processed_tokens, structure_finder)
191 enums = self._parse_enums_with_tokenizer(processed_tokens, structure_finder)
192 unions = self._parse_unions_with_tokenizer(processed_tokens, structure_finder)
193 functions = self._parse_functions_with_tokenizer(
194 processed_tokens, structure_finder
195 )
196 aliases = self._parse_aliases_with_tokenizer(processed_tokens)
198 # "uses" fields will be updated when we have the full project model
200 # Map typedef names to anonymous structs/enums/unions if needed
201 # This logic will be handled by typedef_relations instead
203 file_model = FileModel(
204 file_path=str(file_path),
205 structs=structs,
206 enums=enums,
207 unions=unions,
208 functions=functions,
209 globals=self._parse_globals_with_tokenizer(processed_tokens),
210 includes=self._parse_includes_with_tokenizer(processed_tokens),
211 macros=self._parse_macros_with_tokenizer(processed_tokens),
212 aliases=aliases,
213 # Tag names are now stored in struct/enum/union objects
214 )
216 # Process anonymous typedefs after initial parsing
217 anonymous_processor = AnonymousTypedefProcessor()
218 anonymous_processor.process_file_model(file_model)
220 return file_model
222 def _parse_structs_with_tokenizer(
223 self, tokens, structure_finder
224 ) -> Dict[str, "Struct"]:
225 """Parse struct definitions using tokenizer"""
227 structs = {}
228 struct_infos = structure_finder.find_structs()
230 for start_pos, end_pos, struct_name in struct_infos:
231 # Need to map back to original token positions
232 # Find the original token positions by looking at line/column info
233 original_start = self._find_original_token_pos(
234 tokens, structure_finder.tokens, start_pos
235 )
236 original_end = self._find_original_token_pos(
237 tokens, structure_finder.tokens, end_pos
238 )
240 if original_start is not None and original_end is not None:
241 # Extract field information from original token range
242 field_tuples = find_struct_fields(tokens, original_start, original_end)
244 # Convert to Field objects
245 fields = []
246 for field_name, field_type in field_tuples:
247 try:
248 fields.append(Field(field_name, field_type))
249 except ValueError as e:
250 self.logger.warning(
251 "Error creating field %s: %s", field_name, e
252 )
254 # For anonymous structs, use a special key that can be mapped later
255 if not struct_name:
256 struct_name = "__anonymous_struct__"
258 # Extract tag name if this is a typedef struct
259 tag_name = ""
260 if struct_name and not struct_name.startswith("__anonymous"):
261 # Check if this struct has a typedef
262 tag_name = self._extract_tag_name_for_struct(tokens, struct_name)
264 # Only register non-empty struct names here; anonymous will be created by the anonymous processor
265 if struct_name:
266 structs[struct_name] = Struct(
267 struct_name, fields, tag_name=tag_name, uses=[]
268 )
269 self.logger.debug(
270 "Parsed struct: %s with %d fields", struct_name, len(fields)
271 )
273 return structs
275 def _parse_enums_with_tokenizer(
276 self, tokens, structure_finder
277 ) -> Dict[str, "Enum"]:
278 """Parse enum definitions using tokenizer"""
279 enums = {}
280 enum_infos = structure_finder.find_enums()
282 for start_pos, end_pos, enum_name in enum_infos:
283 # Need to map back to original token positions
284 original_start = self._find_original_token_pos(
285 tokens, structure_finder.tokens, start_pos
286 )
287 original_end = self._find_original_token_pos(
288 tokens, structure_finder.tokens, end_pos
289 )
291 if original_start is not None and original_end is not None:
292 # Extract enum values from original token range
293 value_strs = find_enum_values(tokens, original_start, original_end)
294 values = []
295 for v in value_strs:
296 if "=" in v:
297 name, val = v.split("=", 1)
298 name = name.strip()
299 val = val.strip()
300 if name: # Only add if name is not empty
301 values.append(EnumValue(name=name, value=val))
302 else:
303 name = v.strip()
304 if name: # Only add if name is not empty
305 values.append(EnumValue(name=name))
307 # For anonymous enums, use a special key that can be mapped later
308 if not enum_name:
309 enum_name = "__anonymous_enum__"
311 # Extract tag name if this is a typedef enum
312 tag_name = ""
313 if enum_name and not enum_name.startswith("__anonymous"):
314 # Check if this enum has a typedef
315 tag_name = self._extract_tag_name_for_enum(tokens, enum_name)
317 enums[enum_name] = Enum(enum_name, values, tag_name=tag_name)
318 self.logger.debug(
319 "Parsed enum: %s with %d values", enum_name, len(values)
320 )
322 return enums
324 def _parse_unions_with_tokenizer(
325 self, tokens, structure_finder
326 ) -> Dict[str, "Union"]:
327 """Parse union definitions using tokenizer"""
328 from ..models import Field, Union
330 unions = {}
331 union_infos = structure_finder.find_unions()
333 for start_pos, end_pos, union_name in union_infos:
334 # Need to map back to original token positions
335 original_start = self._find_original_token_pos(
336 tokens, structure_finder.tokens, start_pos
337 )
338 original_end = self._find_original_token_pos(
339 tokens, structure_finder.tokens, end_pos
340 )
342 if original_start is not None and original_end is not None:
343 # Extract field information from original token range
344 field_tuples = find_struct_fields(tokens, original_start, original_end)
346 # Convert to Field objects
347 fields = []
348 for field_name, field_type in field_tuples:
349 try:
350 fields.append(Field(field_name, field_type))
351 except ValueError as e:
352 self.logger.warning(
353 "Error creating union field %s: %s", field_name, e
354 )
356 # For anonymous unions, use a special key that can be mapped later
357 if not union_name:
358 union_name = "__anonymous_union__"
360 # Extract tag name if this is a typedef union
361 tag_name = ""
362 if union_name and not union_name.startswith("__anonymous"):
363 # Check if this union has a typedef
364 tag_name = self._extract_tag_name_for_union(tokens, union_name)
366 unions[union_name] = Union(
367 union_name, fields, tag_name=tag_name, uses=[]
368 )
369 self.logger.debug(
370 "Parsed union: %s with %d fields", union_name, len(fields)
371 )
373 return unions
375 def _parse_functions_with_tokenizer(
376 self, tokens, structure_finder
377 ) -> List["Function"]:
378 """Parse function declarations/definitions using tokenizer"""
379 from ..models import Function
381 functions = []
382 function_infos = structure_finder.find_functions()
384 for (
385 start_pos,
386 end_pos,
387 func_name,
388 return_type,
389 is_declaration,
390 is_inline,
391 ) in function_infos:
392 # Map back to original token positions to parse parameters
393 original_start = self._find_original_token_pos(
394 tokens, structure_finder.tokens, start_pos
395 )
396 original_end = self._find_original_token_pos(
397 tokens, structure_finder.tokens, end_pos
398 )
400 parameters = []
401 if original_start is not None and original_end is not None:
402 # Parse parameters from the token range
403 parameters = self._parse_function_parameters(
404 tokens, original_start, original_end, func_name
405 )
407 try:
408 # Create function with declaration flag
409 function = Function(func_name, return_type, parameters)
410 # Add custom attributes to track if this is a declaration and if it's inline
411 function.is_declaration = is_declaration
412 function.is_inline = is_inline
413 functions.append(function)
414 self.logger.debug(
415 f"Parsed function: {func_name} with {len(parameters)} parameters (declaration: {is_declaration}, inline: {is_inline})"
416 )
417 except Exception as e:
418 self.logger.warning("Error creating function %s: %s", func_name, e)
420 return functions
422 def _parse_globals_with_tokenizer(self, tokens) -> List["Field"]:
423 """Parse global variables using tokenizer"""
424 from ..models import Field
426 globals_list = []
428 i = 0
429 while i < len(tokens):
430 # Skip preprocessor directives, comments, etc.
431 if tokens[i].type in [
432 TokenType.INCLUDE,
433 TokenType.DEFINE,
434 TokenType.COMMENT,
435 TokenType.WHITESPACE,
436 TokenType.NEWLINE,
437 ]:
438 i += 1
439 continue
441 # Skip preprocessor directives but keep their content
442 if tokens[i].type == TokenType.PREPROCESSOR:
443 i = self._skip_preprocessor_directives(tokens, i)
444 continue
446 # Skip function definitions (look for parentheses)
447 if self._looks_like_function(tokens, i):
448 i = self._skip_function(tokens, i)
449 continue
451 # Skip struct/enum/union definitions
452 if tokens[i].type in [
453 TokenType.STRUCT,
454 TokenType.ENUM,
455 TokenType.UNION,
456 TokenType.TYPEDEF,
457 ]:
458 i = self._skip_structure_definition(tokens, i)
459 continue
461 # Skip if we're inside a struct definition (look for opening brace)
462 if i > 0 and tokens[i - 1].type == TokenType.LBRACE:
463 # We're inside a struct, skip until closing brace
464 brace_count = 1
465 j = i
466 while j < len(tokens) and brace_count > 0:
467 if tokens[j].type == TokenType.LBRACE:
468 brace_count += 1
469 elif tokens[j].type == TokenType.RBRACE:
470 brace_count -= 1
471 j += 1
472 i = j
473 continue
475 # Skip macros and other preprocessor content
476 if tokens[i].type == TokenType.DEFINE:
477 # Skip the entire macro content (multi-line macros are now merged)
478 i += 1
479 continue
481 # Additional check: skip if we're inside any brace block (struct, function, etc.)
482 brace_count = 0
483 j = i - 1
484 while j >= 0:
485 if tokens[j].type == TokenType.RBRACE:
486 brace_count += 1
487 elif tokens[j].type == TokenType.LBRACE:
488 brace_count -= 1
489 if brace_count < 0:
490 # We're inside a brace block, skip this token
491 i += 1
492 break
493 j -= 1
494 else:
495 # Not inside a brace block, proceed with global variable parsing
496 global_info = self._parse_global_variable(tokens, i)
497 if global_info:
498 var_name, var_type, var_value = global_info
499 # Only add if it looks like a real global variable (not a fragment)
500 if (
501 var_name
502 and var_name.strip()
503 and var_type
504 and var_type.strip()
505 and not var_name.startswith("#")
506 and len(var_type) < 200
507 and not var_type.startswith("\\")
508 and not var_name.startswith("\\")
509 and "\\" not in var_type
510 and "\\" not in var_name
511 ):
512 try:
513 # Additional validation before creating Field
514 stripped_name = var_name.strip()
515 stripped_type = var_type.strip()
516 if stripped_name and stripped_type:
517 globals_list.append(
518 Field(
519 name=stripped_name,
520 type=stripped_type,
521 value=var_value,
522 )
523 )
524 self.logger.debug(
525 f"Parsed global: {stripped_name} : {stripped_type}"
526 )
527 except Exception as e:
528 self.logger.warning(
529 f"Error creating global field {var_name}: {e}"
530 )
531 i = self._skip_to_semicolon(tokens, i)
532 else:
533 i += 1
535 return globals_list
537 def _parse_includes_with_tokenizer(self, tokens) -> List[str]:
538 """Parse #include directives using tokenizer"""
539 includes = []
541 for token in tokens:
542 if token.type == TokenType.INCLUDE:
543 # Extract include filename from the token value
544 # e.g., "#include <stdio.h>" -> "stdio.h"
545 # e.g., '#include "header.h"' -> "header.h"
546 # e.g., "#include 'header.h'" -> "header.h"
547 import re
549 match = re.search(r'[<"\']([^>\'"]+)[>\'"]', token.value)
550 if match:
551 # Return just the filename without quotes or angle brackets
552 includes.append(match.group(1))
554 return includes
556 def _parse_macros_with_tokenizer(self, tokens) -> List[str]:
557 """Parse macro definitions using tokenizer"""
558 macros = []
560 for token in tokens:
561 if token.type == TokenType.DEFINE:
562 # Store the full macro definition for display flexibility
563 # e.g., "#define PI 3.14159" -> "#define PI 3.14159"
564 # e.g., "#define MIN(a, b) ((a) < (b) ? (a) : (b))" -> "#define MIN(a, b) ((a) < (b) ? (a) : (b))"
565 macro_definition = token.value.strip()
566 if macro_definition not in macros:
567 macros.append(macro_definition)
569 return macros
571 def _parse_aliases_with_tokenizer(self, tokens) -> Dict[str, "Alias"]:
572 """Parse type aliases (primitive or derived typedefs) using tokenizer"""
573 from ..models import Alias
575 aliases = {}
577 i = 0
578 while i < len(tokens):
579 if tokens[i].type == TokenType.TYPEDEF:
580 # Found typedef, parse it
581 typedef_info = self._parse_single_typedef(tokens, i)
582 if typedef_info:
583 typedef_name, original_type = typedef_info
585 # Only include if it's NOT a struct/enum/union typedef
586 if original_type not in ["struct", "enum", "union"]:
587 aliases[typedef_name] = Alias(
588 name=typedef_name, original_type=original_type, uses=[]
589 )
591 i += 1
593 return aliases
595 # _parse_typedef_relations_with_tokenizer method removed - tag names are now in struct/enum/union
597 def _extract_tag_name_for_struct(self, tokens, struct_name: str) -> str:
598 """Extract tag name for a struct if it has a typedef"""
599 i = 0
600 while i < len(tokens):
601 if tokens[i].type == TokenType.TYPEDEF:
602 typedef_info = self._parse_single_typedef(tokens, i)
603 if typedef_info:
604 typedef_name, original_type = typedef_info
605 if original_type == "struct" and typedef_name == struct_name:
606 # Extract the tag name from the typedef
607 return self._extract_tag_name_from_typedef(tokens, i)
608 i += 1
609 return ""
611 def _extract_tag_name_for_enum(self, tokens, enum_name: str) -> str:
612 """Extract tag name for an enum if it has a typedef"""
613 i = 0
614 while i < len(tokens):
615 if tokens[i].type == TokenType.TYPEDEF:
616 typedef_info = self._parse_single_typedef(tokens, i)
617 if typedef_info:
618 typedef_name, original_type = typedef_info
619 if original_type == "enum" and typedef_name == enum_name:
620 # Extract the tag name from the typedef
621 return self._extract_tag_name_from_typedef(tokens, i)
622 i += 1
623 return ""
625 def _extract_tag_name_for_union(self, tokens, union_name: str) -> str:
626 """Extract tag name for a union if it has a typedef"""
627 i = 0
628 while i < len(tokens):
629 if tokens[i].type == TokenType.TYPEDEF:
630 typedef_info = self._parse_single_typedef(tokens, i)
631 if typedef_info:
632 typedef_name, original_type = typedef_info
633 if original_type == "union" and typedef_name == union_name:
634 # Extract the tag name from the typedef
635 return self._extract_tag_name_from_typedef(tokens, i)
636 i += 1
637 return ""
639 def _extract_non_primitive_types(
640 self, type_str: str, available_types: Set[str]
641 ) -> List[str]:
642 """Extract non-primitive type names from a type string that exist in available_types"""
643 # Define primitive types
644 primitive_types = {
645 "void",
646 "char",
647 "short",
648 "int",
649 "long",
650 "float",
651 "double",
652 "signed",
653 "unsigned",
654 "const",
655 "volatile",
656 "static",
657 "extern",
658 "auto",
659 "register",
660 "inline",
661 "restrict",
662 "size_t",
663 "ptrdiff_t",
664 "int8_t",
665 "int16_t",
666 "int32_t",
667 "int64_t",
668 "uint8_t",
669 "uint16_t",
670 "uint32_t",
671 "uint64_t",
672 "intptr_t",
673 "uintptr_t",
674 "bool",
675 "true",
676 "false",
677 "NULL",
678 "nullptr",
679 }
681 # Remove common C keywords and operators
682 import re
684 # Split by common delimiters and operators
685 parts = re.split(r"[\[\]\(\)\{\}\s\*&,;]", type_str)
687 # Extract potential type names that exist in available_types
688 types = []
689 for part in parts:
690 part = part.strip()
691 if part and len(part) > 1 and part not in primitive_types:
692 # Check if it looks like a type name (starts with letter, contains letters/numbers/underscores)
693 if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", part):
694 # Only include if it exists in available_types
695 if part in available_types:
696 types.append(part)
698 return list(set(types)) # Remove duplicates
700 def _find_c_files(
701 self, source_folder_path: Path, recursive_search: bool
702 ) -> List[Path]:
703 """Find all C/C++ files in the source folder"""
704 c_extensions = {".c", ".h", ".cpp", ".cc", ".cxx", ".hpp", ".hxx"}
705 files = []
707 self.logger.debug("Searching for files with extensions: %s", c_extensions)
709 try:
710 if recursive_search:
711 for ext in c_extensions:
712 try:
713 files.extend(source_folder_path.rglob(f"*{ext}"))
714 except (OSError, PermissionError) as e:
715 self.logger.warning("Error during recursive search for %s files: %s", ext, e)
716 # Continue with other extensions
717 else:
718 for ext in c_extensions:
719 try:
720 files.extend(source_folder_path.glob(f"*{ext}"))
721 except (OSError, PermissionError) as e:
722 self.logger.warning("Error during search for %s files: %s", ext, e)
723 # Continue with other extensions
724 except Exception as e:
725 raise OSError(f"Failed to search for C/C++ files in '{source_folder_path}': {e}")
727 # Filter out hidden files and common exclude patterns
728 filtered_files = []
729 exclude_patterns = {".git", "__pycache__", "node_modules", ".vscode", ".idea"}
731 for file_path in files:
732 try:
733 # Skip hidden files and directories
734 if any(part.startswith(".") for part in file_path.parts):
735 continue
737 # Skip common exclude patterns
738 if any(pattern in file_path.parts for pattern in exclude_patterns):
739 continue
741 # Verify the file is actually accessible
742 if not file_path.exists():
743 self.logger.debug("Skipping non-existent file: %s", file_path)
744 continue
746 if not file_path.is_file():
747 self.logger.debug("Skipping non-file item: %s", file_path)
748 continue
750 filtered_files.append(file_path)
751 except (OSError, PermissionError) as e:
752 self.logger.warning("Error accessing file %s: %s", file_path, e)
753 # Skip files we can't access
754 continue
756 self.logger.debug("Found %d C/C++ files after filtering", len(filtered_files))
757 return sorted(filtered_files)
759 def _detect_encoding(self, file_path: Path) -> str:
760 """Detect file encoding with platform-aware fallbacks"""
761 return detect_file_encoding(file_path)
763 def _find_original_token_pos(self, all_tokens, filtered_tokens, filtered_pos):
764 """Find the position in all_tokens that corresponds to filtered_tokens[filtered_pos]"""
765 if filtered_pos >= len(filtered_tokens):
766 return None
768 target_token = filtered_tokens[filtered_pos]
770 # Search for the token in all_tokens by line and column
771 for i, token in enumerate(all_tokens):
772 if (
773 token.line == target_token.line
774 and token.column == target_token.column
775 and token.value == target_token.value
776 ):
777 return i
779 return None
781 def _parse_single_typedef(self, tokens, start_pos):
782 """Parse a single typedef starting at the given position"""
783 # Skip 'typedef' keyword
784 pos = start_pos + 1
786 # Skip whitespace and comments
787 while pos < len(tokens) and tokens[pos].type in [
788 TokenType.WHITESPACE,
789 TokenType.COMMENT,
790 ]:
791 pos += 1
793 if pos >= len(tokens):
794 return None
796 # Check if it's a struct/enum/union typedef
797 if tokens[pos].type in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]:
798 # Look ahead to see if this complex type is immediately followed by a function-pointer declarator
799 # Pattern to detect: ... } ( * name ) ( ... )
800 look = pos
801 # Find the matching closing brace of the outer struct/union/enum
802 if tokens[look].type in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]:
803 # Advance to the opening brace
804 while look < len(tokens) and tokens[look].type != TokenType.LBRACE:
805 look += 1
806 if look < len(tokens) and tokens[look].type == TokenType.LBRACE:
807 brace_count = 1
808 look += 1
809 while look < len(tokens) and brace_count > 0:
810 if tokens[look].type == TokenType.LBRACE:
811 brace_count += 1
812 elif tokens[look].type == TokenType.RBRACE:
813 brace_count -= 1
814 look += 1
815 # Now 'look' is token after the closing brace
816 j = look
817 # Skip whitespace/comments
818 while j < len(tokens) and tokens[j].type in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:
819 j += 1
820 # Detect function-pointer declarator: ( * IDENT ) (
821 if (
822 j + 4 < len(tokens)
823 and tokens[j].type == TokenType.LPAREN
824 and tokens[j + 1].type == TokenType.ASTERISK
825 and tokens[j + 2].type == TokenType.IDENTIFIER
826 and tokens[j + 3].type == TokenType.RPAREN
827 and tokens[j + 4].type == TokenType.LPAREN
828 ):
829 typedef_name = tokens[j + 2].value
830 # Collect the full typedef original type up to the semicolon, preserving parentheses/brackets spacing
831 k = pos
832 formatted: list[str] = []
833 while k < len(tokens) and tokens[k].type != TokenType.SEMICOLON:
834 t = tokens[k]
835 if t.type in [TokenType.LPAREN, TokenType.RPAREN, TokenType.LBRACKET, TokenType.RBRACKET]:
836 formatted.append(t.value)
837 elif formatted and formatted[-1] not in ["(", ")", "[", "]"]:
838 # Prepend space before non-bracket tokens when previous isn't a bracket
839 formatted.append(" " + t.value)
840 else:
841 formatted.append(t.value)
842 k += 1
843 original_type = "".join(formatted)
844 # Clean excessive whitespace inside type
845 original_type = self._clean_type_string(original_type)
846 return (typedef_name, original_type)
847 # Fallback to standard complex typedef parsing
848 return self._parse_complex_typedef(tokens, pos)
850 # Collect all non-whitespace/comment tokens until semicolon
851 # But handle nested structures properly
852 all_tokens = []
853 brace_count = 0
854 paren_count = 0
856 while pos < len(tokens):
857 token = tokens[pos]
859 # Track nested braces and parentheses
860 if token.type == TokenType.LBRACE:
861 brace_count += 1
862 elif token.type == TokenType.RBRACE:
863 brace_count -= 1
864 elif token.type == TokenType.LPAREN:
865 paren_count += 1
866 elif token.type == TokenType.RPAREN:
867 paren_count -= 1
868 elif token.type == TokenType.SEMICOLON:
869 # Only treat semicolon as end if we're not inside nested structures
870 # For function pointer typedefs, we need to be outside the parameter list parentheses
871 if brace_count == 0 and paren_count == 0:
872 # We're outside any nested structures and parentheses
873 break
875 if token.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
876 all_tokens.append(token)
877 pos += 1
879 if len(all_tokens) < 2:
880 return None
882 # Function pointer typedef: typedef ret (*name)(params);
883 for i in range(len(all_tokens) - 3):
884 if (
885 all_tokens[i].type
886 in [
887 TokenType.IDENTIFIER,
888 TokenType.INT,
889 TokenType.VOID,
890 TokenType.CHAR,
891 TokenType.FLOAT,
892 TokenType.DOUBLE,
893 TokenType.LONG,
894 TokenType.SHORT,
895 TokenType.UNSIGNED,
896 TokenType.SIGNED,
897 ]
898 and all_tokens[i + 1].type == TokenType.LPAREN
899 and all_tokens[i + 2].type == TokenType.ASTERISK
900 and all_tokens[i + 3].type == TokenType.IDENTIFIER
901 ):
902 # Check if this is followed by a parameter list
903 if i + 4 < len(all_tokens) and all_tokens[i + 4].type == TokenType.RPAREN:
904 if i + 5 < len(all_tokens) and all_tokens[i + 5].type == TokenType.LPAREN:
905 # This is a function pointer with parameters - skip this pattern and use the complex logic
906 break
908 # Simple function pointer typedef without complex parameters
909 typedef_name = all_tokens[i + 3].value
910 # Fix: Properly format function pointer type - preserve spaces between tokens but not around parentheses
911 formatted_tokens = []
912 for j, token in enumerate(all_tokens):
913 if token.type in [TokenType.LPAREN, TokenType.RPAREN]:
914 # Don't add spaces around parentheses
915 formatted_tokens.append(token.value)
916 elif j > 0 and all_tokens[j - 1].type not in [
917 TokenType.LPAREN,
918 TokenType.RPAREN,
919 ]:
920 # Add space before token if previous token wasn't a parenthesis
921 formatted_tokens.append(" " + token.value)
922 else:
923 # No space before token
924 formatted_tokens.append(token.value)
925 original_type = "".join(formatted_tokens)
926 return (typedef_name, original_type)
928 # Complex function pointer typedef: typedef ret (*name)(complex_params);
929 # This handles cases where the function pointer has complex parameters that span multiple tokens
930 if len(all_tokens) >= 6:
931 # Look for pattern: type ( * name ) ( ... )
932 for i in range(len(all_tokens) - 5):
933 if (
934 all_tokens[i].type
935 in [
936 TokenType.IDENTIFIER,
937 TokenType.INT,
938 TokenType.VOID,
939 TokenType.CHAR,
940 TokenType.FLOAT,
941 TokenType.DOUBLE,
942 TokenType.LONG,
943 TokenType.SHORT,
944 TokenType.UNSIGNED,
945 TokenType.SIGNED,
946 ]
947 and all_tokens[i + 1].type == TokenType.LPAREN
948 and all_tokens[i + 2].type == TokenType.ASTERISK
949 and all_tokens[i + 3].type == TokenType.IDENTIFIER
950 and all_tokens[i + 4].type == TokenType.RPAREN
951 and all_tokens[i + 5].type == TokenType.LPAREN
952 ):
953 # Find the closing parenthesis for the parameter list
954 paren_count = 1
955 param_end = i + 6
956 while param_end < len(all_tokens) and paren_count > 0:
957 if all_tokens[param_end].type == TokenType.LPAREN:
958 paren_count += 1
959 elif all_tokens[param_end].type == TokenType.RPAREN:
960 paren_count -= 1
961 param_end += 1
963 if paren_count == 0:
964 typedef_name = all_tokens[i + 3].value
965 # Format the complete typedef properly
966 formatted_tokens = []
967 for j, token in enumerate(all_tokens):
968 if token.type in [TokenType.LPAREN, TokenType.RPAREN]:
969 # Don't add spaces around parentheses
970 formatted_tokens.append(token.value)
971 elif j > 0 and all_tokens[j - 1].type not in [
972 TokenType.LPAREN,
973 TokenType.RPAREN,
974 ]:
975 # Add space before token if previous token wasn't a parenthesis
976 formatted_tokens.append(" " + token.value)
977 else:
978 # No space before token
979 formatted_tokens.append(token.value)
980 original_type = "".join(formatted_tokens)
981 return (typedef_name, original_type)
983 # Array typedef: typedef type name[size];
984 for i in range(len(all_tokens)):
985 if (
986 all_tokens[i].type == TokenType.LBRACKET
987 and i > 0
988 and all_tokens[i - 1].type == TokenType.IDENTIFIER
989 ):
990 typedef_name = all_tokens[i - 1].value
991 # Fix: Properly format array type - preserve spaces between tokens but not around brackets
992 formatted_tokens = []
993 for j, token in enumerate(all_tokens):
994 if token.type in [TokenType.LBRACKET, TokenType.RBRACKET]:
995 # Don't add spaces around brackets
996 formatted_tokens.append(token.value)
997 elif j > 0 and all_tokens[j - 1].type not in [
998 TokenType.LBRACKET,
999 TokenType.RBRACKET,
1000 ]:
1001 # Add space before token if previous token wasn't a bracket
1002 formatted_tokens.append(" " + token.value)
1003 else:
1004 # No space before token
1005 formatted_tokens.append(token.value)
1006 original_type = "".join(formatted_tokens)
1007 return (typedef_name, original_type)
1009 # Pointer typedef: typedef type * name;
1010 for i in range(len(all_tokens) - 2):
1011 if (
1012 all_tokens[i].type == TokenType.ASTERISK
1013 and all_tokens[i + 1].type == TokenType.IDENTIFIER
1014 ):
1015 typedef_name = all_tokens[i + 1].value
1016 # Fix: Properly format pointer type - preserve spaces between tokens
1017 formatted_tokens = []
1018 for j, token in enumerate(all_tokens):
1019 if j > 0:
1020 # Add space before token
1021 formatted_tokens.append(" " + token.value)
1022 else:
1023 # No space before first token
1024 formatted_tokens.append(token.value)
1025 original_type = "".join(formatted_tokens)
1026 return (typedef_name, original_type)
1028 # Basic typedef: the last token is the typedef name, everything else is the type
1029 typedef_name = all_tokens[-1].value
1030 type_tokens = all_tokens[:-1]
1031 original_type = " ".join(t.value for t in type_tokens)
1032 original_type = self._clean_type_string(original_type)
1033 original_type = self._fix_array_bracket_spacing(original_type)
1034 return (typedef_name, original_type)
1036 def _parse_complex_typedef(self, tokens, start_pos):
1037 """Parse complex typedef (struct/enum/union)"""
1038 # Parse complex typedefs with proper structure detection
1040 # Find the typedef name by looking for the pattern after the closing brace
1041 brace_count = 0
1042 pos = start_pos
1044 # Find opening brace
1045 while pos < len(tokens) and tokens[pos].type != TokenType.LBRACE:
1046 pos += 1
1048 if pos >= len(tokens):
1049 return None
1051 # Skip to closing brace
1052 brace_count = 1
1053 pos += 1
1055 while pos < len(tokens) and brace_count > 0:
1056 if tokens[pos].type == TokenType.LBRACE:
1057 brace_count += 1
1058 elif tokens[pos].type == TokenType.RBRACE:
1059 brace_count -= 1
1060 pos += 1
1062 if brace_count > 0:
1063 return None
1065 # Find typedef name after closing brace
1066 while pos < len(tokens) and tokens[pos].type in [
1067 TokenType.WHITESPACE,
1068 TokenType.COMMENT,
1069 ]:
1070 pos += 1
1072 if pos < len(tokens) and tokens[pos].type == TokenType.IDENTIFIER:
1073 typedef_name = tokens[pos].value
1074 struct_type = tokens[start_pos].value # struct/enum/union
1075 return (typedef_name, struct_type)
1077 return None
1079 def _extract_tag_name_from_typedef(self, tokens, start_pos):
1080 """Extract the tag name from a typedef like 'typedef struct TagName { ... } TypedefName;'"""
1081 # Skip 'typedef' keyword
1082 pos = start_pos + 1
1084 # Skip whitespace and comments
1085 while pos < len(tokens) and tokens[pos].type in [
1086 TokenType.WHITESPACE,
1087 TokenType.COMMENT,
1088 ]:
1089 pos += 1
1091 if pos >= len(tokens):
1092 return ""
1094 # Check if it's a struct/enum/union
1095 if tokens[pos].type not in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]:
1096 return ""
1098 # Skip struct/enum/union keyword
1099 pos += 1
1101 # Skip whitespace and comments
1102 while pos < len(tokens) and tokens[pos].type in [
1103 TokenType.WHITESPACE,
1104 TokenType.COMMENT,
1105 ]:
1106 pos += 1
1108 # Look for tag name (identifier before opening brace)
1109 if pos < len(tokens) and tokens[pos].type == TokenType.IDENTIFIER:
1110 tag_name = tokens[pos].value
1111 return tag_name
1113 return ""
1115 def _looks_like_function(self, tokens, start_pos):
1116 """Check if the token sequence starting at start_pos looks like a function"""
1117 # Look ahead for parentheses within a reasonable distance
1118 for i in range(start_pos, min(start_pos + 10, len(tokens))):
1119 if tokens[i].type == TokenType.LPAREN:
1120 return True
1121 if tokens[i].type in [
1122 TokenType.SEMICOLON,
1123 TokenType.LBRACE,
1124 TokenType.RBRACE,
1125 ]:
1126 return False
1127 return False
1129 def _skip_function(self, tokens, start_pos):
1130 """Skip over a function definition or declaration"""
1131 # Find the end (either semicolon for declaration or closing brace for definition)
1132 i = start_pos
1133 brace_count = 0
1134 paren_count = 0
1136 while i < len(tokens):
1137 if tokens[i].type == TokenType.LPAREN:
1138 paren_count += 1
1139 elif tokens[i].type == TokenType.RPAREN:
1140 paren_count -= 1
1141 elif tokens[i].type == TokenType.LBRACE:
1142 brace_count += 1
1143 elif tokens[i].type == TokenType.RBRACE:
1144 brace_count -= 1
1145 if brace_count == 0 and paren_count == 0:
1146 return i + 1
1147 elif (
1148 tokens[i].type == TokenType.SEMICOLON
1149 and paren_count == 0
1150 and brace_count == 0
1151 ):
1152 return i + 1
1153 i += 1
1155 return i
1157 def _skip_structure_definition(self, tokens, start_pos):
1158 """Skip over struct/enum/union/typedef definition"""
1159 i = start_pos
1160 brace_count = 0
1162 while i < len(tokens):
1163 if tokens[i].type == TokenType.LBRACE:
1164 brace_count += 1
1165 elif tokens[i].type == TokenType.RBRACE:
1166 brace_count -= 1
1167 if brace_count == 0:
1168 # Continue until semicolon
1169 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON:
1170 i += 1
1171 return i + 1 if i < len(tokens) else i
1172 elif tokens[i].type == TokenType.SEMICOLON and brace_count == 0:
1173 return i + 1
1174 i += 1
1176 return i
1178 def _parse_global_variable(self, tokens, start_pos):
1179 """Parse a global variable declaration starting at start_pos"""
1180 # Look for pattern: [static/extern] type name [= value];
1181 i = start_pos
1182 collected_tokens = []
1184 # Collect tokens until semicolon
1185 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON:
1186 if tokens[i].type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
1187 collected_tokens.append(tokens[i])
1188 i += 1
1190 if len(collected_tokens) < 2:
1191 return None
1193 # Skip modifiers
1194 start_idx = 0
1195 while start_idx < len(collected_tokens) and collected_tokens[
1196 start_idx
1197 ].type in [TokenType.STATIC, TokenType.EXTERN, TokenType.CONST]:
1198 start_idx += 1
1200 # Check if there's an assignment
1201 assign_idx = None
1202 for j in range(start_idx, len(collected_tokens)):
1203 if collected_tokens[j].type == TokenType.ASSIGN:
1204 assign_idx = j
1205 break
1207 # Extract variable name and type
1208 if assign_idx is not None:
1209 # Has assignment: type name = value or type name[size] = value
1210 if assign_idx > start_idx + 1:
1211 # Check if this is an array declaration with assignment
1212 bracket_idx = None
1213 for j in range(assign_idx - 1, start_idx, -1):
1214 if collected_tokens[j].type == TokenType.RBRACKET:
1215 bracket_idx = j
1216 break
1218 if bracket_idx is not None:
1219 # Array declaration with assignment: find the identifier before the opening bracket
1220 for j in range(bracket_idx - 1, start_idx, -1):
1221 if collected_tokens[j].type == TokenType.LBRACKET:
1222 # Found opening bracket, look for identifier before it
1223 for k in range(j - 1, start_idx, -1):
1224 if collected_tokens[k].type == TokenType.IDENTIFIER:
1225 var_name = collected_tokens[k].value
1226 type_tokens = collected_tokens[start_idx:k]
1227 # Format array type properly
1228 formatted_type = []
1229 for idx, token in enumerate(type_tokens):
1230 if idx > 0:
1231 formatted_type.append(" " + token.value)
1232 else:
1233 formatted_type.append(token.value)
1234 # Add array brackets without spaces
1235 array_size = collected_tokens[j + 1].value if j + 1 < bracket_idx else ""
1236 var_type = "".join(formatted_type) + "[" + array_size + "]"
1237 var_type = self._clean_type_string(var_type)
1238 value_tokens = collected_tokens[assign_idx + 1 :]
1239 var_value = " ".join(t.value for t in value_tokens)
1240 # Clean the value string to remove excessive whitespace and newlines
1241 var_value = self._clean_value_string(var_value)
1242 return (var_name, var_type, var_value)
1243 break
1244 else:
1245 # Regular assignment: type name = value
1246 var_name = collected_tokens[assign_idx - 1].value
1247 type_tokens = collected_tokens[start_idx : assign_idx - 1]
1248 value_tokens = collected_tokens[assign_idx + 1 :]
1249 var_type = " ".join(t.value for t in type_tokens)
1250 var_type = self._clean_type_string(var_type)
1251 var_type = self._fix_array_bracket_spacing(var_type)
1252 var_value = " ".join(t.value for t in value_tokens)
1253 # Clean the value string to remove excessive whitespace and newlines
1254 var_value = self._clean_value_string(var_value)
1255 return (var_name, var_type, var_value)
1256 else:
1257 # No assignment: type name or type name[size]
1258 if len(collected_tokens) > start_idx + 1:
1259 # Check if this is an array declaration
1260 bracket_idx = None
1261 for j in range(len(collected_tokens) - 1, start_idx, -1):
1262 if collected_tokens[j].type == TokenType.RBRACKET:
1263 bracket_idx = j
1264 break
1266 if bracket_idx is not None:
1267 # Array declaration: find the identifier before the opening bracket
1268 for j in range(bracket_idx - 1, start_idx, -1):
1269 if collected_tokens[j].type == TokenType.LBRACKET:
1270 # Found opening bracket, look for identifier before it
1271 for k in range(j - 1, start_idx, -1):
1272 if collected_tokens[k].type == TokenType.IDENTIFIER:
1273 var_name = collected_tokens[k].value
1274 type_tokens = collected_tokens[start_idx:k]
1275 # Format array type properly - preserve spaces between tokens but not around brackets
1276 formatted_type = []
1277 for idx, token in enumerate(type_tokens):
1278 if idx > 0:
1279 formatted_type.append(" " + token.value)
1280 else:
1281 formatted_type.append(token.value)
1282 # Add array brackets without spaces
1283 array_size = collected_tokens[j + 1].value if j + 1 < bracket_idx else ""
1284 var_type = "".join(formatted_type) + "[" + array_size + "]"
1285 var_type = self._clean_type_string(var_type)
1286 return (var_name, var_type, None)
1287 break
1288 else:
1289 # Regular variable: last token is the name
1290 var_name = collected_tokens[-1].value
1291 type_tokens = collected_tokens[start_idx:-1]
1292 var_type = " ".join(t.value for t in type_tokens)
1293 var_type = self._clean_type_string(var_type)
1294 var_type = self._fix_array_bracket_spacing(var_type)
1295 return (var_name, var_type, None)
1297 return None
1299 def _skip_to_semicolon(self, tokens, start_pos):
1300 """Skip to the next semicolon"""
1301 i = start_pos
1302 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON:
1303 i += 1
1304 return i + 1 if i < len(tokens) else i
1306 def _skip_preprocessor_directives(self, tokens, start_pos):
1307 """Skip preprocessor directives but keep their content for parsing"""
1308 # This method is deprecated - use the PreprocessorManager instead
1309 i = start_pos
1310 while i < len(tokens) and tokens[i].type == TokenType.PREPROCESSOR:
1311 # Skip the preprocessor directive itself
1312 i += 1
1313 return i
1315 def _parse_function_parameters(self, tokens, start_pos, end_pos, func_name):
1316 """Parse function parameters from token range"""
1318 parameters = []
1320 # Find the opening parenthesis for the function
1321 paren_start = None
1322 paren_end = None
1324 for i in range(start_pos, min(end_pos + 1, len(tokens))):
1325 if tokens[i].type == TokenType.IDENTIFIER and tokens[i].value == func_name:
1326 # Look for opening parenthesis after function name
1327 for j in range(i + 1, min(end_pos + 1, len(tokens))):
1328 if tokens[j].type == TokenType.LPAREN:
1329 paren_start = j
1330 break
1331 elif tokens[j].type not in [
1332 TokenType.WHITESPACE,
1333 TokenType.COMMENT,
1334 ]:
1335 break
1336 break
1338 if paren_start is None:
1339 return parameters
1341 # Find matching closing parenthesis
1342 paren_depth = 1
1343 for i in range(paren_start + 1, min(end_pos + 1, len(tokens))):
1344 if tokens[i].type == TokenType.LPAREN:
1345 paren_depth += 1
1346 elif tokens[i].type == TokenType.RPAREN:
1347 paren_depth -= 1
1348 if paren_depth == 0:
1349 paren_end = i
1350 break
1352 if paren_end is None:
1353 return parameters
1355 # Parse parameter tokens between parentheses
1356 param_tokens = []
1357 for i in range(paren_start + 1, paren_end):
1358 if tokens[i].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:
1359 param_tokens.append(tokens[i])
1361 # If no parameters or just "void", return empty list
1362 if not param_tokens or (
1363 len(param_tokens) == 1 and param_tokens[0].value == "void"
1364 ):
1365 return parameters
1367 # Split parameters by commas, but handle function pointers correctly
1368 current_param = []
1369 paren_depth = 0
1370 for token in param_tokens:
1371 if token.type == TokenType.LPAREN:
1372 paren_depth += 1
1373 elif token.type == TokenType.RPAREN:
1374 paren_depth -= 1
1375 elif token.type == TokenType.COMMA and paren_depth == 0:
1376 # Only split on commas that are not inside parentheses
1377 if current_param:
1378 param = self._parse_single_parameter(current_param)
1379 if param:
1380 parameters.append(param)
1381 current_param = []
1382 continue
1384 current_param.append(token)
1386 # Handle last parameter
1387 if current_param:
1388 param = self._parse_single_parameter(current_param)
1389 if param:
1390 parameters.append(param)
1392 return parameters
1394 def _parse_single_parameter(self, param_tokens):
1395 """Parse a single function parameter from tokens"""
1396 from ..models import Field
1398 if not param_tokens:
1399 return None
1401 # Handle variadic parameters (three consecutive dots)
1402 if len(param_tokens) == 3 and all(t.value == "." for t in param_tokens):
1403 return Field(name="...", type="...")
1405 # Handle variadic parameters (single ... token)
1406 if len(param_tokens) == 1 and param_tokens[0].value == "...":
1407 return Field(name="...", type="...")
1409 # Handle function pointer parameters: type (*name)(params)
1410 if len(param_tokens) >= 5:
1411 # Look for pattern: type ( * name ) ( params )
1412 for i in range(len(param_tokens) - 4):
1413 if (
1414 param_tokens[i].type == TokenType.LPAREN
1415 and param_tokens[i + 1].type == TokenType.ASTERISK
1416 and param_tokens[i + 2].type == TokenType.IDENTIFIER
1417 and param_tokens[i + 3].type == TokenType.RPAREN
1418 and param_tokens[i + 4].type == TokenType.LPAREN
1419 ):
1420 # Found function pointer pattern
1421 func_name = param_tokens[i + 2].value
1423 # Find the closing parenthesis for the parameter list
1424 paren_count = 1
1425 param_end = i + 5
1426 while param_end < len(param_tokens) and paren_count > 0:
1427 if param_tokens[param_end].type == TokenType.LPAREN:
1428 paren_count += 1
1429 elif param_tokens[param_end].type == TokenType.RPAREN:
1430 paren_count -= 1
1431 param_end += 1
1433 if paren_count == 0:
1434 # Extract the type (everything before the function pointer)
1435 type_tokens = param_tokens[:i]
1436 param_type = " ".join(t.value for t in type_tokens)
1438 # Extract the function pointer part
1439 func_ptr_tokens = param_tokens[i:param_end]
1440 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)
1442 # Combine type and function pointer
1443 full_type = (param_type + " " + func_ptr_type).strip()
1445 # Fix array bracket spacing
1446 full_type = self._fix_array_bracket_spacing(full_type)
1448 return Field(name=func_name, type=full_type)
1449 else:
1450 # Incomplete function pointer - try to reconstruct
1451 type_tokens = param_tokens[:i]
1452 param_type = " ".join(t.value for t in type_tokens)
1453 func_ptr_tokens = param_tokens[i:]
1454 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)
1455 full_type = (param_type + " " + func_ptr_type).strip()
1456 full_type = self._fix_array_bracket_spacing(full_type)
1457 return Field(name=func_name, type=full_type)
1459 # Also look for pattern: type ( * name ) ( params ) with spaces
1460 for i in range(len(param_tokens) - 4):
1461 if (
1462 param_tokens[i].type == TokenType.LPAREN
1463 and param_tokens[i + 1].type == TokenType.ASTERISK
1464 and param_tokens[i + 2].type == TokenType.IDENTIFIER
1465 and param_tokens[i + 3].type == TokenType.RPAREN
1466 and param_tokens[i + 4].type == TokenType.LPAREN
1467 ):
1468 # Found function pointer pattern
1469 func_name = param_tokens[i + 2].value
1471 # Find the closing parenthesis for the parameter list
1472 paren_count = 1
1473 param_end = i + 5
1474 while param_end < len(param_tokens) and paren_count > 0:
1475 if param_tokens[param_end].type == TokenType.LPAREN:
1476 paren_count += 1
1477 elif param_tokens[param_end].type == TokenType.RPAREN:
1478 paren_count -= 1
1479 param_end += 1
1481 if paren_count == 0:
1482 # Extract the type (everything before the function pointer)
1483 type_tokens = param_tokens[:i]
1484 param_type = " ".join(t.value for t in type_tokens)
1486 # Extract the function pointer part
1487 func_ptr_tokens = param_tokens[i:param_end]
1488 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)
1490 # Combine type and function pointer
1491 full_type = (param_type + " " + func_ptr_type).strip()
1493 # Fix array bracket spacing
1494 full_type = self._fix_array_bracket_spacing(full_type)
1496 return Field(name=func_name, type=full_type)
1497 else:
1498 # Incomplete function pointer - try to reconstruct
1499 type_tokens = param_tokens[:i]
1500 param_type = " ".join(t.value for t in type_tokens)
1501 func_ptr_tokens = param_tokens[i:]
1502 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)
1503 full_type = (param_type + " " + func_ptr_type).strip()
1504 full_type = self._fix_array_bracket_spacing(full_type)
1505 return Field(name=func_name, type=full_type)
1507 # For parameters like "int x" or "const char *name" or "char* argv[]"
1508 if len(param_tokens) >= 2:
1509 # Check if the last token is a closing bracket (array parameter)
1510 if param_tokens[-1].type == TokenType.RBRACKET:
1511 # Find the opening bracket to get the array size
1512 bracket_start = None
1513 for i in range(len(param_tokens) - 1, -1, -1):
1514 if param_tokens[i].type == TokenType.LBRACKET:
1515 bracket_start = i
1516 break
1518 if bracket_start is not None:
1519 # Extract the parameter name (last identifier before the opening bracket)
1520 param_name = None
1521 for i in range(bracket_start - 1, -1, -1):
1522 if param_tokens[i].type == TokenType.IDENTIFIER:
1523 param_name = param_tokens[i].value
1524 break
1526 if param_name:
1527 # Extract the type (everything before the parameter name)
1528 type_tokens = param_tokens[:i]
1529 param_type = " ".join(t.value for t in type_tokens)
1531 # Add the array brackets to the type
1532 array_size = ""
1533 if bracket_start + 1 < len(param_tokens) - 1:
1534 # There's content between brackets
1535 array_content = param_tokens[bracket_start + 1:-1]
1536 array_size = " ".join(t.value for t in array_content)
1538 param_type = param_type + "[" + array_size + "]"
1540 # Fix array bracket spacing
1541 param_type = self._fix_array_bracket_spacing(param_type)
1543 return Field(name=param_name, type=param_type)
1544 else:
1545 # Regular parameter: last token is the parameter name
1546 param_name = param_tokens[-1].value
1547 type_tokens = param_tokens[:-1]
1548 param_type = " ".join(t.value for t in type_tokens)
1550 # Fix array bracket spacing and pointer spacing
1551 param_type = self._fix_array_bracket_spacing(param_type)
1552 param_type = self._fix_pointer_spacing(param_type)
1554 # Handle unnamed parameters (just type)
1555 if param_name in [
1556 "void",
1557 "int",
1558 "char",
1559 "float",
1560 "double",
1561 "long",
1562 "short",
1563 "unsigned",
1564 "signed",
1565 ]:
1566 # This is just a type without a name
1567 return Field(name="unnamed", type=param_type + " " + param_name)
1569 # Additional validation before creating Field
1570 if param_name and param_name.strip() and param_type and param_type.strip():
1571 return Field(name=param_name.strip(), type=param_type.strip())
1572 else:
1573 # Fallback for invalid parameters - try to reconstruct the full parameter
1574 full_param = " ".join(t.value for t in param_tokens)
1575 full_param = self._fix_array_bracket_spacing(full_param)
1576 if full_param.strip():
1577 return Field(name="unnamed", type=full_param.strip())
1578 else:
1579 return Field(name="unnamed", type="unknown")
1580 elif len(param_tokens) == 1:
1581 # Single token - might be just type (like "void") or name
1582 token_value = param_tokens[0].value
1583 if token_value in [
1584 "void",
1585 "int",
1586 "char",
1587 "float",
1588 "double",
1589 "long",
1590 "short",
1591 "unsigned",
1592 "signed",
1593 ]:
1594 return Field(name="unnamed", type=token_value)
1595 else:
1596 # If we can't determine the type, use the token value as type
1597 if token_value and token_value.strip():
1598 return Field(name="unnamed", type=token_value.strip())
1599 else:
1600 return Field(name="unnamed", type="unknown")
1602 return None
1604 def _fix_array_bracket_spacing(self, type_str: str) -> str:
1605 """Fix spacing around array brackets in type strings"""
1606 # First clean the type string to remove newlines
1607 type_str = self._clean_type_string(type_str)
1608 # Replace patterns like "type[ size ]" with "type[size]"
1609 import re
1610 # Remove spaces around array brackets
1611 type_str = re.sub(r'\s*\[\s*', '[', type_str)
1612 type_str = re.sub(r'\s*\]\s*', ']', type_str)
1613 return type_str
1615 def _fix_pointer_spacing(self, type_str: str) -> str:
1616 """Fix spacing around pointer asterisks in type strings"""
1617 import re
1618 # Fix double pointer spacing: "type * *" -> "type **"
1619 type_str = re.sub(r'\*\s+\*', '**', type_str)
1620 # Fix triple pointer spacing: "type * * *" -> "type ***"
1621 type_str = re.sub(r'\*\s+\*\s+\*', '***', type_str)
1622 return type_str
1624 def _clean_type_string(self, type_str: str) -> str:
1625 """Clean type string by removing newlines and normalizing whitespace"""
1626 if not type_str:
1627 return type_str
1628 # Replace newlines with spaces and normalize whitespace
1629 cleaned = type_str.replace('\n', ' ')
1630 # Normalize multiple spaces to single space
1631 import re
1632 cleaned = re.sub(r'\s+', ' ', cleaned)
1633 # Strip leading/trailing whitespace
1634 cleaned = cleaned.strip()
1635 return cleaned
1637 def _clean_value_string(self, value_str: str) -> str:
1638 """Clean value string by removing excessive whitespace and newlines"""
1639 if not value_str:
1640 return value_str
1641 # Replace newlines with spaces and normalize whitespace
1642 cleaned = value_str.replace('\n', ' ')
1643 # Normalize multiple spaces to single space
1644 import re
1645 cleaned = re.sub(r'\s+', ' ', cleaned)
1646 # Strip leading/trailing whitespace
1647 cleaned = cleaned.strip()
1648 # Remove excessive spaces around braces and operators
1649 cleaned = re.sub(r'\s*{\s*', '{', cleaned)
1650 cleaned = re.sub(r'\s*}\s*', '}', cleaned)
1651 cleaned = re.sub(r'\s*,\s*', ', ', cleaned)
1652 cleaned = re.sub(r'\s*&\s*', '&', cleaned)
1653 return cleaned
1655 def _get_timestamp(self) -> str:
1656 """Get current timestamp string"""
1657 from datetime import datetime
1659 return datetime.now().isoformat()
1662class Parser:
1663 """Main parser class for Step 1: Parse C code files and generate model.json"""
1665 def __init__(self):
1666 self.c_parser = CParser()
1667 self.logger = logging.getLogger(__name__)
1669 def parse(
1670 self,
1671 source_folders: "List[str]",
1672 output_file: str = "model.json",
1673 recursive_search: bool = True,
1674 config: "Config" = None,
1675 ) -> str:
1676 """Parse C/C++ projects and generate model.json
1678 Args:
1679 source_folders: List of source folder directories within the project
1680 output_file: Path to the output model.json file
1681 recursive_search: Whether to search subdirectories recursively
1682 config: Configuration object for filtering and processing
1684 Returns:
1685 Path to the generated model.json file
1686 """
1687 # Enhanced validation for source_folders
1688 if not isinstance(source_folders, list):
1689 raise TypeError(f"source_folders must be a list of strings, got: {type(source_folders)}")
1691 if not source_folders:
1692 raise ValueError("At least one source folder must be provided")
1694 # Validate all items are strings and not empty
1695 for i, folder in enumerate(source_folders):
1696 if not isinstance(folder, str):
1697 raise TypeError(f"All source folders must be strings, got {type(folder)} at index {i}: {folder}")
1698 if not folder.strip():
1699 raise ValueError(f"Source folder at index {i} cannot be empty or whitespace: {repr(folder)}")
1701 self.logger.info(
1702 f"Step 1: Parsing C/C++ project with {len(source_folders)} source folders"
1703 )
1705 # Get project name from config or use default
1706 project_name = (
1707 getattr(config, "project_name", "C_Project") if config else "C_Project"
1708 )
1710 # Parse each source folder and combine results
1711 all_files = {}
1712 total_structs = 0
1713 total_enums = 0
1714 total_functions = 0
1715 failed_folders = []
1717 for i, source_folder in enumerate(source_folders):
1718 self.logger.info(
1719 f"Parsing source folder {i+1}/{len(source_folders)}: {source_folder}"
1720 )
1722 try:
1723 # Parse the individual source folder
1724 model = self.c_parser.parse_project(
1725 source_folder, recursive_search, config
1726 )
1728 all_files.update(model.files)
1730 # Update totals
1731 total_structs += sum(len(f.structs) for f in model.files.values())
1732 total_enums += sum(len(f.enums) for f in model.files.values())
1733 total_functions += sum(len(f.functions) for f in model.files.values())
1735 self.logger.info(
1736 f"Successfully parsed source folder {source_folder}: {len(model.files)} files"
1737 )
1739 except Exception as e:
1740 self.logger.error(
1741 "Failed to parse source folder %s: %s", source_folder, e
1742 )
1743 failed_folders.append((source_folder, str(e)))
1745 # If this is the only source folder, re-raise the error
1746 if len(source_folders) == 1:
1747 raise
1749 # For multiple source folders, continue with others but log the failure
1750 self.logger.warning(
1751 "Continuing with other source folders despite failure in %s", source_folder
1752 )
1754 # If all source folders failed, raise an error
1755 if failed_folders and len(failed_folders) == len(source_folders):
1756 error_msg = "All source folders failed to parse:\n"
1757 for folder, error in failed_folders:
1758 error_msg += f" - {folder}: {error}\n"
1759 raise RuntimeError(error_msg)
1761 # If some folders failed, log a warning
1762 if failed_folders:
1763 self.logger.warning(
1764 f"Failed to parse {len(failed_folders)} out of {len(source_folders)} source folders"
1765 )
1767 # Create combined project model
1768 combined_model = ProjectModel(
1769 project_name=project_name,
1770 source_folder=(
1771 ",".join(source_folders)
1772 if len(source_folders) > 1
1773 else source_folders[0]
1774 ),
1775 files=all_files,
1776 )
1778 # Update all uses fields across the entire combined project
1779 combined_model.update_uses_fields()
1781 # Save combined model to JSON file
1782 try:
1783 combined_model.save(output_file)
1784 except Exception as e:
1785 raise RuntimeError(f"Failed to save model to {output_file}: {e}") from e
1787 # Step 1.5: Verify model sanity
1788 self.logger.info("Step 1.5: Verifying model sanity...")
1789 from .verifier import ModelVerifier
1791 verifier = ModelVerifier()
1792 is_valid, issues = verifier.verify_model(combined_model)
1794 if not is_valid:
1795 self.logger.warning(
1796 f"Model verification found {len(issues)} issues - model may contain parsing errors"
1797 )
1798 # Continue processing but warn about potential issues
1799 else:
1800 self.logger.info("Model verification passed - all values look sane")
1802 self.logger.info("Step 1 complete! Model saved to: %s", output_file)
1803 self.logger.info(
1804 f"Found {len(all_files)} total files across {len(source_folders)} source folder(s)"
1805 )
1807 # Print summary
1808 self.logger.info(
1809 f"Summary: {total_structs} structs, {total_enums} enums, "
1810 f"{total_functions} functions"
1811 )
1813 return output_file