Coverage for src/c2puml/core/parser.py: 79%

932 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-20 03:53 +0000

1#!/usr/bin/env python3 

2""" 

3Parser module for C to PlantUML converter - Step 1: Parse C code files and generate model.json 

4""" 

5import logging 

6from pathlib import Path 

7from typing import TYPE_CHECKING, Dict, List, Optional, Set 

8 

9from ..models import Enum, EnumValue, Field, FileModel, ProjectModel, Struct 

10from .parser_tokenizer import ( 

11 CTokenizer, 

12 StructureFinder, 

13 TokenType, 

14 find_enum_values, 

15 find_struct_fields, 

16) 

17from .preprocessor import PreprocessorManager 

18from .parser_anonymous_processor import AnonymousTypedefProcessor 

19from ..utils import detect_file_encoding 

20 

21if TYPE_CHECKING: 

22 from ..config import Config 

23 from ..models import Alias, Enum, Field, Function, Struct, Union 

24 

25 

26class CParser: 

27 """C/C++ parser for extracting structural information from source code using tokenization""" 

28 

29 def __init__(self): 

30 self.logger = logging.getLogger(__name__) 

31 self.tokenizer = CTokenizer() 

32 self.preprocessor = PreprocessorManager() 

33 

34 def parse_project( 

35 self, source_folder: str, recursive_search: bool = True, config: "Config" = None 

36 ) -> ProjectModel: 

37 """Parse a C/C++ project and return a model""" 

38 # Enhanced source path validation 

39 if not source_folder or not isinstance(source_folder, str): 

40 raise ValueError(f"Source folder must be a non-empty string, got: {type(source_folder)}") 

41 

42 if not source_folder.strip(): 

43 raise ValueError("Source folder cannot be empty or whitespace") 

44 

45 try: 

46 source_folder_path = Path(source_folder).resolve() 

47 except (OSError, RuntimeError) as e: 

48 raise ValueError(f"Failed to resolve source folder path '{source_folder}': {e}") 

49 

50 if not source_folder_path.exists(): 

51 # Provide helpful error message with suggestions 

52 error_msg = f"Source folder not found: {source_folder_path}" 

53 

54 # Check if it's a relative path issue 

55 if not Path(source_folder).is_absolute(): 

56 current_dir = Path.cwd() 

57 error_msg += f"\nCurrent working directory: {current_dir}" 

58 error_msg += f"\nTried to resolve relative path: {source_folder}" 

59 

60 # Check if parent directory exists 

61 parent_dir = source_folder_path.parent 

62 if parent_dir.exists(): 

63 error_msg += f"\nParent directory exists: {parent_dir}" 

64 # List contents of parent directory 

65 try: 

66 contents = [item.name for item in parent_dir.iterdir() if item.is_dir()] 

67 if contents: 

68 error_msg += f"\nAvailable directories in parent: {', '.join(contents[:10])}" 

69 if len(contents) > 10: 

70 error_msg += f" (and {len(contents) - 10} more)" 

71 except (OSError, PermissionError): 

72 error_msg += "\nCannot list parent directory contents (permission denied)" 

73 else: 

74 error_msg += f"\nParent directory does not exist: {parent_dir}" 

75 

76 raise ValueError(error_msg) 

77 

78 if not source_folder_path.is_dir(): 

79 raise ValueError(f"Source folder must be a directory, got: {source_folder_path} (is_file: {source_folder_path.is_file()})") 

80 

81 # Check if directory is readable 

82 try: 

83 source_folder_path.iterdir() 

84 except PermissionError: 

85 raise ValueError(f"Permission denied accessing source folder: {source_folder_path}") 

86 except OSError as e: 

87 raise ValueError(f"Error accessing source folder '{source_folder_path}': {e}") 

88 

89 self.logger.info("Parsing project: %s", source_folder_path) 

90 

91 # Find all C/C++ files in the project 

92 try: 

93 all_c_files = self._find_c_files(source_folder_path, recursive_search) 

94 except OSError as e: 

95 raise ValueError(f"Error searching for C/C++ files in '{source_folder_path}': {e}") 

96 

97 self.logger.info("Found %d C/C++ files", len(all_c_files)) 

98 

99 # Apply file filtering based on configuration 

100 c_files = [] 

101 if config: 

102 for file_path in all_c_files: 

103 if config._should_include_file(file_path.name): 

104 c_files.append(file_path) 

105 self.logger.debug( 

106 "Included file after filtering: %s", file_path.name 

107 ) 

108 else: 

109 self.logger.debug( 

110 "Excluded file after filtering: %s", file_path.name 

111 ) 

112 else: 

113 c_files = all_c_files 

114 

115 self.logger.info("After filtering: %d C/C++ files", len(c_files)) 

116 

117 # Parse each file using filename as key for simplified tracking 

118 files = {} 

119 failed_files = [] 

120 

121 for file_path in c_files: 

122 try: 

123 # Use relative path for tracking and filename as key 

124 relative_path = str(file_path.relative_to(source_folder_path)) 

125 file_model = self.parse_file(file_path, relative_path) 

126 

127 # Use filename as key (filenames are guaranteed to be unique) 

128 if file_model.name in files: 

129 raise RuntimeError( 

130 f"Duplicate filename detected: '{file_model.name}' from '{file_path}'. " 

131 f"Already seen from '{files[file_model.name].file_path}'." 

132 ) 

133 files[file_model.name] = file_model 

134 

135 self.logger.debug("Successfully parsed: %s", relative_path) 

136 

137 except (OSError, ValueError) as e: 

138 self.logger.warning("Failed to parse %s: %s", file_path, e) 

139 failed_files.append(str(file_path)) 

140 

141 if failed_files: 

142 error_msg = ( 

143 f"Failed to parse {len(failed_files)} files: {failed_files}. " 

144 "Stopping model processing." 

145 ) 

146 self.logger.error(error_msg) 

147 raise RuntimeError(error_msg) 

148 

149 model = ProjectModel( 

150 project_name=source_folder_path.name, 

151 source_folder=str(source_folder_path), 

152 files=files, 

153 ) 

154 

155 # Update all uses fields across the entire project 

156 model.update_uses_fields() 

157 

158 self.logger.info("Parsing complete. Parsed %d files successfully.", len(files)) 

159 return model 

160 

161 def parse_file(self, file_path: Path, relative_path: str) -> FileModel: 

162 """Parse a single C/C++ file and return a file model using tokenization""" 

163 self.logger.debug("Parsing file: %s", file_path) 

164 

165 # Detect encoding 

166 encoding = self._detect_encoding(file_path) 

167 

168 # Read file content 

169 with open(file_path, "r", encoding=encoding) as f: 

170 content = f.read() 

171 

172 # Tokenize the content 

173 tokens = self.tokenizer.tokenize(content) 

174 self.logger.debug("Tokenized file into %d tokens", len(tokens)) 

175 

176 # Process preprocessor directives 

177 self.preprocessor.add_defines_from_content(tokens) 

178 processed_tokens = self.preprocessor.process_file(tokens) 

179 self.logger.debug( 

180 "Preprocessor processed %d tokens -> %d tokens", 

181 len(tokens), 

182 len(processed_tokens), 

183 ) 

184 

185 # Filter out whitespace and comments for structure finding 

186 filtered_tokens = self.tokenizer.filter_tokens(processed_tokens) 

187 structure_finder = StructureFinder(filtered_tokens) 

188 

189 # Parse different structures using tokenizer 

190 structs = self._parse_structs_with_tokenizer(processed_tokens, structure_finder) 

191 enums = self._parse_enums_with_tokenizer(processed_tokens, structure_finder) 

192 unions = self._parse_unions_with_tokenizer(processed_tokens, structure_finder) 

193 functions = self._parse_functions_with_tokenizer( 

194 processed_tokens, structure_finder 

195 ) 

196 aliases = self._parse_aliases_with_tokenizer(processed_tokens) 

197 

198 # "uses" fields will be updated when we have the full project model 

199 

200 # Map typedef names to anonymous structs/enums/unions if needed 

201 # This logic will be handled by typedef_relations instead 

202 

203 file_model = FileModel( 

204 file_path=str(file_path), 

205 structs=structs, 

206 enums=enums, 

207 unions=unions, 

208 functions=functions, 

209 globals=self._parse_globals_with_tokenizer(processed_tokens), 

210 includes=self._parse_includes_with_tokenizer(processed_tokens), 

211 macros=self._parse_macros_with_tokenizer(processed_tokens), 

212 aliases=aliases, 

213 # Tag names are now stored in struct/enum/union objects 

214 ) 

215 

216 # Process anonymous typedefs after initial parsing 

217 anonymous_processor = AnonymousTypedefProcessor() 

218 anonymous_processor.process_file_model(file_model) 

219 

220 return file_model 

221 

222 def _parse_structs_with_tokenizer( 

223 self, tokens, structure_finder 

224 ) -> Dict[str, "Struct"]: 

225 """Parse struct definitions using tokenizer""" 

226 

227 structs = {} 

228 struct_infos = structure_finder.find_structs() 

229 

230 for start_pos, end_pos, struct_name in struct_infos: 

231 # Need to map back to original token positions 

232 # Find the original token positions by looking at line/column info 

233 original_start = self._find_original_token_pos( 

234 tokens, structure_finder.tokens, start_pos 

235 ) 

236 original_end = self._find_original_token_pos( 

237 tokens, structure_finder.tokens, end_pos 

238 ) 

239 

240 if original_start is not None and original_end is not None: 

241 # Extract field information from original token range 

242 field_tuples = find_struct_fields(tokens, original_start, original_end) 

243 

244 # Convert to Field objects 

245 fields = [] 

246 for field_name, field_type in field_tuples: 

247 try: 

248 fields.append(Field(field_name, field_type)) 

249 except ValueError as e: 

250 self.logger.warning( 

251 "Error creating field %s: %s", field_name, e 

252 ) 

253 

254 # For anonymous structs, use a special key that can be mapped later 

255 if not struct_name: 

256 struct_name = "__anonymous_struct__" 

257 

258 # Extract tag name if this is a typedef struct 

259 tag_name = "" 

260 if struct_name and not struct_name.startswith("__anonymous"): 

261 # Check if this struct has a typedef 

262 tag_name = self._extract_tag_name_for_struct(tokens, struct_name) 

263 

264 # Only register non-empty struct names here; anonymous will be created by the anonymous processor 

265 if struct_name: 

266 structs[struct_name] = Struct( 

267 struct_name, fields, tag_name=tag_name, uses=[] 

268 ) 

269 self.logger.debug( 

270 "Parsed struct: %s with %d fields", struct_name, len(fields) 

271 ) 

272 

273 return structs 

274 

275 def _parse_enums_with_tokenizer( 

276 self, tokens, structure_finder 

277 ) -> Dict[str, "Enum"]: 

278 """Parse enum definitions using tokenizer""" 

279 enums = {} 

280 enum_infos = structure_finder.find_enums() 

281 

282 for start_pos, end_pos, enum_name in enum_infos: 

283 # Need to map back to original token positions 

284 original_start = self._find_original_token_pos( 

285 tokens, structure_finder.tokens, start_pos 

286 ) 

287 original_end = self._find_original_token_pos( 

288 tokens, structure_finder.tokens, end_pos 

289 ) 

290 

291 if original_start is not None and original_end is not None: 

292 # Extract enum values from original token range 

293 value_strs = find_enum_values(tokens, original_start, original_end) 

294 values = [] 

295 for v in value_strs: 

296 if "=" in v: 

297 name, val = v.split("=", 1) 

298 name = name.strip() 

299 val = val.strip() 

300 if name: # Only add if name is not empty 

301 values.append(EnumValue(name=name, value=val)) 

302 else: 

303 name = v.strip() 

304 if name: # Only add if name is not empty 

305 values.append(EnumValue(name=name)) 

306 

307 # For anonymous enums, use a special key that can be mapped later 

308 if not enum_name: 

309 enum_name = "__anonymous_enum__" 

310 

311 # Extract tag name if this is a typedef enum 

312 tag_name = "" 

313 if enum_name and not enum_name.startswith("__anonymous"): 

314 # Check if this enum has a typedef 

315 tag_name = self._extract_tag_name_for_enum(tokens, enum_name) 

316 

317 enums[enum_name] = Enum(enum_name, values, tag_name=tag_name) 

318 self.logger.debug( 

319 "Parsed enum: %s with %d values", enum_name, len(values) 

320 ) 

321 

322 return enums 

323 

324 def _parse_unions_with_tokenizer( 

325 self, tokens, structure_finder 

326 ) -> Dict[str, "Union"]: 

327 """Parse union definitions using tokenizer""" 

328 from ..models import Field, Union 

329 

330 unions = {} 

331 union_infos = structure_finder.find_unions() 

332 

333 for start_pos, end_pos, union_name in union_infos: 

334 # Need to map back to original token positions 

335 original_start = self._find_original_token_pos( 

336 tokens, structure_finder.tokens, start_pos 

337 ) 

338 original_end = self._find_original_token_pos( 

339 tokens, structure_finder.tokens, end_pos 

340 ) 

341 

342 if original_start is not None and original_end is not None: 

343 # Extract field information from original token range 

344 field_tuples = find_struct_fields(tokens, original_start, original_end) 

345 

346 # Convert to Field objects 

347 fields = [] 

348 for field_name, field_type in field_tuples: 

349 try: 

350 fields.append(Field(field_name, field_type)) 

351 except ValueError as e: 

352 self.logger.warning( 

353 "Error creating union field %s: %s", field_name, e 

354 ) 

355 

356 # For anonymous unions, use a special key that can be mapped later 

357 if not union_name: 

358 union_name = "__anonymous_union__" 

359 

360 # Extract tag name if this is a typedef union 

361 tag_name = "" 

362 if union_name and not union_name.startswith("__anonymous"): 

363 # Check if this union has a typedef 

364 tag_name = self._extract_tag_name_for_union(tokens, union_name) 

365 

366 unions[union_name] = Union( 

367 union_name, fields, tag_name=tag_name, uses=[] 

368 ) 

369 self.logger.debug( 

370 "Parsed union: %s with %d fields", union_name, len(fields) 

371 ) 

372 

373 return unions 

374 

375 def _parse_functions_with_tokenizer( 

376 self, tokens, structure_finder 

377 ) -> List["Function"]: 

378 """Parse function declarations/definitions using tokenizer""" 

379 from ..models import Function 

380 

381 functions = [] 

382 function_infos = structure_finder.find_functions() 

383 

384 for ( 

385 start_pos, 

386 end_pos, 

387 func_name, 

388 return_type, 

389 is_declaration, 

390 is_inline, 

391 ) in function_infos: 

392 # Map back to original token positions to parse parameters 

393 original_start = self._find_original_token_pos( 

394 tokens, structure_finder.tokens, start_pos 

395 ) 

396 original_end = self._find_original_token_pos( 

397 tokens, structure_finder.tokens, end_pos 

398 ) 

399 

400 parameters = [] 

401 if original_start is not None and original_end is not None: 

402 # Parse parameters from the token range 

403 parameters = self._parse_function_parameters( 

404 tokens, original_start, original_end, func_name 

405 ) 

406 

407 try: 

408 # Create function with declaration flag 

409 function = Function(func_name, return_type, parameters) 

410 # Add custom attributes to track if this is a declaration and if it's inline 

411 function.is_declaration = is_declaration 

412 function.is_inline = is_inline 

413 functions.append(function) 

414 self.logger.debug( 

415 f"Parsed function: {func_name} with {len(parameters)} parameters (declaration: {is_declaration}, inline: {is_inline})" 

416 ) 

417 except Exception as e: 

418 self.logger.warning("Error creating function %s: %s", func_name, e) 

419 

420 return functions 

421 

422 def _parse_globals_with_tokenizer(self, tokens) -> List["Field"]: 

423 """Parse global variables using tokenizer""" 

424 from ..models import Field 

425 

426 globals_list = [] 

427 

428 i = 0 

429 while i < len(tokens): 

430 # Skip preprocessor directives, comments, etc. 

431 if tokens[i].type in [ 

432 TokenType.INCLUDE, 

433 TokenType.DEFINE, 

434 TokenType.COMMENT, 

435 TokenType.WHITESPACE, 

436 TokenType.NEWLINE, 

437 ]: 

438 i += 1 

439 continue 

440 

441 # Skip preprocessor directives but keep their content 

442 if tokens[i].type == TokenType.PREPROCESSOR: 

443 i = self._skip_preprocessor_directives(tokens, i) 

444 continue 

445 

446 # Skip function definitions (look for parentheses) 

447 if self._looks_like_function(tokens, i): 

448 i = self._skip_function(tokens, i) 

449 continue 

450 

451 # Skip struct/enum/union definitions 

452 if tokens[i].type in [ 

453 TokenType.STRUCT, 

454 TokenType.ENUM, 

455 TokenType.UNION, 

456 TokenType.TYPEDEF, 

457 ]: 

458 i = self._skip_structure_definition(tokens, i) 

459 continue 

460 

461 # Skip if we're inside a struct definition (look for opening brace) 

462 if i > 0 and tokens[i - 1].type == TokenType.LBRACE: 

463 # We're inside a struct, skip until closing brace 

464 brace_count = 1 

465 j = i 

466 while j < len(tokens) and brace_count > 0: 

467 if tokens[j].type == TokenType.LBRACE: 

468 brace_count += 1 

469 elif tokens[j].type == TokenType.RBRACE: 

470 brace_count -= 1 

471 j += 1 

472 i = j 

473 continue 

474 

475 # Skip macros and other preprocessor content 

476 if tokens[i].type == TokenType.DEFINE: 

477 # Skip the entire macro content (multi-line macros are now merged) 

478 i += 1 

479 continue 

480 

481 # Additional check: skip if we're inside any brace block (struct, function, etc.) 

482 brace_count = 0 

483 j = i - 1 

484 while j >= 0: 

485 if tokens[j].type == TokenType.RBRACE: 

486 brace_count += 1 

487 elif tokens[j].type == TokenType.LBRACE: 

488 brace_count -= 1 

489 if brace_count < 0: 

490 # We're inside a brace block, skip this token 

491 i += 1 

492 break 

493 j -= 1 

494 else: 

495 # Not inside a brace block, proceed with global variable parsing 

496 global_info = self._parse_global_variable(tokens, i) 

497 if global_info: 

498 var_name, var_type, var_value = global_info 

499 # Only add if it looks like a real global variable (not a fragment) 

500 if ( 

501 var_name 

502 and var_name.strip() 

503 and var_type 

504 and var_type.strip() 

505 and not var_name.startswith("#") 

506 and len(var_type) < 200 

507 and not var_type.startswith("\\") 

508 and not var_name.startswith("\\") 

509 and "\\" not in var_type 

510 and "\\" not in var_name 

511 ): 

512 try: 

513 # Additional validation before creating Field 

514 stripped_name = var_name.strip() 

515 stripped_type = var_type.strip() 

516 if stripped_name and stripped_type: 

517 globals_list.append( 

518 Field( 

519 name=stripped_name, 

520 type=stripped_type, 

521 value=var_value, 

522 ) 

523 ) 

524 self.logger.debug( 

525 f"Parsed global: {stripped_name} : {stripped_type}" 

526 ) 

527 except Exception as e: 

528 self.logger.warning( 

529 f"Error creating global field {var_name}: {e}" 

530 ) 

531 i = self._skip_to_semicolon(tokens, i) 

532 else: 

533 i += 1 

534 

535 return globals_list 

536 

537 def _parse_includes_with_tokenizer(self, tokens) -> List[str]: 

538 """Parse #include directives using tokenizer""" 

539 includes = [] 

540 

541 for token in tokens: 

542 if token.type == TokenType.INCLUDE: 

543 # Extract include filename from the token value 

544 # e.g., "#include <stdio.h>" -> "stdio.h" 

545 # e.g., '#include "header.h"' -> "header.h" 

546 # e.g., "#include 'header.h'" -> "header.h" 

547 import re 

548 

549 match = re.search(r'[<"\']([^>\'"]+)[>\'"]', token.value) 

550 if match: 

551 # Return just the filename without quotes or angle brackets 

552 includes.append(match.group(1)) 

553 

554 return includes 

555 

556 def _parse_macros_with_tokenizer(self, tokens) -> List[str]: 

557 """Parse macro definitions using tokenizer""" 

558 macros = [] 

559 

560 for token in tokens: 

561 if token.type == TokenType.DEFINE: 

562 # Store the full macro definition for display flexibility 

563 # e.g., "#define PI 3.14159" -> "#define PI 3.14159" 

564 # e.g., "#define MIN(a, b) ((a) < (b) ? (a) : (b))" -> "#define MIN(a, b) ((a) < (b) ? (a) : (b))" 

565 macro_definition = token.value.strip() 

566 if macro_definition not in macros: 

567 macros.append(macro_definition) 

568 

569 return macros 

570 

571 def _parse_aliases_with_tokenizer(self, tokens) -> Dict[str, "Alias"]: 

572 """Parse type aliases (primitive or derived typedefs) using tokenizer""" 

573 from ..models import Alias 

574 

575 aliases = {} 

576 

577 i = 0 

578 while i < len(tokens): 

579 if tokens[i].type == TokenType.TYPEDEF: 

580 # Found typedef, parse it 

581 typedef_info = self._parse_single_typedef(tokens, i) 

582 if typedef_info: 

583 typedef_name, original_type = typedef_info 

584 

585 # Only include if it's NOT a struct/enum/union typedef 

586 if original_type not in ["struct", "enum", "union"]: 

587 aliases[typedef_name] = Alias( 

588 name=typedef_name, original_type=original_type, uses=[] 

589 ) 

590 

591 i += 1 

592 

593 return aliases 

594 

595 # _parse_typedef_relations_with_tokenizer method removed - tag names are now in struct/enum/union 

596 

597 def _extract_tag_name_for_struct(self, tokens, struct_name: str) -> str: 

598 """Extract tag name for a struct if it has a typedef""" 

599 i = 0 

600 while i < len(tokens): 

601 if tokens[i].type == TokenType.TYPEDEF: 

602 typedef_info = self._parse_single_typedef(tokens, i) 

603 if typedef_info: 

604 typedef_name, original_type = typedef_info 

605 if original_type == "struct" and typedef_name == struct_name: 

606 # Extract the tag name from the typedef 

607 return self._extract_tag_name_from_typedef(tokens, i) 

608 i += 1 

609 return "" 

610 

611 def _extract_tag_name_for_enum(self, tokens, enum_name: str) -> str: 

612 """Extract tag name for an enum if it has a typedef""" 

613 i = 0 

614 while i < len(tokens): 

615 if tokens[i].type == TokenType.TYPEDEF: 

616 typedef_info = self._parse_single_typedef(tokens, i) 

617 if typedef_info: 

618 typedef_name, original_type = typedef_info 

619 if original_type == "enum" and typedef_name == enum_name: 

620 # Extract the tag name from the typedef 

621 return self._extract_tag_name_from_typedef(tokens, i) 

622 i += 1 

623 return "" 

624 

625 def _extract_tag_name_for_union(self, tokens, union_name: str) -> str: 

626 """Extract tag name for a union if it has a typedef""" 

627 i = 0 

628 while i < len(tokens): 

629 if tokens[i].type == TokenType.TYPEDEF: 

630 typedef_info = self._parse_single_typedef(tokens, i) 

631 if typedef_info: 

632 typedef_name, original_type = typedef_info 

633 if original_type == "union" and typedef_name == union_name: 

634 # Extract the tag name from the typedef 

635 return self._extract_tag_name_from_typedef(tokens, i) 

636 i += 1 

637 return "" 

638 

639 def _extract_non_primitive_types( 

640 self, type_str: str, available_types: Set[str] 

641 ) -> List[str]: 

642 """Extract non-primitive type names from a type string that exist in available_types""" 

643 # Define primitive types 

644 primitive_types = { 

645 "void", 

646 "char", 

647 "short", 

648 "int", 

649 "long", 

650 "float", 

651 "double", 

652 "signed", 

653 "unsigned", 

654 "const", 

655 "volatile", 

656 "static", 

657 "extern", 

658 "auto", 

659 "register", 

660 "inline", 

661 "restrict", 

662 "size_t", 

663 "ptrdiff_t", 

664 "int8_t", 

665 "int16_t", 

666 "int32_t", 

667 "int64_t", 

668 "uint8_t", 

669 "uint16_t", 

670 "uint32_t", 

671 "uint64_t", 

672 "intptr_t", 

673 "uintptr_t", 

674 "bool", 

675 "true", 

676 "false", 

677 "NULL", 

678 "nullptr", 

679 } 

680 

681 # Remove common C keywords and operators 

682 import re 

683 

684 # Split by common delimiters and operators 

685 parts = re.split(r"[\[\]\(\)\{\}\s\*&,;]", type_str) 

686 

687 # Extract potential type names that exist in available_types 

688 types = [] 

689 for part in parts: 

690 part = part.strip() 

691 if part and len(part) > 1 and part not in primitive_types: 

692 # Check if it looks like a type name (starts with letter, contains letters/numbers/underscores) 

693 if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", part): 

694 # Only include if it exists in available_types 

695 if part in available_types: 

696 types.append(part) 

697 

698 return list(set(types)) # Remove duplicates 

699 

700 def _find_c_files( 

701 self, source_folder_path: Path, recursive_search: bool 

702 ) -> List[Path]: 

703 """Find all C/C++ files in the source folder""" 

704 c_extensions = {".c", ".h", ".cpp", ".cc", ".cxx", ".hpp", ".hxx"} 

705 files = [] 

706 

707 self.logger.debug("Searching for files with extensions: %s", c_extensions) 

708 

709 try: 

710 if recursive_search: 

711 for ext in c_extensions: 

712 try: 

713 files.extend(source_folder_path.rglob(f"*{ext}")) 

714 except (OSError, PermissionError) as e: 

715 self.logger.warning("Error during recursive search for %s files: %s", ext, e) 

716 # Continue with other extensions 

717 else: 

718 for ext in c_extensions: 

719 try: 

720 files.extend(source_folder_path.glob(f"*{ext}")) 

721 except (OSError, PermissionError) as e: 

722 self.logger.warning("Error during search for %s files: %s", ext, e) 

723 # Continue with other extensions 

724 except Exception as e: 

725 raise OSError(f"Failed to search for C/C++ files in '{source_folder_path}': {e}") 

726 

727 # Filter out hidden files and common exclude patterns 

728 filtered_files = [] 

729 exclude_patterns = {".git", "__pycache__", "node_modules", ".vscode", ".idea"} 

730 

731 for file_path in files: 

732 try: 

733 # Skip hidden files and directories 

734 if any(part.startswith(".") for part in file_path.parts): 

735 continue 

736 

737 # Skip common exclude patterns 

738 if any(pattern in file_path.parts for pattern in exclude_patterns): 

739 continue 

740 

741 # Verify the file is actually accessible 

742 if not file_path.exists(): 

743 self.logger.debug("Skipping non-existent file: %s", file_path) 

744 continue 

745 

746 if not file_path.is_file(): 

747 self.logger.debug("Skipping non-file item: %s", file_path) 

748 continue 

749 

750 filtered_files.append(file_path) 

751 except (OSError, PermissionError) as e: 

752 self.logger.warning("Error accessing file %s: %s", file_path, e) 

753 # Skip files we can't access 

754 continue 

755 

756 self.logger.debug("Found %d C/C++ files after filtering", len(filtered_files)) 

757 return sorted(filtered_files) 

758 

759 def _detect_encoding(self, file_path: Path) -> str: 

760 """Detect file encoding with platform-aware fallbacks""" 

761 return detect_file_encoding(file_path) 

762 

763 def _find_original_token_pos(self, all_tokens, filtered_tokens, filtered_pos): 

764 """Find the position in all_tokens that corresponds to filtered_tokens[filtered_pos]""" 

765 if filtered_pos >= len(filtered_tokens): 

766 return None 

767 

768 target_token = filtered_tokens[filtered_pos] 

769 

770 # Search for the token in all_tokens by line and column 

771 for i, token in enumerate(all_tokens): 

772 if ( 

773 token.line == target_token.line 

774 and token.column == target_token.column 

775 and token.value == target_token.value 

776 ): 

777 return i 

778 

779 return None 

780 

781 def _parse_single_typedef(self, tokens, start_pos): 

782 """Parse a single typedef starting at the given position""" 

783 # Skip 'typedef' keyword 

784 pos = start_pos + 1 

785 

786 # Skip whitespace and comments 

787 while pos < len(tokens) and tokens[pos].type in [ 

788 TokenType.WHITESPACE, 

789 TokenType.COMMENT, 

790 ]: 

791 pos += 1 

792 

793 if pos >= len(tokens): 

794 return None 

795 

796 # Check if it's a struct/enum/union typedef 

797 if tokens[pos].type in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]: 

798 # Look ahead to see if this complex type is immediately followed by a function-pointer declarator 

799 # Pattern to detect: ... } ( * name ) ( ... ) 

800 look = pos 

801 # Find the matching closing brace of the outer struct/union/enum 

802 if tokens[look].type in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]: 

803 # Advance to the opening brace 

804 while look < len(tokens) and tokens[look].type != TokenType.LBRACE: 

805 look += 1 

806 if look < len(tokens) and tokens[look].type == TokenType.LBRACE: 

807 brace_count = 1 

808 look += 1 

809 while look < len(tokens) and brace_count > 0: 

810 if tokens[look].type == TokenType.LBRACE: 

811 brace_count += 1 

812 elif tokens[look].type == TokenType.RBRACE: 

813 brace_count -= 1 

814 look += 1 

815 # Now 'look' is token after the closing brace 

816 j = look 

817 # Skip whitespace/comments 

818 while j < len(tokens) and tokens[j].type in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]: 

819 j += 1 

820 # Detect function-pointer declarator: ( * IDENT ) ( 

821 if ( 

822 j + 4 < len(tokens) 

823 and tokens[j].type == TokenType.LPAREN 

824 and tokens[j + 1].type == TokenType.ASTERISK 

825 and tokens[j + 2].type == TokenType.IDENTIFIER 

826 and tokens[j + 3].type == TokenType.RPAREN 

827 and tokens[j + 4].type == TokenType.LPAREN 

828 ): 

829 typedef_name = tokens[j + 2].value 

830 # Collect the full typedef original type up to the semicolon, preserving parentheses/brackets spacing 

831 k = pos 

832 formatted: list[str] = [] 

833 while k < len(tokens) and tokens[k].type != TokenType.SEMICOLON: 

834 t = tokens[k] 

835 if t.type in [TokenType.LPAREN, TokenType.RPAREN, TokenType.LBRACKET, TokenType.RBRACKET]: 

836 formatted.append(t.value) 

837 elif formatted and formatted[-1] not in ["(", ")", "[", "]"]: 

838 # Prepend space before non-bracket tokens when previous isn't a bracket 

839 formatted.append(" " + t.value) 

840 else: 

841 formatted.append(t.value) 

842 k += 1 

843 original_type = "".join(formatted) 

844 # Clean excessive whitespace inside type 

845 original_type = self._clean_type_string(original_type) 

846 return (typedef_name, original_type) 

847 # Fallback to standard complex typedef parsing 

848 return self._parse_complex_typedef(tokens, pos) 

849 

850 # Collect all non-whitespace/comment tokens until semicolon 

851 # But handle nested structures properly 

852 all_tokens = [] 

853 brace_count = 0 

854 paren_count = 0 

855 

856 while pos < len(tokens): 

857 token = tokens[pos] 

858 

859 # Track nested braces and parentheses 

860 if token.type == TokenType.LBRACE: 

861 brace_count += 1 

862 elif token.type == TokenType.RBRACE: 

863 brace_count -= 1 

864 elif token.type == TokenType.LPAREN: 

865 paren_count += 1 

866 elif token.type == TokenType.RPAREN: 

867 paren_count -= 1 

868 elif token.type == TokenType.SEMICOLON: 

869 # Only treat semicolon as end if we're not inside nested structures 

870 # For function pointer typedefs, we need to be outside the parameter list parentheses 

871 if brace_count == 0 and paren_count == 0: 

872 # We're outside any nested structures and parentheses 

873 break 

874 

875 if token.type not in [TokenType.WHITESPACE, TokenType.COMMENT]: 

876 all_tokens.append(token) 

877 pos += 1 

878 

879 if len(all_tokens) < 2: 

880 return None 

881 

882 # Function pointer typedef: typedef ret (*name)(params); 

883 for i in range(len(all_tokens) - 3): 

884 if ( 

885 all_tokens[i].type 

886 in [ 

887 TokenType.IDENTIFIER, 

888 TokenType.INT, 

889 TokenType.VOID, 

890 TokenType.CHAR, 

891 TokenType.FLOAT, 

892 TokenType.DOUBLE, 

893 TokenType.LONG, 

894 TokenType.SHORT, 

895 TokenType.UNSIGNED, 

896 TokenType.SIGNED, 

897 ] 

898 and all_tokens[i + 1].type == TokenType.LPAREN 

899 and all_tokens[i + 2].type == TokenType.ASTERISK 

900 and all_tokens[i + 3].type == TokenType.IDENTIFIER 

901 ): 

902 # Check if this is followed by a parameter list 

903 if i + 4 < len(all_tokens) and all_tokens[i + 4].type == TokenType.RPAREN: 

904 if i + 5 < len(all_tokens) and all_tokens[i + 5].type == TokenType.LPAREN: 

905 # This is a function pointer with parameters - skip this pattern and use the complex logic 

906 break 

907 

908 # Simple function pointer typedef without complex parameters 

909 typedef_name = all_tokens[i + 3].value 

910 # Fix: Properly format function pointer type - preserve spaces between tokens but not around parentheses 

911 formatted_tokens = [] 

912 for j, token in enumerate(all_tokens): 

913 if token.type in [TokenType.LPAREN, TokenType.RPAREN]: 

914 # Don't add spaces around parentheses 

915 formatted_tokens.append(token.value) 

916 elif j > 0 and all_tokens[j - 1].type not in [ 

917 TokenType.LPAREN, 

918 TokenType.RPAREN, 

919 ]: 

920 # Add space before token if previous token wasn't a parenthesis 

921 formatted_tokens.append(" " + token.value) 

922 else: 

923 # No space before token 

924 formatted_tokens.append(token.value) 

925 original_type = "".join(formatted_tokens) 

926 return (typedef_name, original_type) 

927 

928 # Complex function pointer typedef: typedef ret (*name)(complex_params); 

929 # This handles cases where the function pointer has complex parameters that span multiple tokens 

930 if len(all_tokens) >= 6: 

931 # Look for pattern: type ( * name ) ( ... ) 

932 for i in range(len(all_tokens) - 5): 

933 if ( 

934 all_tokens[i].type 

935 in [ 

936 TokenType.IDENTIFIER, 

937 TokenType.INT, 

938 TokenType.VOID, 

939 TokenType.CHAR, 

940 TokenType.FLOAT, 

941 TokenType.DOUBLE, 

942 TokenType.LONG, 

943 TokenType.SHORT, 

944 TokenType.UNSIGNED, 

945 TokenType.SIGNED, 

946 ] 

947 and all_tokens[i + 1].type == TokenType.LPAREN 

948 and all_tokens[i + 2].type == TokenType.ASTERISK 

949 and all_tokens[i + 3].type == TokenType.IDENTIFIER 

950 and all_tokens[i + 4].type == TokenType.RPAREN 

951 and all_tokens[i + 5].type == TokenType.LPAREN 

952 ): 

953 # Find the closing parenthesis for the parameter list 

954 paren_count = 1 

955 param_end = i + 6 

956 while param_end < len(all_tokens) and paren_count > 0: 

957 if all_tokens[param_end].type == TokenType.LPAREN: 

958 paren_count += 1 

959 elif all_tokens[param_end].type == TokenType.RPAREN: 

960 paren_count -= 1 

961 param_end += 1 

962 

963 if paren_count == 0: 

964 typedef_name = all_tokens[i + 3].value 

965 # Format the complete typedef properly 

966 formatted_tokens = [] 

967 for j, token in enumerate(all_tokens): 

968 if token.type in [TokenType.LPAREN, TokenType.RPAREN]: 

969 # Don't add spaces around parentheses 

970 formatted_tokens.append(token.value) 

971 elif j > 0 and all_tokens[j - 1].type not in [ 

972 TokenType.LPAREN, 

973 TokenType.RPAREN, 

974 ]: 

975 # Add space before token if previous token wasn't a parenthesis 

976 formatted_tokens.append(" " + token.value) 

977 else: 

978 # No space before token 

979 formatted_tokens.append(token.value) 

980 original_type = "".join(formatted_tokens) 

981 return (typedef_name, original_type) 

982 

983 # Array typedef: typedef type name[size]; 

984 for i in range(len(all_tokens)): 

985 if ( 

986 all_tokens[i].type == TokenType.LBRACKET 

987 and i > 0 

988 and all_tokens[i - 1].type == TokenType.IDENTIFIER 

989 ): 

990 typedef_name = all_tokens[i - 1].value 

991 # Fix: Properly format array type - preserve spaces between tokens but not around brackets 

992 formatted_tokens = [] 

993 for j, token in enumerate(all_tokens): 

994 if token.type in [TokenType.LBRACKET, TokenType.RBRACKET]: 

995 # Don't add spaces around brackets 

996 formatted_tokens.append(token.value) 

997 elif j > 0 and all_tokens[j - 1].type not in [ 

998 TokenType.LBRACKET, 

999 TokenType.RBRACKET, 

1000 ]: 

1001 # Add space before token if previous token wasn't a bracket 

1002 formatted_tokens.append(" " + token.value) 

1003 else: 

1004 # No space before token 

1005 formatted_tokens.append(token.value) 

1006 original_type = "".join(formatted_tokens) 

1007 return (typedef_name, original_type) 

1008 

1009 # Pointer typedef: typedef type * name; 

1010 for i in range(len(all_tokens) - 2): 

1011 if ( 

1012 all_tokens[i].type == TokenType.ASTERISK 

1013 and all_tokens[i + 1].type == TokenType.IDENTIFIER 

1014 ): 

1015 typedef_name = all_tokens[i + 1].value 

1016 # Fix: Properly format pointer type - preserve spaces between tokens 

1017 formatted_tokens = [] 

1018 for j, token in enumerate(all_tokens): 

1019 if j > 0: 

1020 # Add space before token 

1021 formatted_tokens.append(" " + token.value) 

1022 else: 

1023 # No space before first token 

1024 formatted_tokens.append(token.value) 

1025 original_type = "".join(formatted_tokens) 

1026 return (typedef_name, original_type) 

1027 

1028 # Basic typedef: the last token is the typedef name, everything else is the type 

1029 typedef_name = all_tokens[-1].value 

1030 type_tokens = all_tokens[:-1] 

1031 original_type = " ".join(t.value for t in type_tokens) 

1032 original_type = self._clean_type_string(original_type) 

1033 original_type = self._fix_array_bracket_spacing(original_type) 

1034 return (typedef_name, original_type) 

1035 

1036 def _parse_complex_typedef(self, tokens, start_pos): 

1037 """Parse complex typedef (struct/enum/union)""" 

1038 # Parse complex typedefs with proper structure detection 

1039 

1040 # Find the typedef name by looking for the pattern after the closing brace 

1041 brace_count = 0 

1042 pos = start_pos 

1043 

1044 # Find opening brace 

1045 while pos < len(tokens) and tokens[pos].type != TokenType.LBRACE: 

1046 pos += 1 

1047 

1048 if pos >= len(tokens): 

1049 return None 

1050 

1051 # Skip to closing brace 

1052 brace_count = 1 

1053 pos += 1 

1054 

1055 while pos < len(tokens) and brace_count > 0: 

1056 if tokens[pos].type == TokenType.LBRACE: 

1057 brace_count += 1 

1058 elif tokens[pos].type == TokenType.RBRACE: 

1059 brace_count -= 1 

1060 pos += 1 

1061 

1062 if brace_count > 0: 

1063 return None 

1064 

1065 # Find typedef name after closing brace 

1066 while pos < len(tokens) and tokens[pos].type in [ 

1067 TokenType.WHITESPACE, 

1068 TokenType.COMMENT, 

1069 ]: 

1070 pos += 1 

1071 

1072 if pos < len(tokens) and tokens[pos].type == TokenType.IDENTIFIER: 

1073 typedef_name = tokens[pos].value 

1074 struct_type = tokens[start_pos].value # struct/enum/union 

1075 return (typedef_name, struct_type) 

1076 

1077 return None 

1078 

1079 def _extract_tag_name_from_typedef(self, tokens, start_pos): 

1080 """Extract the tag name from a typedef like 'typedef struct TagName { ... } TypedefName;'""" 

1081 # Skip 'typedef' keyword 

1082 pos = start_pos + 1 

1083 

1084 # Skip whitespace and comments 

1085 while pos < len(tokens) and tokens[pos].type in [ 

1086 TokenType.WHITESPACE, 

1087 TokenType.COMMENT, 

1088 ]: 

1089 pos += 1 

1090 

1091 if pos >= len(tokens): 

1092 return "" 

1093 

1094 # Check if it's a struct/enum/union 

1095 if tokens[pos].type not in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]: 

1096 return "" 

1097 

1098 # Skip struct/enum/union keyword 

1099 pos += 1 

1100 

1101 # Skip whitespace and comments 

1102 while pos < len(tokens) and tokens[pos].type in [ 

1103 TokenType.WHITESPACE, 

1104 TokenType.COMMENT, 

1105 ]: 

1106 pos += 1 

1107 

1108 # Look for tag name (identifier before opening brace) 

1109 if pos < len(tokens) and tokens[pos].type == TokenType.IDENTIFIER: 

1110 tag_name = tokens[pos].value 

1111 return tag_name 

1112 

1113 return "" 

1114 

1115 def _looks_like_function(self, tokens, start_pos): 

1116 """Check if the token sequence starting at start_pos looks like a function""" 

1117 # Look ahead for parentheses within a reasonable distance 

1118 for i in range(start_pos, min(start_pos + 10, len(tokens))): 

1119 if tokens[i].type == TokenType.LPAREN: 

1120 return True 

1121 if tokens[i].type in [ 

1122 TokenType.SEMICOLON, 

1123 TokenType.LBRACE, 

1124 TokenType.RBRACE, 

1125 ]: 

1126 return False 

1127 return False 

1128 

1129 def _skip_function(self, tokens, start_pos): 

1130 """Skip over a function definition or declaration""" 

1131 # Find the end (either semicolon for declaration or closing brace for definition) 

1132 i = start_pos 

1133 brace_count = 0 

1134 paren_count = 0 

1135 

1136 while i < len(tokens): 

1137 if tokens[i].type == TokenType.LPAREN: 

1138 paren_count += 1 

1139 elif tokens[i].type == TokenType.RPAREN: 

1140 paren_count -= 1 

1141 elif tokens[i].type == TokenType.LBRACE: 

1142 brace_count += 1 

1143 elif tokens[i].type == TokenType.RBRACE: 

1144 brace_count -= 1 

1145 if brace_count == 0 and paren_count == 0: 

1146 return i + 1 

1147 elif ( 

1148 tokens[i].type == TokenType.SEMICOLON 

1149 and paren_count == 0 

1150 and brace_count == 0 

1151 ): 

1152 return i + 1 

1153 i += 1 

1154 

1155 return i 

1156 

1157 def _skip_structure_definition(self, tokens, start_pos): 

1158 """Skip over struct/enum/union/typedef definition""" 

1159 i = start_pos 

1160 brace_count = 0 

1161 

1162 while i < len(tokens): 

1163 if tokens[i].type == TokenType.LBRACE: 

1164 brace_count += 1 

1165 elif tokens[i].type == TokenType.RBRACE: 

1166 brace_count -= 1 

1167 if brace_count == 0: 

1168 # Continue until semicolon 

1169 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON: 

1170 i += 1 

1171 return i + 1 if i < len(tokens) else i 

1172 elif tokens[i].type == TokenType.SEMICOLON and brace_count == 0: 

1173 return i + 1 

1174 i += 1 

1175 

1176 return i 

1177 

1178 def _parse_global_variable(self, tokens, start_pos): 

1179 """Parse a global variable declaration starting at start_pos""" 

1180 # Look for pattern: [static/extern] type name [= value]; 

1181 i = start_pos 

1182 collected_tokens = [] 

1183 

1184 # Collect tokens until semicolon 

1185 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON: 

1186 if tokens[i].type not in [TokenType.WHITESPACE, TokenType.COMMENT]: 

1187 collected_tokens.append(tokens[i]) 

1188 i += 1 

1189 

1190 if len(collected_tokens) < 2: 

1191 return None 

1192 

1193 # Skip modifiers 

1194 start_idx = 0 

1195 while start_idx < len(collected_tokens) and collected_tokens[ 

1196 start_idx 

1197 ].type in [TokenType.STATIC, TokenType.EXTERN, TokenType.CONST]: 

1198 start_idx += 1 

1199 

1200 # Check if there's an assignment 

1201 assign_idx = None 

1202 for j in range(start_idx, len(collected_tokens)): 

1203 if collected_tokens[j].type == TokenType.ASSIGN: 

1204 assign_idx = j 

1205 break 

1206 

1207 # Extract variable name and type 

1208 if assign_idx is not None: 

1209 # Has assignment: type name = value or type name[size] = value 

1210 if assign_idx > start_idx + 1: 

1211 # Check if this is an array declaration with assignment 

1212 bracket_idx = None 

1213 for j in range(assign_idx - 1, start_idx, -1): 

1214 if collected_tokens[j].type == TokenType.RBRACKET: 

1215 bracket_idx = j 

1216 break 

1217 

1218 if bracket_idx is not None: 

1219 # Array declaration with assignment: find the identifier before the opening bracket 

1220 for j in range(bracket_idx - 1, start_idx, -1): 

1221 if collected_tokens[j].type == TokenType.LBRACKET: 

1222 # Found opening bracket, look for identifier before it 

1223 for k in range(j - 1, start_idx, -1): 

1224 if collected_tokens[k].type == TokenType.IDENTIFIER: 

1225 var_name = collected_tokens[k].value 

1226 type_tokens = collected_tokens[start_idx:k] 

1227 # Format array type properly 

1228 formatted_type = [] 

1229 for idx, token in enumerate(type_tokens): 

1230 if idx > 0: 

1231 formatted_type.append(" " + token.value) 

1232 else: 

1233 formatted_type.append(token.value) 

1234 # Add array brackets without spaces 

1235 array_size = collected_tokens[j + 1].value if j + 1 < bracket_idx else "" 

1236 var_type = "".join(formatted_type) + "[" + array_size + "]" 

1237 var_type = self._clean_type_string(var_type) 

1238 value_tokens = collected_tokens[assign_idx + 1 :] 

1239 var_value = " ".join(t.value for t in value_tokens) 

1240 # Clean the value string to remove excessive whitespace and newlines 

1241 var_value = self._clean_value_string(var_value) 

1242 return (var_name, var_type, var_value) 

1243 break 

1244 else: 

1245 # Regular assignment: type name = value 

1246 var_name = collected_tokens[assign_idx - 1].value 

1247 type_tokens = collected_tokens[start_idx : assign_idx - 1] 

1248 value_tokens = collected_tokens[assign_idx + 1 :] 

1249 var_type = " ".join(t.value for t in type_tokens) 

1250 var_type = self._clean_type_string(var_type) 

1251 var_type = self._fix_array_bracket_spacing(var_type) 

1252 var_value = " ".join(t.value for t in value_tokens) 

1253 # Clean the value string to remove excessive whitespace and newlines 

1254 var_value = self._clean_value_string(var_value) 

1255 return (var_name, var_type, var_value) 

1256 else: 

1257 # No assignment: type name or type name[size] 

1258 if len(collected_tokens) > start_idx + 1: 

1259 # Check if this is an array declaration 

1260 bracket_idx = None 

1261 for j in range(len(collected_tokens) - 1, start_idx, -1): 

1262 if collected_tokens[j].type == TokenType.RBRACKET: 

1263 bracket_idx = j 

1264 break 

1265 

1266 if bracket_idx is not None: 

1267 # Array declaration: find the identifier before the opening bracket 

1268 for j in range(bracket_idx - 1, start_idx, -1): 

1269 if collected_tokens[j].type == TokenType.LBRACKET: 

1270 # Found opening bracket, look for identifier before it 

1271 for k in range(j - 1, start_idx, -1): 

1272 if collected_tokens[k].type == TokenType.IDENTIFIER: 

1273 var_name = collected_tokens[k].value 

1274 type_tokens = collected_tokens[start_idx:k] 

1275 # Format array type properly - preserve spaces between tokens but not around brackets 

1276 formatted_type = [] 

1277 for idx, token in enumerate(type_tokens): 

1278 if idx > 0: 

1279 formatted_type.append(" " + token.value) 

1280 else: 

1281 formatted_type.append(token.value) 

1282 # Add array brackets without spaces 

1283 array_size = collected_tokens[j + 1].value if j + 1 < bracket_idx else "" 

1284 var_type = "".join(formatted_type) + "[" + array_size + "]" 

1285 var_type = self._clean_type_string(var_type) 

1286 return (var_name, var_type, None) 

1287 break 

1288 else: 

1289 # Regular variable: last token is the name 

1290 var_name = collected_tokens[-1].value 

1291 type_tokens = collected_tokens[start_idx:-1] 

1292 var_type = " ".join(t.value for t in type_tokens) 

1293 var_type = self._clean_type_string(var_type) 

1294 var_type = self._fix_array_bracket_spacing(var_type) 

1295 return (var_name, var_type, None) 

1296 

1297 return None 

1298 

1299 def _skip_to_semicolon(self, tokens, start_pos): 

1300 """Skip to the next semicolon""" 

1301 i = start_pos 

1302 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON: 

1303 i += 1 

1304 return i + 1 if i < len(tokens) else i 

1305 

1306 def _skip_preprocessor_directives(self, tokens, start_pos): 

1307 """Skip preprocessor directives but keep their content for parsing""" 

1308 # This method is deprecated - use the PreprocessorManager instead 

1309 i = start_pos 

1310 while i < len(tokens) and tokens[i].type == TokenType.PREPROCESSOR: 

1311 # Skip the preprocessor directive itself 

1312 i += 1 

1313 return i 

1314 

1315 def _parse_function_parameters(self, tokens, start_pos, end_pos, func_name): 

1316 """Parse function parameters from token range""" 

1317 

1318 parameters = [] 

1319 

1320 # Find the opening parenthesis for the function 

1321 paren_start = None 

1322 paren_end = None 

1323 

1324 for i in range(start_pos, min(end_pos + 1, len(tokens))): 

1325 if tokens[i].type == TokenType.IDENTIFIER and tokens[i].value == func_name: 

1326 # Look for opening parenthesis after function name 

1327 for j in range(i + 1, min(end_pos + 1, len(tokens))): 

1328 if tokens[j].type == TokenType.LPAREN: 

1329 paren_start = j 

1330 break 

1331 elif tokens[j].type not in [ 

1332 TokenType.WHITESPACE, 

1333 TokenType.COMMENT, 

1334 ]: 

1335 break 

1336 break 

1337 

1338 if paren_start is None: 

1339 return parameters 

1340 

1341 # Find matching closing parenthesis 

1342 paren_depth = 1 

1343 for i in range(paren_start + 1, min(end_pos + 1, len(tokens))): 

1344 if tokens[i].type == TokenType.LPAREN: 

1345 paren_depth += 1 

1346 elif tokens[i].type == TokenType.RPAREN: 

1347 paren_depth -= 1 

1348 if paren_depth == 0: 

1349 paren_end = i 

1350 break 

1351 

1352 if paren_end is None: 

1353 return parameters 

1354 

1355 # Parse parameter tokens between parentheses 

1356 param_tokens = [] 

1357 for i in range(paren_start + 1, paren_end): 

1358 if tokens[i].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]: 

1359 param_tokens.append(tokens[i]) 

1360 

1361 # If no parameters or just "void", return empty list 

1362 if not param_tokens or ( 

1363 len(param_tokens) == 1 and param_tokens[0].value == "void" 

1364 ): 

1365 return parameters 

1366 

1367 # Split parameters by commas, but handle function pointers correctly 

1368 current_param = [] 

1369 paren_depth = 0 

1370 for token in param_tokens: 

1371 if token.type == TokenType.LPAREN: 

1372 paren_depth += 1 

1373 elif token.type == TokenType.RPAREN: 

1374 paren_depth -= 1 

1375 elif token.type == TokenType.COMMA and paren_depth == 0: 

1376 # Only split on commas that are not inside parentheses 

1377 if current_param: 

1378 param = self._parse_single_parameter(current_param) 

1379 if param: 

1380 parameters.append(param) 

1381 current_param = [] 

1382 continue 

1383 

1384 current_param.append(token) 

1385 

1386 # Handle last parameter 

1387 if current_param: 

1388 param = self._parse_single_parameter(current_param) 

1389 if param: 

1390 parameters.append(param) 

1391 

1392 return parameters 

1393 

1394 def _parse_single_parameter(self, param_tokens): 

1395 """Parse a single function parameter from tokens""" 

1396 from ..models import Field 

1397 

1398 if not param_tokens: 

1399 return None 

1400 

1401 # Handle variadic parameters (three consecutive dots) 

1402 if len(param_tokens) == 3 and all(t.value == "." for t in param_tokens): 

1403 return Field(name="...", type="...") 

1404 

1405 # Handle variadic parameters (single ... token) 

1406 if len(param_tokens) == 1 and param_tokens[0].value == "...": 

1407 return Field(name="...", type="...") 

1408 

1409 # Handle function pointer parameters: type (*name)(params) 

1410 if len(param_tokens) >= 5: 

1411 # Look for pattern: type ( * name ) ( params ) 

1412 for i in range(len(param_tokens) - 4): 

1413 if ( 

1414 param_tokens[i].type == TokenType.LPAREN 

1415 and param_tokens[i + 1].type == TokenType.ASTERISK 

1416 and param_tokens[i + 2].type == TokenType.IDENTIFIER 

1417 and param_tokens[i + 3].type == TokenType.RPAREN 

1418 and param_tokens[i + 4].type == TokenType.LPAREN 

1419 ): 

1420 # Found function pointer pattern 

1421 func_name = param_tokens[i + 2].value 

1422 

1423 # Find the closing parenthesis for the parameter list 

1424 paren_count = 1 

1425 param_end = i + 5 

1426 while param_end < len(param_tokens) and paren_count > 0: 

1427 if param_tokens[param_end].type == TokenType.LPAREN: 

1428 paren_count += 1 

1429 elif param_tokens[param_end].type == TokenType.RPAREN: 

1430 paren_count -= 1 

1431 param_end += 1 

1432 

1433 if paren_count == 0: 

1434 # Extract the type (everything before the function pointer) 

1435 type_tokens = param_tokens[:i] 

1436 param_type = " ".join(t.value for t in type_tokens) 

1437 

1438 # Extract the function pointer part 

1439 func_ptr_tokens = param_tokens[i:param_end] 

1440 func_ptr_type = " ".join(t.value for t in func_ptr_tokens) 

1441 

1442 # Combine type and function pointer 

1443 full_type = (param_type + " " + func_ptr_type).strip() 

1444 

1445 # Fix array bracket spacing 

1446 full_type = self._fix_array_bracket_spacing(full_type) 

1447 

1448 return Field(name=func_name, type=full_type) 

1449 else: 

1450 # Incomplete function pointer - try to reconstruct 

1451 type_tokens = param_tokens[:i] 

1452 param_type = " ".join(t.value for t in type_tokens) 

1453 func_ptr_tokens = param_tokens[i:] 

1454 func_ptr_type = " ".join(t.value for t in func_ptr_tokens) 

1455 full_type = (param_type + " " + func_ptr_type).strip() 

1456 full_type = self._fix_array_bracket_spacing(full_type) 

1457 return Field(name=func_name, type=full_type) 

1458 

1459 # Also look for pattern: type ( * name ) ( params ) with spaces 

1460 for i in range(len(param_tokens) - 4): 

1461 if ( 

1462 param_tokens[i].type == TokenType.LPAREN 

1463 and param_tokens[i + 1].type == TokenType.ASTERISK 

1464 and param_tokens[i + 2].type == TokenType.IDENTIFIER 

1465 and param_tokens[i + 3].type == TokenType.RPAREN 

1466 and param_tokens[i + 4].type == TokenType.LPAREN 

1467 ): 

1468 # Found function pointer pattern 

1469 func_name = param_tokens[i + 2].value 

1470 

1471 # Find the closing parenthesis for the parameter list 

1472 paren_count = 1 

1473 param_end = i + 5 

1474 while param_end < len(param_tokens) and paren_count > 0: 

1475 if param_tokens[param_end].type == TokenType.LPAREN: 

1476 paren_count += 1 

1477 elif param_tokens[param_end].type == TokenType.RPAREN: 

1478 paren_count -= 1 

1479 param_end += 1 

1480 

1481 if paren_count == 0: 

1482 # Extract the type (everything before the function pointer) 

1483 type_tokens = param_tokens[:i] 

1484 param_type = " ".join(t.value for t in type_tokens) 

1485 

1486 # Extract the function pointer part 

1487 func_ptr_tokens = param_tokens[i:param_end] 

1488 func_ptr_type = " ".join(t.value for t in func_ptr_tokens) 

1489 

1490 # Combine type and function pointer 

1491 full_type = (param_type + " " + func_ptr_type).strip() 

1492 

1493 # Fix array bracket spacing 

1494 full_type = self._fix_array_bracket_spacing(full_type) 

1495 

1496 return Field(name=func_name, type=full_type) 

1497 else: 

1498 # Incomplete function pointer - try to reconstruct 

1499 type_tokens = param_tokens[:i] 

1500 param_type = " ".join(t.value for t in type_tokens) 

1501 func_ptr_tokens = param_tokens[i:] 

1502 func_ptr_type = " ".join(t.value for t in func_ptr_tokens) 

1503 full_type = (param_type + " " + func_ptr_type).strip() 

1504 full_type = self._fix_array_bracket_spacing(full_type) 

1505 return Field(name=func_name, type=full_type) 

1506 

1507 # For parameters like "int x" or "const char *name" or "char* argv[]" 

1508 if len(param_tokens) >= 2: 

1509 # Check if the last token is a closing bracket (array parameter) 

1510 if param_tokens[-1].type == TokenType.RBRACKET: 

1511 # Find the opening bracket to get the array size 

1512 bracket_start = None 

1513 for i in range(len(param_tokens) - 1, -1, -1): 

1514 if param_tokens[i].type == TokenType.LBRACKET: 

1515 bracket_start = i 

1516 break 

1517 

1518 if bracket_start is not None: 

1519 # Extract the parameter name (last identifier before the opening bracket) 

1520 param_name = None 

1521 for i in range(bracket_start - 1, -1, -1): 

1522 if param_tokens[i].type == TokenType.IDENTIFIER: 

1523 param_name = param_tokens[i].value 

1524 break 

1525 

1526 if param_name: 

1527 # Extract the type (everything before the parameter name) 

1528 type_tokens = param_tokens[:i] 

1529 param_type = " ".join(t.value for t in type_tokens) 

1530 

1531 # Add the array brackets to the type 

1532 array_size = "" 

1533 if bracket_start + 1 < len(param_tokens) - 1: 

1534 # There's content between brackets 

1535 array_content = param_tokens[bracket_start + 1:-1] 

1536 array_size = " ".join(t.value for t in array_content) 

1537 

1538 param_type = param_type + "[" + array_size + "]" 

1539 

1540 # Fix array bracket spacing 

1541 param_type = self._fix_array_bracket_spacing(param_type) 

1542 

1543 return Field(name=param_name, type=param_type) 

1544 else: 

1545 # Regular parameter: last token is the parameter name 

1546 param_name = param_tokens[-1].value 

1547 type_tokens = param_tokens[:-1] 

1548 param_type = " ".join(t.value for t in type_tokens) 

1549 

1550 # Fix array bracket spacing and pointer spacing 

1551 param_type = self._fix_array_bracket_spacing(param_type) 

1552 param_type = self._fix_pointer_spacing(param_type) 

1553 

1554 # Handle unnamed parameters (just type) 

1555 if param_name in [ 

1556 "void", 

1557 "int", 

1558 "char", 

1559 "float", 

1560 "double", 

1561 "long", 

1562 "short", 

1563 "unsigned", 

1564 "signed", 

1565 ]: 

1566 # This is just a type without a name 

1567 return Field(name="unnamed", type=param_type + " " + param_name) 

1568 

1569 # Additional validation before creating Field 

1570 if param_name and param_name.strip() and param_type and param_type.strip(): 

1571 return Field(name=param_name.strip(), type=param_type.strip()) 

1572 else: 

1573 # Fallback for invalid parameters - try to reconstruct the full parameter 

1574 full_param = " ".join(t.value for t in param_tokens) 

1575 full_param = self._fix_array_bracket_spacing(full_param) 

1576 if full_param.strip(): 

1577 return Field(name="unnamed", type=full_param.strip()) 

1578 else: 

1579 return Field(name="unnamed", type="unknown") 

1580 elif len(param_tokens) == 1: 

1581 # Single token - might be just type (like "void") or name 

1582 token_value = param_tokens[0].value 

1583 if token_value in [ 

1584 "void", 

1585 "int", 

1586 "char", 

1587 "float", 

1588 "double", 

1589 "long", 

1590 "short", 

1591 "unsigned", 

1592 "signed", 

1593 ]: 

1594 return Field(name="unnamed", type=token_value) 

1595 else: 

1596 # If we can't determine the type, use the token value as type 

1597 if token_value and token_value.strip(): 

1598 return Field(name="unnamed", type=token_value.strip()) 

1599 else: 

1600 return Field(name="unnamed", type="unknown") 

1601 

1602 return None 

1603 

1604 def _fix_array_bracket_spacing(self, type_str: str) -> str: 

1605 """Fix spacing around array brackets in type strings""" 

1606 # First clean the type string to remove newlines 

1607 type_str = self._clean_type_string(type_str) 

1608 # Replace patterns like "type[ size ]" with "type[size]" 

1609 import re 

1610 # Remove spaces around array brackets 

1611 type_str = re.sub(r'\s*\[\s*', '[', type_str) 

1612 type_str = re.sub(r'\s*\]\s*', ']', type_str) 

1613 return type_str 

1614 

1615 def _fix_pointer_spacing(self, type_str: str) -> str: 

1616 """Fix spacing around pointer asterisks in type strings""" 

1617 import re 

1618 # Fix double pointer spacing: "type * *" -> "type **" 

1619 type_str = re.sub(r'\*\s+\*', '**', type_str) 

1620 # Fix triple pointer spacing: "type * * *" -> "type ***" 

1621 type_str = re.sub(r'\*\s+\*\s+\*', '***', type_str) 

1622 return type_str 

1623 

1624 def _clean_type_string(self, type_str: str) -> str: 

1625 """Clean type string by removing newlines and normalizing whitespace""" 

1626 if not type_str: 

1627 return type_str 

1628 # Replace newlines with spaces and normalize whitespace 

1629 cleaned = type_str.replace('\n', ' ') 

1630 # Normalize multiple spaces to single space 

1631 import re 

1632 cleaned = re.sub(r'\s+', ' ', cleaned) 

1633 # Strip leading/trailing whitespace 

1634 cleaned = cleaned.strip() 

1635 return cleaned 

1636 

1637 def _clean_value_string(self, value_str: str) -> str: 

1638 """Clean value string by removing excessive whitespace and newlines""" 

1639 if not value_str: 

1640 return value_str 

1641 # Replace newlines with spaces and normalize whitespace 

1642 cleaned = value_str.replace('\n', ' ') 

1643 # Normalize multiple spaces to single space 

1644 import re 

1645 cleaned = re.sub(r'\s+', ' ', cleaned) 

1646 # Strip leading/trailing whitespace 

1647 cleaned = cleaned.strip() 

1648 # Remove excessive spaces around braces and operators 

1649 cleaned = re.sub(r'\s*{\s*', '{', cleaned) 

1650 cleaned = re.sub(r'\s*}\s*', '}', cleaned) 

1651 cleaned = re.sub(r'\s*,\s*', ', ', cleaned) 

1652 cleaned = re.sub(r'\s*&\s*', '&', cleaned) 

1653 return cleaned 

1654 

1655 def _get_timestamp(self) -> str: 

1656 """Get current timestamp string""" 

1657 from datetime import datetime 

1658 

1659 return datetime.now().isoformat() 

1660 

1661 

1662class Parser: 

1663 """Main parser class for Step 1: Parse C code files and generate model.json""" 

1664 

1665 def __init__(self): 

1666 self.c_parser = CParser() 

1667 self.logger = logging.getLogger(__name__) 

1668 

1669 def parse( 

1670 self, 

1671 source_folders: "List[str]", 

1672 output_file: str = "model.json", 

1673 recursive_search: bool = True, 

1674 config: "Config" = None, 

1675 ) -> str: 

1676 """Parse C/C++ projects and generate model.json 

1677 

1678 Args: 

1679 source_folders: List of source folder directories within the project 

1680 output_file: Path to the output model.json file 

1681 recursive_search: Whether to search subdirectories recursively 

1682 config: Configuration object for filtering and processing 

1683 

1684 Returns: 

1685 Path to the generated model.json file 

1686 """ 

1687 # Enhanced validation for source_folders 

1688 if not isinstance(source_folders, list): 

1689 raise TypeError(f"source_folders must be a list of strings, got: {type(source_folders)}") 

1690 

1691 if not source_folders: 

1692 raise ValueError("At least one source folder must be provided") 

1693 

1694 # Validate all items are strings and not empty 

1695 for i, folder in enumerate(source_folders): 

1696 if not isinstance(folder, str): 

1697 raise TypeError(f"All source folders must be strings, got {type(folder)} at index {i}: {folder}") 

1698 if not folder.strip(): 

1699 raise ValueError(f"Source folder at index {i} cannot be empty or whitespace: {repr(folder)}") 

1700 

1701 self.logger.info( 

1702 f"Step 1: Parsing C/C++ project with {len(source_folders)} source folders" 

1703 ) 

1704 

1705 # Get project name from config or use default 

1706 project_name = ( 

1707 getattr(config, "project_name", "C_Project") if config else "C_Project" 

1708 ) 

1709 

1710 # Parse each source folder and combine results 

1711 all_files = {} 

1712 total_structs = 0 

1713 total_enums = 0 

1714 total_functions = 0 

1715 failed_folders = [] 

1716 

1717 for i, source_folder in enumerate(source_folders): 

1718 self.logger.info( 

1719 f"Parsing source folder {i+1}/{len(source_folders)}: {source_folder}" 

1720 ) 

1721 

1722 try: 

1723 # Parse the individual source folder 

1724 model = self.c_parser.parse_project( 

1725 source_folder, recursive_search, config 

1726 ) 

1727 

1728 all_files.update(model.files) 

1729 

1730 # Update totals 

1731 total_structs += sum(len(f.structs) for f in model.files.values()) 

1732 total_enums += sum(len(f.enums) for f in model.files.values()) 

1733 total_functions += sum(len(f.functions) for f in model.files.values()) 

1734 

1735 self.logger.info( 

1736 f"Successfully parsed source folder {source_folder}: {len(model.files)} files" 

1737 ) 

1738 

1739 except Exception as e: 

1740 self.logger.error( 

1741 "Failed to parse source folder %s: %s", source_folder, e 

1742 ) 

1743 failed_folders.append((source_folder, str(e))) 

1744 

1745 # If this is the only source folder, re-raise the error 

1746 if len(source_folders) == 1: 

1747 raise 

1748 

1749 # For multiple source folders, continue with others but log the failure 

1750 self.logger.warning( 

1751 "Continuing with other source folders despite failure in %s", source_folder 

1752 ) 

1753 

1754 # If all source folders failed, raise an error 

1755 if failed_folders and len(failed_folders) == len(source_folders): 

1756 error_msg = "All source folders failed to parse:\n" 

1757 for folder, error in failed_folders: 

1758 error_msg += f" - {folder}: {error}\n" 

1759 raise RuntimeError(error_msg) 

1760 

1761 # If some folders failed, log a warning 

1762 if failed_folders: 

1763 self.logger.warning( 

1764 f"Failed to parse {len(failed_folders)} out of {len(source_folders)} source folders" 

1765 ) 

1766 

1767 # Create combined project model 

1768 combined_model = ProjectModel( 

1769 project_name=project_name, 

1770 source_folder=( 

1771 ",".join(source_folders) 

1772 if len(source_folders) > 1 

1773 else source_folders[0] 

1774 ), 

1775 files=all_files, 

1776 ) 

1777 

1778 # Update all uses fields across the entire combined project 

1779 combined_model.update_uses_fields() 

1780 

1781 # Save combined model to JSON file 

1782 try: 

1783 combined_model.save(output_file) 

1784 except Exception as e: 

1785 raise RuntimeError(f"Failed to save model to {output_file}: {e}") from e 

1786 

1787 # Step 1.5: Verify model sanity 

1788 self.logger.info("Step 1.5: Verifying model sanity...") 

1789 from .verifier import ModelVerifier 

1790 

1791 verifier = ModelVerifier() 

1792 is_valid, issues = verifier.verify_model(combined_model) 

1793 

1794 if not is_valid: 

1795 self.logger.warning( 

1796 f"Model verification found {len(issues)} issues - model may contain parsing errors" 

1797 ) 

1798 # Continue processing but warn about potential issues 

1799 else: 

1800 self.logger.info("Model verification passed - all values look sane") 

1801 

1802 self.logger.info("Step 1 complete! Model saved to: %s", output_file) 

1803 self.logger.info( 

1804 f"Found {len(all_files)} total files across {len(source_folders)} source folder(s)" 

1805 ) 

1806 

1807 # Print summary 

1808 self.logger.info( 

1809 f"Summary: {total_structs} structs, {total_enums} enums, " 

1810 f"{total_functions} functions" 

1811 ) 

1812 

1813 return output_file