Coverage for src/c2puml/core/parser.py: 79%

1#!/usr/bin/env python3

2"""

3Parser module for C to PlantUML converter - Step 1: Parse C code files and generate model.json

4"""

5import logging

6from pathlib import Path

7from typing import TYPE_CHECKING, Dict, List, Optional, Set

9from ..models import Enum, EnumValue, Field, FileModel, ProjectModel, Struct

10from .parser_tokenizer import (

11 CTokenizer,

12 StructureFinder,

13 TokenType,

14 find_enum_values,

15 find_struct_fields,

16)

17from .preprocessor import PreprocessorManager

18from .parser_anonymous_processor import AnonymousTypedefProcessor

19from ..utils import detect_file_encoding

21if TYPE_CHECKING:

22 from ..config import Config

23 from ..models import Alias, Enum, Field, Function, Struct, Union

26class CParser:

27 """C/C++ parser for extracting structural information from source code using tokenization"""

29 def __init__(self):

30 self.logger = logging.getLogger(__name__)

31 self.tokenizer = CTokenizer()

32 self.preprocessor = PreprocessorManager()

34 def parse_project(

35 self, source_folder: str, recursive_search: bool = True, config: "Config" = None

36 ) -> ProjectModel:

37 """Parse a C/C++ project and return a model"""

38 # Enhanced source path validation

39 if not source_folder or not isinstance(source_folder, str):

40 raise ValueError(f"Source folder must be a non-empty string, got: {type(source_folder)}")

42 if not source_folder.strip():

43 raise ValueError("Source folder cannot be empty or whitespace")

45 try:

46 source_folder_path = Path(source_folder).resolve()

47 except (OSError, RuntimeError) as e:

48 raise ValueError(f"Failed to resolve source folder path '{source_folder}': {e}")

50 if not source_folder_path.exists():

51 # Provide helpful error message with suggestions

52 error_msg = f"Source folder not found: {source_folder_path}"

54 # Check if it's a relative path issue

55 if not Path(source_folder).is_absolute():

56 current_dir = Path.cwd()

57 error_msg += f"\nCurrent working directory: {current_dir}"

58 error_msg += f"\nTried to resolve relative path: {source_folder}"

60 # Check if parent directory exists

61 parent_dir = source_folder_path.parent

62 if parent_dir.exists():

63 error_msg += f"\nParent directory exists: {parent_dir}"

64 # List contents of parent directory

65 try:

66 contents = [item.name for item in parent_dir.iterdir() if item.is_dir()]

67 if contents:

68 error_msg += f"\nAvailable directories in parent: {', '.join(contents[:10])}"

69 if len(contents) > 10:

70 error_msg += f" (and {len(contents) - 10} more)"

71 except (OSError, PermissionError):

72 error_msg += "\nCannot list parent directory contents (permission denied)"

73 else:

74 error_msg += f"\nParent directory does not exist: {parent_dir}"

76 raise ValueError(error_msg)

78 if not source_folder_path.is_dir():

79 raise ValueError(f"Source folder must be a directory, got: {source_folder_path} (is_file: {source_folder_path.is_file()})")

81 # Check if directory is readable

82 try:

83 source_folder_path.iterdir()

84 except PermissionError:

85 raise ValueError(f"Permission denied accessing source folder: {source_folder_path}")

86 except OSError as e:

87 raise ValueError(f"Error accessing source folder '{source_folder_path}': {e}")

89 self.logger.info("Parsing project: %s", source_folder_path)

91 # Find all C/C++ files in the project

92 try:

93 all_c_files = self._find_c_files(source_folder_path, recursive_search)

94 except OSError as e:

95 raise ValueError(f"Error searching for C/C++ files in '{source_folder_path}': {e}")

97 self.logger.info("Found %d C/C++ files", len(all_c_files))

99 # Apply file filtering based on configuration

100 c_files = []

101 if config:

102 for file_path in all_c_files:

103 if config._should_include_file(file_path.name):

104 c_files.append(file_path)

105 self.logger.debug(

106 "Included file after filtering: %s", file_path.name

107 )

108 else:

109 self.logger.debug(

110 "Excluded file after filtering: %s", file_path.name

111 )

112 else:

113 c_files = all_c_files

114

115 self.logger.info("After filtering: %d C/C++ files", len(c_files))

116

117 # Parse each file using filename as key for simplified tracking

118 files = {}

119 failed_files = []

120

121 for file_path in c_files:

122 try:

123 # Use relative path for tracking and filename as key

124 relative_path = str(file_path.relative_to(source_folder_path))

125 file_model = self.parse_file(file_path, relative_path)

126

127 # Use filename as key (filenames are guaranteed to be unique)

128 if file_model.name in files:

129 raise RuntimeError(

130 f"Duplicate filename detected: '{file_model.name}' from '{file_path}'. "

131 f"Already seen from '{files[file_model.name].file_path}'."

132 )

133 files[file_model.name] = file_model

134

135 self.logger.debug("Successfully parsed: %s", relative_path)

136

137 except (OSError, ValueError) as e:

138 self.logger.warning("Failed to parse %s: %s", file_path, e)

139 failed_files.append(str(file_path))

140

141 if failed_files:

142 error_msg = (

143 f"Failed to parse {len(failed_files)} files: {failed_files}. "

144 "Stopping model processing."

145 )

146 self.logger.error(error_msg)

147 raise RuntimeError(error_msg)

148

149 model = ProjectModel(

150 project_name=source_folder_path.name,

151 source_folder=str(source_folder_path),

152 files=files,

153 )

154

155 # Update all uses fields across the entire project

156 model.update_uses_fields()

157

158 self.logger.info("Parsing complete. Parsed %d files successfully.", len(files))

159 return model

160

161 def parse_file(self, file_path: Path, relative_path: str) -> FileModel:

162 """Parse a single C/C++ file and return a file model using tokenization"""

163 self.logger.debug("Parsing file: %s", file_path)

164

165 # Detect encoding

166 encoding = self._detect_encoding(file_path)

167

168 # Read file content

169 with open(file_path, "r", encoding=encoding) as f:

170 content = f.read()

171

172 # Tokenize the content

173 tokens = self.tokenizer.tokenize(content)

174 self.logger.debug("Tokenized file into %d tokens", len(tokens))

175

176 # Process preprocessor directives

177 self.preprocessor.add_defines_from_content(tokens)

178 processed_tokens = self.preprocessor.process_file(tokens)

179 self.logger.debug(

180 "Preprocessor processed %d tokens -> %d tokens",

181 len(tokens),

182 len(processed_tokens),

183 )

184

185 # Filter out whitespace and comments for structure finding

186 filtered_tokens = self.tokenizer.filter_tokens(processed_tokens)

187 structure_finder = StructureFinder(filtered_tokens)

188

189 # Parse different structures using tokenizer

190 structs = self._parse_structs_with_tokenizer(processed_tokens, structure_finder)

191 enums = self._parse_enums_with_tokenizer(processed_tokens, structure_finder)

192 unions = self._parse_unions_with_tokenizer(processed_tokens, structure_finder)

193 functions = self._parse_functions_with_tokenizer(

194 processed_tokens, structure_finder

195 )

196 aliases = self._parse_aliases_with_tokenizer(processed_tokens)

197

198 # "uses" fields will be updated when we have the full project model

199

200 # Map typedef names to anonymous structs/enums/unions if needed

201 # This logic will be handled by typedef_relations instead

202

203 file_model = FileModel(

204 file_path=str(file_path),

205 structs=structs,

206 enums=enums,

207 unions=unions,

208 functions=functions,

209 globals=self._parse_globals_with_tokenizer(processed_tokens),

210 includes=self._parse_includes_with_tokenizer(processed_tokens),

211 macros=self._parse_macros_with_tokenizer(processed_tokens),

212 aliases=aliases,

213 # Tag names are now stored in struct/enum/union objects

214 )

215

216 # Process anonymous typedefs after initial parsing

217 anonymous_processor = AnonymousTypedefProcessor()

218 anonymous_processor.process_file_model(file_model)

219

220 return file_model

221

222 def _parse_structs_with_tokenizer(

223 self, tokens, structure_finder

224 ) -> Dict[str, "Struct"]:

225 """Parse struct definitions using tokenizer"""

226

227 structs = {}

228 struct_infos = structure_finder.find_structs()

229

230 for start_pos, end_pos, struct_name in struct_infos:

231 # Need to map back to original token positions

232 # Find the original token positions by looking at line/column info

233 original_start = self._find_original_token_pos(

234 tokens, structure_finder.tokens, start_pos

235 )

236 original_end = self._find_original_token_pos(

237 tokens, structure_finder.tokens, end_pos

238 )

239

240 if original_start is not None and original_end is not None:

241 # Extract field information from original token range

242 field_tuples = find_struct_fields(tokens, original_start, original_end)

243

244 # Convert to Field objects

245 fields = []

246 for field_name, field_type in field_tuples:

247 try:

248 fields.append(Field(field_name, field_type))

249 except ValueError as e:

250 self.logger.warning(

251 "Error creating field %s: %s", field_name, e

252 )

253

254 # For anonymous structs, use a special key that can be mapped later

255 if not struct_name:

256 struct_name = "__anonymous_struct__"

257

258 # Extract tag name if this is a typedef struct

259 tag_name = ""

260 if struct_name and not struct_name.startswith("__anonymous"):

261 # Check if this struct has a typedef

262 tag_name = self._extract_tag_name_for_struct(tokens, struct_name)

263

264 # Only register non-empty struct names here; anonymous will be created by the anonymous processor

265 if struct_name:

266 structs[struct_name] = Struct(

267 struct_name, fields, tag_name=tag_name, uses=[]

268 )

269 self.logger.debug(

270 "Parsed struct: %s with %d fields", struct_name, len(fields)

271 )

272

273 return structs

274

275 def _parse_enums_with_tokenizer(

276 self, tokens, structure_finder

277 ) -> Dict[str, "Enum"]:

278 """Parse enum definitions using tokenizer"""

279 enums = {}

280 enum_infos = structure_finder.find_enums()

281

282 for start_pos, end_pos, enum_name in enum_infos:

283 # Need to map back to original token positions

284 original_start = self._find_original_token_pos(

285 tokens, structure_finder.tokens, start_pos

286 )

287 original_end = self._find_original_token_pos(

288 tokens, structure_finder.tokens, end_pos

289 )

290

291 if original_start is not None and original_end is not None:

292 # Extract enum values from original token range

293 value_strs = find_enum_values(tokens, original_start, original_end)

294 values = []

295 for v in value_strs:

296 if "=" in v:

297 name, val = v.split("=", 1)

298 name = name.strip()

299 val = val.strip()

300 if name: # Only add if name is not empty

301 values.append(EnumValue(name=name, value=val))

302 else:

303 name = v.strip()

304 if name: # Only add if name is not empty

305 values.append(EnumValue(name=name))

306

307 # For anonymous enums, use a special key that can be mapped later

308 if not enum_name:

309 enum_name = "__anonymous_enum__"

310

311 # Extract tag name if this is a typedef enum

312 tag_name = ""

313 if enum_name and not enum_name.startswith("__anonymous"):

314 # Check if this enum has a typedef

315 tag_name = self._extract_tag_name_for_enum(tokens, enum_name)

316

317 enums[enum_name] = Enum(enum_name, values, tag_name=tag_name)

318 self.logger.debug(

319 "Parsed enum: %s with %d values", enum_name, len(values)

320 )

321

322 return enums

323

324 def _parse_unions_with_tokenizer(

325 self, tokens, structure_finder

326 ) -> Dict[str, "Union"]:

327 """Parse union definitions using tokenizer"""

328 from ..models import Field, Union

329

330 unions = {}

331 union_infos = structure_finder.find_unions()

332

333 for start_pos, end_pos, union_name in union_infos:

334 # Need to map back to original token positions

335 original_start = self._find_original_token_pos(

336 tokens, structure_finder.tokens, start_pos

337 )

338 original_end = self._find_original_token_pos(

339 tokens, structure_finder.tokens, end_pos

340 )

341

342 if original_start is not None and original_end is not None:

343 # Extract field information from original token range

344 field_tuples = find_struct_fields(tokens, original_start, original_end)

345

346 # Convert to Field objects

347 fields = []

348 for field_name, field_type in field_tuples:

349 try:

350 fields.append(Field(field_name, field_type))

351 except ValueError as e:

352 self.logger.warning(

353 "Error creating union field %s: %s", field_name, e

354 )

355

356 # For anonymous unions, use a special key that can be mapped later

357 if not union_name:

358 union_name = "__anonymous_union__"

359

360 # Extract tag name if this is a typedef union

361 tag_name = ""

362 if union_name and not union_name.startswith("__anonymous"):

363 # Check if this union has a typedef

364 tag_name = self._extract_tag_name_for_union(tokens, union_name)

365

366 unions[union_name] = Union(

367 union_name, fields, tag_name=tag_name, uses=[]

368 )

369 self.logger.debug(

370 "Parsed union: %s with %d fields", union_name, len(fields)

371 )

372

373 return unions

374

375 def _parse_functions_with_tokenizer(

376 self, tokens, structure_finder

377 ) -> List["Function"]:

378 """Parse function declarations/definitions using tokenizer"""

379 from ..models import Function

380

381 functions = []

382 function_infos = structure_finder.find_functions()

383

384 for (

385 start_pos,

386 end_pos,

387 func_name,

388 return_type,

389 is_declaration,

390 is_inline,

391 ) in function_infos:

392 # Map back to original token positions to parse parameters

393 original_start = self._find_original_token_pos(

394 tokens, structure_finder.tokens, start_pos

395 )

396 original_end = self._find_original_token_pos(

397 tokens, structure_finder.tokens, end_pos

398 )

399

400 parameters = []

401 if original_start is not None and original_end is not None:

402 # Parse parameters from the token range

403 parameters = self._parse_function_parameters(

404 tokens, original_start, original_end, func_name

405 )

406

407 try:

408 # Create function with declaration flag

409 function = Function(func_name, return_type, parameters)

410 # Add custom attributes to track if this is a declaration and if it's inline

411 function.is_declaration = is_declaration

412 function.is_inline = is_inline

413 functions.append(function)

414 self.logger.debug(

415 f"Parsed function: {func_name} with {len(parameters)} parameters (declaration: {is_declaration}, inline: {is_inline})"

416 )

417 except Exception as e:

418 self.logger.warning("Error creating function %s: %s", func_name, e)

419

420 return functions

421

422 def _parse_globals_with_tokenizer(self, tokens) -> List["Field"]:

423 """Parse global variables using tokenizer"""

424 from ..models import Field

425

426 globals_list = []

427

428 i = 0

429 while i < len(tokens):

430 # Skip preprocessor directives, comments, etc.

431 if tokens[i].type in [

432 TokenType.INCLUDE,

433 TokenType.DEFINE,

434 TokenType.COMMENT,

435 TokenType.WHITESPACE,

436 TokenType.NEWLINE,

437 ]:

438 i += 1

439 continue

440

441 # Skip preprocessor directives but keep their content

442 if tokens[i].type == TokenType.PREPROCESSOR:

443 i = self._skip_preprocessor_directives(tokens, i)

444 continue

445

446 # Skip function definitions (look for parentheses)

447 if self._looks_like_function(tokens, i):

448 i = self._skip_function(tokens, i)

449 continue

450

451 # Skip struct/enum/union definitions

452 if tokens[i].type in [

453 TokenType.STRUCT,

454 TokenType.ENUM,

455 TokenType.UNION,

456 TokenType.TYPEDEF,

457 ]:

458 i = self._skip_structure_definition(tokens, i)

459 continue

460

461 # Skip if we're inside a struct definition (look for opening brace)

462 if i > 0 and tokens[i - 1].type == TokenType.LBRACE:

463 # We're inside a struct, skip until closing brace

464 brace_count = 1

465 j = i

466 while j < len(tokens) and brace_count > 0:

467 if tokens[j].type == TokenType.LBRACE:

468 brace_count += 1

469 elif tokens[j].type == TokenType.RBRACE:

470 brace_count -= 1

471 j += 1

472 i = j

473 continue

474

475 # Skip macros and other preprocessor content

476 if tokens[i].type == TokenType.DEFINE:

477 # Skip the entire macro content (multi-line macros are now merged)

478 i += 1

479 continue

480

481 # Additional check: skip if we're inside any brace block (struct, function, etc.)

482 brace_count = 0

483 j = i - 1

484 while j >= 0:

485 if tokens[j].type == TokenType.RBRACE:

486 brace_count += 1

487 elif tokens[j].type == TokenType.LBRACE:

488 brace_count -= 1

489 if brace_count < 0:

490 # We're inside a brace block, skip this token

491 i += 1

492 break

493 j -= 1

494 else:

495 # Not inside a brace block, proceed with global variable parsing

496 global_info = self._parse_global_variable(tokens, i)

497 if global_info:

498 var_name, var_type, var_value = global_info

499 # Only add if it looks like a real global variable (not a fragment)

500 if (

501 var_name

502 and var_name.strip()

503 and var_type

504 and var_type.strip()

505 and not var_name.startswith("#")

506 and len(var_type) < 200

507 and not var_type.startswith("\\")

508 and not var_name.startswith("\\")

509 and "\\" not in var_type

510 and "\\" not in var_name

511 ):

512 try:

513 # Additional validation before creating Field

514 stripped_name = var_name.strip()

515 stripped_type = var_type.strip()

516 if stripped_name and stripped_type:

517 globals_list.append(

518 Field(

519 name=stripped_name,

520 type=stripped_type,

521 value=var_value,

522 )

523 )

524 self.logger.debug(

525 f"Parsed global: {stripped_name} : {stripped_type}"

526 )

527 except Exception as e:

528 self.logger.warning(

529 f"Error creating global field {var_name}: {e}"

530 )

531 i = self._skip_to_semicolon(tokens, i)

532 else:

533 i += 1

534

535 return globals_list

536

537 def _parse_includes_with_tokenizer(self, tokens) -> List[str]:

538 """Parse #include directives using tokenizer"""

539 includes = []

540

541 for token in tokens:

542 if token.type == TokenType.INCLUDE:

543 # Extract include filename from the token value

544 # e.g., "#include <stdio.h>" -> "stdio.h"

545 # e.g., '#include "header.h"' -> "header.h"

546 # e.g., "#include 'header.h'" -> "header.h"

547 import re

548

549 match = re.search(r'[<"\']([^>\'"]+)[>\'"]', token.value)

550 if match:

551 # Return just the filename without quotes or angle brackets

552 includes.append(match.group(1))

553

554 return includes

555

556 def _parse_macros_with_tokenizer(self, tokens) -> List[str]:

557 """Parse macro definitions using tokenizer"""

558 macros = []

559

560 for token in tokens:

561 if token.type == TokenType.DEFINE:

562 # Store the full macro definition for display flexibility

563 # e.g., "#define PI 3.14159" -> "#define PI 3.14159"

564 # e.g., "#define MIN(a, b) ((a) < (b) ? (a) : (b))" -> "#define MIN(a, b) ((a) < (b) ? (a) : (b))"

565 macro_definition = token.value.strip()

566 if macro_definition not in macros:

567 macros.append(macro_definition)

568

569 return macros

570

571 def _parse_aliases_with_tokenizer(self, tokens) -> Dict[str, "Alias"]:

572 """Parse type aliases (primitive or derived typedefs) using tokenizer"""

573 from ..models import Alias

574

575 aliases = {}

576

577 i = 0

578 while i < len(tokens):

579 if tokens[i].type == TokenType.TYPEDEF:

580 # Found typedef, parse it

581 typedef_info = self._parse_single_typedef(tokens, i)

582 if typedef_info:

583 typedef_name, original_type = typedef_info

584

585 # Only include if it's NOT a struct/enum/union typedef

586 if original_type not in ["struct", "enum", "union"]:

587 aliases[typedef_name] = Alias(

588 name=typedef_name, original_type=original_type, uses=[]

589 )

590

591 i += 1

592

593 return aliases

594

595 # _parse_typedef_relations_with_tokenizer method removed - tag names are now in struct/enum/union

596

597 def _extract_tag_name_for_struct(self, tokens, struct_name: str) -> str:

598 """Extract tag name for a struct if it has a typedef"""

599 i = 0

600 while i < len(tokens):

601 if tokens[i].type == TokenType.TYPEDEF:

602 typedef_info = self._parse_single_typedef(tokens, i)

603 if typedef_info:

604 typedef_name, original_type = typedef_info

605 if original_type == "struct" and typedef_name == struct_name:

606 # Extract the tag name from the typedef

607 return self._extract_tag_name_from_typedef(tokens, i)

608 i += 1

609 return ""

610

611 def _extract_tag_name_for_enum(self, tokens, enum_name: str) -> str:

612 """Extract tag name for an enum if it has a typedef"""

613 i = 0

614 while i < len(tokens):

615 if tokens[i].type == TokenType.TYPEDEF:

616 typedef_info = self._parse_single_typedef(tokens, i)

617 if typedef_info:

618 typedef_name, original_type = typedef_info

619 if original_type == "enum" and typedef_name == enum_name:

620 # Extract the tag name from the typedef

621 return self._extract_tag_name_from_typedef(tokens, i)

622 i += 1

623 return ""

624

625 def _extract_tag_name_for_union(self, tokens, union_name: str) -> str:

626 """Extract tag name for a union if it has a typedef"""

627 i = 0

628 while i < len(tokens):

629 if tokens[i].type == TokenType.TYPEDEF:

630 typedef_info = self._parse_single_typedef(tokens, i)

631 if typedef_info:

632 typedef_name, original_type = typedef_info

633 if original_type == "union" and typedef_name == union_name:

634 # Extract the tag name from the typedef

635 return self._extract_tag_name_from_typedef(tokens, i)

636 i += 1

637 return ""

638

639 def _extract_non_primitive_types(

640 self, type_str: str, available_types: Set[str]

641 ) -> List[str]:

642 """Extract non-primitive type names from a type string that exist in available_types"""

643 # Define primitive types

644 primitive_types = {

645 "void",

646 "char",

647 "short",

648 "int",

649 "long",

650 "float",

651 "double",

652 "signed",

653 "unsigned",

654 "const",

655 "volatile",

656 "static",

657 "extern",

658 "auto",

659 "register",

660 "inline",

661 "restrict",

662 "size_t",

663 "ptrdiff_t",

664 "int8_t",

665 "int16_t",

666 "int32_t",

667 "int64_t",

668 "uint8_t",

669 "uint16_t",

670 "uint32_t",

671 "uint64_t",

672 "intptr_t",

673 "uintptr_t",

674 "bool",

675 "true",

676 "false",

677 "NULL",

678 "nullptr",

679 }

680

681 # Remove common C keywords and operators

682 import re

683

684 # Split by common delimiters and operators

685 parts = re.split(r"[\[\]\{\}\s\*&,;]", type_str)

686

687 # Extract potential type names that exist in available_types

688 types = []

689 for part in parts:

690 part = part.strip()

691 if part and len(part) > 1 and part not in primitive_types:

692 # Check if it looks like a type name (starts with letter, contains letters/numbers/underscores)

693 if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", part):

694 # Only include if it exists in available_types

695 if part in available_types:

696 types.append(part)

697

698 return list(set(types)) # Remove duplicates

699

700 def _find_c_files(

701 self, source_folder_path: Path, recursive_search: bool

702 ) -> List[Path]:

703 """Find all C/C++ files in the source folder"""

704 c_extensions = {".c", ".h", ".cpp", ".cc", ".cxx", ".hpp", ".hxx"}

705 files = []

706

707 self.logger.debug("Searching for files with extensions: %s", c_extensions)

708

709 try:

710 if recursive_search:

711 for ext in c_extensions:

712 try:

713 files.extend(source_folder_path.rglob(f"*{ext}"))

714 except (OSError, PermissionError) as e:

715 self.logger.warning("Error during recursive search for %s files: %s", ext, e)

716 # Continue with other extensions

717 else:

718 for ext in c_extensions:

719 try:

720 files.extend(source_folder_path.glob(f"*{ext}"))

721 except (OSError, PermissionError) as e:

722 self.logger.warning("Error during search for %s files: %s", ext, e)

723 # Continue with other extensions

724 except Exception as e:

725 raise OSError(f"Failed to search for C/C++ files in '{source_folder_path}': {e}")

726

727 # Filter out hidden files and common exclude patterns

728 filtered_files = []

729 exclude_patterns = {".git", "__pycache__", "node_modules", ".vscode", ".idea"}

730

731 for file_path in files:

732 try:

733 # Skip hidden files and directories

734 if any(part.startswith(".") for part in file_path.parts):

735 continue

736

737 # Skip common exclude patterns

738 if any(pattern in file_path.parts for pattern in exclude_patterns):

739 continue

740

741 # Verify the file is actually accessible

742 if not file_path.exists():

743 self.logger.debug("Skipping non-existent file: %s", file_path)

744 continue

745

746 if not file_path.is_file():

747 self.logger.debug("Skipping non-file item: %s", file_path)

748 continue

749

750 filtered_files.append(file_path)

751 except (OSError, PermissionError) as e:

752 self.logger.warning("Error accessing file %s: %s", file_path, e)

753 # Skip files we can't access

754 continue

755

756 self.logger.debug("Found %d C/C++ files after filtering", len(filtered_files))

757 return sorted(filtered_files)

758

759 def _detect_encoding(self, file_path: Path) -> str:

760 """Detect file encoding with platform-aware fallbacks"""

761 return detect_file_encoding(file_path)

762

763 def _find_original_token_pos(self, all_tokens, filtered_tokens, filtered_pos):

764 """Find the position in all_tokens that corresponds to filtered_tokens[filtered_pos]"""

765 if filtered_pos >= len(filtered_tokens):

766 return None

767

768 target_token = filtered_tokens[filtered_pos]

769

770 # Search for the token in all_tokens by line and column

771 for i, token in enumerate(all_tokens):

772 if (

773 token.line == target_token.line

774 and token.column == target_token.column

775 and token.value == target_token.value

776 ):

777 return i

778

779 return None

780

781 def _parse_single_typedef(self, tokens, start_pos):

782 """Parse a single typedef starting at the given position"""

783 # Skip 'typedef' keyword

784 pos = start_pos + 1

785

786 # Skip whitespace and comments

787 while pos < len(tokens) and tokens[pos].type in [

788 TokenType.WHITESPACE,

789 TokenType.COMMENT,

790 ]:

791 pos += 1

792

793 if pos >= len(tokens):

794 return None

795

796 # Check if it's a struct/enum/union typedef

797 if tokens[pos].type in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]:

798 # Look ahead to see if this complex type is immediately followed by a function-pointer declarator

799 # Pattern to detect: ... } ( * name ) ( ... )

800 look = pos

801 # Find the matching closing brace of the outer struct/union/enum

802 if tokens[look].type in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]:

803 # Advance to the opening brace

804 while look < len(tokens) and tokens[look].type != TokenType.LBRACE:

805 look += 1

806 if look < len(tokens) and tokens[look].type == TokenType.LBRACE:

807 brace_count = 1

808 look += 1

809 while look < len(tokens) and brace_count > 0:

810 if tokens[look].type == TokenType.LBRACE:

811 brace_count += 1

812 elif tokens[look].type == TokenType.RBRACE:

813 brace_count -= 1

814 look += 1

815 # Now 'look' is token after the closing brace

816 j = look

817 # Skip whitespace/comments

818 while j < len(tokens) and tokens[j].type in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:

819 j += 1

820 # Detect function-pointer declarator: ( * IDENT ) (

821 if (

822 j + 4 < len(tokens)

823 and tokens[j].type == TokenType.LPAREN

824 and tokens[j + 1].type == TokenType.ASTERISK

825 and tokens[j + 2].type == TokenType.IDENTIFIER

826 and tokens[j + 3].type == TokenType.RPAREN

827 and tokens[j + 4].type == TokenType.LPAREN

828 ):

829 typedef_name = tokens[j + 2].value

830 # Collect the full typedef original type up to the semicolon, preserving parentheses/brackets spacing

831 k = pos

832 formatted: list[str] = []

833 while k < len(tokens) and tokens[k].type != TokenType.SEMICOLON:

834 t = tokens[k]

835 if t.type in [TokenType.LPAREN, TokenType.RPAREN, TokenType.LBRACKET, TokenType.RBRACKET]:

836 formatted.append(t.value)

837 elif formatted and formatted[-1] not in ["(", ")", "[", "]"]:

838 # Prepend space before non-bracket tokens when previous isn't a bracket

839 formatted.append(" " + t.value)

840 else:

841 formatted.append(t.value)

842 k += 1

843 original_type = "".join(formatted)

844 # Clean excessive whitespace inside type

845 original_type = self._clean_type_string(original_type)

846 return (typedef_name, original_type)

847 # Fallback to standard complex typedef parsing

848 return self._parse_complex_typedef(tokens, pos)

849

850 # Collect all non-whitespace/comment tokens until semicolon

851 # But handle nested structures properly

852 all_tokens = []

853 brace_count = 0

854 paren_count = 0

855

856 while pos < len(tokens):

857 token = tokens[pos]

858

859 # Track nested braces and parentheses

860 if token.type == TokenType.LBRACE:

861 brace_count += 1

862 elif token.type == TokenType.RBRACE:

863 brace_count -= 1

864 elif token.type == TokenType.LPAREN:

865 paren_count += 1

866 elif token.type == TokenType.RPAREN:

867 paren_count -= 1

868 elif token.type == TokenType.SEMICOLON:

869 # Only treat semicolon as end if we're not inside nested structures

870 # For function pointer typedefs, we need to be outside the parameter list parentheses

871 if brace_count == 0 and paren_count == 0:

872 # We're outside any nested structures and parentheses

873 break

874

875 if token.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:

876 all_tokens.append(token)

877 pos += 1

878

879 if len(all_tokens) < 2:

880 return None

881

882 # Function pointer typedef: typedef ret (*name)(params);

883 for i in range(len(all_tokens) - 3):

884 if (

885 all_tokens[i].type

886 in [

887 TokenType.IDENTIFIER,

888 TokenType.INT,

889 TokenType.VOID,

890 TokenType.CHAR,

891 TokenType.FLOAT,

892 TokenType.DOUBLE,

893 TokenType.LONG,

894 TokenType.SHORT,

895 TokenType.UNSIGNED,

896 TokenType.SIGNED,

897 ]

898 and all_tokens[i + 1].type == TokenType.LPAREN

899 and all_tokens[i + 2].type == TokenType.ASTERISK

900 and all_tokens[i + 3].type == TokenType.IDENTIFIER

901 ):

902 # Check if this is followed by a parameter list

903 if i + 4 < len(all_tokens) and all_tokens[i + 4].type == TokenType.RPAREN:

904 if i + 5 < len(all_tokens) and all_tokens[i + 5].type == TokenType.LPAREN:

905 # This is a function pointer with parameters - skip this pattern and use the complex logic

906 break

907

908 # Simple function pointer typedef without complex parameters

909 typedef_name = all_tokens[i + 3].value

910 # Fix: Properly format function pointer type - preserve spaces between tokens but not around parentheses

911 formatted_tokens = []

912 for j, token in enumerate(all_tokens):

913 if token.type in [TokenType.LPAREN, TokenType.RPAREN]:

914 # Don't add spaces around parentheses

915 formatted_tokens.append(token.value)

916 elif j > 0 and all_tokens[j - 1].type not in [

917 TokenType.LPAREN,

918 TokenType.RPAREN,

919 ]:

920 # Add space before token if previous token wasn't a parenthesis

921 formatted_tokens.append(" " + token.value)

922 else:

923 # No space before token

924 formatted_tokens.append(token.value)

925 original_type = "".join(formatted_tokens)

926 return (typedef_name, original_type)

927

928 # Complex function pointer typedef: typedef ret (*name)(complex_params);

929 # This handles cases where the function pointer has complex parameters that span multiple tokens

930 if len(all_tokens) >= 6:

931 # Look for pattern: type ( * name ) ( ... )

932 for i in range(len(all_tokens) - 5):

933 if (

934 all_tokens[i].type

935 in [

936 TokenType.IDENTIFIER,

937 TokenType.INT,

938 TokenType.VOID,

939 TokenType.CHAR,

940 TokenType.FLOAT,

941 TokenType.DOUBLE,

942 TokenType.LONG,

943 TokenType.SHORT,

944 TokenType.UNSIGNED,

945 TokenType.SIGNED,

946 ]

947 and all_tokens[i + 1].type == TokenType.LPAREN

948 and all_tokens[i + 2].type == TokenType.ASTERISK

949 and all_tokens[i + 3].type == TokenType.IDENTIFIER

950 and all_tokens[i + 4].type == TokenType.RPAREN

951 and all_tokens[i + 5].type == TokenType.LPAREN

952 ):

953 # Find the closing parenthesis for the parameter list

954 paren_count = 1

955 param_end = i + 6

956 while param_end < len(all_tokens) and paren_count > 0:

957 if all_tokens[param_end].type == TokenType.LPAREN:

958 paren_count += 1

959 elif all_tokens[param_end].type == TokenType.RPAREN:

960 paren_count -= 1

961 param_end += 1

962

963 if paren_count == 0:

964 typedef_name = all_tokens[i + 3].value

965 # Format the complete typedef properly

966 formatted_tokens = []

967 for j, token in enumerate(all_tokens):

968 if token.type in [TokenType.LPAREN, TokenType.RPAREN]:

969 # Don't add spaces around parentheses

970 formatted_tokens.append(token.value)

971 elif j > 0 and all_tokens[j - 1].type not in [

972 TokenType.LPAREN,

973 TokenType.RPAREN,

974 ]:

975 # Add space before token if previous token wasn't a parenthesis

976 formatted_tokens.append(" " + token.value)

977 else:

978 # No space before token

979 formatted_tokens.append(token.value)

980 original_type = "".join(formatted_tokens)

981 return (typedef_name, original_type)

982

983 # Array typedef: typedef type name[size];

984 for i in range(len(all_tokens)):

985 if (

986 all_tokens[i].type == TokenType.LBRACKET

987 and i > 0

988 and all_tokens[i - 1].type == TokenType.IDENTIFIER

989 ):

990 typedef_name = all_tokens[i - 1].value

991 # Fix: Properly format array type - preserve spaces between tokens but not around brackets

992 formatted_tokens = []

993 for j, token in enumerate(all_tokens):

994 if token.type in [TokenType.LBRACKET, TokenType.RBRACKET]:

995 # Don't add spaces around brackets

996 formatted_tokens.append(token.value)

997 elif j > 0 and all_tokens[j - 1].type not in [

998 TokenType.LBRACKET,

999 TokenType.RBRACKET,

1000 ]:

1001 # Add space before token if previous token wasn't a bracket

1002 formatted_tokens.append(" " + token.value)

1003 else:

1004 # No space before token

1005 formatted_tokens.append(token.value)

1006 original_type = "".join(formatted_tokens)

1007 return (typedef_name, original_type)

1008

1009 # Pointer typedef: typedef type * name;

1010 for i in range(len(all_tokens) - 2):

1011 if (

1012 all_tokens[i].type == TokenType.ASTERISK

1013 and all_tokens[i + 1].type == TokenType.IDENTIFIER

1014 ):

1015 typedef_name = all_tokens[i + 1].value

1016 # Fix: Properly format pointer type - preserve spaces between tokens

1017 formatted_tokens = []

1018 for j, token in enumerate(all_tokens):

1019 if j > 0:

1020 # Add space before token

1021 formatted_tokens.append(" " + token.value)

1022 else:

1023 # No space before first token

1024 formatted_tokens.append(token.value)

1025 original_type = "".join(formatted_tokens)

1026 return (typedef_name, original_type)

1027

1028 # Basic typedef: the last token is the typedef name, everything else is the type

1029 typedef_name = all_tokens[-1].value

1030 type_tokens = all_tokens[:-1]

1031 original_type = " ".join(t.value for t in type_tokens)

1032 original_type = self._clean_type_string(original_type)

1033 original_type = self._fix_array_bracket_spacing(original_type)

1034 return (typedef_name, original_type)

1035

1036 def _parse_complex_typedef(self, tokens, start_pos):

1037 """Parse complex typedef (struct/enum/union)"""

1038 # Parse complex typedefs with proper structure detection

1039

1040 # Find the typedef name by looking for the pattern after the closing brace

1041 brace_count = 0

1042 pos = start_pos

1043

1044 # Find opening brace

1045 while pos < len(tokens) and tokens[pos].type != TokenType.LBRACE:

1046 pos += 1

1047

1048 if pos >= len(tokens):

1049 return None

1050

1051 # Skip to closing brace

1052 brace_count = 1

1053 pos += 1

1054

1055 while pos < len(tokens) and brace_count > 0:

1056 if tokens[pos].type == TokenType.LBRACE:

1057 brace_count += 1

1058 elif tokens[pos].type == TokenType.RBRACE:

1059 brace_count -= 1

1060 pos += 1

1061

1062 if brace_count > 0:

1063 return None

1064

1065 # Find typedef name after closing brace

1066 while pos < len(tokens) and tokens[pos].type in [

1067 TokenType.WHITESPACE,

1068 TokenType.COMMENT,

1069 ]:

1070 pos += 1

1071

1072 if pos < len(tokens) and tokens[pos].type == TokenType.IDENTIFIER:

1073 typedef_name = tokens[pos].value

1074 struct_type = tokens[start_pos].value # struct/enum/union

1075 return (typedef_name, struct_type)

1076

1077 return None

1078

1079 def _extract_tag_name_from_typedef(self, tokens, start_pos):

1080 """Extract the tag name from a typedef like 'typedef struct TagName { ... } TypedefName;'"""

1081 # Skip 'typedef' keyword

1082 pos = start_pos + 1

1083

1084 # Skip whitespace and comments

1085 while pos < len(tokens) and tokens[pos].type in [

1086 TokenType.WHITESPACE,

1087 TokenType.COMMENT,

1088 ]:

1089 pos += 1

1090

1091 if pos >= len(tokens):

1092 return ""

1093

1094 # Check if it's a struct/enum/union

1095 if tokens[pos].type not in [TokenType.STRUCT, TokenType.ENUM, TokenType.UNION]:

1096 return ""

1097

1098 # Skip struct/enum/union keyword

1099 pos += 1

1100

1101 # Skip whitespace and comments

1102 while pos < len(tokens) and tokens[pos].type in [

1103 TokenType.WHITESPACE,

1104 TokenType.COMMENT,

1105 ]:

1106 pos += 1

1107

1108 # Look for tag name (identifier before opening brace)

1109 if pos < len(tokens) and tokens[pos].type == TokenType.IDENTIFIER:

1110 tag_name = tokens[pos].value

1111 return tag_name

1112

1113 return ""

1114

1115 def _looks_like_function(self, tokens, start_pos):

1116 """Check if the token sequence starting at start_pos looks like a function"""

1117 # Look ahead for parentheses within a reasonable distance

1118 for i in range(start_pos, min(start_pos + 10, len(tokens))):

1119 if tokens[i].type == TokenType.LPAREN:

1120 return True

1121 if tokens[i].type in [

1122 TokenType.SEMICOLON,

1123 TokenType.LBRACE,

1124 TokenType.RBRACE,

1125 ]:

1126 return False

1127 return False

1128

1129 def _skip_function(self, tokens, start_pos):

1130 """Skip over a function definition or declaration"""

1131 # Find the end (either semicolon for declaration or closing brace for definition)

1132 i = start_pos

1133 brace_count = 0

1134 paren_count = 0

1135

1136 while i < len(tokens):

1137 if tokens[i].type == TokenType.LPAREN:

1138 paren_count += 1

1139 elif tokens[i].type == TokenType.RPAREN:

1140 paren_count -= 1

1141 elif tokens[i].type == TokenType.LBRACE:

1142 brace_count += 1

1143 elif tokens[i].type == TokenType.RBRACE:

1144 brace_count -= 1

1145 if brace_count == 0 and paren_count == 0:

1146 return i + 1

1147 elif (

1148 tokens[i].type == TokenType.SEMICOLON

1149 and paren_count == 0

1150 and brace_count == 0

1151 ):

1152 return i + 1

1153 i += 1

1154

1155 return i

1156

1157 def _skip_structure_definition(self, tokens, start_pos):

1158 """Skip over struct/enum/union/typedef definition"""

1159 i = start_pos

1160 brace_count = 0

1161

1162 while i < len(tokens):

1163 if tokens[i].type == TokenType.LBRACE:

1164 brace_count += 1

1165 elif tokens[i].type == TokenType.RBRACE:

1166 brace_count -= 1

1167 if brace_count == 0:

1168 # Continue until semicolon

1169 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON:

1170 i += 1

1171 return i + 1 if i < len(tokens) else i

1172 elif tokens[i].type == TokenType.SEMICOLON and brace_count == 0:

1173 return i + 1

1174 i += 1

1175

1176 return i

1177

1178 def _parse_global_variable(self, tokens, start_pos):

1179 """Parse a global variable declaration starting at start_pos"""

1180 # Look for pattern: [static/extern] type name [= value];

1181 i = start_pos

1182 collected_tokens = []

1183

1184 # Collect tokens until semicolon

1185 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON:

1186 if tokens[i].type not in [TokenType.WHITESPACE, TokenType.COMMENT]:

1187 collected_tokens.append(tokens[i])

1188 i += 1

1189

1190 if len(collected_tokens) < 2:

1191 return None

1192

1193 # Skip modifiers

1194 start_idx = 0

1195 while start_idx < len(collected_tokens) and collected_tokens[

1196 start_idx

1197 ].type in [TokenType.STATIC, TokenType.EXTERN, TokenType.CONST]:

1198 start_idx += 1

1199

1200 # Check if there's an assignment

1201 assign_idx = None

1202 for j in range(start_idx, len(collected_tokens)):

1203 if collected_tokens[j].type == TokenType.ASSIGN:

1204 assign_idx = j

1205 break

1206

1207 # Extract variable name and type

1208 if assign_idx is not None:

1209 # Has assignment: type name = value or type name[size] = value

1210 if assign_idx > start_idx + 1:

1211 # Check if this is an array declaration with assignment

1212 bracket_idx = None

1213 for j in range(assign_idx - 1, start_idx, -1):

1214 if collected_tokens[j].type == TokenType.RBRACKET:

1215 bracket_idx = j

1216 break

1217

1218 if bracket_idx is not None:

1219 # Array declaration with assignment: find the identifier before the opening bracket

1220 for j in range(bracket_idx - 1, start_idx, -1):

1221 if collected_tokens[j].type == TokenType.LBRACKET:

1222 # Found opening bracket, look for identifier before it

1223 for k in range(j - 1, start_idx, -1):

1224 if collected_tokens[k].type == TokenType.IDENTIFIER:

1225 var_name = collected_tokens[k].value

1226 type_tokens = collected_tokens[start_idx:k]

1227 # Format array type properly

1228 formatted_type = []

1229 for idx, token in enumerate(type_tokens):

1230 if idx > 0:

1231 formatted_type.append(" " + token.value)

1232 else:

1233 formatted_type.append(token.value)

1234 # Add array brackets without spaces

1235 array_size = collected_tokens[j + 1].value if j + 1 < bracket_idx else ""

1236 var_type = "".join(formatted_type) + "[" + array_size + "]"

1237 var_type = self._clean_type_string(var_type)

1238 value_tokens = collected_tokens[assign_idx + 1 :]

1239 var_value = " ".join(t.value for t in value_tokens)

1240 # Clean the value string to remove excessive whitespace and newlines

1241 var_value = self._clean_value_string(var_value)

1242 return (var_name, var_type, var_value)

1243 break

1244 else:

1245 # Regular assignment: type name = value

1246 var_name = collected_tokens[assign_idx - 1].value

1247 type_tokens = collected_tokens[start_idx : assign_idx - 1]

1248 value_tokens = collected_tokens[assign_idx + 1 :]

1249 var_type = " ".join(t.value for t in type_tokens)

1250 var_type = self._clean_type_string(var_type)

1251 var_type = self._fix_array_bracket_spacing(var_type)

1252 var_value = " ".join(t.value for t in value_tokens)

1253 # Clean the value string to remove excessive whitespace and newlines

1254 var_value = self._clean_value_string(var_value)

1255 return (var_name, var_type, var_value)

1256 else:

1257 # No assignment: type name or type name[size]

1258 if len(collected_tokens) > start_idx + 1:

1259 # Check if this is an array declaration

1260 bracket_idx = None

1261 for j in range(len(collected_tokens) - 1, start_idx, -1):

1262 if collected_tokens[j].type == TokenType.RBRACKET:

1263 bracket_idx = j

1264 break

1265

1266 if bracket_idx is not None:

1267 # Array declaration: find the identifier before the opening bracket

1268 for j in range(bracket_idx - 1, start_idx, -1):

1269 if collected_tokens[j].type == TokenType.LBRACKET:

1270 # Found opening bracket, look for identifier before it

1271 for k in range(j - 1, start_idx, -1):

1272 if collected_tokens[k].type == TokenType.IDENTIFIER:

1273 var_name = collected_tokens[k].value

1274 type_tokens = collected_tokens[start_idx:k]

1275 # Format array type properly - preserve spaces between tokens but not around brackets

1276 formatted_type = []

1277 for idx, token in enumerate(type_tokens):

1278 if idx > 0:

1279 formatted_type.append(" " + token.value)

1280 else:

1281 formatted_type.append(token.value)

1282 # Add array brackets without spaces

1283 array_size = collected_tokens[j + 1].value if j + 1 < bracket_idx else ""

1284 var_type = "".join(formatted_type) + "[" + array_size + "]"

1285 var_type = self._clean_type_string(var_type)

1286 return (var_name, var_type, None)

1287 break

1288 else:

1289 # Regular variable: last token is the name

1290 var_name = collected_tokens[-1].value

1291 type_tokens = collected_tokens[start_idx:-1]

1292 var_type = " ".join(t.value for t in type_tokens)

1293 var_type = self._clean_type_string(var_type)

1294 var_type = self._fix_array_bracket_spacing(var_type)

1295 return (var_name, var_type, None)

1296

1297 return None

1298

1299 def _skip_to_semicolon(self, tokens, start_pos):

1300 """Skip to the next semicolon"""

1301 i = start_pos

1302 while i < len(tokens) and tokens[i].type != TokenType.SEMICOLON:

1303 i += 1

1304 return i + 1 if i < len(tokens) else i

1305

1306 def _skip_preprocessor_directives(self, tokens, start_pos):

1307 """Skip preprocessor directives but keep their content for parsing"""

1308 # This method is deprecated - use the PreprocessorManager instead

1309 i = start_pos

1310 while i < len(tokens) and tokens[i].type == TokenType.PREPROCESSOR:

1311 # Skip the preprocessor directive itself

1312 i += 1

1313 return i

1314

1315 def _parse_function_parameters(self, tokens, start_pos, end_pos, func_name):

1316 """Parse function parameters from token range"""

1317

1318 parameters = []

1319

1320 # Find the opening parenthesis for the function

1321 paren_start = None

1322 paren_end = None

1323

1324 for i in range(start_pos, min(end_pos + 1, len(tokens))):

1325 if tokens[i].type == TokenType.IDENTIFIER and tokens[i].value == func_name:

1326 # Look for opening parenthesis after function name

1327 for j in range(i + 1, min(end_pos + 1, len(tokens))):

1328 if tokens[j].type == TokenType.LPAREN:

1329 paren_start = j

1330 break

1331 elif tokens[j].type not in [

1332 TokenType.WHITESPACE,

1333 TokenType.COMMENT,

1334 ]:

1335 break

1336 break

1337

1338 if paren_start is None:

1339 return parameters

1340

1341 # Find matching closing parenthesis

1342 paren_depth = 1

1343 for i in range(paren_start + 1, min(end_pos + 1, len(tokens))):

1344 if tokens[i].type == TokenType.LPAREN:

1345 paren_depth += 1

1346 elif tokens[i].type == TokenType.RPAREN:

1347 paren_depth -= 1

1348 if paren_depth == 0:

1349 paren_end = i

1350 break

1351

1352 if paren_end is None:

1353 return parameters

1354

1355 # Parse parameter tokens between parentheses

1356 param_tokens = []

1357 for i in range(paren_start + 1, paren_end):

1358 if tokens[i].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:

1359 param_tokens.append(tokens[i])

1360

1361 # If no parameters or just "void", return empty list

1362 if not param_tokens or (

1363 len(param_tokens) == 1 and param_tokens[0].value == "void"

1364 ):

1365 return parameters

1366

1367 # Split parameters by commas, but handle function pointers correctly

1368 current_param = []

1369 paren_depth = 0

1370 for token in param_tokens:

1371 if token.type == TokenType.LPAREN:

1372 paren_depth += 1

1373 elif token.type == TokenType.RPAREN:

1374 paren_depth -= 1

1375 elif token.type == TokenType.COMMA and paren_depth == 0:

1376 # Only split on commas that are not inside parentheses

1377 if current_param:

1378 param = self._parse_single_parameter(current_param)

1379 if param:

1380 parameters.append(param)

1381 current_param = []

1382 continue

1383

1384 current_param.append(token)

1385

1386 # Handle last parameter

1387 if current_param:

1388 param = self._parse_single_parameter(current_param)

1389 if param:

1390 parameters.append(param)

1391

1392 return parameters

1393

1394 def _parse_single_parameter(self, param_tokens):

1395 """Parse a single function parameter from tokens"""

1396 from ..models import Field

1397

1398 if not param_tokens:

1399 return None

1400

1401 # Handle variadic parameters (three consecutive dots)

1402 if len(param_tokens) == 3 and all(t.value == "." for t in param_tokens):

1403 return Field(name="...", type="...")

1404

1405 # Handle variadic parameters (single ... token)

1406 if len(param_tokens) == 1 and param_tokens[0].value == "...":

1407 return Field(name="...", type="...")

1408

1409 # Handle function pointer parameters: type (*name)(params)

1410 if len(param_tokens) >= 5:

1411 # Look for pattern: type ( * name ) ( params )

1412 for i in range(len(param_tokens) - 4):

1413 if (

1414 param_tokens[i].type == TokenType.LPAREN

1415 and param_tokens[i + 1].type == TokenType.ASTERISK

1416 and param_tokens[i + 2].type == TokenType.IDENTIFIER

1417 and param_tokens[i + 3].type == TokenType.RPAREN

1418 and param_tokens[i + 4].type == TokenType.LPAREN

1419 ):

1420 # Found function pointer pattern

1421 func_name = param_tokens[i + 2].value

1422

1423 # Find the closing parenthesis for the parameter list

1424 paren_count = 1

1425 param_end = i + 5

1426 while param_end < len(param_tokens) and paren_count > 0:

1427 if param_tokens[param_end].type == TokenType.LPAREN:

1428 paren_count += 1

1429 elif param_tokens[param_end].type == TokenType.RPAREN:

1430 paren_count -= 1

1431 param_end += 1

1432

1433 if paren_count == 0:

1434 # Extract the type (everything before the function pointer)

1435 type_tokens = param_tokens[:i]

1436 param_type = " ".join(t.value for t in type_tokens)

1437

1438 # Extract the function pointer part

1439 func_ptr_tokens = param_tokens[i:param_end]

1440 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)

1441

1442 # Combine type and function pointer

1443 full_type = (param_type + " " + func_ptr_type).strip()

1444

1445 # Fix array bracket spacing

1446 full_type = self._fix_array_bracket_spacing(full_type)

1447

1448 return Field(name=func_name, type=full_type)

1449 else:

1450 # Incomplete function pointer - try to reconstruct

1451 type_tokens = param_tokens[:i]

1452 param_type = " ".join(t.value for t in type_tokens)

1453 func_ptr_tokens = param_tokens[i:]

1454 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)

1455 full_type = (param_type + " " + func_ptr_type).strip()

1456 full_type = self._fix_array_bracket_spacing(full_type)

1457 return Field(name=func_name, type=full_type)

1458

1459 # Also look for pattern: type ( * name ) ( params ) with spaces

1460 for i in range(len(param_tokens) - 4):

1461 if (

1462 param_tokens[i].type == TokenType.LPAREN

1463 and param_tokens[i + 1].type == TokenType.ASTERISK

1464 and param_tokens[i + 2].type == TokenType.IDENTIFIER

1465 and param_tokens[i + 3].type == TokenType.RPAREN

1466 and param_tokens[i + 4].type == TokenType.LPAREN

1467 ):

1468 # Found function pointer pattern

1469 func_name = param_tokens[i + 2].value

1470

1471 # Find the closing parenthesis for the parameter list

1472 paren_count = 1

1473 param_end = i + 5

1474 while param_end < len(param_tokens) and paren_count > 0:

1475 if param_tokens[param_end].type == TokenType.LPAREN:

1476 paren_count += 1

1477 elif param_tokens[param_end].type == TokenType.RPAREN:

1478 paren_count -= 1

1479 param_end += 1

1480

1481 if paren_count == 0:

1482 # Extract the type (everything before the function pointer)

1483 type_tokens = param_tokens[:i]

1484 param_type = " ".join(t.value for t in type_tokens)

1485

1486 # Extract the function pointer part

1487 func_ptr_tokens = param_tokens[i:param_end]

1488 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)

1489

1490 # Combine type and function pointer

1491 full_type = (param_type + " " + func_ptr_type).strip()

1492

1493 # Fix array bracket spacing

1494 full_type = self._fix_array_bracket_spacing(full_type)

1495

1496 return Field(name=func_name, type=full_type)

1497 else:

1498 # Incomplete function pointer - try to reconstruct

1499 type_tokens = param_tokens[:i]

1500 param_type = " ".join(t.value for t in type_tokens)

1501 func_ptr_tokens = param_tokens[i:]

1502 func_ptr_type = " ".join(t.value for t in func_ptr_tokens)

1503 full_type = (param_type + " " + func_ptr_type).strip()

1504 full_type = self._fix_array_bracket_spacing(full_type)

1505 return Field(name=func_name, type=full_type)

1506

1507 # For parameters like "int x" or "const char *name" or "char* argv[]"

1508 if len(param_tokens) >= 2:

1509 # Check if the last token is a closing bracket (array parameter)

1510 if param_tokens[-1].type == TokenType.RBRACKET:

1511 # Find the opening bracket to get the array size

1512 bracket_start = None

1513 for i in range(len(param_tokens) - 1, -1, -1):

1514 if param_tokens[i].type == TokenType.LBRACKET:

1515 bracket_start = i

1516 break

1517

1518 if bracket_start is not None:

1519 # Extract the parameter name (last identifier before the opening bracket)

1520 param_name = None

1521 for i in range(bracket_start - 1, -1, -1):

1522 if param_tokens[i].type == TokenType.IDENTIFIER:

1523 param_name = param_tokens[i].value

1524 break

1525

1526 if param_name:

1527 # Extract the type (everything before the parameter name)

1528 type_tokens = param_tokens[:i]

1529 param_type = " ".join(t.value for t in type_tokens)

1530

1531 # Add the array brackets to the type

1532 array_size = ""

1533 if bracket_start + 1 < len(param_tokens) - 1:

1534 # There's content between brackets

1535 array_content = param_tokens[bracket_start + 1:-1]

1536 array_size = " ".join(t.value for t in array_content)

1537

1538 param_type = param_type + "[" + array_size + "]"

1539

1540 # Fix array bracket spacing

1541 param_type = self._fix_array_bracket_spacing(param_type)

1542

1543 return Field(name=param_name, type=param_type)

1544 else:

1545 # Regular parameter: last token is the parameter name

1546 param_name = param_tokens[-1].value

1547 type_tokens = param_tokens[:-1]

1548 param_type = " ".join(t.value for t in type_tokens)

1549

1550 # Fix array bracket spacing and pointer spacing

1551 param_type = self._fix_array_bracket_spacing(param_type)

1552 param_type = self._fix_pointer_spacing(param_type)

1553

1554 # Handle unnamed parameters (just type)

1555 if param_name in [

1556 "void",

1557 "int",

1558 "char",

1559 "float",

1560 "double",

1561 "long",

1562 "short",

1563 "unsigned",

1564 "signed",

1565 ]:

1566 # This is just a type without a name

1567 return Field(name="unnamed", type=param_type + " " + param_name)

1568

1569 # Additional validation before creating Field

1570 if param_name and param_name.strip() and param_type and param_type.strip():

1571 return Field(name=param_name.strip(), type=param_type.strip())

1572 else:

1573 # Fallback for invalid parameters - try to reconstruct the full parameter

1574 full_param = " ".join(t.value for t in param_tokens)

1575 full_param = self._fix_array_bracket_spacing(full_param)

1576 if full_param.strip():

1577 return Field(name="unnamed", type=full_param.strip())

1578 else:

1579 return Field(name="unnamed", type="unknown")

1580 elif len(param_tokens) == 1:

1581 # Single token - might be just type (like "void") or name

1582 token_value = param_tokens[0].value

1583 if token_value in [

1584 "void",

1585 "int",

1586 "char",

1587 "float",

1588 "double",

1589 "long",

1590 "short",

1591 "unsigned",

1592 "signed",

1593 ]:

1594 return Field(name="unnamed", type=token_value)

1595 else:

1596 # If we can't determine the type, use the token value as type

1597 if token_value and token_value.strip():

1598 return Field(name="unnamed", type=token_value.strip())

1599 else:

1600 return Field(name="unnamed", type="unknown")

1601

1602 return None

1603

1604 def _fix_array_bracket_spacing(self, type_str: str) -> str:

1605 """Fix spacing around array brackets in type strings"""

1606 # First clean the type string to remove newlines

1607 type_str = self._clean_type_string(type_str)

1608 # Replace patterns like "type[ size ]" with "type[size]"

1609 import re

1610 # Remove spaces around array brackets

1611 type_str = re.sub(r'\s*\[\s*', '[', type_str)

1612 type_str = re.sub(r'\s*\]\s*', ']', type_str)

1613 return type_str

1614

1615 def _fix_pointer_spacing(self, type_str: str) -> str:

1616 """Fix spacing around pointer asterisks in type strings"""

1617 import re

1618 # Fix double pointer spacing: "type * *" -> "type **"

1619 type_str = re.sub(r'\*\s+\*', '**', type_str)

1620 # Fix triple pointer spacing: "type * * *" -> "type ***"

1621 type_str = re.sub(r'\*\s+\*\s+\*', '***', type_str)

1622 return type_str

1623

1624 def _clean_type_string(self, type_str: str) -> str:

1625 """Clean type string by removing newlines and normalizing whitespace"""

1626 if not type_str:

1627 return type_str

1628 # Replace newlines with spaces and normalize whitespace

1629 cleaned = type_str.replace('\n', ' ')

1630 # Normalize multiple spaces to single space

1631 import re

1632 cleaned = re.sub(r'\s+', ' ', cleaned)

1633 # Strip leading/trailing whitespace

1634 cleaned = cleaned.strip()

1635 return cleaned

1636

1637 def _clean_value_string(self, value_str: str) -> str:

1638 """Clean value string by removing excessive whitespace and newlines"""

1639 if not value_str:

1640 return value_str

1641 # Replace newlines with spaces and normalize whitespace

1642 cleaned = value_str.replace('\n', ' ')

1643 # Normalize multiple spaces to single space

1644 import re

1645 cleaned = re.sub(r'\s+', ' ', cleaned)

1646 # Strip leading/trailing whitespace

1647 cleaned = cleaned.strip()

1648 # Remove excessive spaces around braces and operators

1649 cleaned = re.sub(r'\s*{\s*', '{', cleaned)

1650 cleaned = re.sub(r'\s*}\s*', '}', cleaned)

1651 cleaned = re.sub(r'\s*,\s*', ', ', cleaned)

1652 cleaned = re.sub(r'\s*&\s*', '&', cleaned)

1653 return cleaned

1654

1655 def _get_timestamp(self) -> str:

1656 """Get current timestamp string"""

1657 from datetime import datetime

1658

1659 return datetime.now().isoformat()

1660

1661

1662class Parser:

1663 """Main parser class for Step 1: Parse C code files and generate model.json"""

1664

1665 def __init__(self):

1666 self.c_parser = CParser()

1667 self.logger = logging.getLogger(__name__)

1668

1669 def parse(

1670 self,

1671 source_folders: "List[str]",

1672 output_file: str = "model.json",

1673 recursive_search: bool = True,

1674 config: "Config" = None,

1675 ) -> str:

1676 """Parse C/C++ projects and generate model.json

1677

1678 Args:

1679 source_folders: List of source folder directories within the project

1680 output_file: Path to the output model.json file

1681 recursive_search: Whether to search subdirectories recursively

1682 config: Configuration object for filtering and processing

1683

1684 Returns:

1685 Path to the generated model.json file

1686 """

1687 # Enhanced validation for source_folders

1688 if not isinstance(source_folders, list):

1689 raise TypeError(f"source_folders must be a list of strings, got: {type(source_folders)}")

1690

1691 if not source_folders:

1692 raise ValueError("At least one source folder must be provided")

1693

1694 # Validate all items are strings and not empty

1695 for i, folder in enumerate(source_folders):

1696 if not isinstance(folder, str):

1697 raise TypeError(f"All source folders must be strings, got {type(folder)} at index {i}: {folder}")

1698 if not folder.strip():

1699 raise ValueError(f"Source folder at index {i} cannot be empty or whitespace: {repr(folder)}")

1700

1701 self.logger.info(

1702 f"Step 1: Parsing C/C++ project with {len(source_folders)} source folders"

1703 )

1704

1705 # Get project name from config or use default

1706 project_name = (

1707 getattr(config, "project_name", "C_Project") if config else "C_Project"

1708 )

1709

1710 # Parse each source folder and combine results

1711 all_files = {}

1712 total_structs = 0

1713 total_enums = 0

1714 total_functions = 0

1715 failed_folders = []

1716

1717 for i, source_folder in enumerate(source_folders):

1718 self.logger.info(

1719 f"Parsing source folder {i+1}/{len(source_folders)}: {source_folder}"

1720 )

1721

1722 try:

1723 # Parse the individual source folder

1724 model = self.c_parser.parse_project(

1725 source_folder, recursive_search, config

1726 )

1727

1728 all_files.update(model.files)

1729

1730 # Update totals

1731 total_structs += sum(len(f.structs) for f in model.files.values())

1732 total_enums += sum(len(f.enums) for f in model.files.values())

1733 total_functions += sum(len(f.functions) for f in model.files.values())

1734

1735 self.logger.info(

1736 f"Successfully parsed source folder {source_folder}: {len(model.files)} files"

1737 )

1738

1739 except Exception as e:

1740 self.logger.error(

1741 "Failed to parse source folder %s: %s", source_folder, e

1742 )

1743 failed_folders.append((source_folder, str(e)))

1744

1745 # If this is the only source folder, re-raise the error

1746 if len(source_folders) == 1:

1747 raise

1748

1749 # For multiple source folders, continue with others but log the failure

1750 self.logger.warning(

1751 "Continuing with other source folders despite failure in %s", source_folder

1752 )

1753

1754 # If all source folders failed, raise an error

1755 if failed_folders and len(failed_folders) == len(source_folders):

1756 error_msg = "All source folders failed to parse:\n"

1757 for folder, error in failed_folders:

1758 error_msg += f" - {folder}: {error}\n"

1759 raise RuntimeError(error_msg)

1760

1761 # If some folders failed, log a warning

1762 if failed_folders:

1763 self.logger.warning(

1764 f"Failed to parse {len(failed_folders)} out of {len(source_folders)} source folders"

1765 )

1766

1767 # Create combined project model

1768 combined_model = ProjectModel(

1769 project_name=project_name,

1770 source_folder=(

1771 ",".join(source_folders)

1772 if len(source_folders) > 1

1773 else source_folders[0]

1774 ),

1775 files=all_files,

1776 )

1777

1778 # Update all uses fields across the entire combined project

1779 combined_model.update_uses_fields()

1780

1781 # Save combined model to JSON file

1782 try:

1783 combined_model.save(output_file)

1784 except Exception as e:

1785 raise RuntimeError(f"Failed to save model to {output_file}: {e}") from e

1786

1787 # Step 1.5: Verify model sanity

1788 self.logger.info("Step 1.5: Verifying model sanity...")

1789 from .verifier import ModelVerifier

1790

1791 verifier = ModelVerifier()

1792 is_valid, issues = verifier.verify_model(combined_model)

1793

1794 if not is_valid:

1795 self.logger.warning(

1796 f"Model verification found {len(issues)} issues - model may contain parsing errors"

1797 )

1798 # Continue processing but warn about potential issues

1799 else:

1800 self.logger.info("Model verification passed - all values look sane")

1801

1802 self.logger.info("Step 1 complete! Model saved to: %s", output_file)

1803 self.logger.info(

1804 f"Found {len(all_files)} total files across {len(source_folders)} source folder(s)"

1805 )

1806

1807 # Print summary

1808 self.logger.info(

1809 f"Summary: {total_structs} structs, {total_enums} enums, "

1810 f"{total_functions} functions"

1811 )

1812

1813 return output_file