Coverage for src/c2puml/core/parser_tokenizer.py: 87%

865 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-20 03:53 +0000

1#!/usr/bin/env python3 

2""" 

3Tokenizer module for C to PlantUML converter - Helper library for tokenizing C/C++ code 

4""" 

5 

6import logging 

7import re 

8from dataclasses import dataclass 

9from enum import Enum 

10from typing import List, Optional, Tuple 

11 

12 

13class TokenType(Enum): 

14 """Token types for C/C++ lexical analysis""" 

15 

16 # Keywords 

17 STRUCT = "STRUCT" 

18 ENUM = "ENUM" 

19 UNION = "UNION" 

20 TYPEDEF = "TYPEDEF" 

21 STATIC = "STATIC" 

22 EXTERN = "EXTERN" 

23 INLINE = "INLINE" 

24 LOCAL_INLINE = "LOCAL_INLINE" 

25 CONST = "CONST" 

26 VOID = "VOID" 

27 

28 # Data types 

29 CHAR = "CHAR" 

30 INT = "INT" 

31 FLOAT = "FLOAT" 

32 DOUBLE = "DOUBLE" 

33 LONG = "LONG" 

34 SHORT = "SHORT" 

35 UNSIGNED = "UNSIGNED" 

36 SIGNED = "SIGNED" 

37 

38 # Operators and punctuation 

39 LBRACE = "LBRACE" # { 

40 RBRACE = "RBRACE" # } 

41 LPAREN = "LPAREN" # ( 

42 RPAREN = "RPAREN" # ) 

43 LBRACKET = "LBRACKET" # [ 

44 RBRACKET = "RBRACKET" # ] 

45 SEMICOLON = "SEMICOLON" # ; 

46 COMMA = "COMMA" # , 

47 ASSIGN = "ASSIGN" # = 

48 ASTERISK = "ASTERISK" # * 

49 AMPERSAND = "AMPERSAND" # & 

50 ARROW = "ARROW" # -> 

51 

52 # Literals and identifiers 

53 IDENTIFIER = "IDENTIFIER" 

54 NUMBER = "NUMBER" 

55 STRING = "STRING" 

56 CHAR_LITERAL = "CHAR_LITERAL" 

57 

58 # Preprocessor 

59 INCLUDE = "INCLUDE" 

60 DEFINE = "DEFINE" 

61 PREPROCESSOR = "PREPROCESSOR" 

62 

63 # Special 

64 COMMENT = "COMMENT" 

65 WHITESPACE = "WHITESPACE" 

66 NEWLINE = "NEWLINE" 

67 EOF = "EOF" 

68 UNKNOWN = "UNKNOWN" 

69 

70 

71@dataclass 

72class Token: 

73 """Represents a single token in C/C++ code""" 

74 

75 type: TokenType 

76 value: str 

77 line: int 

78 column: int 

79 

80 def __repr__(self) -> str: 

81 return f"Token({self.type.name}, '{self.value}', {self.line}:{self.column})" 

82 

83 

84class CTokenizer: 

85 """Tokenizer for C/C++ source code""" 

86 

87 # Keywords mapping 

88 KEYWORDS = { 

89 "struct": TokenType.STRUCT, 

90 "enum": TokenType.ENUM, 

91 "union": TokenType.UNION, 

92 "typedef": TokenType.TYPEDEF, 

93 "static": TokenType.STATIC, 

94 "extern": TokenType.EXTERN, 

95 "inline": TokenType.INLINE, 

96 "local_inline": TokenType.LOCAL_INLINE, 

97 "const": TokenType.CONST, 

98 "void": TokenType.VOID, 

99 "char": TokenType.CHAR, 

100 "int": TokenType.INT, 

101 "float": TokenType.FLOAT, 

102 "double": TokenType.DOUBLE, 

103 "long": TokenType.LONG, 

104 "short": TokenType.SHORT, 

105 "unsigned": TokenType.UNSIGNED, 

106 "signed": TokenType.SIGNED, 

107 } 

108 

109 # Single character tokens 

110 SINGLE_CHAR_TOKENS = { 

111 "{": TokenType.LBRACE, 

112 "}": TokenType.RBRACE, 

113 "(": TokenType.LPAREN, 

114 ")": TokenType.RPAREN, 

115 "[": TokenType.LBRACKET, 

116 "]": TokenType.RBRACKET, 

117 ";": TokenType.SEMICOLON, 

118 ",": TokenType.COMMA, 

119 "=": TokenType.ASSIGN, 

120 "*": TokenType.ASTERISK, 

121 "&": TokenType.AMPERSAND, 

122 } 

123 

124 # Two character tokens 

125 TWO_CHAR_TOKENS = { 

126 "->": TokenType.ARROW, 

127 } 

128 

129 def __init__(self): 

130 self.logger = logging.getLogger(__name__) 

131 

132 # Compiled regex patterns for efficiency 

133 self.patterns = { 

134 "identifier": re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*"), 

135 "number": re.compile( 

136 r"0[xX][0-9a-fA-F]+[uUlL]*|0[bB][01]+[uUlL]*|0[0-7]+[uUlL]*|" 

137 r"\d+\.\d*([eE][+-]?\d+)?[fFlL]*|\d+([eE][+-]?\d+)?[fFlL]*|\d+[uUlL]*" 

138 ), 

139 "string": re.compile(r'"([^"\\]|\\.)*"'), 

140 "char": re.compile(r"'([^'\\]|\\.)'"), 

141 "comment_single": re.compile(r"//.*"), 

142 "comment_multi": re.compile(r"/\*.*?\*/", re.DOTALL), 

143 "preprocessor": re.compile( 

144 r"#(include|define|ifdef|ifndef|if|endif|elif|else|pragma|error|warning)\b.*" 

145 ), 

146 "whitespace": re.compile(r"[ \t]+"), 

147 "newline": re.compile(r"\n"), 

148 } 

149 

150 def tokenize(self, content: str) -> List[Token]: 

151 """Tokenize C/C++ source code content""" 

152 tokens = [] 

153 lines = content.splitlines() 

154 total_lines = len(lines) 

155 line_num = 1 

156 in_multiline_string = False 

157 multiline_string_value = "" 

158 multiline_string_start_line = 0 

159 multiline_string_start_col = 0 

160 in_multiline_comment = False 

161 multiline_comment_value = "" 

162 multiline_comment_start_line = 0 

163 multiline_comment_start_col = 0 

164 

165 for idx, line in enumerate(lines): 

166 if in_multiline_string: 

167 multiline_string_value += "\n" + line 

168 if '"' in line: 

169 # End of multiline string 

170 in_multiline_string = False 

171 tokens.append( 

172 Token( 

173 TokenType.STRING, 

174 multiline_string_value, 

175 multiline_string_start_line, 

176 multiline_string_start_col, 

177 ) 

178 ) 

179 elif in_multiline_comment: 

180 # Continue multi-line comment 

181 multiline_comment_value += "\n" + line 

182 comment_end = line.find("*/") 

183 if comment_end != -1: 

184 # End of multi-line comment 

185 in_multiline_comment = False 

186 multiline_comment_value = multiline_comment_value[ 

187 : multiline_comment_value.rfind("*/") + 2 

188 ] 

189 tokens.append( 

190 Token( 

191 TokenType.COMMENT, 

192 multiline_comment_value, 

193 multiline_comment_start_line, 

194 multiline_comment_start_col, 

195 ) 

196 ) 

197 else: 

198 line_tokens = self._tokenize_line(line, line_num) 

199 # Check if a string starts but does not end on this line 

200 if ( 

201 line_tokens 

202 and line_tokens[-1].type == TokenType.STRING 

203 and not line_tokens[-1].value.endswith('"') 

204 ): 

205 in_multiline_string = True 

206 multiline_string_value = line_tokens[-1].value 

207 multiline_string_start_line = line_tokens[-1].line 

208 multiline_string_start_col = line_tokens[-1].column 

209 tokens.extend(line_tokens[:-1]) 

210 # Check if a multi-line comment starts but does not end on this line 

211 elif ( 

212 line_tokens 

213 and line_tokens[-1].type == TokenType.COMMENT 

214 and line_tokens[-1].value.startswith("/*") 

215 and not line_tokens[-1].value.endswith("*/") 

216 ): 

217 in_multiline_comment = True 

218 multiline_comment_value = line_tokens[-1].value 

219 multiline_comment_start_line = line_tokens[-1].line 

220 multiline_comment_start_col = line_tokens[-1].column 

221 tokens.extend(line_tokens[:-1]) 

222 else: 

223 tokens.extend(line_tokens) 

224 

225 if line_num < total_lines: 

226 tokens.append(Token(TokenType.NEWLINE, "\n", line_num, len(line))) 

227 line_num += 1 

228 

229 if in_multiline_string: 

230 tokens.append( 

231 Token( 

232 TokenType.STRING, 

233 multiline_string_value, 

234 multiline_string_start_line, 

235 multiline_string_start_col, 

236 ) 

237 ) 

238 if in_multiline_comment: 

239 tokens.append( 

240 Token( 

241 TokenType.COMMENT, 

242 multiline_comment_value, 

243 multiline_comment_start_line, 

244 multiline_comment_start_col, 

245 ) 

246 ) 

247 

248 # Post-process tokens to merge multi-line macros 

249 tokens = self._merge_multiline_macros(tokens, lines) 

250 

251 tokens.append( 

252 Token(TokenType.EOF, "", total_lines, len(lines[-1]) if lines else 0) 

253 ) 

254 

255 return tokens 

256 

257 def _tokenize_line(self, line: str, line_num: int) -> List[Token]: 

258 """Tokenize a single line of code""" 

259 tokens = [] 

260 pos = 0 

261 

262 while pos < len(line): 

263 # Skip whitespace but track it 

264 if match := self.patterns["whitespace"].match(line, pos): 

265 tokens.append(Token(TokenType.WHITESPACE, match.group(), line_num, pos)) 

266 pos = match.end() 

267 continue 

268 

269 # Comments 

270 if match := self.patterns["comment_single"].match(line, pos): 

271 tokens.append(Token(TokenType.COMMENT, match.group(), line_num, pos)) 

272 pos = len(line) # Rest of line is comment 

273 continue 

274 

275 # Multi-line comments - check for /* at start of line or after whitespace 

276 if line[pos:].startswith("/*"): 

277 # Find the end of the comment 

278 comment_end = line.find("*/", pos) 

279 if comment_end != -1: 

280 # Comment ends on this line 

281 comment_text = line[pos : comment_end + 2] 

282 tokens.append(Token(TokenType.COMMENT, comment_text, line_num, pos)) 

283 pos = comment_end + 2 

284 continue 

285 else: 

286 # Comment continues to next line - create a partial comment token 

287 comment_text = line[pos:] 

288 tokens.append(Token(TokenType.COMMENT, comment_text, line_num, pos)) 

289 pos = len(line) 

290 continue 

291 

292 # Preprocessor directives 

293 if match := self.patterns["preprocessor"].match(line, pos): 

294 value = match.group() 

295 if value.startswith("#include"): 

296 tokens.append(Token(TokenType.INCLUDE, value, line_num, pos)) 

297 elif value.startswith("#define"): 

298 tokens.append(Token(TokenType.DEFINE, value, line_num, pos)) 

299 else: 

300 tokens.append(Token(TokenType.PREPROCESSOR, value, line_num, pos)) 

301 pos = len(line) # Rest of line is preprocessor 

302 continue 

303 

304 # String literals 

305 if ( 

306 line[pos] == '"' 

307 or ( 

308 pos > 0 

309 and line[pos - 1] in ["L", "u", "U", "R"] 

310 and line[pos] == '"' 

311 ) 

312 or (pos > 1 and line[pos - 2 : pos] == "u8" and line[pos] == '"') 

313 ): 

314 # Handle string literals with possible prefixes 

315 string_start = pos 

316 if line[pos - 2 : pos] == "u8": 

317 string_start -= 2 

318 elif line[pos - 1] in ["L", "u", "U", "R"]: 

319 string_start -= 1 

320 pos += 1 # Skip opening quote 

321 while pos < len(line): 

322 if line[pos] == '"': 

323 # Found closing quote 

324 string_text = line[string_start : pos + 1] 

325 tokens.append( 

326 Token(TokenType.STRING, string_text, line_num, string_start) 

327 ) 

328 pos += 1 

329 break 

330 elif line[pos] == "\\": 

331 pos += 2 

332 else: 

333 pos += 1 

334 else: 

335 string_text = line[string_start:] 

336 tokens.append( 

337 Token(TokenType.STRING, string_text, line_num, string_start) 

338 ) 

339 pos = len(line) 

340 continue 

341 

342 # Character literals 

343 if match := self.patterns["char"].match(line, pos): 

344 tokens.append( 

345 Token(TokenType.CHAR_LITERAL, match.group(), line_num, pos) 

346 ) 

347 pos = match.end() 

348 continue 

349 

350 # Numbers 

351 if match := self.patterns["number"].match(line, pos): 

352 tokens.append(Token(TokenType.NUMBER, match.group(), line_num, pos)) 

353 pos = match.end() 

354 continue 

355 

356 # Single character tokens 

357 if line[pos] in self.SINGLE_CHAR_TOKENS: 

358 token_type = self.SINGLE_CHAR_TOKENS[line[pos]] 

359 tokens.append(Token(token_type, line[pos], line_num, pos)) 

360 pos += 1 

361 continue 

362 

363 # Multi-character operators (<<, >>, ->) 

364 if line[pos : pos + 2] in ["<<", ">>", "->"]: 

365 op = line[pos : pos + 2] 

366 if op == "->": 

367 tokens.append(Token(TokenType.ARROW, op, line_num, pos)) 

368 else: 

369 tokens.append( 

370 Token( 

371 ( 

372 TokenType.OPERATOR 

373 if hasattr(TokenType, "OPERATOR") 

374 else TokenType.UNKNOWN 

375 ), 

376 op, 

377 line_num, 

378 pos, 

379 ) 

380 ) 

381 pos += 2 

382 continue 

383 

384 # Identifiers and keywords 

385 if match := self.patterns["identifier"].match(line, pos): 

386 value = match.group() 

387 token_type = self.KEYWORDS.get(value.lower(), TokenType.IDENTIFIER) 

388 tokens.append(Token(token_type, value, line_num, pos)) 

389 pos = match.end() 

390 continue 

391 

392 # Unknown character (always one at a time) 

393 tokens.append(Token(TokenType.UNKNOWN, line[pos], line_num, pos)) 

394 pos += 1 

395 

396 return tokens 

397 

398 def filter_tokens( 

399 self, tokens: List[Token], exclude_types: Optional[List[TokenType]] = None 

400 ) -> List[Token]: 

401 """Filter tokens by type""" 

402 if exclude_types is None: 

403 exclude_types = [ 

404 TokenType.WHITESPACE, 

405 TokenType.COMMENT, 

406 TokenType.NEWLINE, 

407 TokenType.EOF, 

408 ] 

409 

410 return [token for token in tokens if token.type not in exclude_types] 

411 

412 def _merge_multiline_macros( 

413 self, tokens: List[Token], lines: List[str] 

414 ) -> List[Token]: 

415 """Merge multi-line macro tokens that span multiple lines with backslashes""" 

416 merged_tokens = [] 

417 i = 0 

418 

419 while i < len(tokens): 

420 token = tokens[i] 

421 

422 if token.type == TokenType.DEFINE and token.value.rstrip().endswith("\\"): 

423 # Found a multi-line macro, merge with subsequent lines 

424 macro_content = token.value 

425 current_line = token.line 

426 

427 # Continue merging lines until we find one that doesn't end with backslash 

428 while macro_content.rstrip().endswith("\\"): 

429 # Remove the backslash and add a newline 

430 macro_content = macro_content.rstrip()[:-1] + "\n" 

431 current_line += 1 

432 

433 # Find the next line content 

434 if current_line <= len(lines): 

435 next_line = lines[current_line - 1] 

436 macro_content += next_line 

437 else: 

438 break 

439 

440 # Create a new token with the merged content 

441 merged_tokens.append( 

442 Token(TokenType.DEFINE, macro_content, token.line, token.column) 

443 ) 

444 else: 

445 merged_tokens.append(token) 

446 

447 i += 1 

448 

449 return merged_tokens 

450 

451 

452class StructureFinder: 

453 """Helper class to find C/C++ structures in token streams""" 

454 

455 def __init__(self, tokens: List[Token]): 

456 self.tokens = tokens 

457 self.pos = 0 

458 self.logger = logging.getLogger(__name__) 

459 

460 def find_structs(self) -> List[Tuple[int, int, str]]: 

461 """Find struct definitions in token stream 

462 

463 Returns: 

464 List of tuples (start_pos, end_pos, struct_name) 

465 """ 

466 structs = [] 

467 self.pos = 0 

468 

469 while self.pos < len(self.tokens): 

470 if self._current_token_is(TokenType.STRUCT): 

471 struct_info = self._parse_struct() 

472 if struct_info: 

473 structs.append(struct_info) 

474 elif self._current_token_is(TokenType.TYPEDEF): 

475 typedef_struct = self._parse_typedef_struct() 

476 if typedef_struct: 

477 structs.append(typedef_struct) 

478 else: 

479 self.pos += 1 

480 

481 return structs 

482 

483 def find_enums(self) -> List[Tuple[int, int, str]]: 

484 """Find enum definitions in token stream""" 

485 enums = [] 

486 self.pos = 0 

487 

488 while self.pos < len(self.tokens): 

489 if self._current_token_is(TokenType.ENUM): 

490 enum_info = self._parse_enum() 

491 if enum_info: 

492 enums.append(enum_info) 

493 elif self._current_token_is(TokenType.TYPEDEF): 

494 typedef_enum = self._parse_typedef_enum() 

495 if typedef_enum: 

496 enums.append(typedef_enum) 

497 else: 

498 self.pos += 1 

499 

500 return enums 

501 

502 def find_functions(self) -> List[Tuple[int, int, str, str, bool, bool]]: 

503 """Find all function declarations and definitions in the token stream 

504 

505 Returns: 

506 List of tuples (start_pos, end_pos, func_name, return_type, is_declaration, is_inline) 

507 """ 

508 functions = [] 

509 self.pos = 0 

510 

511 while self.pos < len(self.tokens): 

512 result = self._parse_function() 

513 if result: 

514 functions.append(result) 

515 

516 return functions 

517 

518 def find_unions(self) -> List[Tuple[int, int, str]]: 

519 """Find union definitions in token stream""" 

520 unions = [] 

521 self.pos = 0 

522 

523 while self.pos < len(self.tokens): 

524 if self._current_token_is(TokenType.UNION): 

525 union_info = self._parse_union() 

526 if union_info: 

527 unions.append(union_info) 

528 elif self._current_token_is(TokenType.TYPEDEF): 

529 typedef_union = self._parse_typedef_union() 

530 if typedef_union: 

531 unions.append(typedef_union) 

532 else: 

533 self.pos += 1 

534 

535 return unions 

536 

537 def _current_token_is(self, token_type: TokenType) -> bool: 

538 """Check if current token is of specified type""" 

539 return self.pos < len(self.tokens) and self.tokens[self.pos].type == token_type 

540 

541 def _peek_token(self, offset: int = 1) -> Optional[Token]: 

542 """Peek at token at current position + offset""" 

543 peek_pos = self.pos + offset 

544 return self.tokens[peek_pos] if peek_pos < len(self.tokens) else None 

545 

546 def _advance(self) -> Optional[Token]: 

547 """Advance to next token and return current""" 

548 if self.pos < len(self.tokens): 

549 token = self.tokens[self.pos] 

550 self.pos += 1 

551 return token 

552 return None 

553 

554 def _find_matching_brace(self, start_pos: int) -> Optional[int]: 

555 """Find matching closing brace starting from open brace position""" 

556 if ( 

557 start_pos >= len(self.tokens) 

558 or self.tokens[start_pos].type != TokenType.LBRACE 

559 ): 

560 return None 

561 

562 depth = 1 

563 pos = start_pos + 1 

564 

565 while pos < len(self.tokens) and depth > 0: 

566 if self.tokens[pos].type == TokenType.LBRACE: 

567 depth += 1 

568 elif self.tokens[pos].type == TokenType.RBRACE: 

569 depth -= 1 

570 pos += 1 

571 

572 return pos - 1 if depth == 0 else None 

573 

574 def _parse_struct(self) -> Optional[Tuple[int, int, str]]: 

575 """Parse struct definition starting at current position""" 

576 start_pos = self.pos 

577 

578 # Consume 'struct' keyword 

579 if not self._current_token_is(TokenType.STRUCT): 

580 return None 

581 self._advance() 

582 

583 # Check if this struct is inside a cast expression by looking backwards 

584 check_pos = start_pos - 1 

585 while check_pos >= 0: 

586 if self.tokens[check_pos].type == TokenType.LPAREN: 

587 # Found opening parenthesis before struct - this is likely a cast expression 

588 return None 

589 elif self.tokens[check_pos].type in [TokenType.STRUCT, TokenType.TYPEDEF]: 

590 # Found another struct or typedef - this is not a cast expression 

591 break 

592 elif self.tokens[check_pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]: 

593 # Found some other token - this is not a cast expression 

594 break 

595 check_pos -= 1 

596 

597 # Skip whitespace 

598 while self.pos < len(self.tokens) and self._current_token_is( 

599 TokenType.WHITESPACE 

600 ): 

601 self.pos += 1 

602 

603 # Check if this is a cast expression: (struct type*) 

604 if self._current_token_is(TokenType.LPAREN): 

605 # Look ahead to see if this is a cast expression 

606 check_pos = self.pos + 1 

607 while check_pos < len(self.tokens): 

608 if self.tokens[check_pos].type == TokenType.RPAREN: 

609 # Found closing parenthesis - this is likely a cast expression 

610 return None 

611 elif self.tokens[check_pos].type == TokenType.LBRACE: 

612 # Found opening brace - this is a struct definition 

613 break 

614 elif self.tokens[check_pos].type == TokenType.SEMICOLON: 

615 # Found semicolon - this is a variable declaration 

616 return None 

617 check_pos += 1 

618 

619 # Get struct tag name (optional for anonymous structs) 

620 struct_tag = "" 

621 if self._current_token_is(TokenType.IDENTIFIER): 

622 struct_tag = self._advance().value 

623 

624 # Look for opening brace or semicolon 

625 while self.pos < len(self.tokens): 

626 if self._current_token_is(TokenType.LBRACE): 

627 # Found opening brace - this is a struct definition 

628 break 

629 elif self._current_token_is(TokenType.SEMICOLON): 

630 # Found semicolon before opening brace - this is a variable declaration 

631 return None 

632 self.pos += 1 

633 

634 if not self._current_token_is(TokenType.LBRACE): 

635 # This is a variable declaration 

636 return None 

637 

638 # Find matching closing brace 

639 brace_pos = self.pos 

640 end_brace_pos = self._find_matching_brace(brace_pos) 

641 

642 if end_brace_pos is None: 

643 return None 

644 

645 # Look for struct name after closing brace 

646 name_pos = end_brace_pos + 1 

647 struct_name = struct_tag # Default to tag name 

648 

649 # Check if this is a typedef struct by looking backwards 

650 is_typedef = False 

651 check_pos = start_pos - 1 

652 while check_pos >= 0: 

653 if self.tokens[check_pos].type == TokenType.TYPEDEF: 

654 is_typedef = True 

655 break 

656 elif self.tokens[check_pos].type in [ 

657 TokenType.STRUCT, 

658 TokenType.LBRACE, 

659 TokenType.RBRACE, 

660 ]: 

661 break 

662 check_pos -= 1 

663 

664 if is_typedef: 

665 # For typedef struct, look for the typedef name after the closing brace 

666 while name_pos < len(self.tokens): 

667 if self.tokens[name_pos].type == TokenType.IDENTIFIER: 

668 struct_name = self.tokens[name_pos].value 

669 break 

670 elif self.tokens[name_pos].type == TokenType.SEMICOLON: 

671 break 

672 name_pos += 1 

673 else: 

674 # Check if there's a variable name after the brace 

675 while name_pos < len(self.tokens): 

676 if self.tokens[name_pos].type == TokenType.IDENTIFIER: 

677 # This is a variable name 

678 struct_name = "" 

679 break 

680 elif self.tokens[name_pos].type == TokenType.SEMICOLON: 

681 break 

682 name_pos += 1 

683 

684 # Find semicolon (for struct definitions) 

685 self.pos = end_brace_pos + 1 

686 while self.pos < len(self.tokens) and not self._current_token_is( 

687 TokenType.SEMICOLON 

688 ): 

689 self.pos += 1 

690 

691 end_pos = self.pos 

692 return (start_pos, end_pos, struct_name) 

693 

694 def _parse_typedef_struct(self) -> Optional[Tuple[int, int, str]]: 

695 """Parse typedef struct definition""" 

696 start_pos = self.pos 

697 

698 # Consume 'typedef' 

699 if not self._current_token_is(TokenType.TYPEDEF): 

700 return None 

701 self._advance() 

702 

703 # Look for 'struct' 

704 if not self._current_token_is(TokenType.STRUCT): 

705 # Not a typedef struct, reset position 

706 self.pos = start_pos + 1 

707 return None 

708 

709 # Skip 'struct' 

710 self._advance() 

711 

712 # Skip whitespace 

713 while self.pos < len(self.tokens) and self._current_token_is( 

714 TokenType.WHITESPACE 

715 ): 

716 self.pos += 1 

717 

718 # Get struct tag name (optional) 

719 struct_tag = "" 

720 if self._current_token_is(TokenType.IDENTIFIER): 

721 struct_tag = self._advance().value 

722 

723 # Skip whitespace 

724 while self.pos < len(self.tokens) and self._current_token_is( 

725 TokenType.WHITESPACE 

726 ): 

727 self.pos += 1 

728 

729 # Check if this is a forward declaration (no braces) 

730 if not self._current_token_is(TokenType.LBRACE): 

731 # This is a forward declaration, skip it 

732 self.pos = start_pos + 1 

733 return None 

734 

735 # Find matching closing brace 

736 end_brace_pos = self._find_matching_brace(self.pos) 

737 if end_brace_pos is None: 

738 self.pos = start_pos + 1 

739 return None 

740 

741 # Look for typedef name after closing brace 

742 typedef_name = "" 

743 name_pos = end_brace_pos + 1 

744 while name_pos < len(self.tokens): 

745 if self.tokens[name_pos].type == TokenType.IDENTIFIER: 

746 typedef_name = self.tokens[name_pos].value 

747 break 

748 elif self.tokens[name_pos].type == TokenType.SEMICOLON: 

749 break 

750 name_pos += 1 

751 

752 # Find semicolon 

753 while ( 

754 name_pos < len(self.tokens) 

755 and not self.tokens[name_pos].type == TokenType.SEMICOLON 

756 ): 

757 name_pos += 1 

758 

759 end_pos = name_pos 

760 return (start_pos, end_pos, typedef_name) 

761 

762 def _parse_enum(self) -> Optional[Tuple[int, int, str]]: 

763 """Parse enum definition starting at current position""" 

764 start_pos = self.pos 

765 

766 # Consume 'enum' keyword 

767 if not self._current_token_is(TokenType.ENUM): 

768 return None 

769 self._advance() 

770 

771 # Skip whitespace 

772 while self.pos < len(self.tokens) and self._current_token_is( 

773 TokenType.WHITESPACE 

774 ): 

775 self.pos += 1 

776 

777 # Get enum tag name (optional for anonymous enums) 

778 enum_tag = "" 

779 if self._current_token_is(TokenType.IDENTIFIER): 

780 enum_tag = self._advance().value 

781 

782 # Find opening brace 

783 while self.pos < len(self.tokens) and not self._current_token_is( 

784 TokenType.LBRACE 

785 ): 

786 self.pos += 1 

787 

788 if not self._current_token_is(TokenType.LBRACE): 

789 return None 

790 

791 # Find matching closing brace 

792 brace_pos = self.pos 

793 end_brace_pos = self._find_matching_brace(brace_pos) 

794 

795 if end_brace_pos is None: 

796 return None 

797 

798 # Look for enum name after closing brace 

799 name_pos = end_brace_pos + 1 

800 enum_name = enum_tag # Default to tag name 

801 

802 # Check if this is a typedef enum by looking backwards 

803 is_typedef = False 

804 check_pos = start_pos - 1 

805 while check_pos >= 0: 

806 if self.tokens[check_pos].type == TokenType.TYPEDEF: 

807 is_typedef = True 

808 break 

809 elif self.tokens[check_pos].type in [ 

810 TokenType.ENUM, 

811 TokenType.LBRACE, 

812 TokenType.RBRACE, 

813 ]: 

814 break 

815 check_pos -= 1 

816 

817 if is_typedef: 

818 # For typedef enum, look for the typedef name after the closing brace 

819 while name_pos < len(self.tokens): 

820 if self.tokens[name_pos].type == TokenType.IDENTIFIER: 

821 enum_name = self.tokens[name_pos].value 

822 break 

823 elif self.tokens[name_pos].type == TokenType.SEMICOLON: 

824 break 

825 name_pos += 1 

826 elif not enum_tag: 

827 # Anonymous enum - check if there's a variable name after the brace 

828 while name_pos < len(self.tokens): 

829 if self.tokens[name_pos].type == TokenType.IDENTIFIER: 

830 # This is a variable name 

831 enum_name = "" 

832 break 

833 elif self.tokens[name_pos].type == TokenType.SEMICOLON: 

834 break 

835 name_pos += 1 

836 

837 # Find semicolon 

838 self.pos = end_brace_pos + 1 

839 while self.pos < len(self.tokens) and not self._current_token_is( 

840 TokenType.SEMICOLON 

841 ): 

842 self.pos += 1 

843 

844 end_pos = self.pos 

845 return (start_pos, end_pos, enum_name) 

846 

847 def _parse_typedef_enum(self) -> Optional[Tuple[int, int, str]]: 

848 """Parse typedef enum definition""" 

849 start_pos = self.pos 

850 

851 # Consume 'typedef' 

852 if not self._current_token_is(TokenType.TYPEDEF): 

853 return None 

854 self._advance() 

855 

856 # Look for 'enum' 

857 if not self._current_token_is(TokenType.ENUM): 

858 # Not a typedef enum, reset position 

859 self.pos = start_pos + 1 

860 return None 

861 

862 # Parse the enum part - this will return the tag name (e.g., StatusEnum_tag) 

863 enum_info = self._parse_enum() 

864 if not enum_info: 

865 self.pos = start_pos + 1 

866 return None 

867 

868 # For typedef enums, we want to return the tag name, not the typedef name 

869 # The typedef name will be handled separately in the parser 

870 return enum_info 

871 

872 def _parse_function(self) -> Optional[Tuple[int, int, str, str, bool, bool]]: 

873 """Parse function declaration/definition 

874 

875 Returns: 

876 Tuple of (start_pos, end_pos, func_name, return_type, is_declaration, is_inline) 

877 """ 

878 start_pos = self.pos 

879 

880 # Look for function pattern: [modifiers] return_type function_name (params) 

881 while self.pos < len(self.tokens): 

882 token = self.tokens[self.pos] 

883 

884 # If we hit a parenthesis, check if this is a function 

885 if token.type == TokenType.LPAREN: 

886 # Look backwards for function name 

887 if ( 

888 self.pos > 0 

889 and self.tokens[self.pos - 1].type == TokenType.IDENTIFIER 

890 ): 

891 func_name = self.tokens[self.pos - 1].value 

892 func_name_pos = self.pos - 1 

893 

894 # Look backwards from function name to find return type 

895 # Start from just before the function name 

896 return_type_end = func_name_pos - 1 

897 return_type_start = return_type_end 

898 

899 # Skip backwards over whitespace and comments 

900 while return_type_start >= 0: 

901 token_type = self.tokens[return_type_start].type 

902 if token_type in [ 

903 TokenType.WHITESPACE, 

904 TokenType.COMMENT, 

905 TokenType.NEWLINE, 

906 ]: 

907 return_type_start -= 1 

908 else: 

909 break 

910 

911 # If we found a non-whitespace token, that's the end of the return type 

912 # Find the start by looking backwards from there 

913 if return_type_start >= 0: 

914 return_type_end = return_type_start 

915 return_type_start = return_type_end 

916 

917 # Define modifiers set (used in token type checking below) 

918 

919 # Collect all tokens that are part of the return type (including modifiers) 

920 return_type_tokens = [] 

921 

922 # Look back at most 10 tokens to capture multi-token return types 

923 max_lookback = max(0, func_name_pos - 10) 

924 current_pos = return_type_start 

925 

926 # Collect tokens backwards until we hit a limit or non-return-type token 

927 while current_pos >= max_lookback: 

928 token_type = self.tokens[current_pos].type 

929 if token_type in [ 

930 TokenType.IDENTIFIER, 

931 TokenType.INT, 

932 TokenType.VOID, 

933 TokenType.CHAR, 

934 TokenType.FLOAT, 

935 TokenType.DOUBLE, 

936 TokenType.LONG, 

937 TokenType.SHORT, 

938 TokenType.UNSIGNED, 

939 TokenType.SIGNED, 

940 TokenType.ASTERISK, 

941 TokenType.CONST, 

942 TokenType.STATIC, 

943 TokenType.EXTERN, 

944 TokenType.INLINE, 

945 TokenType.LOCAL_INLINE, 

946 ]: 

947 return_type_tokens.insert(0, self.tokens[current_pos]) 

948 current_pos -= 1 

949 elif token_type in [ 

950 TokenType.WHITESPACE, 

951 TokenType.COMMENT, 

952 TokenType.NEWLINE, 

953 ]: 

954 # Skip whitespace and continue looking 

955 current_pos -= 1 

956 else: 

957 break 

958 

959 # Extract return type 

960 if return_type_tokens: 

961 return_type = " ".join( 

962 t.value for t in return_type_tokens 

963 ).strip() 

964 

965 # Check if function is inline 

966 is_inline = any( 

967 token.type in [TokenType.INLINE, TokenType.LOCAL_INLINE] 

968 for token in return_type_tokens 

969 ) 

970 

971 # Find end of function (either ; for declaration or { for definition) 

972 end_pos = self._find_function_end(self.pos) 

973 if end_pos: 

974 # Determine if this is a declaration or definition 

975 is_declaration = self._is_function_declaration(end_pos) 

976 self.pos = end_pos + 1 

977 return ( 

978 start_pos, 

979 end_pos, 

980 func_name, 

981 return_type, 

982 is_declaration, 

983 is_inline, 

984 ) 

985 

986 self.pos += 1 

987 

988 # Prevent infinite loops - if we've gone too far, this isn't a function 

989 if self.pos - start_pos > 50: 

990 break 

991 

992 # Reset position if no function found 

993 self.pos = start_pos + 1 

994 return None 

995 

996 def _is_function_declaration(self, end_pos: int) -> bool: 

997 """Check if the function at end_pos is a declaration (ends with ;) or definition (ends with })""" 

998 if end_pos >= len(self.tokens): 

999 return False 

1000 

1001 # Look backwards from end_pos to find the last significant token 

1002 pos = end_pos 

1003 while pos >= 0: 

1004 token_type = self.tokens[pos].type 

1005 if token_type not in [ 

1006 TokenType.WHITESPACE, 

1007 TokenType.COMMENT, 

1008 TokenType.NEWLINE, 

1009 ]: 

1010 return token_type == TokenType.SEMICOLON 

1011 pos -= 1 

1012 

1013 return False 

1014 

1015 def _find_function_end(self, start_pos: int) -> Optional[int]: 

1016 """Find end of function declaration or definition""" 

1017 pos = start_pos 

1018 

1019 # Find matching closing parenthesis 

1020 if pos >= len(self.tokens) or self.tokens[pos].type != TokenType.LPAREN: 

1021 return None 

1022 

1023 depth = 1 

1024 pos += 1 

1025 

1026 while pos < len(self.tokens) and depth > 0: 

1027 if self.tokens[pos].type == TokenType.LPAREN: 

1028 depth += 1 

1029 elif self.tokens[pos].type == TokenType.RPAREN: 

1030 depth -= 1 

1031 pos += 1 

1032 

1033 if depth > 0: 

1034 return None 

1035 

1036 # Look for either ; (declaration) or { (definition) 

1037 while pos < len(self.tokens): 

1038 if self.tokens[pos].type == TokenType.SEMICOLON: 

1039 return pos 

1040 elif self.tokens[pos].type == TokenType.LBRACE: 

1041 # Function definition - find matching brace 

1042 end_brace = self._find_matching_brace(pos) 

1043 return end_brace if end_brace else pos 

1044 pos += 1 

1045 

1046 return None 

1047 

1048 def _parse_union(self) -> Optional[Tuple[int, int, str]]: 

1049 """Parse union definition""" 

1050 if not self._current_token_is(TokenType.UNION): 

1051 return None 

1052 

1053 start_pos = self.pos 

1054 self._advance() # Consumes 'union' 

1055 

1056 # Skip whitespace 

1057 while self.pos < len(self.tokens) and self._current_token_is( 

1058 TokenType.WHITESPACE 

1059 ): 

1060 self.pos += 1 

1061 

1062 # Get union tag name (optional for anonymous unions) 

1063 union_tag = "" 

1064 if self._current_token_is(TokenType.IDENTIFIER): 

1065 union_tag = self._advance().value 

1066 

1067 # Find opening brace 

1068 while self.pos < len(self.tokens) and not self._current_token_is( 

1069 TokenType.LBRACE 

1070 ): 

1071 self.pos += 1 

1072 

1073 if self.pos >= len(self.tokens): 

1074 return None 

1075 

1076 # Find matching closing brace 

1077 end_pos = self._find_matching_brace(self.pos) 

1078 if end_pos is None: 

1079 return None 

1080 

1081 # Look for union name after closing brace (for typedefs or named unions) 

1082 union_name = union_tag # Default to tag name 

1083 

1084 # Skip to semicolon 

1085 self.pos = end_pos + 1 

1086 while self.pos < len(self.tokens) and not self._current_token_is( 

1087 TokenType.SEMICOLON 

1088 ): 

1089 if self._current_token_is(TokenType.IDENTIFIER): 

1090 union_name = self._advance().value 

1091 break 

1092 self.pos += 1 

1093 

1094 return (start_pos, end_pos, union_name) 

1095 

1096 def _parse_typedef_union(self) -> Optional[Tuple[int, int, str]]: 

1097 """Parse typedef union definition""" 

1098 if not self._current_token_is(TokenType.TYPEDEF): 

1099 return None 

1100 

1101 start_pos = self.pos 

1102 self._advance() # Consumes 'typedef' 

1103 

1104 # Skip whitespace 

1105 while self.pos < len(self.tokens) and self._current_token_is( 

1106 TokenType.WHITESPACE 

1107 ): 

1108 self.pos += 1 

1109 

1110 # Check if next token is 'union' 

1111 if not self._current_token_is(TokenType.UNION): 

1112 return None 

1113 

1114 self._advance() # Consumes 'union' 

1115 

1116 # Skip whitespace 

1117 while self.pos < len(self.tokens) and self._current_token_is( 

1118 TokenType.WHITESPACE 

1119 ): 

1120 self.pos += 1 

1121 

1122 # Get union tag name (optional) 

1123 union_tag = "" 

1124 if self._current_token_is(TokenType.IDENTIFIER): 

1125 union_tag = self._advance().value 

1126 

1127 # Find opening brace 

1128 while self.pos < len(self.tokens) and not self._current_token_is( 

1129 TokenType.LBRACE 

1130 ): 

1131 self.pos += 1 

1132 

1133 if self.pos >= len(self.tokens): 

1134 return None 

1135 

1136 # Find matching closing brace 

1137 end_pos = self._find_matching_brace(self.pos) 

1138 if end_pos is None: 

1139 return None 

1140 

1141 # Look for typedef name after closing brace 

1142 typedef_name = "" 

1143 self.pos = end_pos + 1 

1144 while self.pos < len(self.tokens) and not self._current_token_is( 

1145 TokenType.SEMICOLON 

1146 ): 

1147 if self._current_token_is(TokenType.IDENTIFIER): 

1148 typedef_name = self._advance().value 

1149 break 

1150 self.pos += 1 

1151 

1152 return (start_pos, end_pos, typedef_name) 

1153 

1154 

1155def extract_token_range(tokens: List[Token], start: int, end: int) -> str: 

1156 """Extract raw text from token range, excluding whitespace, comments, and newlines""" 

1157 if start >= len(tokens) or end >= len(tokens) or start > end: 

1158 return "" 

1159 return " ".join( 

1160 token.value 

1161 for token in tokens[start : end + 1] 

1162 if token.type 

1163 not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE] 

1164 ) 

1165 

1166 

1167def find_struct_fields( 

1168 tokens: List[Token], struct_start: int, struct_end: int 

1169) -> List[Tuple[str, str]]: 

1170 """Extract field information from struct token range 

1171 Returns: 

1172 List of tuples (field_name, field_type) 

1173 """ 

1174 fields = [] 

1175 pos = struct_start 

1176 while pos <= struct_end and tokens[pos].type != TokenType.LBRACE: 

1177 pos += 1 

1178 if pos > struct_end: 

1179 return fields 

1180 pos += 1 # Skip opening brace 

1181 

1182 # Find the closing brace position of the main struct body 

1183 closing_brace_pos = pos 

1184 brace_count = 1 # Start at 1 because we're already past the opening brace 

1185 while closing_brace_pos <= struct_end: 

1186 if tokens[closing_brace_pos].type == TokenType.LBRACE: 

1187 brace_count += 1 

1188 elif tokens[closing_brace_pos].type == TokenType.RBRACE: 

1189 brace_count -= 1 

1190 if brace_count == 0: 

1191 # This is the closing brace of the main struct body 

1192 break 

1193 closing_brace_pos += 1 

1194 

1195 # Only parse fields up to the closing brace 

1196 while pos < closing_brace_pos and tokens[pos].type != TokenType.RBRACE: 

1197 field_tokens = [] 

1198 # Collect tokens until we find the semicolon that ends this field 

1199 # For nested structures, we need to handle braces properly 

1200 brace_count = 0 

1201 field_start_pos = pos 

1202 

1203 # First pass: collect tokens until we find the semicolon outside of braces 

1204 while pos < closing_brace_pos: 

1205 if tokens[pos].type == TokenType.LBRACE: 

1206 brace_count += 1 

1207 elif tokens[pos].type == TokenType.RBRACE: 

1208 brace_count -= 1 

1209 # Only stop if we're at the main closing brace 

1210 if pos == closing_brace_pos: 

1211 break 

1212 elif tokens[pos].type == TokenType.SEMICOLON and brace_count == 0: 

1213 # This is the semicolon that ends the field 

1214 break 

1215 

1216 if tokens[pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]: 

1217 field_tokens.append(tokens[pos]) 

1218 pos += 1 

1219 

1220 # For nested structures, we need to continue collecting tokens until we find the field name 

1221 # and the semicolon that ends the entire field 

1222 if (len(field_tokens) >= 3 and 

1223 field_tokens[0].type in [TokenType.STRUCT, TokenType.UNION] and 

1224 field_tokens[1].type == TokenType.LBRACE): 

1225 # This might be a nested structure, continue collecting until we find the field name 

1226 temp_pos = pos 

1227 brace_count = 0 # Track nested braces to find the correct field boundary 

1228 while temp_pos < len(tokens): 

1229 if tokens[temp_pos].type == TokenType.LBRACE: 

1230 brace_count += 1 

1231 elif tokens[temp_pos].type == TokenType.RBRACE: 

1232 brace_count -= 1 

1233 elif tokens[temp_pos].type == TokenType.SEMICOLON and brace_count == 0: 

1234 # Found the semicolon that ends the field (not inside nested braces) 

1235 break 

1236 

1237 if tokens[temp_pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]: 

1238 field_tokens.append(tokens[temp_pos]) 

1239 temp_pos += 1 

1240 pos = temp_pos 

1241 

1242 # Parse field from collected tokens 

1243 if len(field_tokens) >= 2: 

1244 # Check if this is a nested struct field 

1245 if ( 

1246 len(field_tokens) >= 3 

1247 and field_tokens[0].type == TokenType.STRUCT 

1248 and field_tokens[1].type == TokenType.LBRACE 

1249 ): 

1250 # This is a nested struct - find the field name after the closing brace 

1251 # Look for the pattern: struct { ... } field_name; 

1252 field_name = None 

1253 # Find the LAST closing brace and then the field name 

1254 # This handles deeply nested structures correctly 

1255 for i in range(len(field_tokens) - 1, -1, -1): 

1256 if field_tokens[i].type == TokenType.RBRACE and i + 1 < len(field_tokens): 

1257 # The field name should be the next identifier after the closing brace 

1258 for j in range(i + 1, len(field_tokens)): 

1259 if field_tokens[j].type == TokenType.IDENTIFIER: 

1260 field_name = field_tokens[j].value 

1261 break 

1262 if field_name: 

1263 break 

1264 

1265 if field_name: 

1266 # Extract the content between braces for anonymous processor using special format 

1267 content = _extract_brace_content(field_tokens) 

1268 if content: 

1269 # Preserve content for anonymous processor using special format 

1270 import base64 

1271 encoded_content = base64.b64encode(content.encode()).decode() 

1272 field_type = f"struct {{ /*ANON:{encoded_content}:{field_name}*/ ... }}" 

1273 else: 

1274 field_type = "struct { ... }" 

1275 

1276 if field_name not in ["[", "]", ";", "}"]: 

1277 fields.append((field_name, field_type)) 

1278 # Skip parsing the nested struct's fields as separate fields 

1279 # Let the normal flow handle semicolon advancement 

1280 else: 

1281 # Anonymous nested struct without a field name 

1282 content = _extract_brace_content(field_tokens) 

1283 if content: 

1284 import base64 

1285 encoded_content = base64.b64encode(content.encode()).decode() 

1286 # Use generic field name for anonymous struct 

1287 generic_name = "__anonymous_struct__" 

1288 field_type = f"struct {{ /*ANON:{encoded_content}:{generic_name}*/ ... }}" 

1289 else: 

1290 generic_name = "__anonymous_struct__" 

1291 field_type = "struct { ... }" 

1292 fields.append((generic_name, field_type)) 

1293 # Check if this is a nested union field 

1294 elif ( 

1295 len(field_tokens) >= 3 

1296 and field_tokens[0].type == TokenType.UNION 

1297 and field_tokens[1].type == TokenType.LBRACE 

1298 ): 

1299 # This is a nested union - find the field name after the closing brace 

1300 # Look for the pattern: union { ... } field_name; 

1301 field_name = None 

1302 # Find the LAST closing brace and then the field name 

1303 # This handles deeply nested structures correctly 

1304 for i in range(len(field_tokens) - 1, -1, -1): 

1305 if field_tokens[i].type == TokenType.RBRACE and i + 1 < len(field_tokens): 

1306 # The field name should be the next identifier after the closing brace 

1307 for j in range(i + 1, len(field_tokens)): 

1308 if field_tokens[j].type == TokenType.IDENTIFIER: 

1309 field_name = field_tokens[j].value 

1310 break 

1311 if field_name: 

1312 break 

1313 

1314 if field_name: 

1315 # Extract the content between braces for anonymous processor 

1316 content = _extract_brace_content(field_tokens) 

1317 if content: 

1318 # Preserve content for anonymous processor using special format 

1319 import base64 

1320 encoded_content = base64.b64encode(content.encode()).decode() 

1321 field_type = f"union {{ /*ANON:{encoded_content}:{field_name}*/ ... }}" 

1322 else: 

1323 field_type = "union { ... }" 

1324 

1325 if field_name not in ["[", "]", ";", "}"]: 

1326 fields.append((field_name, field_type)) 

1327 # Skip parsing the nested union's fields as separate fields 

1328 # Let the normal flow handle semicolon advancement 

1329 else: 

1330 # Anonymous nested union without a field name 

1331 content = _extract_brace_content(field_tokens) 

1332 if content: 

1333 import base64 

1334 encoded_content = base64.b64encode(content.encode()).decode() 

1335 generic_name = "__anonymous_union__" 

1336 field_type = f"union {{ /*ANON:{encoded_content}:{generic_name}*/ ... }}" 

1337 else: 

1338 generic_name = "__anonymous_union__" 

1339 field_type = "union { ... }" 

1340 fields.append((generic_name, field_type)) 

1341 # Function pointer array field: type (*name[size])(params) 

1342 elif ( 

1343 len(field_tokens) >= 8 

1344 and field_tokens[1].type == TokenType.LPAREN 

1345 and field_tokens[2].type == TokenType.ASTERISK 

1346 and any(t.type == TokenType.LBRACKET for t in field_tokens) 

1347 and any(t.type == TokenType.RBRACKET for t in field_tokens) 

1348 ): 

1349 # Find the function pointer name (between * and [) 

1350 # Look for the identifier between * and [ 

1351 name_start = 3 # After the * 

1352 name_end = None 

1353 for i in range(name_start, len(field_tokens)): 

1354 if field_tokens[i].type == TokenType.LBRACKET: 

1355 name_end = i 

1356 break 

1357 

1358 if name_end is not None: 

1359 field_name = " ".join( 

1360 t.value for t in field_tokens[name_start:name_end] 

1361 ) 

1362 

1363 # Format the type properly - preserve spaces between tokens but not around brackets/parentheses 

1364 formatted_tokens = [] 

1365 for j, token in enumerate(field_tokens): 

1366 if token.type in [ 

1367 TokenType.LPAREN, 

1368 TokenType.RPAREN, 

1369 TokenType.LBRACKET, 

1370 TokenType.RBRACKET, 

1371 ]: 

1372 # Don't add spaces around brackets/parentheses 

1373 formatted_tokens.append(token.value) 

1374 elif j > 0 and field_tokens[j - 1].type not in [ 

1375 TokenType.LPAREN, 

1376 TokenType.RPAREN, 

1377 TokenType.LBRACKET, 

1378 TokenType.RBRACKET, 

1379 ]: 

1380 # Add space before token if previous token wasn't a bracket/parenthesis 

1381 formatted_tokens.append(" " + token.value) 

1382 else: 

1383 # No space before token 

1384 formatted_tokens.append(token.value) 

1385 field_type = "".join(formatted_tokens) 

1386 

1387 # Validate and add the field 

1388 if ( 

1389 field_name 

1390 and field_name.strip() 

1391 and field_type.strip() 

1392 and field_name not in ["[", "]", ";", "}"] 

1393 ): 

1394 stripped_name = field_name.strip() 

1395 stripped_type = field_type.strip() 

1396 if stripped_name and stripped_type: 

1397 fields.append((stripped_name, stripped_type)) 

1398 # Function pointer field: type (*name)(params) or type (*name[size])(params) 

1399 elif ( 

1400 len(field_tokens) >= 5 

1401 and field_tokens[1].type == TokenType.LPAREN and field_tokens[2].type == TokenType.ASTERISK 

1402 ): 

1403 # Find the opening parenthesis and asterisk pattern 

1404 func_ptr_start = None 

1405 for i in range(len(field_tokens) - 1): 

1406 if field_tokens[i].type == TokenType.LPAREN and field_tokens[i + 1].type == TokenType.ASTERISK: 

1407 func_ptr_start = i 

1408 break 

1409 

1410 if func_ptr_start is not None: 

1411 # Extract the type (everything before the opening parenthesis) 

1412 type_tokens = field_tokens[:func_ptr_start] 

1413 field_type = " ".join(t.value for t in type_tokens) 

1414 

1415 # Find the closing parenthesis after the function name 

1416 paren_count = 0 

1417 name_end = None 

1418 for i in range(func_ptr_start, len(field_tokens)): 

1419 if field_tokens[i].type == TokenType.LPAREN: 

1420 paren_count += 1 

1421 elif field_tokens[i].type == TokenType.RPAREN: 

1422 paren_count -= 1 

1423 if paren_count == 0 and i > func_ptr_start + 1: 

1424 name_end = i 

1425 break 

1426 

1427 if name_end is not None: 

1428 # Extract function name (between * and closing parenthesis) 

1429 name_tokens = field_tokens[func_ptr_start + 2:name_end] 

1430 field_name = " ".join(t.value for t in name_tokens) 

1431 

1432 # Extract the parameter list as part of the type 

1433 param_tokens = field_tokens[name_end + 1:] 

1434 param_type = " ".join(t.value for t in param_tokens) 

1435 

1436 # Combine type and parameter list (without the function name in the type) 

1437 # The function name is already extracted as field_name, so we don't include it in the type 

1438 func_ptr_start_tokens = field_tokens[func_ptr_start:func_ptr_start + 2] # ( * 

1439 func_ptr_end_tokens = field_tokens[name_end:name_end + 1] # ) 

1440 full_type = field_type + " " + " ".join(t.value for t in func_ptr_start_tokens) + " " + " ".join(t.value for t in func_ptr_end_tokens) + " " + param_type 

1441 

1442 if ( 

1443 field_name 

1444 and field_name.strip() 

1445 and full_type.strip() 

1446 and field_name not in ["[", "]", ";", "}"] 

1447 ): 

1448 stripped_name = field_name.strip() 

1449 stripped_type = full_type.strip() 

1450 if stripped_name and stripped_type: 

1451 fields.append((stripped_name, stripped_type)) 

1452 # Array field: type name [ size ] 

1453 elif ( 

1454 len(field_tokens) >= 4 

1455 and field_tokens[-3].type == TokenType.LBRACKET 

1456 and field_tokens[-1].type == TokenType.RBRACKET 

1457 ): 

1458 field_name = field_tokens[-4].value 

1459 # Fix: Properly format array type - preserve spaces between tokens 

1460 type_tokens = field_tokens[:-4] 

1461 field_type = " ".join(t.value for t in type_tokens) + "[" + field_tokens[-2].value + "]" 

1462 if ( 

1463 field_name 

1464 and field_name.strip() 

1465 and field_type.strip() 

1466 and field_name not in ["[", "]", ";", "}"] 

1467 ): 

1468 # Additional validation to ensure we don't have empty strings 

1469 stripped_name = field_name.strip() 

1470 stripped_type = field_type.strip() 

1471 if stripped_name and stripped_type: 

1472 fields.append((stripped_name, stripped_type)) 

1473 else: 

1474 # Regular field: type name 

1475 # Check if this field declaration contains commas (multiple fields of same type) 

1476 comma_positions = [] 

1477 paren_count = 0 

1478 brace_count = 0 

1479 

1480 # Find comma positions that are outside of parentheses and braces 

1481 for i, token in enumerate(field_tokens): 

1482 if token.type == TokenType.LPAREN: 

1483 paren_count += 1 

1484 elif token.type == TokenType.RPAREN: 

1485 paren_count -= 1 

1486 elif token.type == TokenType.LBRACE: 

1487 brace_count += 1 

1488 elif token.type == TokenType.RBRACE: 

1489 brace_count -= 1 

1490 elif token.type == TokenType.COMMA and paren_count == 0 and brace_count == 0: 

1491 comma_positions.append(i) 

1492 

1493 if comma_positions: 

1494 # Multiple fields of the same type: "int x, y, z;" 

1495 # Extract the type (everything before the first field name) 

1496 first_field_start = None 

1497 for i in range(len(field_tokens)): 

1498 if field_tokens[i].type == TokenType.IDENTIFIER: 

1499 first_field_start = i 

1500 break 

1501 

1502 if first_field_start is not None: 

1503 type_tokens = field_tokens[:first_field_start] 

1504 field_type = " ".join(t.value for t in type_tokens) 

1505 

1506 # Split fields on commas 

1507 field_starts = [first_field_start] + [pos + 1 for pos in comma_positions] 

1508 field_ends = comma_positions + [len(field_tokens)] 

1509 

1510 for start, end in zip(field_starts, field_ends): 

1511 if start < end: 

1512 field_name_tokens = field_tokens[start:end] 

1513 field_name = " ".join(t.value for t in field_name_tokens) 

1514 

1515 if ( 

1516 field_name 

1517 and field_name.strip() 

1518 and field_type.strip() 

1519 and field_name not in ["[", "]", ";", "}"] 

1520 ): 

1521 stripped_name = field_name.strip() 

1522 stripped_type = field_type.strip() 

1523 if stripped_name and stripped_type: 

1524 fields.append((stripped_name, stripped_type)) 

1525 else: 

1526 # Single field: type name 

1527 field_name = field_tokens[-1].value 

1528 field_type = " ".join(t.value for t in field_tokens[:-1]) 

1529 if ( 

1530 field_name not in ["[", "]", ";", "}"] 

1531 and field_name 

1532 and field_name.strip() 

1533 and field_type.strip() 

1534 ): 

1535 # Additional validation to ensure we don't have empty strings 

1536 stripped_name = field_name.strip() 

1537 stripped_type = field_type.strip() 

1538 if stripped_name and stripped_type: 

1539 fields.append((stripped_name, stripped_type)) 

1540 if pos < closing_brace_pos: 

1541 pos += 1 # Skip semicolon 

1542 return fields 

1543 

1544 

1545def find_enum_values(tokens: List[Token], enum_start: int, enum_end: int) -> List[str]: 

1546 """Extract enum values from enum token range""" 

1547 values = [] 

1548 pos = enum_start 

1549 while pos <= enum_end and tokens[pos].type != TokenType.LBRACE: 

1550 pos += 1 

1551 if pos > enum_end: 

1552 return values 

1553 pos += 1 # Skip opening brace 

1554 current_value = [] 

1555 while pos <= enum_end and tokens[pos].type != TokenType.RBRACE: 

1556 token = tokens[pos] 

1557 if token.type == TokenType.COMMA: 

1558 if current_value: 

1559 filtered_value = [ 

1560 t 

1561 for t in current_value 

1562 if t.type not in [TokenType.WHITESPACE, TokenType.COMMENT] 

1563 ] 

1564 if filtered_value: 

1565 value_str = " ".join(t.value for t in filtered_value).strip() 

1566 if value_str: 

1567 values.append(value_str) 

1568 current_value = [] 

1569 elif token.type not in [TokenType.WHITESPACE, TokenType.COMMENT]: 

1570 current_value.append(token) 

1571 pos += 1 

1572 if current_value: 

1573 filtered_value = [ 

1574 t 

1575 for t in current_value 

1576 if t.type not in [TokenType.WHITESPACE, TokenType.COMMENT] 

1577 ] 

1578 if filtered_value: 

1579 value_str = " ".join(t.value for t in filtered_value).strip() 

1580 if value_str: 

1581 values.append(value_str) 

1582 return values 

1583 

1584 

1585def _extract_brace_content(field_tokens: List[Token]) -> str: 

1586 """Extract the content between braces from field tokens. 

1587  

1588 Args: 

1589 field_tokens: List of tokens representing a field with anonymous structure 

1590  

1591 Returns: 

1592 String content between the braces, or empty string if not found 

1593 """ 

1594 content_tokens = [] 

1595 in_braces = False 

1596 brace_count = 0 

1597 

1598 for token in field_tokens: 

1599 if token.type == TokenType.LBRACE: 

1600 if not in_braces: 

1601 in_braces = True 

1602 brace_count = 1 

1603 else: 

1604 brace_count += 1 

1605 content_tokens.append(token) 

1606 elif token.type == TokenType.RBRACE: 

1607 if in_braces: 

1608 brace_count -= 1 

1609 if brace_count == 0: 

1610 # Found the closing brace 

1611 break 

1612 else: 

1613 content_tokens.append(token) 

1614 elif in_braces: 

1615 content_tokens.append(token) 

1616 

1617 # Convert tokens back to text preserving spacing 

1618 if content_tokens: 

1619 result = "" 

1620 for i, token in enumerate(content_tokens): 

1621 result += token.value 

1622 # Add space after most tokens except when next token is punctuation 

1623 if (i < len(content_tokens) - 1 and 

1624 token.type not in [TokenType.WHITESPACE, TokenType.NEWLINE] and 

1625 content_tokens[i + 1].type not in [TokenType.LBRACKET, TokenType.RBRACKET, 

1626 TokenType.SEMICOLON, TokenType.COMMA, 

1627 TokenType.WHITESPACE, TokenType.NEWLINE]): 

1628 result += " " 

1629 return result 

1630 return ""