Coverage for src/c2puml/core/parser

1#!/usr/bin/env python3

2"""

3Tokenizer module for C to PlantUML converter - Helper library for tokenizing C/C++ code

4"""

6import logging

7import re

8from dataclasses import dataclass

9from enum import Enum

10from typing import List, Optional, Tuple

13class TokenType(Enum):

14 """Token types for C/C++ lexical analysis"""

16 # Keywords

17 STRUCT = "STRUCT"

18 ENUM = "ENUM"

19 UNION = "UNION"

20 TYPEDEF = "TYPEDEF"

21 STATIC = "STATIC"

22 EXTERN = "EXTERN"

23 INLINE = "INLINE"

24 LOCAL_INLINE = "LOCAL_INLINE"

25 CONST = "CONST"

26 VOID = "VOID"

28 # Data types

29 CHAR = "CHAR"

30 INT = "INT"

31 FLOAT = "FLOAT"

32 DOUBLE = "DOUBLE"

33 LONG = "LONG"

34 SHORT = "SHORT"

35 UNSIGNED = "UNSIGNED"

36 SIGNED = "SIGNED"

38 # Operators and punctuation

39 LBRACE = "LBRACE" # {

40 RBRACE = "RBRACE" # }

41 LPAREN = "LPAREN" # (

42 RPAREN = "RPAREN" # )

43 LBRACKET = "LBRACKET" # [

44 RBRACKET = "RBRACKET" # ]

45 SEMICOLON = "SEMICOLON" # ;

46 COMMA = "COMMA" # ,

47 ASSIGN = "ASSIGN" # =

48 ASTERISK = "ASTERISK" # *

49 AMPERSAND = "AMPERSAND" # &

50 ARROW = "ARROW" # ->

52 # Literals and identifiers

53 IDENTIFIER = "IDENTIFIER"

54 NUMBER = "NUMBER"

55 STRING = "STRING"

56 CHAR_LITERAL = "CHAR_LITERAL"

58 # Preprocessor

59 INCLUDE = "INCLUDE"

60 DEFINE = "DEFINE"

61 PREPROCESSOR = "PREPROCESSOR"

63 # Special

64 COMMENT = "COMMENT"

65 WHITESPACE = "WHITESPACE"

66 NEWLINE = "NEWLINE"

67 EOF = "EOF"

68 UNKNOWN = "UNKNOWN"

71@dataclass

72class Token:

73 """Represents a single token in C/C++ code"""

75 type: TokenType

76 value: str

77 line: int

78 column: int

80 def __repr__(self) -> str:

81 return f"Token({self.type.name}, '{self.value}', {self.line}:{self.column})"

84class CTokenizer:

85 """Tokenizer for C/C++ source code"""

87 # Keywords mapping

88 KEYWORDS = {

89 "struct": TokenType.STRUCT,

90 "enum": TokenType.ENUM,

91 "union": TokenType.UNION,

92 "typedef": TokenType.TYPEDEF,

93 "static": TokenType.STATIC,

94 "extern": TokenType.EXTERN,

95 "inline": TokenType.INLINE,

96 "local_inline": TokenType.LOCAL_INLINE,

97 "const": TokenType.CONST,

98 "void": TokenType.VOID,

99 "char": TokenType.CHAR,

100 "int": TokenType.INT,

101 "float": TokenType.FLOAT,

102 "double": TokenType.DOUBLE,

103 "long": TokenType.LONG,

104 "short": TokenType.SHORT,

105 "unsigned": TokenType.UNSIGNED,

106 "signed": TokenType.SIGNED,

107 }

108

109 # Single character tokens

110 SINGLE_CHAR_TOKENS = {

111 "{": TokenType.LBRACE,

112 "}": TokenType.RBRACE,

113 "(": TokenType.LPAREN,

114 ")": TokenType.RPAREN,

115 "[": TokenType.LBRACKET,

116 "]": TokenType.RBRACKET,

117 ";": TokenType.SEMICOLON,

118 ",": TokenType.COMMA,

119 "=": TokenType.ASSIGN,

120 "*": TokenType.ASTERISK,

121 "&": TokenType.AMPERSAND,

122 }

123

124 # Two character tokens

125 TWO_CHAR_TOKENS = {

126 "->": TokenType.ARROW,

127 }

128

129 def __init__(self):

130 self.logger = logging.getLogger(__name__)

131

132 # Compiled regex patterns for efficiency

133 self.patterns = {

134 "identifier": re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*"),

135 "number": re.compile(

136 r"0[xX][0-9a-fA-F]+[uUlL]*|0[bB][01]+[uUlL]*|0[0-7]+[uUlL]*|"

137 r"\d+\.\d*([eE][+-]?\d+)?[fFlL]*|\d+([eE][+-]?\d+)?[fFlL]*|\d+[uUlL]*"

138 ),

139 "string": re.compile(r'"([^"\\]|\\.)*"'),

140 "char": re.compile(r"'([^'\\]|\\.)'"),

141 "comment_single": re.compile(r"//.*"),

142 "comment_multi": re.compile(r"/\*.*?\*/", re.DOTALL),

143 "preprocessor": re.compile(

145 ),

146 "whitespace": re.compile(r"[ \t]+"),

147 "newline": re.compile(r"\n"),

148 }

149

150 def tokenize(self, content: str) -> List[Token]:

151 """Tokenize C/C++ source code content"""

152 tokens = []

153 lines = content.splitlines()

154 total_lines = len(lines)

155 line_num = 1

156 in_multiline_string = False

157 multiline_string_value = ""

158 multiline_string_start_line = 0

159 multiline_string_start_col = 0

160 in_multiline_comment = False

161 multiline_comment_value = ""

162 multiline_comment_start_line = 0

163 multiline_comment_start_col = 0

164

165 for idx, line in enumerate(lines):

166 if in_multiline_string:

167 multiline_string_value += "\n" + line

168 if '"' in line:

169 # End of multiline string

170 in_multiline_string = False

171 tokens.append(

172 Token(

173 TokenType.STRING,

174 multiline_string_value,

175 multiline_string_start_line,

176 multiline_string_start_col,

177 )

178 )

179 elif in_multiline_comment:

180 # Continue multi-line comment

181 multiline_comment_value += "\n" + line

182 comment_end = line.find("*/")

183 if comment_end != -1:

184 # End of multi-line comment

185 in_multiline_comment = False

186 multiline_comment_value = multiline_comment_value[

187 : multiline_comment_value.rfind("*/") + 2

188 ]

189 tokens.append(

190 Token(

191 TokenType.COMMENT,

192 multiline_comment_value,

193 multiline_comment_start_line,

194 multiline_comment_start_col,

195 )

196 )

197 else:

198 line_tokens = self._tokenize_line(line, line_num)

199 # Check if a string starts but does not end on this line

200 if (

201 line_tokens

202 and line_tokens[-1].type == TokenType.STRING

203 and not line_tokens[-1].value.endswith('"')

204 ):

205 in_multiline_string = True

206 multiline_string_value = line_tokens[-1].value

207 multiline_string_start_line = line_tokens[-1].line

208 multiline_string_start_col = line_tokens[-1].column

209 tokens.extend(line_tokens[:-1])

210 # Check if a multi-line comment starts but does not end on this line

211 elif (

212 line_tokens

213 and line_tokens[-1].type == TokenType.COMMENT

214 and line_tokens[-1].value.startswith("/*")

215 and not line_tokens[-1].value.endswith("*/")

216 ):

217 in_multiline_comment = True

218 multiline_comment_value = line_tokens[-1].value

219 multiline_comment_start_line = line_tokens[-1].line

220 multiline_comment_start_col = line_tokens[-1].column

221 tokens.extend(line_tokens[:-1])

222 else:

223 tokens.extend(line_tokens)

224

225 if line_num < total_lines:

226 tokens.append(Token(TokenType.NEWLINE, "\n", line_num, len(line)))

227 line_num += 1

228

229 if in_multiline_string:

230 tokens.append(

231 Token(

232 TokenType.STRING,

233 multiline_string_value,

234 multiline_string_start_line,

235 multiline_string_start_col,

236 )

237 )

238 if in_multiline_comment:

239 tokens.append(

240 Token(

241 TokenType.COMMENT,

242 multiline_comment_value,

243 multiline_comment_start_line,

244 multiline_comment_start_col,

245 )

246 )

247

248 # Post-process tokens to merge multi-line macros

249 tokens = self._merge_multiline_macros(tokens, lines)

250

251 tokens.append(

252 Token(TokenType.EOF, "", total_lines, len(lines[-1]) if lines else 0)

253 )

254

255 return tokens

256

257 def _tokenize_line(self, line: str, line_num: int) -> List[Token]:

258 """Tokenize a single line of code"""

259 tokens = []

260 pos = 0

261

262 while pos < len(line):

263 # Skip whitespace but track it

264 if match := self.patterns["whitespace"].match(line, pos):

265 tokens.append(Token(TokenType.WHITESPACE, match.group(), line_num, pos))

266 pos = match.end()

267 continue

268

269 # Comments

270 if match := self.patterns["comment_single"].match(line, pos):

271 tokens.append(Token(TokenType.COMMENT, match.group(), line_num, pos))

272 pos = len(line) # Rest of line is comment

273 continue

274

275 # Multi-line comments - check for /* at start of line or after whitespace

276 if line[pos:].startswith("/*"):

277 # Find the end of the comment

278 comment_end = line.find("*/", pos)

279 if comment_end != -1:

280 # Comment ends on this line

281 comment_text = line[pos : comment_end + 2]

282 tokens.append(Token(TokenType.COMMENT, comment_text, line_num, pos))

283 pos = comment_end + 2

284 continue

285 else:

286 # Comment continues to next line - create a partial comment token

287 comment_text = line[pos:]

288 tokens.append(Token(TokenType.COMMENT, comment_text, line_num, pos))

289 pos = len(line)

290 continue

291

292 # Preprocessor directives

293 if match := self.patterns["preprocessor"].match(line, pos):

294 value = match.group()

295 if value.startswith("#include"):

296 tokens.append(Token(TokenType.INCLUDE, value, line_num, pos))

297 elif value.startswith("#define"):

298 tokens.append(Token(TokenType.DEFINE, value, line_num, pos))

299 else:

300 tokens.append(Token(TokenType.PREPROCESSOR, value, line_num, pos))

301 pos = len(line) # Rest of line is preprocessor

302 continue

303

304 # String literals

305 if (

306 line[pos] == '"'

307 or (

308 pos > 0

309 and line[pos - 1] in ["L", "u", "U", "R"]

310 and line[pos] == '"'

311 )

312 or (pos > 1 and line[pos - 2 : pos] == "u8" and line[pos] == '"')

313 ):

314 # Handle string literals with possible prefixes

315 string_start = pos

316 if line[pos - 2 : pos] == "u8":

317 string_start -= 2

318 elif line[pos - 1] in ["L", "u", "U", "R"]:

319 string_start -= 1

320 pos += 1 # Skip opening quote

321 while pos < len(line):

322 if line[pos] == '"':

323 # Found closing quote

324 string_text = line[string_start : pos + 1]

325 tokens.append(

326 Token(TokenType.STRING, string_text, line_num, string_start)

327 )

328 pos += 1

329 break

330 elif line[pos] == "\\":

331 pos += 2

332 else:

333 pos += 1

334 else:

335 string_text = line[string_start:]

336 tokens.append(

337 Token(TokenType.STRING, string_text, line_num, string_start)

338 )

339 pos = len(line)

340 continue

341

342 # Character literals

343 if match := self.patterns["char"].match(line, pos):

344 tokens.append(

345 Token(TokenType.CHAR_LITERAL, match.group(), line_num, pos)

346 )

347 pos = match.end()

348 continue

349

350 # Numbers

351 if match := self.patterns["number"].match(line, pos):

352 tokens.append(Token(TokenType.NUMBER, match.group(), line_num, pos))

353 pos = match.end()

354 continue

355

356 # Single character tokens

357 if line[pos] in self.SINGLE_CHAR_TOKENS:

358 token_type = self.SINGLE_CHAR_TOKENS[line[pos]]

359 tokens.append(Token(token_type, line[pos], line_num, pos))

360 pos += 1

361 continue

362

363 # Multi-character operators (<<, >>, ->)

364 if line[pos : pos + 2] in ["<<", ">>", "->"]:

365 op = line[pos : pos + 2]

366 if op == "->":

367 tokens.append(Token(TokenType.ARROW, op, line_num, pos))

368 else:

369 tokens.append(

370 Token(

371 (

372 TokenType.OPERATOR

373 if hasattr(TokenType, "OPERATOR")

374 else TokenType.UNKNOWN

375 ),

376 op,

377 line_num,

378 pos,

379 )

380 )

381 pos += 2

382 continue

383

384 # Identifiers and keywords

385 if match := self.patterns["identifier"].match(line, pos):

386 value = match.group()

387 token_type = self.KEYWORDS.get(value.lower(), TokenType.IDENTIFIER)

388 tokens.append(Token(token_type, value, line_num, pos))

389 pos = match.end()

390 continue

391

392 # Unknown character (always one at a time)

393 tokens.append(Token(TokenType.UNKNOWN, line[pos], line_num, pos))

394 pos += 1

395

396 return tokens

397

398 def filter_tokens(

399 self, tokens: List[Token], exclude_types: Optional[List[TokenType]] = None

400 ) -> List[Token]:

401 """Filter tokens by type"""

402 if exclude_types is None:

403 exclude_types = [

404 TokenType.WHITESPACE,

405 TokenType.COMMENT,

406 TokenType.NEWLINE,

407 TokenType.EOF,

408 ]

409

410 return [token for token in tokens if token.type not in exclude_types]

411

412 def _merge_multiline_macros(

413 self, tokens: List[Token], lines: List[str]

414 ) -> List[Token]:

415 """Merge multi-line macro tokens that span multiple lines with backslashes"""

416 merged_tokens = []

417 i = 0

418

419 while i < len(tokens):

420 token = tokens[i]

421

422 if token.type == TokenType.DEFINE and token.value.rstrip().endswith("\\"):

423 # Found a multi-line macro, merge with subsequent lines

424 macro_content = token.value

425 current_line = token.line

426

427 # Continue merging lines until we find one that doesn't end with backslash

428 while macro_content.rstrip().endswith("\\"):

429 # Remove the backslash and add a newline

430 macro_content = macro_content.rstrip()[:-1] + "\n"

431 current_line += 1

432

433 # Find the next line content

434 if current_line <= len(lines):

435 next_line = lines[current_line - 1]

436 macro_content += next_line

437 else:

438 break

439

440 # Create a new token with the merged content

441 merged_tokens.append(

442 Token(TokenType.DEFINE, macro_content, token.line, token.column)

443 )

444 else:

445 merged_tokens.append(token)

446

447 i += 1

448

449 return merged_tokens

450

451

452class StructureFinder:

453 """Helper class to find C/C++ structures in token streams"""

454

455 def __init__(self, tokens: List[Token]):

456 self.tokens = tokens

457 self.pos = 0

458 self.logger = logging.getLogger(__name__)

459

460 def find_structs(self) -> List[Tuple[int, int, str]]:

461 """Find struct definitions in token stream

462

463 Returns:

464 List of tuples (start_pos, end_pos, struct_name)

465 """

466 structs = []

467 self.pos = 0

468

469 while self.pos < len(self.tokens):

470 if self._current_token_is(TokenType.STRUCT):

471 struct_info = self._parse_struct()

472 if struct_info:

473 structs.append(struct_info)

474 elif self._current_token_is(TokenType.TYPEDEF):

475 typedef_struct = self._parse_typedef_struct()

476 if typedef_struct:

477 structs.append(typedef_struct)

478 else:

479 self.pos += 1

480

481 return structs

482

483 def find_enums(self) -> List[Tuple[int, int, str]]:

484 """Find enum definitions in token stream"""

485 enums = []

486 self.pos = 0

487

488 while self.pos < len(self.tokens):

489 if self._current_token_is(TokenType.ENUM):

490 enum_info = self._parse_enum()

491 if enum_info:

492 enums.append(enum_info)

493 elif self._current_token_is(TokenType.TYPEDEF):

494 typedef_enum = self._parse_typedef_enum()

495 if typedef_enum:

496 enums.append(typedef_enum)

497 else:

498 self.pos += 1

499

500 return enums

501

502 def find_functions(self) -> List[Tuple[int, int, str, str, bool, bool]]:

503 """Find all function declarations and definitions in the token stream

504

505 Returns:

506 List of tuples (start_pos, end_pos, func_name, return_type, is_declaration, is_inline)

507 """

508 functions = []

509 self.pos = 0

510

511 while self.pos < len(self.tokens):

512 result = self._parse_function()

513 if result:

514 functions.append(result)

515

516 return functions

517

518 def find_unions(self) -> List[Tuple[int, int, str]]:

519 """Find union definitions in token stream"""

520 unions = []

521 self.pos = 0

522

523 while self.pos < len(self.tokens):

524 if self._current_token_is(TokenType.UNION):

525 union_info = self._parse_union()

526 if union_info:

527 unions.append(union_info)

528 elif self._current_token_is(TokenType.TYPEDEF):

529 typedef_union = self._parse_typedef_union()

530 if typedef_union:

531 unions.append(typedef_union)

532 else:

533 self.pos += 1

534

535 return unions

536

537 def _current_token_is(self, token_type: TokenType) -> bool:

538 """Check if current token is of specified type"""

539 return self.pos < len(self.tokens) and self.tokens[self.pos].type == token_type

540

541 def _peek_token(self, offset: int = 1) -> Optional[Token]:

542 """Peek at token at current position + offset"""

543 peek_pos = self.pos + offset

544 return self.tokens[peek_pos] if peek_pos < len(self.tokens) else None

545

546 def _advance(self) -> Optional[Token]:

547 """Advance to next token and return current"""

548 if self.pos < len(self.tokens):

549 token = self.tokens[self.pos]

550 self.pos += 1

551 return token

552 return None

553

554 def _find_matching_brace(self, start_pos: int) -> Optional[int]:

555 """Find matching closing brace starting from open brace position"""

556 if (

557 start_pos >= len(self.tokens)

558 or self.tokens[start_pos].type != TokenType.LBRACE

559 ):

560 return None

561

562 depth = 1

563 pos = start_pos + 1

564

565 while pos < len(self.tokens) and depth > 0:

566 if self.tokens[pos].type == TokenType.LBRACE:

567 depth += 1

568 elif self.tokens[pos].type == TokenType.RBRACE:

569 depth -= 1

570 pos += 1

571

572 return pos - 1 if depth == 0 else None

573

574 def _parse_struct(self) -> Optional[Tuple[int, int, str]]:

575 """Parse struct definition starting at current position"""

576 start_pos = self.pos

577

578 # Consume 'struct' keyword

579 if not self._current_token_is(TokenType.STRUCT):

580 return None

581 self._advance()

582

583 # Check if this struct is inside a cast expression by looking backwards

584 check_pos = start_pos - 1

585 while check_pos >= 0:

586 if self.tokens[check_pos].type == TokenType.LPAREN:

587 # Found opening parenthesis before struct - this is likely a cast expression

588 return None

589 elif self.tokens[check_pos].type in [TokenType.STRUCT, TokenType.TYPEDEF]:

590 # Found another struct or typedef - this is not a cast expression

591 break

592 elif self.tokens[check_pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:

593 # Found some other token - this is not a cast expression

594 break

595 check_pos -= 1

596

597 # Skip whitespace

598 while self.pos < len(self.tokens) and self._current_token_is(

599 TokenType.WHITESPACE

600 ):

601 self.pos += 1

602

603 # Check if this is a cast expression: (struct type*)

604 if self._current_token_is(TokenType.LPAREN):

605 # Look ahead to see if this is a cast expression

606 check_pos = self.pos + 1

607 while check_pos < len(self.tokens):

608 if self.tokens[check_pos].type == TokenType.RPAREN:

609 # Found closing parenthesis - this is likely a cast expression

610 return None

611 elif self.tokens[check_pos].type == TokenType.LBRACE:

612 # Found opening brace - this is a struct definition

613 break

614 elif self.tokens[check_pos].type == TokenType.SEMICOLON:

615 # Found semicolon - this is a variable declaration

616 return None

617 check_pos += 1

618

619 # Get struct tag name (optional for anonymous structs)

620 struct_tag = ""

621 if self._current_token_is(TokenType.IDENTIFIER):

622 struct_tag = self._advance().value

623

624 # Look for opening brace or semicolon

625 while self.pos < len(self.tokens):

626 if self._current_token_is(TokenType.LBRACE):

627 # Found opening brace - this is a struct definition

628 break

629 elif self._current_token_is(TokenType.SEMICOLON):

630 # Found semicolon before opening brace - this is a variable declaration

631 return None

632 self.pos += 1

633

634 if not self._current_token_is(TokenType.LBRACE):

635 # This is a variable declaration

636 return None

637

638 # Find matching closing brace

639 brace_pos = self.pos

640 end_brace_pos = self._find_matching_brace(brace_pos)

641

642 if end_brace_pos is None:

643 return None

644

645 # Look for struct name after closing brace

646 name_pos = end_brace_pos + 1

647 struct_name = struct_tag # Default to tag name

648

649 # Check if this is a typedef struct by looking backwards

650 is_typedef = False

651 check_pos = start_pos - 1

652 while check_pos >= 0:

653 if self.tokens[check_pos].type == TokenType.TYPEDEF:

654 is_typedef = True

655 break

656 elif self.tokens[check_pos].type in [

657 TokenType.STRUCT,

658 TokenType.LBRACE,

659 TokenType.RBRACE,

660 ]:

661 break

662 check_pos -= 1

663

664 if is_typedef:

665 # For typedef struct, look for the typedef name after the closing brace

666 while name_pos < len(self.tokens):

667 if self.tokens[name_pos].type == TokenType.IDENTIFIER:

668 struct_name = self.tokens[name_pos].value

669 break

670 elif self.tokens[name_pos].type == TokenType.SEMICOLON:

671 break

672 name_pos += 1

673 else:

674 # Check if there's a variable name after the brace

675 while name_pos < len(self.tokens):

676 if self.tokens[name_pos].type == TokenType.IDENTIFIER:

677 # This is a variable name

678 struct_name = ""

679 break

680 elif self.tokens[name_pos].type == TokenType.SEMICOLON:

681 break

682 name_pos += 1

683

684 # Find semicolon (for struct definitions)

685 self.pos = end_brace_pos + 1

686 while self.pos < len(self.tokens) and not self._current_token_is(

687 TokenType.SEMICOLON

688 ):

689 self.pos += 1

690

691 end_pos = self.pos

692 return (start_pos, end_pos, struct_name)

693

694 def _parse_typedef_struct(self) -> Optional[Tuple[int, int, str]]:

695 """Parse typedef struct definition"""

696 start_pos = self.pos

697

698 # Consume 'typedef'

699 if not self._current_token_is(TokenType.TYPEDEF):

700 return None

701 self._advance()

702

703 # Look for 'struct'

704 if not self._current_token_is(TokenType.STRUCT):

705 # Not a typedef struct, reset position

706 self.pos = start_pos + 1

707 return None

708

709 # Skip 'struct'

710 self._advance()

711

712 # Skip whitespace

713 while self.pos < len(self.tokens) and self._current_token_is(

714 TokenType.WHITESPACE

715 ):

716 self.pos += 1

717

718 # Get struct tag name (optional)

719 struct_tag = ""

720 if self._current_token_is(TokenType.IDENTIFIER):

721 struct_tag = self._advance().value

722

723 # Skip whitespace

724 while self.pos < len(self.tokens) and self._current_token_is(

725 TokenType.WHITESPACE

726 ):

727 self.pos += 1

728

729 # Check if this is a forward declaration (no braces)

730 if not self._current_token_is(TokenType.LBRACE):

731 # This is a forward declaration, skip it

732 self.pos = start_pos + 1

733 return None

734

735 # Find matching closing brace

736 end_brace_pos = self._find_matching_brace(self.pos)

737 if end_brace_pos is None:

738 self.pos = start_pos + 1

739 return None

740

741 # Look for typedef name after closing brace

742 typedef_name = ""

743 name_pos = end_brace_pos + 1

744 while name_pos < len(self.tokens):

745 if self.tokens[name_pos].type == TokenType.IDENTIFIER:

746 typedef_name = self.tokens[name_pos].value

747 break

748 elif self.tokens[name_pos].type == TokenType.SEMICOLON:

749 break

750 name_pos += 1

751

752 # Find semicolon

753 while (

754 name_pos < len(self.tokens)

755 and not self.tokens[name_pos].type == TokenType.SEMICOLON

756 ):

757 name_pos += 1

758

759 end_pos = name_pos

760 return (start_pos, end_pos, typedef_name)

761

762 def _parse_enum(self) -> Optional[Tuple[int, int, str]]:

763 """Parse enum definition starting at current position"""

764 start_pos = self.pos

765

766 # Consume 'enum' keyword

767 if not self._current_token_is(TokenType.ENUM):

768 return None

769 self._advance()

770

771 # Skip whitespace

772 while self.pos < len(self.tokens) and self._current_token_is(

773 TokenType.WHITESPACE

774 ):

775 self.pos += 1

776

777 # Get enum tag name (optional for anonymous enums)

778 enum_tag = ""

779 if self._current_token_is(TokenType.IDENTIFIER):

780 enum_tag = self._advance().value

781

782 # Find opening brace

783 while self.pos < len(self.tokens) and not self._current_token_is(

784 TokenType.LBRACE

785 ):

786 self.pos += 1

787

788 if not self._current_token_is(TokenType.LBRACE):

789 return None

790

791 # Find matching closing brace

792 brace_pos = self.pos

793 end_brace_pos = self._find_matching_brace(brace_pos)

794

795 if end_brace_pos is None:

796 return None

797

798 # Look for enum name after closing brace

799 name_pos = end_brace_pos + 1

800 enum_name = enum_tag # Default to tag name

801

802 # Check if this is a typedef enum by looking backwards

803 is_typedef = False

804 check_pos = start_pos - 1

805 while check_pos >= 0:

806 if self.tokens[check_pos].type == TokenType.TYPEDEF:

807 is_typedef = True

808 break

809 elif self.tokens[check_pos].type in [

810 TokenType.ENUM,

811 TokenType.LBRACE,

812 TokenType.RBRACE,

813 ]:

814 break

815 check_pos -= 1

816

817 if is_typedef:

818 # For typedef enum, look for the typedef name after the closing brace

819 while name_pos < len(self.tokens):

820 if self.tokens[name_pos].type == TokenType.IDENTIFIER:

821 enum_name = self.tokens[name_pos].value

822 break

823 elif self.tokens[name_pos].type == TokenType.SEMICOLON:

824 break

825 name_pos += 1

826 elif not enum_tag:

827 # Anonymous enum - check if there's a variable name after the brace

828 while name_pos < len(self.tokens):

829 if self.tokens[name_pos].type == TokenType.IDENTIFIER:

830 # This is a variable name

831 enum_name = ""

832 break

833 elif self.tokens[name_pos].type == TokenType.SEMICOLON:

834 break

835 name_pos += 1

836

837 # Find semicolon

838 self.pos = end_brace_pos + 1

839 while self.pos < len(self.tokens) and not self._current_token_is(

840 TokenType.SEMICOLON

841 ):

842 self.pos += 1

843

844 end_pos = self.pos

845 return (start_pos, end_pos, enum_name)

846

847 def _parse_typedef_enum(self) -> Optional[Tuple[int, int, str]]:

848 """Parse typedef enum definition"""

849 start_pos = self.pos

850

851 # Consume 'typedef'

852 if not self._current_token_is(TokenType.TYPEDEF):

853 return None

854 self._advance()

855

856 # Look for 'enum'

857 if not self._current_token_is(TokenType.ENUM):

858 # Not a typedef enum, reset position

859 self.pos = start_pos + 1

860 return None

861

862 # Parse the enum part - this will return the tag name (e.g., StatusEnum_tag)

863 enum_info = self._parse_enum()

864 if not enum_info:

865 self.pos = start_pos + 1

866 return None

867

868 # For typedef enums, we want to return the tag name, not the typedef name

869 # The typedef name will be handled separately in the parser

870 return enum_info

871

872 def _parse_function(self) -> Optional[Tuple[int, int, str, str, bool, bool]]:

873 """Parse function declaration/definition

874

875 Returns:

876 Tuple of (start_pos, end_pos, func_name, return_type, is_declaration, is_inline)

877 """

878 start_pos = self.pos

879

880 # Look for function pattern: [modifiers] return_type function_name (params)

881 while self.pos < len(self.tokens):

882 token = self.tokens[self.pos]

883

884 # If we hit a parenthesis, check if this is a function

885 if token.type == TokenType.LPAREN:

886 # Look backwards for function name

887 if (

888 self.pos > 0

889 and self.tokens[self.pos - 1].type == TokenType.IDENTIFIER

890 ):

891 func_name = self.tokens[self.pos - 1].value

892 func_name_pos = self.pos - 1

893

894 # Look backwards from function name to find return type

895 # Start from just before the function name

896 return_type_end = func_name_pos - 1

897 return_type_start = return_type_end

898

899 # Skip backwards over whitespace and comments

900 while return_type_start >= 0:

901 token_type = self.tokens[return_type_start].type

902 if token_type in [

903 TokenType.WHITESPACE,

904 TokenType.COMMENT,

905 TokenType.NEWLINE,

906 ]:

907 return_type_start -= 1

908 else:

909 break

910

911 # If we found a non-whitespace token, that's the end of the return type

912 # Find the start by looking backwards from there

913 if return_type_start >= 0:

914 return_type_end = return_type_start

915 return_type_start = return_type_end

916

917 # Define modifiers set (used in token type checking below)

918

919 # Collect all tokens that are part of the return type (including modifiers)

920 return_type_tokens = []

921

922 # Look back at most 10 tokens to capture multi-token return types

923 max_lookback = max(0, func_name_pos - 10)

924 current_pos = return_type_start

925

926 # Collect tokens backwards until we hit a limit or non-return-type token

927 while current_pos >= max_lookback:

928 token_type = self.tokens[current_pos].type

929 if token_type in [

930 TokenType.IDENTIFIER,

931 TokenType.INT,

932 TokenType.VOID,

933 TokenType.CHAR,

934 TokenType.FLOAT,

935 TokenType.DOUBLE,

936 TokenType.LONG,

937 TokenType.SHORT,

938 TokenType.UNSIGNED,

939 TokenType.SIGNED,

940 TokenType.ASTERISK,

941 TokenType.CONST,

942 TokenType.STATIC,

943 TokenType.EXTERN,

944 TokenType.INLINE,

945 TokenType.LOCAL_INLINE,

946 ]:

947 return_type_tokens.insert(0, self.tokens[current_pos])

948 current_pos -= 1

949 elif token_type in [

950 TokenType.WHITESPACE,

951 TokenType.COMMENT,

952 TokenType.NEWLINE,

953 ]:

954 # Skip whitespace and continue looking

955 current_pos -= 1

956 else:

957 break

958

959 # Extract return type

960 if return_type_tokens:

961 return_type = " ".join(

962 t.value for t in return_type_tokens

963 ).strip()

964

965 # Check if function is inline

966 is_inline = any(

967 token.type in [TokenType.INLINE, TokenType.LOCAL_INLINE]

968 for token in return_type_tokens

969 )

970

971 # Find end of function (either ; for declaration or { for definition)

972 end_pos = self._find_function_end(self.pos)

973 if end_pos:

974 # Determine if this is a declaration or definition

975 is_declaration = self._is_function_declaration(end_pos)

976 self.pos = end_pos + 1

977 return (

978 start_pos,

979 end_pos,

980 func_name,

981 return_type,

982 is_declaration,

983 is_inline,

984 )

985

986 self.pos += 1

987

988 # Prevent infinite loops - if we've gone too far, this isn't a function

989 if self.pos - start_pos > 50:

990 break

991

992 # Reset position if no function found

993 self.pos = start_pos + 1

994 return None

995

996 def _is_function_declaration(self, end_pos: int) -> bool:

997 """Check if the function at end_pos is a declaration (ends with ;) or definition (ends with })"""

998 if end_pos >= len(self.tokens):

999 return False

1000

1001 # Look backwards from end_pos to find the last significant token

1002 pos = end_pos

1003 while pos >= 0:

1004 token_type = self.tokens[pos].type

1005 if token_type not in [

1006 TokenType.WHITESPACE,

1007 TokenType.COMMENT,

1008 TokenType.NEWLINE,

1009 ]:

1010 return token_type == TokenType.SEMICOLON

1011 pos -= 1

1012

1013 return False

1014

1015 def _find_function_end(self, start_pos: int) -> Optional[int]:

1016 """Find end of function declaration or definition"""

1017 pos = start_pos

1018

1019 # Find matching closing parenthesis

1020 if pos >= len(self.tokens) or self.tokens[pos].type != TokenType.LPAREN:

1021 return None

1022

1023 depth = 1

1024 pos += 1

1025

1026 while pos < len(self.tokens) and depth > 0:

1027 if self.tokens[pos].type == TokenType.LPAREN:

1028 depth += 1

1029 elif self.tokens[pos].type == TokenType.RPAREN:

1030 depth -= 1

1031 pos += 1

1032

1033 if depth > 0:

1034 return None

1035

1036 # Look for either ; (declaration) or { (definition)

1037 while pos < len(self.tokens):

1038 if self.tokens[pos].type == TokenType.SEMICOLON:

1039 return pos

1040 elif self.tokens[pos].type == TokenType.LBRACE:

1041 # Function definition - find matching brace

1042 end_brace = self._find_matching_brace(pos)

1043 return end_brace if end_brace else pos

1044 pos += 1

1045

1046 return None

1047

1048 def _parse_union(self) -> Optional[Tuple[int, int, str]]:

1049 """Parse union definition"""

1050 if not self._current_token_is(TokenType.UNION):

1051 return None

1052

1053 start_pos = self.pos

1054 self._advance() # Consumes 'union'

1055

1056 # Skip whitespace

1057 while self.pos < len(self.tokens) and self._current_token_is(

1058 TokenType.WHITESPACE

1059 ):

1060 self.pos += 1

1061

1062 # Get union tag name (optional for anonymous unions)

1063 union_tag = ""

1064 if self._current_token_is(TokenType.IDENTIFIER):

1065 union_tag = self._advance().value

1066

1067 # Find opening brace

1068 while self.pos < len(self.tokens) and not self._current_token_is(

1069 TokenType.LBRACE

1070 ):

1071 self.pos += 1

1072

1073 if self.pos >= len(self.tokens):

1074 return None

1075

1076 # Find matching closing brace

1077 end_pos = self._find_matching_brace(self.pos)

1078 if end_pos is None:

1079 return None

1080

1081 # Look for union name after closing brace (for typedefs or named unions)

1082 union_name = union_tag # Default to tag name

1083

1084 # Skip to semicolon

1085 self.pos = end_pos + 1

1086 while self.pos < len(self.tokens) and not self._current_token_is(

1087 TokenType.SEMICOLON

1088 ):

1089 if self._current_token_is(TokenType.IDENTIFIER):

1090 union_name = self._advance().value

1091 break

1092 self.pos += 1

1093

1094 return (start_pos, end_pos, union_name)

1095

1096 def _parse_typedef_union(self) -> Optional[Tuple[int, int, str]]:

1097 """Parse typedef union definition"""

1098 if not self._current_token_is(TokenType.TYPEDEF):

1099 return None

1100

1101 start_pos = self.pos

1102 self._advance() # Consumes 'typedef'

1103

1104 # Skip whitespace

1105 while self.pos < len(self.tokens) and self._current_token_is(

1106 TokenType.WHITESPACE

1107 ):

1108 self.pos += 1

1109

1110 # Check if next token is 'union'

1111 if not self._current_token_is(TokenType.UNION):

1112 return None

1113

1114 self._advance() # Consumes 'union'

1115

1116 # Skip whitespace

1117 while self.pos < len(self.tokens) and self._current_token_is(

1118 TokenType.WHITESPACE

1119 ):

1120 self.pos += 1

1121

1122 # Get union tag name (optional)

1123 union_tag = ""

1124 if self._current_token_is(TokenType.IDENTIFIER):

1125 union_tag = self._advance().value

1126

1127 # Find opening brace

1128 while self.pos < len(self.tokens) and not self._current_token_is(

1129 TokenType.LBRACE

1130 ):

1131 self.pos += 1

1132

1133 if self.pos >= len(self.tokens):

1134 return None

1135

1136 # Find matching closing brace

1137 end_pos = self._find_matching_brace(self.pos)

1138 if end_pos is None:

1139 return None

1140

1141 # Look for typedef name after closing brace

1142 typedef_name = ""

1143 self.pos = end_pos + 1

1144 while self.pos < len(self.tokens) and not self._current_token_is(

1145 TokenType.SEMICOLON

1146 ):

1147 if self._current_token_is(TokenType.IDENTIFIER):

1148 typedef_name = self._advance().value

1149 break

1150 self.pos += 1

1151

1152 return (start_pos, end_pos, typedef_name)

1153

1154

1155def extract_token_range(tokens: List[Token], start: int, end: int) -> str:

1156 """Extract raw text from token range, excluding whitespace, comments, and newlines"""

1157 if start >= len(tokens) or end >= len(tokens) or start > end:

1158 return ""

1159 return " ".join(

1160 token.value

1161 for token in tokens[start : end + 1]

1162 if token.type

1163 not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]

1164 )

1165

1166

1167def find_struct_fields(

1168 tokens: List[Token], struct_start: int, struct_end: int

1169) -> List[Tuple[str, str]]:

1170 """Extract field information from struct token range

1171 Returns:

1172 List of tuples (field_name, field_type)

1173 """

1174 fields = []

1175 pos = struct_start

1176 while pos <= struct_end and tokens[pos].type != TokenType.LBRACE:

1177 pos += 1

1178 if pos > struct_end:

1179 return fields

1180 pos += 1 # Skip opening brace

1181

1182 # Find the closing brace position of the main struct body

1183 closing_brace_pos = pos

1184 brace_count = 1 # Start at 1 because we're already past the opening brace

1185 while closing_brace_pos <= struct_end:

1186 if tokens[closing_brace_pos].type == TokenType.LBRACE:

1187 brace_count += 1

1188 elif tokens[closing_brace_pos].type == TokenType.RBRACE:

1189 brace_count -= 1

1190 if brace_count == 0:

1191 # This is the closing brace of the main struct body

1192 break

1193 closing_brace_pos += 1

1194

1195 # Only parse fields up to the closing brace

1196 while pos < closing_brace_pos and tokens[pos].type != TokenType.RBRACE:

1197 field_tokens = []

1198 # Collect tokens until we find the semicolon that ends this field

1199 # For nested structures, we need to handle braces properly

1200 brace_count = 0

1201 field_start_pos = pos

1202

1203 # First pass: collect tokens until we find the semicolon outside of braces

1204 while pos < closing_brace_pos:

1205 if tokens[pos].type == TokenType.LBRACE:

1206 brace_count += 1

1207 elif tokens[pos].type == TokenType.RBRACE:

1208 brace_count -= 1

1209 # Only stop if we're at the main closing brace

1210 if pos == closing_brace_pos:

1211 break

1212 elif tokens[pos].type == TokenType.SEMICOLON and brace_count == 0:

1213 # This is the semicolon that ends the field

1214 break

1215

1216 if tokens[pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:

1217 field_tokens.append(tokens[pos])

1218 pos += 1

1219

1220 # For nested structures, we need to continue collecting tokens until we find the field name

1221 # and the semicolon that ends the entire field

1222 if (len(field_tokens) >= 3 and

1223 field_tokens[0].type in [TokenType.STRUCT, TokenType.UNION] and

1224 field_tokens[1].type == TokenType.LBRACE):

1225 # This might be a nested structure, continue collecting until we find the field name

1226 temp_pos = pos

1227 brace_count = 0 # Track nested braces to find the correct field boundary

1228 while temp_pos < len(tokens):

1229 if tokens[temp_pos].type == TokenType.LBRACE:

1230 brace_count += 1

1231 elif tokens[temp_pos].type == TokenType.RBRACE:

1232 brace_count -= 1

1233 elif tokens[temp_pos].type == TokenType.SEMICOLON and brace_count == 0:

1234 # Found the semicolon that ends the field (not inside nested braces)

1235 break

1236

1237 if tokens[temp_pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:

1238 field_tokens.append(tokens[temp_pos])

1239 temp_pos += 1

1240 pos = temp_pos

1241

1242 # Parse field from collected tokens

1243 if len(field_tokens) >= 2:

1244 # Check if this is a nested struct field

1245 if (

1246 len(field_tokens) >= 3

1247 and field_tokens[0].type == TokenType.STRUCT

1248 and field_tokens[1].type == TokenType.LBRACE

1249 ):

1250 # This is a nested struct - find the field name after the closing brace

1251 # Look for the pattern: struct { ... } field_name;

1252 field_name = None

1253 # Find the LAST closing brace and then the field name

1254 # This handles deeply nested structures correctly

1255 for i in range(len(field_tokens) - 1, -1, -1):

1256 if field_tokens[i].type == TokenType.RBRACE and i + 1 < len(field_tokens):

1257 # The field name should be the next identifier after the closing brace

1258 for j in range(i + 1, len(field_tokens)):

1259 if field_tokens[j].type == TokenType.IDENTIFIER:

1260 field_name = field_tokens[j].value

1261 break

1262 if field_name:

1263 break

1264

1265 if field_name:

1266 # Extract the content between braces for anonymous processor using special format

1267 content = _extract_brace_content(field_tokens)

1268 if content:

1269 # Preserve content for anonymous processor using special format

1270 import base64

1271 encoded_content = base64.b64encode(content.encode()).decode()

1272 field_type = f"struct {{ /*ANON:{encoded_content}:{field_name}*/ ... }}"

1273 else:

1274 field_type = "struct { ... }"

1275

1276 if field_name not in ["[", "]", ";", "}"]:

1277 fields.append((field_name, field_type))

1278 # Skip parsing the nested struct's fields as separate fields

1279 # Let the normal flow handle semicolon advancement

1280 else:

1281 # Anonymous nested struct without a field name

1282 content = _extract_brace_content(field_tokens)

1283 if content:

1284 import base64

1285 encoded_content = base64.b64encode(content.encode()).decode()

1286 # Use generic field name for anonymous struct

1287 generic_name = "__anonymous_struct__"

1288 field_type = f"struct {{ /*ANON:{encoded_content}:{generic_name}*/ ... }}"

1289 else:

1290 generic_name = "__anonymous_struct__"

1291 field_type = "struct { ... }"

1292 fields.append((generic_name, field_type))

1293 # Check if this is a nested union field

1294 elif (

1295 len(field_tokens) >= 3

1296 and field_tokens[0].type == TokenType.UNION

1297 and field_tokens[1].type == TokenType.LBRACE

1298 ):

1299 # This is a nested union - find the field name after the closing brace

1300 # Look for the pattern: union { ... } field_name;

1301 field_name = None

1302 # Find the LAST closing brace and then the field name

1303 # This handles deeply nested structures correctly

1304 for i in range(len(field_tokens) - 1, -1, -1):

1305 if field_tokens[i].type == TokenType.RBRACE and i + 1 < len(field_tokens):

1306 # The field name should be the next identifier after the closing brace

1307 for j in range(i + 1, len(field_tokens)):

1308 if field_tokens[j].type == TokenType.IDENTIFIER:

1309 field_name = field_tokens[j].value

1310 break

1311 if field_name:

1312 break

1313

1314 if field_name:

1315 # Extract the content between braces for anonymous processor

1316 content = _extract_brace_content(field_tokens)

1317 if content:

1318 # Preserve content for anonymous processor using special format

1319 import base64

1320 encoded_content = base64.b64encode(content.encode()).decode()

1321 field_type = f"union {{ /*ANON:{encoded_content}:{field_name}*/ ... }}"

1322 else:

1323 field_type = "union { ... }"

1324

1325 if field_name not in ["[", "]", ";", "}"]:

1326 fields.append((field_name, field_type))

1327 # Skip parsing the nested union's fields as separate fields

1328 # Let the normal flow handle semicolon advancement

1329 else:

1330 # Anonymous nested union without a field name

1331 content = _extract_brace_content(field_tokens)

1332 if content:

1333 import base64

1334 encoded_content = base64.b64encode(content.encode()).decode()

1335 generic_name = "__anonymous_union__"

1336 field_type = f"union {{ /*ANON:{encoded_content}:{generic_name}*/ ... }}"

1337 else:

1338 generic_name = "__anonymous_union__"

1339 field_type = "union { ... }"

1340 fields.append((generic_name, field_type))

1341 # Function pointer array field: type (*name[size])(params)

1342 elif (

1343 len(field_tokens) >= 8

1344 and field_tokens[1].type == TokenType.LPAREN

1345 and field_tokens[2].type == TokenType.ASTERISK

1346 and any(t.type == TokenType.LBRACKET for t in field_tokens)

1347 and any(t.type == TokenType.RBRACKET for t in field_tokens)

1348 ):

1349 # Find the function pointer name (between * and [)

1350 # Look for the identifier between * and [

1351 name_start = 3 # After the *

1352 name_end = None

1353 for i in range(name_start, len(field_tokens)):

1354 if field_tokens[i].type == TokenType.LBRACKET:

1355 name_end = i

1356 break

1357

1358 if name_end is not None:

1359 field_name = " ".join(

1360 t.value for t in field_tokens[name_start:name_end]

1361 )

1362

1363 # Format the type properly - preserve spaces between tokens but not around brackets/parentheses

1364 formatted_tokens = []

1365 for j, token in enumerate(field_tokens):

1366 if token.type in [

1367 TokenType.LPAREN,

1368 TokenType.RPAREN,

1369 TokenType.LBRACKET,

1370 TokenType.RBRACKET,

1371 ]:

1372 # Don't add spaces around brackets/parentheses

1373 formatted_tokens.append(token.value)

1374 elif j > 0 and field_tokens[j - 1].type not in [

1375 TokenType.LPAREN,

1376 TokenType.RPAREN,

1377 TokenType.LBRACKET,

1378 TokenType.RBRACKET,

1379 ]:

1380 # Add space before token if previous token wasn't a bracket/parenthesis

1381 formatted_tokens.append(" " + token.value)

1382 else:

1383 # No space before token

1384 formatted_tokens.append(token.value)

1385 field_type = "".join(formatted_tokens)

1386

1387 # Validate and add the field

1388 if (

1389 field_name

1390 and field_name.strip()

1391 and field_type.strip()

1392 and field_name not in ["[", "]", ";", "}"]

1393 ):

1394 stripped_name = field_name.strip()

1395 stripped_type = field_type.strip()

1396 if stripped_name and stripped_type:

1397 fields.append((stripped_name, stripped_type))

1398 # Function pointer field: type (*name)(params) or type (*name[size])(params)

1399 elif (

1400 len(field_tokens) >= 5

1401 and field_tokens[1].type == TokenType.LPAREN and field_tokens[2].type == TokenType.ASTERISK

1402 ):

1403 # Find the opening parenthesis and asterisk pattern

1404 func_ptr_start = None

1405 for i in range(len(field_tokens) - 1):

1406 if field_tokens[i].type == TokenType.LPAREN and field_tokens[i + 1].type == TokenType.ASTERISK:

1407 func_ptr_start = i

1408 break

1409

1410 if func_ptr_start is not None:

1411 # Extract the type (everything before the opening parenthesis)

1412 type_tokens = field_tokens[:func_ptr_start]

1413 field_type = " ".join(t.value for t in type_tokens)

1414

1415 # Find the closing parenthesis after the function name

1416 paren_count = 0

1417 name_end = None

1418 for i in range(func_ptr_start, len(field_tokens)):

1419 if field_tokens[i].type == TokenType.LPAREN:

1420 paren_count += 1

1421 elif field_tokens[i].type == TokenType.RPAREN:

1422 paren_count -= 1

1423 if paren_count == 0 and i > func_ptr_start + 1:

1424 name_end = i

1425 break

1426

1427 if name_end is not None:

1428 # Extract function name (between * and closing parenthesis)

1429 name_tokens = field_tokens[func_ptr_start + 2:name_end]

1430 field_name = " ".join(t.value for t in name_tokens)

1431

1432 # Extract the parameter list as part of the type

1433 param_tokens = field_tokens[name_end + 1:]

1434 param_type = " ".join(t.value for t in param_tokens)

1435

1436 # Combine type and parameter list (without the function name in the type)

1437 # The function name is already extracted as field_name, so we don't include it in the type

1438 func_ptr_start_tokens = field_tokens[func_ptr_start:func_ptr_start + 2] # ( *

1439 func_ptr_end_tokens = field_tokens[name_end:name_end + 1] # )

1440 full_type = field_type + " " + " ".join(t.value for t in func_ptr_start_tokens) + " " + " ".join(t.value for t in func_ptr_end_tokens) + " " + param_type

1441

1442 if (

1443 field_name

1444 and field_name.strip()

1445 and full_type.strip()

1446 and field_name not in ["[", "]", ";", "}"]

1447 ):

1448 stripped_name = field_name.strip()

1449 stripped_type = full_type.strip()

1450 if stripped_name and stripped_type:

1451 fields.append((stripped_name, stripped_type))

1452 # Array field: type name [ size ]

1453 elif (

1454 len(field_tokens) >= 4

1455 and field_tokens[-3].type == TokenType.LBRACKET

1456 and field_tokens[-1].type == TokenType.RBRACKET

1457 ):

1458 field_name = field_tokens[-4].value

1459 # Fix: Properly format array type - preserve spaces between tokens

1460 type_tokens = field_tokens[:-4]

1461 field_type = " ".join(t.value for t in type_tokens) + "[" + field_tokens[-2].value + "]"

1462 if (

1463 field_name

1464 and field_name.strip()

1465 and field_type.strip()

1466 and field_name not in ["[", "]", ";", "}"]

1467 ):

1468 # Additional validation to ensure we don't have empty strings

1469 stripped_name = field_name.strip()

1470 stripped_type = field_type.strip()

1471 if stripped_name and stripped_type:

1472 fields.append((stripped_name, stripped_type))

1473 else:

1474 # Regular field: type name

1475 # Check if this field declaration contains commas (multiple fields of same type)

1476 comma_positions = []

1477 paren_count = 0

1478 brace_count = 0

1479

1480 # Find comma positions that are outside of parentheses and braces

1481 for i, token in enumerate(field_tokens):

1482 if token.type == TokenType.LPAREN:

1483 paren_count += 1

1484 elif token.type == TokenType.RPAREN:

1485 paren_count -= 1

1486 elif token.type == TokenType.LBRACE:

1487 brace_count += 1

1488 elif token.type == TokenType.RBRACE:

1489 brace_count -= 1

1490 elif token.type == TokenType.COMMA and paren_count == 0 and brace_count == 0:

1491 comma_positions.append(i)

1492

1493 if comma_positions:

1494 # Multiple fields of the same type: "int x, y, z;"

1495 # Extract the type (everything before the first field name)

1496 first_field_start = None

1497 for i in range(len(field_tokens)):

1498 if field_tokens[i].type == TokenType.IDENTIFIER:

1499 first_field_start = i

1500 break

1501

1502 if first_field_start is not None:

1503 type_tokens = field_tokens[:first_field_start]

1504 field_type = " ".join(t.value for t in type_tokens)

1505

1506 # Split fields on commas

1507 field_starts = [first_field_start] + [pos + 1 for pos in comma_positions]

1508 field_ends = comma_positions + [len(field_tokens)]

1509

1510 for start, end in zip(field_starts, field_ends):

1511 if start < end:

1512 field_name_tokens = field_tokens[start:end]

1513 field_name = " ".join(t.value for t in field_name_tokens)

1514

1515 if (

1516 field_name

1517 and field_name.strip()

1518 and field_type.strip()

1519 and field_name not in ["[", "]", ";", "}"]

1520 ):

1521 stripped_name = field_name.strip()

1522 stripped_type = field_type.strip()

1523 if stripped_name and stripped_type:

1524 fields.append((stripped_name, stripped_type))

1525 else:

1526 # Single field: type name

1527 field_name = field_tokens[-1].value

1528 field_type = " ".join(t.value for t in field_tokens[:-1])

1529 if (

1530 field_name not in ["[", "]", ";", "}"]

1531 and field_name

1532 and field_name.strip()

1533 and field_type.strip()

1534 ):

1535 # Additional validation to ensure we don't have empty strings

1536 stripped_name = field_name.strip()

1537 stripped_type = field_type.strip()

1538 if stripped_name and stripped_type:

1539 fields.append((stripped_name, stripped_type))

1540 if pos < closing_brace_pos:

1541 pos += 1 # Skip semicolon

1542 return fields

1543

1544

1545def find_enum_values(tokens: List[Token], enum_start: int, enum_end: int) -> List[str]:

1546 """Extract enum values from enum token range"""

1547 values = []

1548 pos = enum_start

1549 while pos <= enum_end and tokens[pos].type != TokenType.LBRACE:

1550 pos += 1

1551 if pos > enum_end:

1552 return values

1553 pos += 1 # Skip opening brace

1554 current_value = []

1555 while pos <= enum_end and tokens[pos].type != TokenType.RBRACE:

1556 token = tokens[pos]

1557 if token.type == TokenType.COMMA:

1558 if current_value:

1559 filtered_value = [

1560 t

1561 for t in current_value

1562 if t.type not in [TokenType.WHITESPACE, TokenType.COMMENT]

1563 ]

1564 if filtered_value:

1565 value_str = " ".join(t.value for t in filtered_value).strip()

1566 if value_str:

1567 values.append(value_str)

1568 current_value = []

1569 elif token.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:

1570 current_value.append(token)

1571 pos += 1

1572 if current_value:

1573 filtered_value = [

1574 t

1575 for t in current_value

1576 if t.type not in [TokenType.WHITESPACE, TokenType.COMMENT]

1577 ]

1578 if filtered_value:

1579 value_str = " ".join(t.value for t in filtered_value).strip()

1580 if value_str:

1581 values.append(value_str)

1582 return values

1583

1584

1585def _extract_brace_content(field_tokens: List[Token]) -> str:

1586 """Extract the content between braces from field tokens.

1587

1588 Args:

1589 field_tokens: List of tokens representing a field with anonymous structure

1590

1591 Returns:

1592 String content between the braces, or empty string if not found

1593 """

1594 content_tokens = []

1595 in_braces = False

1596 brace_count = 0

1597

1598 for token in field_tokens:

1599 if token.type == TokenType.LBRACE:

1600 if not in_braces:

1601 in_braces = True

1602 brace_count = 1

1603 else:

1604 brace_count += 1

1605 content_tokens.append(token)

1606 elif token.type == TokenType.RBRACE:

1607 if in_braces:

1608 brace_count -= 1

1609 if brace_count == 0:

1610 # Found the closing brace

1611 break

1612 else:

1613 content_tokens.append(token)

1614 elif in_braces:

1615 content_tokens.append(token)

1616

1617 # Convert tokens back to text preserving spacing

1618 if content_tokens:

1619 result = ""

1620 for i, token in enumerate(content_tokens):

1621 result += token.value

1622 # Add space after most tokens except when next token is punctuation

1623 if (i < len(content_tokens) - 1 and

1624 token.type not in [TokenType.WHITESPACE, TokenType.NEWLINE] and

1625 content_tokens[i + 1].type not in [TokenType.LBRACKET, TokenType.RBRACKET,

1626 TokenType.SEMICOLON, TokenType.COMMA,

1627 TokenType.WHITESPACE, TokenType.NEWLINE]):

1628 result += " "

1629 return result

1630 return ""

Coverage for src/c2puml/core/parser_tokenizer.py: 87%

865 statements