Coverage for src/c2puml/core/parser_tokenizer.py: 87%
865 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
1#!/usr/bin/env python3
2"""
3Tokenizer module for C to PlantUML converter - Helper library for tokenizing C/C++ code
4"""
6import logging
7import re
8from dataclasses import dataclass
9from enum import Enum
10from typing import List, Optional, Tuple
13class TokenType(Enum):
14 """Token types for C/C++ lexical analysis"""
16 # Keywords
17 STRUCT = "STRUCT"
18 ENUM = "ENUM"
19 UNION = "UNION"
20 TYPEDEF = "TYPEDEF"
21 STATIC = "STATIC"
22 EXTERN = "EXTERN"
23 INLINE = "INLINE"
24 LOCAL_INLINE = "LOCAL_INLINE"
25 CONST = "CONST"
26 VOID = "VOID"
28 # Data types
29 CHAR = "CHAR"
30 INT = "INT"
31 FLOAT = "FLOAT"
32 DOUBLE = "DOUBLE"
33 LONG = "LONG"
34 SHORT = "SHORT"
35 UNSIGNED = "UNSIGNED"
36 SIGNED = "SIGNED"
38 # Operators and punctuation
39 LBRACE = "LBRACE" # {
40 RBRACE = "RBRACE" # }
41 LPAREN = "LPAREN" # (
42 RPAREN = "RPAREN" # )
43 LBRACKET = "LBRACKET" # [
44 RBRACKET = "RBRACKET" # ]
45 SEMICOLON = "SEMICOLON" # ;
46 COMMA = "COMMA" # ,
47 ASSIGN = "ASSIGN" # =
48 ASTERISK = "ASTERISK" # *
49 AMPERSAND = "AMPERSAND" # &
50 ARROW = "ARROW" # ->
52 # Literals and identifiers
53 IDENTIFIER = "IDENTIFIER"
54 NUMBER = "NUMBER"
55 STRING = "STRING"
56 CHAR_LITERAL = "CHAR_LITERAL"
58 # Preprocessor
59 INCLUDE = "INCLUDE"
60 DEFINE = "DEFINE"
61 PREPROCESSOR = "PREPROCESSOR"
63 # Special
64 COMMENT = "COMMENT"
65 WHITESPACE = "WHITESPACE"
66 NEWLINE = "NEWLINE"
67 EOF = "EOF"
68 UNKNOWN = "UNKNOWN"
71@dataclass
72class Token:
73 """Represents a single token in C/C++ code"""
75 type: TokenType
76 value: str
77 line: int
78 column: int
80 def __repr__(self) -> str:
81 return f"Token({self.type.name}, '{self.value}', {self.line}:{self.column})"
84class CTokenizer:
85 """Tokenizer for C/C++ source code"""
87 # Keywords mapping
88 KEYWORDS = {
89 "struct": TokenType.STRUCT,
90 "enum": TokenType.ENUM,
91 "union": TokenType.UNION,
92 "typedef": TokenType.TYPEDEF,
93 "static": TokenType.STATIC,
94 "extern": TokenType.EXTERN,
95 "inline": TokenType.INLINE,
96 "local_inline": TokenType.LOCAL_INLINE,
97 "const": TokenType.CONST,
98 "void": TokenType.VOID,
99 "char": TokenType.CHAR,
100 "int": TokenType.INT,
101 "float": TokenType.FLOAT,
102 "double": TokenType.DOUBLE,
103 "long": TokenType.LONG,
104 "short": TokenType.SHORT,
105 "unsigned": TokenType.UNSIGNED,
106 "signed": TokenType.SIGNED,
107 }
109 # Single character tokens
110 SINGLE_CHAR_TOKENS = {
111 "{": TokenType.LBRACE,
112 "}": TokenType.RBRACE,
113 "(": TokenType.LPAREN,
114 ")": TokenType.RPAREN,
115 "[": TokenType.LBRACKET,
116 "]": TokenType.RBRACKET,
117 ";": TokenType.SEMICOLON,
118 ",": TokenType.COMMA,
119 "=": TokenType.ASSIGN,
120 "*": TokenType.ASTERISK,
121 "&": TokenType.AMPERSAND,
122 }
124 # Two character tokens
125 TWO_CHAR_TOKENS = {
126 "->": TokenType.ARROW,
127 }
129 def __init__(self):
130 self.logger = logging.getLogger(__name__)
132 # Compiled regex patterns for efficiency
133 self.patterns = {
134 "identifier": re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*"),
135 "number": re.compile(
136 r"0[xX][0-9a-fA-F]+[uUlL]*|0[bB][01]+[uUlL]*|0[0-7]+[uUlL]*|"
137 r"\d+\.\d*([eE][+-]?\d+)?[fFlL]*|\d+([eE][+-]?\d+)?[fFlL]*|\d+[uUlL]*"
138 ),
139 "string": re.compile(r'"([^"\\]|\\.)*"'),
140 "char": re.compile(r"'([^'\\]|\\.)'"),
141 "comment_single": re.compile(r"//.*"),
142 "comment_multi": re.compile(r"/\*.*?\*/", re.DOTALL),
143 "preprocessor": re.compile(
144 r"#(include|define|ifdef|ifndef|if|endif|elif|else|pragma|error|warning)\b.*"
145 ),
146 "whitespace": re.compile(r"[ \t]+"),
147 "newline": re.compile(r"\n"),
148 }
150 def tokenize(self, content: str) -> List[Token]:
151 """Tokenize C/C++ source code content"""
152 tokens = []
153 lines = content.splitlines()
154 total_lines = len(lines)
155 line_num = 1
156 in_multiline_string = False
157 multiline_string_value = ""
158 multiline_string_start_line = 0
159 multiline_string_start_col = 0
160 in_multiline_comment = False
161 multiline_comment_value = ""
162 multiline_comment_start_line = 0
163 multiline_comment_start_col = 0
165 for idx, line in enumerate(lines):
166 if in_multiline_string:
167 multiline_string_value += "\n" + line
168 if '"' in line:
169 # End of multiline string
170 in_multiline_string = False
171 tokens.append(
172 Token(
173 TokenType.STRING,
174 multiline_string_value,
175 multiline_string_start_line,
176 multiline_string_start_col,
177 )
178 )
179 elif in_multiline_comment:
180 # Continue multi-line comment
181 multiline_comment_value += "\n" + line
182 comment_end = line.find("*/")
183 if comment_end != -1:
184 # End of multi-line comment
185 in_multiline_comment = False
186 multiline_comment_value = multiline_comment_value[
187 : multiline_comment_value.rfind("*/") + 2
188 ]
189 tokens.append(
190 Token(
191 TokenType.COMMENT,
192 multiline_comment_value,
193 multiline_comment_start_line,
194 multiline_comment_start_col,
195 )
196 )
197 else:
198 line_tokens = self._tokenize_line(line, line_num)
199 # Check if a string starts but does not end on this line
200 if (
201 line_tokens
202 and line_tokens[-1].type == TokenType.STRING
203 and not line_tokens[-1].value.endswith('"')
204 ):
205 in_multiline_string = True
206 multiline_string_value = line_tokens[-1].value
207 multiline_string_start_line = line_tokens[-1].line
208 multiline_string_start_col = line_tokens[-1].column
209 tokens.extend(line_tokens[:-1])
210 # Check if a multi-line comment starts but does not end on this line
211 elif (
212 line_tokens
213 and line_tokens[-1].type == TokenType.COMMENT
214 and line_tokens[-1].value.startswith("/*")
215 and not line_tokens[-1].value.endswith("*/")
216 ):
217 in_multiline_comment = True
218 multiline_comment_value = line_tokens[-1].value
219 multiline_comment_start_line = line_tokens[-1].line
220 multiline_comment_start_col = line_tokens[-1].column
221 tokens.extend(line_tokens[:-1])
222 else:
223 tokens.extend(line_tokens)
225 if line_num < total_lines:
226 tokens.append(Token(TokenType.NEWLINE, "\n", line_num, len(line)))
227 line_num += 1
229 if in_multiline_string:
230 tokens.append(
231 Token(
232 TokenType.STRING,
233 multiline_string_value,
234 multiline_string_start_line,
235 multiline_string_start_col,
236 )
237 )
238 if in_multiline_comment:
239 tokens.append(
240 Token(
241 TokenType.COMMENT,
242 multiline_comment_value,
243 multiline_comment_start_line,
244 multiline_comment_start_col,
245 )
246 )
248 # Post-process tokens to merge multi-line macros
249 tokens = self._merge_multiline_macros(tokens, lines)
251 tokens.append(
252 Token(TokenType.EOF, "", total_lines, len(lines[-1]) if lines else 0)
253 )
255 return tokens
257 def _tokenize_line(self, line: str, line_num: int) -> List[Token]:
258 """Tokenize a single line of code"""
259 tokens = []
260 pos = 0
262 while pos < len(line):
263 # Skip whitespace but track it
264 if match := self.patterns["whitespace"].match(line, pos):
265 tokens.append(Token(TokenType.WHITESPACE, match.group(), line_num, pos))
266 pos = match.end()
267 continue
269 # Comments
270 if match := self.patterns["comment_single"].match(line, pos):
271 tokens.append(Token(TokenType.COMMENT, match.group(), line_num, pos))
272 pos = len(line) # Rest of line is comment
273 continue
275 # Multi-line comments - check for /* at start of line or after whitespace
276 if line[pos:].startswith("/*"):
277 # Find the end of the comment
278 comment_end = line.find("*/", pos)
279 if comment_end != -1:
280 # Comment ends on this line
281 comment_text = line[pos : comment_end + 2]
282 tokens.append(Token(TokenType.COMMENT, comment_text, line_num, pos))
283 pos = comment_end + 2
284 continue
285 else:
286 # Comment continues to next line - create a partial comment token
287 comment_text = line[pos:]
288 tokens.append(Token(TokenType.COMMENT, comment_text, line_num, pos))
289 pos = len(line)
290 continue
292 # Preprocessor directives
293 if match := self.patterns["preprocessor"].match(line, pos):
294 value = match.group()
295 if value.startswith("#include"):
296 tokens.append(Token(TokenType.INCLUDE, value, line_num, pos))
297 elif value.startswith("#define"):
298 tokens.append(Token(TokenType.DEFINE, value, line_num, pos))
299 else:
300 tokens.append(Token(TokenType.PREPROCESSOR, value, line_num, pos))
301 pos = len(line) # Rest of line is preprocessor
302 continue
304 # String literals
305 if (
306 line[pos] == '"'
307 or (
308 pos > 0
309 and line[pos - 1] in ["L", "u", "U", "R"]
310 and line[pos] == '"'
311 )
312 or (pos > 1 and line[pos - 2 : pos] == "u8" and line[pos] == '"')
313 ):
314 # Handle string literals with possible prefixes
315 string_start = pos
316 if line[pos - 2 : pos] == "u8":
317 string_start -= 2
318 elif line[pos - 1] in ["L", "u", "U", "R"]:
319 string_start -= 1
320 pos += 1 # Skip opening quote
321 while pos < len(line):
322 if line[pos] == '"':
323 # Found closing quote
324 string_text = line[string_start : pos + 1]
325 tokens.append(
326 Token(TokenType.STRING, string_text, line_num, string_start)
327 )
328 pos += 1
329 break
330 elif line[pos] == "\\":
331 pos += 2
332 else:
333 pos += 1
334 else:
335 string_text = line[string_start:]
336 tokens.append(
337 Token(TokenType.STRING, string_text, line_num, string_start)
338 )
339 pos = len(line)
340 continue
342 # Character literals
343 if match := self.patterns["char"].match(line, pos):
344 tokens.append(
345 Token(TokenType.CHAR_LITERAL, match.group(), line_num, pos)
346 )
347 pos = match.end()
348 continue
350 # Numbers
351 if match := self.patterns["number"].match(line, pos):
352 tokens.append(Token(TokenType.NUMBER, match.group(), line_num, pos))
353 pos = match.end()
354 continue
356 # Single character tokens
357 if line[pos] in self.SINGLE_CHAR_TOKENS:
358 token_type = self.SINGLE_CHAR_TOKENS[line[pos]]
359 tokens.append(Token(token_type, line[pos], line_num, pos))
360 pos += 1
361 continue
363 # Multi-character operators (<<, >>, ->)
364 if line[pos : pos + 2] in ["<<", ">>", "->"]:
365 op = line[pos : pos + 2]
366 if op == "->":
367 tokens.append(Token(TokenType.ARROW, op, line_num, pos))
368 else:
369 tokens.append(
370 Token(
371 (
372 TokenType.OPERATOR
373 if hasattr(TokenType, "OPERATOR")
374 else TokenType.UNKNOWN
375 ),
376 op,
377 line_num,
378 pos,
379 )
380 )
381 pos += 2
382 continue
384 # Identifiers and keywords
385 if match := self.patterns["identifier"].match(line, pos):
386 value = match.group()
387 token_type = self.KEYWORDS.get(value.lower(), TokenType.IDENTIFIER)
388 tokens.append(Token(token_type, value, line_num, pos))
389 pos = match.end()
390 continue
392 # Unknown character (always one at a time)
393 tokens.append(Token(TokenType.UNKNOWN, line[pos], line_num, pos))
394 pos += 1
396 return tokens
398 def filter_tokens(
399 self, tokens: List[Token], exclude_types: Optional[List[TokenType]] = None
400 ) -> List[Token]:
401 """Filter tokens by type"""
402 if exclude_types is None:
403 exclude_types = [
404 TokenType.WHITESPACE,
405 TokenType.COMMENT,
406 TokenType.NEWLINE,
407 TokenType.EOF,
408 ]
410 return [token for token in tokens if token.type not in exclude_types]
412 def _merge_multiline_macros(
413 self, tokens: List[Token], lines: List[str]
414 ) -> List[Token]:
415 """Merge multi-line macro tokens that span multiple lines with backslashes"""
416 merged_tokens = []
417 i = 0
419 while i < len(tokens):
420 token = tokens[i]
422 if token.type == TokenType.DEFINE and token.value.rstrip().endswith("\\"):
423 # Found a multi-line macro, merge with subsequent lines
424 macro_content = token.value
425 current_line = token.line
427 # Continue merging lines until we find one that doesn't end with backslash
428 while macro_content.rstrip().endswith("\\"):
429 # Remove the backslash and add a newline
430 macro_content = macro_content.rstrip()[:-1] + "\n"
431 current_line += 1
433 # Find the next line content
434 if current_line <= len(lines):
435 next_line = lines[current_line - 1]
436 macro_content += next_line
437 else:
438 break
440 # Create a new token with the merged content
441 merged_tokens.append(
442 Token(TokenType.DEFINE, macro_content, token.line, token.column)
443 )
444 else:
445 merged_tokens.append(token)
447 i += 1
449 return merged_tokens
452class StructureFinder:
453 """Helper class to find C/C++ structures in token streams"""
455 def __init__(self, tokens: List[Token]):
456 self.tokens = tokens
457 self.pos = 0
458 self.logger = logging.getLogger(__name__)
460 def find_structs(self) -> List[Tuple[int, int, str]]:
461 """Find struct definitions in token stream
463 Returns:
464 List of tuples (start_pos, end_pos, struct_name)
465 """
466 structs = []
467 self.pos = 0
469 while self.pos < len(self.tokens):
470 if self._current_token_is(TokenType.STRUCT):
471 struct_info = self._parse_struct()
472 if struct_info:
473 structs.append(struct_info)
474 elif self._current_token_is(TokenType.TYPEDEF):
475 typedef_struct = self._parse_typedef_struct()
476 if typedef_struct:
477 structs.append(typedef_struct)
478 else:
479 self.pos += 1
481 return structs
483 def find_enums(self) -> List[Tuple[int, int, str]]:
484 """Find enum definitions in token stream"""
485 enums = []
486 self.pos = 0
488 while self.pos < len(self.tokens):
489 if self._current_token_is(TokenType.ENUM):
490 enum_info = self._parse_enum()
491 if enum_info:
492 enums.append(enum_info)
493 elif self._current_token_is(TokenType.TYPEDEF):
494 typedef_enum = self._parse_typedef_enum()
495 if typedef_enum:
496 enums.append(typedef_enum)
497 else:
498 self.pos += 1
500 return enums
502 def find_functions(self) -> List[Tuple[int, int, str, str, bool, bool]]:
503 """Find all function declarations and definitions in the token stream
505 Returns:
506 List of tuples (start_pos, end_pos, func_name, return_type, is_declaration, is_inline)
507 """
508 functions = []
509 self.pos = 0
511 while self.pos < len(self.tokens):
512 result = self._parse_function()
513 if result:
514 functions.append(result)
516 return functions
518 def find_unions(self) -> List[Tuple[int, int, str]]:
519 """Find union definitions in token stream"""
520 unions = []
521 self.pos = 0
523 while self.pos < len(self.tokens):
524 if self._current_token_is(TokenType.UNION):
525 union_info = self._parse_union()
526 if union_info:
527 unions.append(union_info)
528 elif self._current_token_is(TokenType.TYPEDEF):
529 typedef_union = self._parse_typedef_union()
530 if typedef_union:
531 unions.append(typedef_union)
532 else:
533 self.pos += 1
535 return unions
537 def _current_token_is(self, token_type: TokenType) -> bool:
538 """Check if current token is of specified type"""
539 return self.pos < len(self.tokens) and self.tokens[self.pos].type == token_type
541 def _peek_token(self, offset: int = 1) -> Optional[Token]:
542 """Peek at token at current position + offset"""
543 peek_pos = self.pos + offset
544 return self.tokens[peek_pos] if peek_pos < len(self.tokens) else None
546 def _advance(self) -> Optional[Token]:
547 """Advance to next token and return current"""
548 if self.pos < len(self.tokens):
549 token = self.tokens[self.pos]
550 self.pos += 1
551 return token
552 return None
554 def _find_matching_brace(self, start_pos: int) -> Optional[int]:
555 """Find matching closing brace starting from open brace position"""
556 if (
557 start_pos >= len(self.tokens)
558 or self.tokens[start_pos].type != TokenType.LBRACE
559 ):
560 return None
562 depth = 1
563 pos = start_pos + 1
565 while pos < len(self.tokens) and depth > 0:
566 if self.tokens[pos].type == TokenType.LBRACE:
567 depth += 1
568 elif self.tokens[pos].type == TokenType.RBRACE:
569 depth -= 1
570 pos += 1
572 return pos - 1 if depth == 0 else None
574 def _parse_struct(self) -> Optional[Tuple[int, int, str]]:
575 """Parse struct definition starting at current position"""
576 start_pos = self.pos
578 # Consume 'struct' keyword
579 if not self._current_token_is(TokenType.STRUCT):
580 return None
581 self._advance()
583 # Check if this struct is inside a cast expression by looking backwards
584 check_pos = start_pos - 1
585 while check_pos >= 0:
586 if self.tokens[check_pos].type == TokenType.LPAREN:
587 # Found opening parenthesis before struct - this is likely a cast expression
588 return None
589 elif self.tokens[check_pos].type in [TokenType.STRUCT, TokenType.TYPEDEF]:
590 # Found another struct or typedef - this is not a cast expression
591 break
592 elif self.tokens[check_pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:
593 # Found some other token - this is not a cast expression
594 break
595 check_pos -= 1
597 # Skip whitespace
598 while self.pos < len(self.tokens) and self._current_token_is(
599 TokenType.WHITESPACE
600 ):
601 self.pos += 1
603 # Check if this is a cast expression: (struct type*)
604 if self._current_token_is(TokenType.LPAREN):
605 # Look ahead to see if this is a cast expression
606 check_pos = self.pos + 1
607 while check_pos < len(self.tokens):
608 if self.tokens[check_pos].type == TokenType.RPAREN:
609 # Found closing parenthesis - this is likely a cast expression
610 return None
611 elif self.tokens[check_pos].type == TokenType.LBRACE:
612 # Found opening brace - this is a struct definition
613 break
614 elif self.tokens[check_pos].type == TokenType.SEMICOLON:
615 # Found semicolon - this is a variable declaration
616 return None
617 check_pos += 1
619 # Get struct tag name (optional for anonymous structs)
620 struct_tag = ""
621 if self._current_token_is(TokenType.IDENTIFIER):
622 struct_tag = self._advance().value
624 # Look for opening brace or semicolon
625 while self.pos < len(self.tokens):
626 if self._current_token_is(TokenType.LBRACE):
627 # Found opening brace - this is a struct definition
628 break
629 elif self._current_token_is(TokenType.SEMICOLON):
630 # Found semicolon before opening brace - this is a variable declaration
631 return None
632 self.pos += 1
634 if not self._current_token_is(TokenType.LBRACE):
635 # This is a variable declaration
636 return None
638 # Find matching closing brace
639 brace_pos = self.pos
640 end_brace_pos = self._find_matching_brace(brace_pos)
642 if end_brace_pos is None:
643 return None
645 # Look for struct name after closing brace
646 name_pos = end_brace_pos + 1
647 struct_name = struct_tag # Default to tag name
649 # Check if this is a typedef struct by looking backwards
650 is_typedef = False
651 check_pos = start_pos - 1
652 while check_pos >= 0:
653 if self.tokens[check_pos].type == TokenType.TYPEDEF:
654 is_typedef = True
655 break
656 elif self.tokens[check_pos].type in [
657 TokenType.STRUCT,
658 TokenType.LBRACE,
659 TokenType.RBRACE,
660 ]:
661 break
662 check_pos -= 1
664 if is_typedef:
665 # For typedef struct, look for the typedef name after the closing brace
666 while name_pos < len(self.tokens):
667 if self.tokens[name_pos].type == TokenType.IDENTIFIER:
668 struct_name = self.tokens[name_pos].value
669 break
670 elif self.tokens[name_pos].type == TokenType.SEMICOLON:
671 break
672 name_pos += 1
673 else:
674 # Check if there's a variable name after the brace
675 while name_pos < len(self.tokens):
676 if self.tokens[name_pos].type == TokenType.IDENTIFIER:
677 # This is a variable name
678 struct_name = ""
679 break
680 elif self.tokens[name_pos].type == TokenType.SEMICOLON:
681 break
682 name_pos += 1
684 # Find semicolon (for struct definitions)
685 self.pos = end_brace_pos + 1
686 while self.pos < len(self.tokens) and not self._current_token_is(
687 TokenType.SEMICOLON
688 ):
689 self.pos += 1
691 end_pos = self.pos
692 return (start_pos, end_pos, struct_name)
694 def _parse_typedef_struct(self) -> Optional[Tuple[int, int, str]]:
695 """Parse typedef struct definition"""
696 start_pos = self.pos
698 # Consume 'typedef'
699 if not self._current_token_is(TokenType.TYPEDEF):
700 return None
701 self._advance()
703 # Look for 'struct'
704 if not self._current_token_is(TokenType.STRUCT):
705 # Not a typedef struct, reset position
706 self.pos = start_pos + 1
707 return None
709 # Skip 'struct'
710 self._advance()
712 # Skip whitespace
713 while self.pos < len(self.tokens) and self._current_token_is(
714 TokenType.WHITESPACE
715 ):
716 self.pos += 1
718 # Get struct tag name (optional)
719 struct_tag = ""
720 if self._current_token_is(TokenType.IDENTIFIER):
721 struct_tag = self._advance().value
723 # Skip whitespace
724 while self.pos < len(self.tokens) and self._current_token_is(
725 TokenType.WHITESPACE
726 ):
727 self.pos += 1
729 # Check if this is a forward declaration (no braces)
730 if not self._current_token_is(TokenType.LBRACE):
731 # This is a forward declaration, skip it
732 self.pos = start_pos + 1
733 return None
735 # Find matching closing brace
736 end_brace_pos = self._find_matching_brace(self.pos)
737 if end_brace_pos is None:
738 self.pos = start_pos + 1
739 return None
741 # Look for typedef name after closing brace
742 typedef_name = ""
743 name_pos = end_brace_pos + 1
744 while name_pos < len(self.tokens):
745 if self.tokens[name_pos].type == TokenType.IDENTIFIER:
746 typedef_name = self.tokens[name_pos].value
747 break
748 elif self.tokens[name_pos].type == TokenType.SEMICOLON:
749 break
750 name_pos += 1
752 # Find semicolon
753 while (
754 name_pos < len(self.tokens)
755 and not self.tokens[name_pos].type == TokenType.SEMICOLON
756 ):
757 name_pos += 1
759 end_pos = name_pos
760 return (start_pos, end_pos, typedef_name)
762 def _parse_enum(self) -> Optional[Tuple[int, int, str]]:
763 """Parse enum definition starting at current position"""
764 start_pos = self.pos
766 # Consume 'enum' keyword
767 if not self._current_token_is(TokenType.ENUM):
768 return None
769 self._advance()
771 # Skip whitespace
772 while self.pos < len(self.tokens) and self._current_token_is(
773 TokenType.WHITESPACE
774 ):
775 self.pos += 1
777 # Get enum tag name (optional for anonymous enums)
778 enum_tag = ""
779 if self._current_token_is(TokenType.IDENTIFIER):
780 enum_tag = self._advance().value
782 # Find opening brace
783 while self.pos < len(self.tokens) and not self._current_token_is(
784 TokenType.LBRACE
785 ):
786 self.pos += 1
788 if not self._current_token_is(TokenType.LBRACE):
789 return None
791 # Find matching closing brace
792 brace_pos = self.pos
793 end_brace_pos = self._find_matching_brace(brace_pos)
795 if end_brace_pos is None:
796 return None
798 # Look for enum name after closing brace
799 name_pos = end_brace_pos + 1
800 enum_name = enum_tag # Default to tag name
802 # Check if this is a typedef enum by looking backwards
803 is_typedef = False
804 check_pos = start_pos - 1
805 while check_pos >= 0:
806 if self.tokens[check_pos].type == TokenType.TYPEDEF:
807 is_typedef = True
808 break
809 elif self.tokens[check_pos].type in [
810 TokenType.ENUM,
811 TokenType.LBRACE,
812 TokenType.RBRACE,
813 ]:
814 break
815 check_pos -= 1
817 if is_typedef:
818 # For typedef enum, look for the typedef name after the closing brace
819 while name_pos < len(self.tokens):
820 if self.tokens[name_pos].type == TokenType.IDENTIFIER:
821 enum_name = self.tokens[name_pos].value
822 break
823 elif self.tokens[name_pos].type == TokenType.SEMICOLON:
824 break
825 name_pos += 1
826 elif not enum_tag:
827 # Anonymous enum - check if there's a variable name after the brace
828 while name_pos < len(self.tokens):
829 if self.tokens[name_pos].type == TokenType.IDENTIFIER:
830 # This is a variable name
831 enum_name = ""
832 break
833 elif self.tokens[name_pos].type == TokenType.SEMICOLON:
834 break
835 name_pos += 1
837 # Find semicolon
838 self.pos = end_brace_pos + 1
839 while self.pos < len(self.tokens) and not self._current_token_is(
840 TokenType.SEMICOLON
841 ):
842 self.pos += 1
844 end_pos = self.pos
845 return (start_pos, end_pos, enum_name)
847 def _parse_typedef_enum(self) -> Optional[Tuple[int, int, str]]:
848 """Parse typedef enum definition"""
849 start_pos = self.pos
851 # Consume 'typedef'
852 if not self._current_token_is(TokenType.TYPEDEF):
853 return None
854 self._advance()
856 # Look for 'enum'
857 if not self._current_token_is(TokenType.ENUM):
858 # Not a typedef enum, reset position
859 self.pos = start_pos + 1
860 return None
862 # Parse the enum part - this will return the tag name (e.g., StatusEnum_tag)
863 enum_info = self._parse_enum()
864 if not enum_info:
865 self.pos = start_pos + 1
866 return None
868 # For typedef enums, we want to return the tag name, not the typedef name
869 # The typedef name will be handled separately in the parser
870 return enum_info
872 def _parse_function(self) -> Optional[Tuple[int, int, str, str, bool, bool]]:
873 """Parse function declaration/definition
875 Returns:
876 Tuple of (start_pos, end_pos, func_name, return_type, is_declaration, is_inline)
877 """
878 start_pos = self.pos
880 # Look for function pattern: [modifiers] return_type function_name (params)
881 while self.pos < len(self.tokens):
882 token = self.tokens[self.pos]
884 # If we hit a parenthesis, check if this is a function
885 if token.type == TokenType.LPAREN:
886 # Look backwards for function name
887 if (
888 self.pos > 0
889 and self.tokens[self.pos - 1].type == TokenType.IDENTIFIER
890 ):
891 func_name = self.tokens[self.pos - 1].value
892 func_name_pos = self.pos - 1
894 # Look backwards from function name to find return type
895 # Start from just before the function name
896 return_type_end = func_name_pos - 1
897 return_type_start = return_type_end
899 # Skip backwards over whitespace and comments
900 while return_type_start >= 0:
901 token_type = self.tokens[return_type_start].type
902 if token_type in [
903 TokenType.WHITESPACE,
904 TokenType.COMMENT,
905 TokenType.NEWLINE,
906 ]:
907 return_type_start -= 1
908 else:
909 break
911 # If we found a non-whitespace token, that's the end of the return type
912 # Find the start by looking backwards from there
913 if return_type_start >= 0:
914 return_type_end = return_type_start
915 return_type_start = return_type_end
917 # Define modifiers set (used in token type checking below)
919 # Collect all tokens that are part of the return type (including modifiers)
920 return_type_tokens = []
922 # Look back at most 10 tokens to capture multi-token return types
923 max_lookback = max(0, func_name_pos - 10)
924 current_pos = return_type_start
926 # Collect tokens backwards until we hit a limit or non-return-type token
927 while current_pos >= max_lookback:
928 token_type = self.tokens[current_pos].type
929 if token_type in [
930 TokenType.IDENTIFIER,
931 TokenType.INT,
932 TokenType.VOID,
933 TokenType.CHAR,
934 TokenType.FLOAT,
935 TokenType.DOUBLE,
936 TokenType.LONG,
937 TokenType.SHORT,
938 TokenType.UNSIGNED,
939 TokenType.SIGNED,
940 TokenType.ASTERISK,
941 TokenType.CONST,
942 TokenType.STATIC,
943 TokenType.EXTERN,
944 TokenType.INLINE,
945 TokenType.LOCAL_INLINE,
946 ]:
947 return_type_tokens.insert(0, self.tokens[current_pos])
948 current_pos -= 1
949 elif token_type in [
950 TokenType.WHITESPACE,
951 TokenType.COMMENT,
952 TokenType.NEWLINE,
953 ]:
954 # Skip whitespace and continue looking
955 current_pos -= 1
956 else:
957 break
959 # Extract return type
960 if return_type_tokens:
961 return_type = " ".join(
962 t.value for t in return_type_tokens
963 ).strip()
965 # Check if function is inline
966 is_inline = any(
967 token.type in [TokenType.INLINE, TokenType.LOCAL_INLINE]
968 for token in return_type_tokens
969 )
971 # Find end of function (either ; for declaration or { for definition)
972 end_pos = self._find_function_end(self.pos)
973 if end_pos:
974 # Determine if this is a declaration or definition
975 is_declaration = self._is_function_declaration(end_pos)
976 self.pos = end_pos + 1
977 return (
978 start_pos,
979 end_pos,
980 func_name,
981 return_type,
982 is_declaration,
983 is_inline,
984 )
986 self.pos += 1
988 # Prevent infinite loops - if we've gone too far, this isn't a function
989 if self.pos - start_pos > 50:
990 break
992 # Reset position if no function found
993 self.pos = start_pos + 1
994 return None
996 def _is_function_declaration(self, end_pos: int) -> bool:
997 """Check if the function at end_pos is a declaration (ends with ;) or definition (ends with })"""
998 if end_pos >= len(self.tokens):
999 return False
1001 # Look backwards from end_pos to find the last significant token
1002 pos = end_pos
1003 while pos >= 0:
1004 token_type = self.tokens[pos].type
1005 if token_type not in [
1006 TokenType.WHITESPACE,
1007 TokenType.COMMENT,
1008 TokenType.NEWLINE,
1009 ]:
1010 return token_type == TokenType.SEMICOLON
1011 pos -= 1
1013 return False
1015 def _find_function_end(self, start_pos: int) -> Optional[int]:
1016 """Find end of function declaration or definition"""
1017 pos = start_pos
1019 # Find matching closing parenthesis
1020 if pos >= len(self.tokens) or self.tokens[pos].type != TokenType.LPAREN:
1021 return None
1023 depth = 1
1024 pos += 1
1026 while pos < len(self.tokens) and depth > 0:
1027 if self.tokens[pos].type == TokenType.LPAREN:
1028 depth += 1
1029 elif self.tokens[pos].type == TokenType.RPAREN:
1030 depth -= 1
1031 pos += 1
1033 if depth > 0:
1034 return None
1036 # Look for either ; (declaration) or { (definition)
1037 while pos < len(self.tokens):
1038 if self.tokens[pos].type == TokenType.SEMICOLON:
1039 return pos
1040 elif self.tokens[pos].type == TokenType.LBRACE:
1041 # Function definition - find matching brace
1042 end_brace = self._find_matching_brace(pos)
1043 return end_brace if end_brace else pos
1044 pos += 1
1046 return None
1048 def _parse_union(self) -> Optional[Tuple[int, int, str]]:
1049 """Parse union definition"""
1050 if not self._current_token_is(TokenType.UNION):
1051 return None
1053 start_pos = self.pos
1054 self._advance() # Consumes 'union'
1056 # Skip whitespace
1057 while self.pos < len(self.tokens) and self._current_token_is(
1058 TokenType.WHITESPACE
1059 ):
1060 self.pos += 1
1062 # Get union tag name (optional for anonymous unions)
1063 union_tag = ""
1064 if self._current_token_is(TokenType.IDENTIFIER):
1065 union_tag = self._advance().value
1067 # Find opening brace
1068 while self.pos < len(self.tokens) and not self._current_token_is(
1069 TokenType.LBRACE
1070 ):
1071 self.pos += 1
1073 if self.pos >= len(self.tokens):
1074 return None
1076 # Find matching closing brace
1077 end_pos = self._find_matching_brace(self.pos)
1078 if end_pos is None:
1079 return None
1081 # Look for union name after closing brace (for typedefs or named unions)
1082 union_name = union_tag # Default to tag name
1084 # Skip to semicolon
1085 self.pos = end_pos + 1
1086 while self.pos < len(self.tokens) and not self._current_token_is(
1087 TokenType.SEMICOLON
1088 ):
1089 if self._current_token_is(TokenType.IDENTIFIER):
1090 union_name = self._advance().value
1091 break
1092 self.pos += 1
1094 return (start_pos, end_pos, union_name)
1096 def _parse_typedef_union(self) -> Optional[Tuple[int, int, str]]:
1097 """Parse typedef union definition"""
1098 if not self._current_token_is(TokenType.TYPEDEF):
1099 return None
1101 start_pos = self.pos
1102 self._advance() # Consumes 'typedef'
1104 # Skip whitespace
1105 while self.pos < len(self.tokens) and self._current_token_is(
1106 TokenType.WHITESPACE
1107 ):
1108 self.pos += 1
1110 # Check if next token is 'union'
1111 if not self._current_token_is(TokenType.UNION):
1112 return None
1114 self._advance() # Consumes 'union'
1116 # Skip whitespace
1117 while self.pos < len(self.tokens) and self._current_token_is(
1118 TokenType.WHITESPACE
1119 ):
1120 self.pos += 1
1122 # Get union tag name (optional)
1123 union_tag = ""
1124 if self._current_token_is(TokenType.IDENTIFIER):
1125 union_tag = self._advance().value
1127 # Find opening brace
1128 while self.pos < len(self.tokens) and not self._current_token_is(
1129 TokenType.LBRACE
1130 ):
1131 self.pos += 1
1133 if self.pos >= len(self.tokens):
1134 return None
1136 # Find matching closing brace
1137 end_pos = self._find_matching_brace(self.pos)
1138 if end_pos is None:
1139 return None
1141 # Look for typedef name after closing brace
1142 typedef_name = ""
1143 self.pos = end_pos + 1
1144 while self.pos < len(self.tokens) and not self._current_token_is(
1145 TokenType.SEMICOLON
1146 ):
1147 if self._current_token_is(TokenType.IDENTIFIER):
1148 typedef_name = self._advance().value
1149 break
1150 self.pos += 1
1152 return (start_pos, end_pos, typedef_name)
1155def extract_token_range(tokens: List[Token], start: int, end: int) -> str:
1156 """Extract raw text from token range, excluding whitespace, comments, and newlines"""
1157 if start >= len(tokens) or end >= len(tokens) or start > end:
1158 return ""
1159 return " ".join(
1160 token.value
1161 for token in tokens[start : end + 1]
1162 if token.type
1163 not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]
1164 )
1167def find_struct_fields(
1168 tokens: List[Token], struct_start: int, struct_end: int
1169) -> List[Tuple[str, str]]:
1170 """Extract field information from struct token range
1171 Returns:
1172 List of tuples (field_name, field_type)
1173 """
1174 fields = []
1175 pos = struct_start
1176 while pos <= struct_end and tokens[pos].type != TokenType.LBRACE:
1177 pos += 1
1178 if pos > struct_end:
1179 return fields
1180 pos += 1 # Skip opening brace
1182 # Find the closing brace position of the main struct body
1183 closing_brace_pos = pos
1184 brace_count = 1 # Start at 1 because we're already past the opening brace
1185 while closing_brace_pos <= struct_end:
1186 if tokens[closing_brace_pos].type == TokenType.LBRACE:
1187 brace_count += 1
1188 elif tokens[closing_brace_pos].type == TokenType.RBRACE:
1189 brace_count -= 1
1190 if brace_count == 0:
1191 # This is the closing brace of the main struct body
1192 break
1193 closing_brace_pos += 1
1195 # Only parse fields up to the closing brace
1196 while pos < closing_brace_pos and tokens[pos].type != TokenType.RBRACE:
1197 field_tokens = []
1198 # Collect tokens until we find the semicolon that ends this field
1199 # For nested structures, we need to handle braces properly
1200 brace_count = 0
1201 field_start_pos = pos
1203 # First pass: collect tokens until we find the semicolon outside of braces
1204 while pos < closing_brace_pos:
1205 if tokens[pos].type == TokenType.LBRACE:
1206 brace_count += 1
1207 elif tokens[pos].type == TokenType.RBRACE:
1208 brace_count -= 1
1209 # Only stop if we're at the main closing brace
1210 if pos == closing_brace_pos:
1211 break
1212 elif tokens[pos].type == TokenType.SEMICOLON and brace_count == 0:
1213 # This is the semicolon that ends the field
1214 break
1216 if tokens[pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:
1217 field_tokens.append(tokens[pos])
1218 pos += 1
1220 # For nested structures, we need to continue collecting tokens until we find the field name
1221 # and the semicolon that ends the entire field
1222 if (len(field_tokens) >= 3 and
1223 field_tokens[0].type in [TokenType.STRUCT, TokenType.UNION] and
1224 field_tokens[1].type == TokenType.LBRACE):
1225 # This might be a nested structure, continue collecting until we find the field name
1226 temp_pos = pos
1227 brace_count = 0 # Track nested braces to find the correct field boundary
1228 while temp_pos < len(tokens):
1229 if tokens[temp_pos].type == TokenType.LBRACE:
1230 brace_count += 1
1231 elif tokens[temp_pos].type == TokenType.RBRACE:
1232 brace_count -= 1
1233 elif tokens[temp_pos].type == TokenType.SEMICOLON and brace_count == 0:
1234 # Found the semicolon that ends the field (not inside nested braces)
1235 break
1237 if tokens[temp_pos].type not in [TokenType.WHITESPACE, TokenType.COMMENT, TokenType.NEWLINE]:
1238 field_tokens.append(tokens[temp_pos])
1239 temp_pos += 1
1240 pos = temp_pos
1242 # Parse field from collected tokens
1243 if len(field_tokens) >= 2:
1244 # Check if this is a nested struct field
1245 if (
1246 len(field_tokens) >= 3
1247 and field_tokens[0].type == TokenType.STRUCT
1248 and field_tokens[1].type == TokenType.LBRACE
1249 ):
1250 # This is a nested struct - find the field name after the closing brace
1251 # Look for the pattern: struct { ... } field_name;
1252 field_name = None
1253 # Find the LAST closing brace and then the field name
1254 # This handles deeply nested structures correctly
1255 for i in range(len(field_tokens) - 1, -1, -1):
1256 if field_tokens[i].type == TokenType.RBRACE and i + 1 < len(field_tokens):
1257 # The field name should be the next identifier after the closing brace
1258 for j in range(i + 1, len(field_tokens)):
1259 if field_tokens[j].type == TokenType.IDENTIFIER:
1260 field_name = field_tokens[j].value
1261 break
1262 if field_name:
1263 break
1265 if field_name:
1266 # Extract the content between braces for anonymous processor using special format
1267 content = _extract_brace_content(field_tokens)
1268 if content:
1269 # Preserve content for anonymous processor using special format
1270 import base64
1271 encoded_content = base64.b64encode(content.encode()).decode()
1272 field_type = f"struct {{ /*ANON:{encoded_content}:{field_name}*/ ... }}"
1273 else:
1274 field_type = "struct { ... }"
1276 if field_name not in ["[", "]", ";", "}"]:
1277 fields.append((field_name, field_type))
1278 # Skip parsing the nested struct's fields as separate fields
1279 # Let the normal flow handle semicolon advancement
1280 else:
1281 # Anonymous nested struct without a field name
1282 content = _extract_brace_content(field_tokens)
1283 if content:
1284 import base64
1285 encoded_content = base64.b64encode(content.encode()).decode()
1286 # Use generic field name for anonymous struct
1287 generic_name = "__anonymous_struct__"
1288 field_type = f"struct {{ /*ANON:{encoded_content}:{generic_name}*/ ... }}"
1289 else:
1290 generic_name = "__anonymous_struct__"
1291 field_type = "struct { ... }"
1292 fields.append((generic_name, field_type))
1293 # Check if this is a nested union field
1294 elif (
1295 len(field_tokens) >= 3
1296 and field_tokens[0].type == TokenType.UNION
1297 and field_tokens[1].type == TokenType.LBRACE
1298 ):
1299 # This is a nested union - find the field name after the closing brace
1300 # Look for the pattern: union { ... } field_name;
1301 field_name = None
1302 # Find the LAST closing brace and then the field name
1303 # This handles deeply nested structures correctly
1304 for i in range(len(field_tokens) - 1, -1, -1):
1305 if field_tokens[i].type == TokenType.RBRACE and i + 1 < len(field_tokens):
1306 # The field name should be the next identifier after the closing brace
1307 for j in range(i + 1, len(field_tokens)):
1308 if field_tokens[j].type == TokenType.IDENTIFIER:
1309 field_name = field_tokens[j].value
1310 break
1311 if field_name:
1312 break
1314 if field_name:
1315 # Extract the content between braces for anonymous processor
1316 content = _extract_brace_content(field_tokens)
1317 if content:
1318 # Preserve content for anonymous processor using special format
1319 import base64
1320 encoded_content = base64.b64encode(content.encode()).decode()
1321 field_type = f"union {{ /*ANON:{encoded_content}:{field_name}*/ ... }}"
1322 else:
1323 field_type = "union { ... }"
1325 if field_name not in ["[", "]", ";", "}"]:
1326 fields.append((field_name, field_type))
1327 # Skip parsing the nested union's fields as separate fields
1328 # Let the normal flow handle semicolon advancement
1329 else:
1330 # Anonymous nested union without a field name
1331 content = _extract_brace_content(field_tokens)
1332 if content:
1333 import base64
1334 encoded_content = base64.b64encode(content.encode()).decode()
1335 generic_name = "__anonymous_union__"
1336 field_type = f"union {{ /*ANON:{encoded_content}:{generic_name}*/ ... }}"
1337 else:
1338 generic_name = "__anonymous_union__"
1339 field_type = "union { ... }"
1340 fields.append((generic_name, field_type))
1341 # Function pointer array field: type (*name[size])(params)
1342 elif (
1343 len(field_tokens) >= 8
1344 and field_tokens[1].type == TokenType.LPAREN
1345 and field_tokens[2].type == TokenType.ASTERISK
1346 and any(t.type == TokenType.LBRACKET for t in field_tokens)
1347 and any(t.type == TokenType.RBRACKET for t in field_tokens)
1348 ):
1349 # Find the function pointer name (between * and [)
1350 # Look for the identifier between * and [
1351 name_start = 3 # After the *
1352 name_end = None
1353 for i in range(name_start, len(field_tokens)):
1354 if field_tokens[i].type == TokenType.LBRACKET:
1355 name_end = i
1356 break
1358 if name_end is not None:
1359 field_name = " ".join(
1360 t.value for t in field_tokens[name_start:name_end]
1361 )
1363 # Format the type properly - preserve spaces between tokens but not around brackets/parentheses
1364 formatted_tokens = []
1365 for j, token in enumerate(field_tokens):
1366 if token.type in [
1367 TokenType.LPAREN,
1368 TokenType.RPAREN,
1369 TokenType.LBRACKET,
1370 TokenType.RBRACKET,
1371 ]:
1372 # Don't add spaces around brackets/parentheses
1373 formatted_tokens.append(token.value)
1374 elif j > 0 and field_tokens[j - 1].type not in [
1375 TokenType.LPAREN,
1376 TokenType.RPAREN,
1377 TokenType.LBRACKET,
1378 TokenType.RBRACKET,
1379 ]:
1380 # Add space before token if previous token wasn't a bracket/parenthesis
1381 formatted_tokens.append(" " + token.value)
1382 else:
1383 # No space before token
1384 formatted_tokens.append(token.value)
1385 field_type = "".join(formatted_tokens)
1387 # Validate and add the field
1388 if (
1389 field_name
1390 and field_name.strip()
1391 and field_type.strip()
1392 and field_name not in ["[", "]", ";", "}"]
1393 ):
1394 stripped_name = field_name.strip()
1395 stripped_type = field_type.strip()
1396 if stripped_name and stripped_type:
1397 fields.append((stripped_name, stripped_type))
1398 # Function pointer field: type (*name)(params) or type (*name[size])(params)
1399 elif (
1400 len(field_tokens) >= 5
1401 and field_tokens[1].type == TokenType.LPAREN and field_tokens[2].type == TokenType.ASTERISK
1402 ):
1403 # Find the opening parenthesis and asterisk pattern
1404 func_ptr_start = None
1405 for i in range(len(field_tokens) - 1):
1406 if field_tokens[i].type == TokenType.LPAREN and field_tokens[i + 1].type == TokenType.ASTERISK:
1407 func_ptr_start = i
1408 break
1410 if func_ptr_start is not None:
1411 # Extract the type (everything before the opening parenthesis)
1412 type_tokens = field_tokens[:func_ptr_start]
1413 field_type = " ".join(t.value for t in type_tokens)
1415 # Find the closing parenthesis after the function name
1416 paren_count = 0
1417 name_end = None
1418 for i in range(func_ptr_start, len(field_tokens)):
1419 if field_tokens[i].type == TokenType.LPAREN:
1420 paren_count += 1
1421 elif field_tokens[i].type == TokenType.RPAREN:
1422 paren_count -= 1
1423 if paren_count == 0 and i > func_ptr_start + 1:
1424 name_end = i
1425 break
1427 if name_end is not None:
1428 # Extract function name (between * and closing parenthesis)
1429 name_tokens = field_tokens[func_ptr_start + 2:name_end]
1430 field_name = " ".join(t.value for t in name_tokens)
1432 # Extract the parameter list as part of the type
1433 param_tokens = field_tokens[name_end + 1:]
1434 param_type = " ".join(t.value for t in param_tokens)
1436 # Combine type and parameter list (without the function name in the type)
1437 # The function name is already extracted as field_name, so we don't include it in the type
1438 func_ptr_start_tokens = field_tokens[func_ptr_start:func_ptr_start + 2] # ( *
1439 func_ptr_end_tokens = field_tokens[name_end:name_end + 1] # )
1440 full_type = field_type + " " + " ".join(t.value for t in func_ptr_start_tokens) + " " + " ".join(t.value for t in func_ptr_end_tokens) + " " + param_type
1442 if (
1443 field_name
1444 and field_name.strip()
1445 and full_type.strip()
1446 and field_name not in ["[", "]", ";", "}"]
1447 ):
1448 stripped_name = field_name.strip()
1449 stripped_type = full_type.strip()
1450 if stripped_name and stripped_type:
1451 fields.append((stripped_name, stripped_type))
1452 # Array field: type name [ size ]
1453 elif (
1454 len(field_tokens) >= 4
1455 and field_tokens[-3].type == TokenType.LBRACKET
1456 and field_tokens[-1].type == TokenType.RBRACKET
1457 ):
1458 field_name = field_tokens[-4].value
1459 # Fix: Properly format array type - preserve spaces between tokens
1460 type_tokens = field_tokens[:-4]
1461 field_type = " ".join(t.value for t in type_tokens) + "[" + field_tokens[-2].value + "]"
1462 if (
1463 field_name
1464 and field_name.strip()
1465 and field_type.strip()
1466 and field_name not in ["[", "]", ";", "}"]
1467 ):
1468 # Additional validation to ensure we don't have empty strings
1469 stripped_name = field_name.strip()
1470 stripped_type = field_type.strip()
1471 if stripped_name and stripped_type:
1472 fields.append((stripped_name, stripped_type))
1473 else:
1474 # Regular field: type name
1475 # Check if this field declaration contains commas (multiple fields of same type)
1476 comma_positions = []
1477 paren_count = 0
1478 brace_count = 0
1480 # Find comma positions that are outside of parentheses and braces
1481 for i, token in enumerate(field_tokens):
1482 if token.type == TokenType.LPAREN:
1483 paren_count += 1
1484 elif token.type == TokenType.RPAREN:
1485 paren_count -= 1
1486 elif token.type == TokenType.LBRACE:
1487 brace_count += 1
1488 elif token.type == TokenType.RBRACE:
1489 brace_count -= 1
1490 elif token.type == TokenType.COMMA and paren_count == 0 and brace_count == 0:
1491 comma_positions.append(i)
1493 if comma_positions:
1494 # Multiple fields of the same type: "int x, y, z;"
1495 # Extract the type (everything before the first field name)
1496 first_field_start = None
1497 for i in range(len(field_tokens)):
1498 if field_tokens[i].type == TokenType.IDENTIFIER:
1499 first_field_start = i
1500 break
1502 if first_field_start is not None:
1503 type_tokens = field_tokens[:first_field_start]
1504 field_type = " ".join(t.value for t in type_tokens)
1506 # Split fields on commas
1507 field_starts = [first_field_start] + [pos + 1 for pos in comma_positions]
1508 field_ends = comma_positions + [len(field_tokens)]
1510 for start, end in zip(field_starts, field_ends):
1511 if start < end:
1512 field_name_tokens = field_tokens[start:end]
1513 field_name = " ".join(t.value for t in field_name_tokens)
1515 if (
1516 field_name
1517 and field_name.strip()
1518 and field_type.strip()
1519 and field_name not in ["[", "]", ";", "}"]
1520 ):
1521 stripped_name = field_name.strip()
1522 stripped_type = field_type.strip()
1523 if stripped_name and stripped_type:
1524 fields.append((stripped_name, stripped_type))
1525 else:
1526 # Single field: type name
1527 field_name = field_tokens[-1].value
1528 field_type = " ".join(t.value for t in field_tokens[:-1])
1529 if (
1530 field_name not in ["[", "]", ";", "}"]
1531 and field_name
1532 and field_name.strip()
1533 and field_type.strip()
1534 ):
1535 # Additional validation to ensure we don't have empty strings
1536 stripped_name = field_name.strip()
1537 stripped_type = field_type.strip()
1538 if stripped_name and stripped_type:
1539 fields.append((stripped_name, stripped_type))
1540 if pos < closing_brace_pos:
1541 pos += 1 # Skip semicolon
1542 return fields
1545def find_enum_values(tokens: List[Token], enum_start: int, enum_end: int) -> List[str]:
1546 """Extract enum values from enum token range"""
1547 values = []
1548 pos = enum_start
1549 while pos <= enum_end and tokens[pos].type != TokenType.LBRACE:
1550 pos += 1
1551 if pos > enum_end:
1552 return values
1553 pos += 1 # Skip opening brace
1554 current_value = []
1555 while pos <= enum_end and tokens[pos].type != TokenType.RBRACE:
1556 token = tokens[pos]
1557 if token.type == TokenType.COMMA:
1558 if current_value:
1559 filtered_value = [
1560 t
1561 for t in current_value
1562 if t.type not in [TokenType.WHITESPACE, TokenType.COMMENT]
1563 ]
1564 if filtered_value:
1565 value_str = " ".join(t.value for t in filtered_value).strip()
1566 if value_str:
1567 values.append(value_str)
1568 current_value = []
1569 elif token.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
1570 current_value.append(token)
1571 pos += 1
1572 if current_value:
1573 filtered_value = [
1574 t
1575 for t in current_value
1576 if t.type not in [TokenType.WHITESPACE, TokenType.COMMENT]
1577 ]
1578 if filtered_value:
1579 value_str = " ".join(t.value for t in filtered_value).strip()
1580 if value_str:
1581 values.append(value_str)
1582 return values
1585def _extract_brace_content(field_tokens: List[Token]) -> str:
1586 """Extract the content between braces from field tokens.
1588 Args:
1589 field_tokens: List of tokens representing a field with anonymous structure
1591 Returns:
1592 String content between the braces, or empty string if not found
1593 """
1594 content_tokens = []
1595 in_braces = False
1596 brace_count = 0
1598 for token in field_tokens:
1599 if token.type == TokenType.LBRACE:
1600 if not in_braces:
1601 in_braces = True
1602 brace_count = 1
1603 else:
1604 brace_count += 1
1605 content_tokens.append(token)
1606 elif token.type == TokenType.RBRACE:
1607 if in_braces:
1608 brace_count -= 1
1609 if brace_count == 0:
1610 # Found the closing brace
1611 break
1612 else:
1613 content_tokens.append(token)
1614 elif in_braces:
1615 content_tokens.append(token)
1617 # Convert tokens back to text preserving spacing
1618 if content_tokens:
1619 result = ""
1620 for i, token in enumerate(content_tokens):
1621 result += token.value
1622 # Add space after most tokens except when next token is punctuation
1623 if (i < len(content_tokens) - 1 and
1624 token.type not in [TokenType.WHITESPACE, TokenType.NEWLINE] and
1625 content_tokens[i + 1].type not in [TokenType.LBRACKET, TokenType.RBRACKET,
1626 TokenType.SEMICOLON, TokenType.COMMA,
1627 TokenType.WHITESPACE, TokenType.NEWLINE]):
1628 result += " "
1629 return result
1630 return ""