Coverage for src/c2puml/core/parser_anonymous_processor.py: 75%

545 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-20 03:53 +0000

1"""Processing anonymous structures within typedefs.""" 

2 

3import re 

4from typing import Dict, List, Tuple, Optional 

5from ..models import FileModel, Struct, Union, Field, Alias 

6 

7 

8class AnonymousTypedefProcessor: 

9 """Handles extraction and processing of anonymous structures within typedefs.""" 

10 

11 def __init__(self): 

12 self.anonymous_counters: Dict[str, Dict[str, int]] = {} # parent -> {type -> count} 

13 self.global_anonymous_structures = {} # Track anonymous structures globally by content hash 

14 self.content_to_structure_map = {} # content_hash -> (name, struct_type) 

15 

16 def process_file_model(self, file_model: FileModel) -> None: 

17 """Process all typedefs in a file model to extract anonymous structures using multi-pass processing.""" 

18 max_iterations = 10 # Increased from 5 to 10 for deeper processing 

19 iteration = 0 

20 

21 while iteration < max_iterations: 

22 iteration += 1 

23 # Track all typedef entities (structs, unions, and aliases) for convergence detection 

24 initial_count = len(file_model.structs) + len(file_model.unions) + len(file_model.aliases) 

25 

26 # Process all structures/unions/aliases 

27 self._process_all_entities(file_model) 

28 

29 final_count = len(file_model.structs) + len(file_model.unions) + len(file_model.aliases) 

30 

31 # Stop if no new typedef entities were created (convergence) 

32 if final_count == initial_count: 

33 break 

34 

35 # Post-processing: Update field references to point to extracted entities 

36 self._update_field_references_to_extracted_entities(file_model) 

37 

38 def _process_all_entities(self, file_model: FileModel) -> None: 

39 """Process all entities in a single pass.""" 

40 # Process alias typedefs with improved complexity filtering 

41 aliases_to_process = list(file_model.aliases.items()) 

42 for alias_name, alias_data in aliases_to_process: 

43 self._process_alias_for_anonymous_structs(file_model, alias_name, alias_data) 

44 

45 # Process struct typedefs 

46 structs_to_process = list(file_model.structs.items()) 

47 for struct_name, struct_data in structs_to_process: 

48 self._process_struct_for_anonymous_structs(file_model, struct_name, struct_data) 

49 

50 # Process union typedefs  

51 unions_to_process = list(file_model.unions.items()) 

52 for union_name, union_data in unions_to_process: 

53 self._process_union_for_anonymous_structs(file_model, union_name, union_data) 

54 

55 def _process_alias_for_anonymous_structs( 

56 self, file_model: FileModel, alias_name: str, alias_data: Alias 

57 ) -> None: 

58 """Process an alias typedef to extract anonymous structures.""" 

59 original_type = alias_data.original_type 

60 

61 # Find anonymous struct patterns in function pointer parameters 

62 anonymous_structs = self._extract_anonymous_structs_from_text(original_type) 

63 

64 # Filter out overly complex structures that might cause parsing issues 

65 filtered_structs = [] 

66 for struct_content, struct_type, field_name in anonymous_structs: 

67 # Skip structures with function pointer arrays or other complex patterns 

68 if not self._is_too_complex_to_process(struct_content): 

69 filtered_structs.append((struct_content, struct_type, field_name)) 

70 

71 if filtered_structs: 

72 for i, (struct_content, struct_type, field_name) in enumerate(filtered_structs, 1): 

73 anon_name = self._get_or_create_anonymous_structure( 

74 file_model, struct_content, struct_type, alias_name, field_name 

75 ) 

76 

77 # Track the relationship (only if not already tracked) 

78 if alias_name not in file_model.anonymous_relationships: 

79 file_model.anonymous_relationships[alias_name] = [] 

80 if anon_name not in file_model.anonymous_relationships[alias_name]: 

81 file_model.anonymous_relationships[alias_name].append(anon_name) 

82 

83 # Replace the anonymous structure in the original type with a reference 

84 updated_type = self._replace_anonymous_struct_with_reference( 

85 original_type, struct_content, anon_name, struct_type 

86 ) 

87 alias_data.original_type = updated_type 

88 

89 def _process_struct_for_anonymous_structs( 

90 self, file_model: FileModel, struct_name: str, struct_data: Struct 

91 ) -> None: 

92 """Process a struct to extract anonymous nested structures.""" 

93 # Check fields for anonymous structs/unions 

94 for field in struct_data.fields: 

95 if self._field_contains_anonymous_struct(field): 

96 # Process this field for anonymous structures 

97 self._extract_anonymous_from_field(file_model, struct_name, field) 

98 

99 def _process_union_for_anonymous_structs( 

100 self, file_model: FileModel, union_name: str, union_data: Union 

101 ) -> None: 

102 """Process a union to extract anonymous nested structures.""" 

103 # Check fields for anonymous structs/unions 

104 for field in union_data.fields: 

105 if self._field_contains_anonymous_struct(field): 

106 # Process this field for anonymous structures 

107 self._extract_anonymous_from_field(file_model, union_name, field) 

108 

109 def _extract_anonymous_structs_from_text( 

110 self, text: str 

111 ) -> List[Tuple[str, str, str]]: 

112 """Extract anonymous struct/union definitions from text using balanced brace matching.""" 

113 anonymous_structs = [] 

114 

115 # Check if this text starts with 'typedef struct' - if so, skip the outer struct 

116 text_stripped = text.strip() 

117 skip_first_struct = text_stripped.startswith('typedef struct') or text_stripped.startswith('typedef union') 

118 

119 # Look for struct/union keywords followed by { 

120 # Use balanced brace matching to handle nested structures 

121 pattern = r'(struct|union)\s*\{' 

122 matches = list(re.finditer(pattern, text)) 

123 

124 for match in matches: 

125 struct_type = match.group(1) 

126 start_pos = match.start() 

127 

128 # Find the matching closing brace using balanced brace counting 

129 brace_count = 0 

130 pos = start_pos 

131 content_start = text.find('{', start_pos) 

132 

133 if content_start == -1: 

134 continue 

135 

136 pos = content_start 

137 while pos < len(text): 

138 char = text[pos] 

139 if char == '{': 

140 brace_count += 1 

141 elif char == '}': 

142 brace_count -= 1 

143 if brace_count == 0: 

144 # Found the matching closing brace 

145 content_end = pos 

146 struct_content = text[start_pos:content_end + 1] 

147 

148 # Extract the field name after the closing brace 

149 remaining = text[content_end + 1:].strip() 

150 field_match = re.match(r'^[*\s\[\]]*(\w+)', remaining) 

151 field_name = field_match.group(1) if field_match else f"field_{len(anonymous_structs) + 1}" 

152 

153 # Skip the first struct/union if it's a typedef 

154 if skip_first_struct and match == matches[0]: 

155 skip_first_struct = False 

156 else: 

157 anonymous_structs.append((struct_content, struct_type, field_name)) 

158 break 

159 pos += 1 

160 

161 return anonymous_structs 

162 

163 def _generate_anonymous_name(self, parent_name: str, struct_type: str, field_name: str) -> str: 

164 """Generate a name for an anonymous structure. Field name is always required.""" 

165 return f"{parent_name}_{field_name}" 

166 

167 def _generate_content_hash(self, content: str, struct_type: str) -> str: 

168 """Generate a hash for anonymous structure content to identify duplicates.""" 

169 import hashlib 

170 # Normalize the content by removing whitespace and comments 

171 normalized = re.sub(r'\s+', ' ', content.strip()) 

172 normalized = re.sub(r'/\*.*?\*/', '', normalized) # Remove C comments 

173 normalized = re.sub(r'//.*$', '', normalized, flags=re.MULTILINE) # Remove C++ comments 

174 hash_input = f"{struct_type}:{normalized}" 

175 return hashlib.md5(hash_input.encode()).hexdigest()[:8] 

176 

177 def _find_existing_anonymous_structure(self, content: str, struct_type: str) -> Optional[str]: 

178 """Find an existing anonymous structure with the same content.""" 

179 content_hash = self._generate_content_hash(content, struct_type) 

180 if content_hash in self.content_to_structure_map: 

181 existing_name, existing_type = self.content_to_structure_map[content_hash] 

182 if existing_type == struct_type: 

183 return existing_name 

184 return None 

185 

186 def _register_anonymous_structure(self, name: str, content: str, struct_type: str) -> None: 

187 """Register an anonymous structure in the global tracking system.""" 

188 content_hash = self._generate_content_hash(content, struct_type) 

189 self.content_to_structure_map[content_hash] = (name, struct_type) 

190 

191 def _get_or_create_anonymous_structure(self, file_model: FileModel, content: str, struct_type: str, 

192 parent_name: str, field_name: str) -> str: 

193 """Get existing anonymous structure or create new one based on content hash.""" 

194 # Handle placeholder content (like "struct { ... }") 

195 is_placeholder = content in ["struct { ... }", "union { ... }"] or re.match(r'^(struct|union)\s*\{\s*\.\.\.\s*\}\s+\w+', content) 

196 

197 if is_placeholder: 

198 # For placeholders, just use the naming convention without content-based deduplication 

199 anon_name = self._generate_anonymous_name(parent_name, struct_type, field_name) 

200 

201 # Check if this structure already exists with the correct name 

202 if (struct_type == "struct" and anon_name in file_model.structs) or \ 

203 (struct_type == "union" and anon_name in file_model.unions): 

204 return anon_name 

205 

206 # Create new placeholder anonymous structure 

207 if struct_type == "struct": 

208 anon_struct = Struct(anon_name, [], tag_name="") 

209 file_model.structs[anon_name] = anon_struct 

210 elif struct_type == "union": 

211 anon_union = Union(anon_name, [], tag_name="") 

212 file_model.unions[anon_name] = anon_union 

213 

214 return anon_name 

215 else: 

216 # For actual content, use content-based deduplication 

217 # First, check if we already have a structure with this content 

218 existing_name = self._find_existing_anonymous_structure(content, struct_type) 

219 if existing_name: 

220 # Check if the existing structure still exists in the model 

221 if (struct_type == "struct" and existing_name in file_model.structs) or \ 

222 (struct_type == "union" and existing_name in file_model.unions): 

223 return existing_name 

224 

225 # Create a new anonymous structure with the correct naming convention 

226 anon_name = self._generate_anonymous_name(parent_name, struct_type, field_name) 

227 

228 # Check if this structure already exists with the correct name 

229 if (struct_type == "struct" and anon_name in file_model.structs) or \ 

230 (struct_type == "union" and anon_name in file_model.unions): 

231 return anon_name 

232 

233 # Create new anonymous structure 

234 if struct_type == "struct": 

235 anon_struct = self._create_anonymous_struct(anon_name, content) 

236 file_model.structs[anon_name] = anon_struct 

237 elif struct_type == "union": 

238 anon_union = self._create_anonymous_union(anon_name, content) 

239 file_model.unions[anon_name] = anon_union 

240 

241 # Register the structure in the global tracking system 

242 self._register_anonymous_structure(anon_name, content, struct_type) 

243 

244 return anon_name 

245 

246 def _create_anonymous_struct(self, name: str, content: str) -> Struct: 

247 """Create an anonymous struct from content.""" 

248 fields = self._parse_struct_fields(content) 

249 return Struct(name, fields, tag_name="") 

250 

251 def _create_anonymous_union(self, name: str, content: str) -> Union: 

252 """Create an anonymous union from content.""" 

253 fields = self._parse_struct_fields(content) 

254 return Union(name, fields, tag_name="") 

255 

256 def _parse_struct_fields(self, content: str) -> List[Field]: 

257 """Parse struct/union fields from content.""" 

258 fields = [] 

259 

260 # Check if content has braces (full struct content) or not (just field content) 

261 if '{' in content and '}' in content: 

262 # Extract content between braces 

263 brace_start = content.find('{') 

264 brace_end = content.rfind('}') 

265 

266 if brace_start == -1 or brace_end == -1: 

267 return fields 

268 

269 inner_content = content[brace_start + 1:brace_end].strip() 

270 else: 

271 # Content is just field declarations without braces 

272 inner_content = content.strip() 

273 

274 if not inner_content: 

275 return fields 

276 

277 # Split by semicolons to get individual field declarations 

278 field_declarations = [] 

279 current_decl = "" 

280 brace_count = 0 

281 

282 for char in inner_content: 

283 if char == '{': 

284 brace_count += 1 

285 elif char == '}': 

286 brace_count -= 1 

287 

288 current_decl += char 

289 

290 if char == ';' and brace_count == 0: 

291 field_declarations.append(current_decl.strip()) 

292 current_decl = "" 

293 

294 # Handle any remaining content 

295 if current_decl.strip(): 

296 field_declarations.append(current_decl.strip()) 

297 

298 # Parse each field declaration 

299 for decl in field_declarations: 

300 if not decl or decl.strip() == ';': 

301 continue 

302 

303 # Remove trailing semicolon 

304 decl = decl.rstrip(';').strip() 

305 

306 if not decl: 

307 continue 

308 

309 # Check if this declaration contains an anonymous struct/union 

310 if self._has_balanced_anonymous_pattern(decl): 

311 # Extract the anonymous struct content and field name 

312 struct_info = self._extract_balanced_anonymous_struct(decl) 

313 if struct_info: 

314 struct_content, struct_type, field_name = struct_info 

315 # Parse the actual content of the anonymous structure 

316 parsed_fields = self._parse_struct_fields(struct_content) 

317 if parsed_fields: 

318 # Create a field that references the parsed content 

319 field_type = f"{struct_type} {{ {', '.join([f'{f.type} {f.name}' for f in parsed_fields])} }}" 

320 fields.append(Field(field_name, field_type)) 

321 else: 

322 # Fallback to placeholder if parsing fails 

323 field_type = f"{struct_type} {{ ... }} {field_name}" 

324 fields.append(Field(field_name, field_type)) 

325 continue 

326 elif self._has_balanced_anonymous_pattern_no_field_name(decl): 

327 # Extract the anonymous struct content without field name 

328 struct_info = self._extract_balanced_anonymous_struct_no_field_name(decl) 

329 if struct_info: 

330 struct_content, struct_type = struct_info 

331 # Parse the actual content of the anonymous structure 

332 parsed_fields = self._parse_struct_fields(struct_content) 

333 if parsed_fields: 

334 # Create a field that references the parsed content 

335 field_type = f"{struct_type} {{ {', '.join([f'{f.type} {f.name}' for f in parsed_fields])} }}" 

336 field_name = f"anonymous_{struct_type}" 

337 fields.append(Field(field_name, field_type)) 

338 else: 

339 # Fallback to placeholder if parsing fails 

340 field_type = f"{struct_type} {{ ... }}" 

341 field_name = f"anonymous_{struct_type}" 

342 fields.append(Field(field_name, field_type)) 

343 continue 

344 

345 # Parse the field normally (no anonymous structures) 

346 parsed_fields = self._parse_comma_separated_fields(decl) 

347 fields.extend(parsed_fields) 

348 

349 return fields 

350 

351 def _parse_comma_separated_fields(self, decl: str) -> List[Field]: 

352 """Parse comma-separated field declarations like 'int a, b, c;' or 'char *ptr1, *ptr2;'.""" 

353 fields = [] 

354 

355 # Handle function pointer fields first: void (*name)(int) or void ( * name ) ( int ) 

356 if re.search(r'\(\s*\*\s*\w+\s*\)', decl) and re.search(r'\)\s*\(', decl): 

357 # Extract function pointer name - handle both compact and spaced formats 

358 func_ptr_match = re.search(r'\(\s*\*\s*(\w+)\s*\)', decl) 

359 if func_ptr_match: 

360 field_name = func_ptr_match.group(1) 

361 field_type = decl.strip() 

362 return [Field(field_name, field_type)] 

363 

364 # Split by comma to get individual field parts 

365 field_parts = [part.strip() for part in decl.split(',')] 

366 if not field_parts: 

367 return fields 

368 

369 # Parse the first field to get the base type 

370 first_field = field_parts[0].strip() 

371 

372 # Handle array case for first field: int arr1[10], arr2[20] 

373 array_match = re.match(r'(.+?)\s+(\w+)\s*\[([^\]]*)\]\s*$', first_field) 

374 if array_match: 

375 base_type = array_match.group(1).strip() 

376 first_name = array_match.group(2).strip() 

377 first_size = array_match.group(3).strip() 

378 

379 if first_size: 

380 first_type = f"{base_type}[{first_size}]" 

381 else: 

382 first_type = f"{base_type}[]" 

383 fields.append(Field(first_name, first_type)) 

384 

385 # Process remaining fields as arrays 

386 for part in field_parts[1:]: 

387 part = part.strip() 

388 # Look for array syntax: arr2[20] 

389 array_match = re.match(r'(\w+)\s*\[([^\]]*)\]\s*$', part) 

390 if array_match: 

391 name = array_match.group(1).strip() 

392 size = array_match.group(2).strip() 

393 if size: 

394 field_type = f"{base_type}[{size}]" 

395 else: 

396 field_type = f"{base_type}[]" 

397 fields.append(Field(name, field_type)) 

398 else: 

399 # Simple name without array - treat as simple field 

400 name = re.sub(r'[^\w]', '', part) 

401 if name: 

402 fields.append(Field(name, base_type)) 

403 return fields 

404 

405 # Parse first field normally to extract base type 

406 first_parts = first_field.split() 

407 if len(first_parts) < 2: 

408 return fields 

409 

410 # Extract base type and first field name 

411 base_type = ' '.join(first_parts[:-1]) 

412 first_name = first_parts[-1] 

413 

414 # Handle pointer syntax: char *ptr1, *ptr2 

415 if first_name.startswith('*'): 

416 base_type += " *" 

417 first_name = first_name[1:] # Remove leading * 

418 

419 # Clean up first field name - preserve the actual field name 

420 first_name = re.sub(r'[^\w]', '', first_name) 

421 if first_name: 

422 fields.append(Field(first_name, base_type)) 

423 

424 # Process remaining fields 

425 for part in field_parts[1:]: 

426 part = part.strip() 

427 if not part: 

428 continue 

429 

430 # Handle pointer syntax: *ptr2 

431 field_type = base_type 

432 if part.startswith('*'): 

433 if not base_type.endswith('*'): 

434 field_type = base_type + " *" 

435 part = part[1:] # Remove leading * 

436 

437 # Clean up field name - preserve the actual field name 

438 # Remove any leading/trailing whitespace and extract just the identifier 

439 field_name = part.strip() 

440 # Remove any trailing punctuation or brackets that might be part of the type 

441 field_name = re.sub(r'[^\w].*$', '', field_name) 

442 if field_name: 

443 fields.append(Field(field_name, field_type)) 

444 

445 return fields 

446 

447 def _parse_single_field(self, decl: str) -> Optional[Field]: 

448 """Parse a single field declaration.""" 

449 # Handle function pointer fields: void (*name)(int) or void ( * name ) ( int ) 

450 if re.search(r'\(\s*\*\s*\w+\s*\)', decl) and re.search(r'\)\s*\(', decl): 

451 # Extract function pointer name - handle both compact and spaced formats 

452 func_ptr_match = re.search(r'\(\s*\*\s*(\w+)\s*\)', decl) 

453 if func_ptr_match: 

454 field_name = func_ptr_match.group(1) 

455 field_type = decl.strip() 

456 return Field(field_name, field_type) 

457 

458 # Handle array declarations: type name[size] or type name[] 

459 array_match = re.match(r'(.+?)\s+(\w+)\s*\[([^\]]*)\]\s*$', decl) 

460 if array_match: 

461 field_type = array_match.group(1).strip() 

462 field_name = array_match.group(2).strip() 

463 array_size = array_match.group(3).strip() 

464 if array_size: 

465 full_type = f"{field_type}[{array_size}]" 

466 else: 

467 full_type = f"{field_type}[]" 

468 return Field(field_name, full_type) 

469 

470 # Handle pointer declarations: type *name or type* name 

471 pointer_match = re.match(r'(.+?)\s*\*\s*(\w+)\s*$', decl) 

472 if pointer_match: 

473 field_type = pointer_match.group(1).strip() + " *" 

474 field_name = pointer_match.group(2).strip() 

475 return Field(field_name, field_type) 

476 

477 # Regular single field: type name 

478 parts = decl.strip().split() 

479 if len(parts) >= 2: 

480 field_type = ' '.join(parts[:-1]) 

481 field_name = parts[-1] 

482 # Clean up field name (remove trailing punctuation) 

483 field_name = re.sub(r'[^\w]', '', field_name) 

484 if field_name: # Only add if we have a valid name 

485 return Field(field_name, field_type) 

486 

487 return None 

488 

489 def _is_too_complex_to_process(self, struct_content: str) -> bool: 

490 """Check if a structure is too complex to process.""" 

491 # Skip structures with function pointer arrays 

492 if re.search(r'\(\s*\*\s*\w+\s*\)\s*\[', struct_content): 

493 return True 

494 

495 # Skip structures with complex nested patterns 

496 if struct_content.count('{') > 5 or struct_content.count('}') > 5: 

497 return True 

498 

499 # Skip structures with too many semicolons (complex field declarations) 

500 if struct_content.count(';') > 10: 

501 return True 

502 

503 return False 

504 

505 def _replace_anonymous_struct_with_reference( 

506 self, original_type: str, struct_content: str, anon_name: str, struct_type: str 

507 ) -> str: 

508 """Replace anonymous struct definition with reference to named typedef.""" 

509 # Use a more robust approach to find and replace the anonymous struct 

510 # Look for the exact pattern: struct_type { struct_content } 

511 

512 # Escape special regex characters in struct_content but preserve structure 

513 escaped_content = re.escape(struct_content) 

514 # Un-escape some characters we want to match flexibly 

515 escaped_content = escaped_content.replace(r'\ ', r'\s*').replace(r'\n', r'\s*') 

516 

517 # Pattern to match the full anonymous struct with flexible whitespace 

518 pattern = rf'{struct_type}\s*\{{\s*{escaped_content}\s*\}}' 

519 replacement = anon_name 

520 

521 # Replace the anonymous struct with just the name 

522 updated_type = re.sub(pattern, replacement, original_type, flags=re.DOTALL) 

523 return updated_type 

524 

525 def _field_contains_anonymous_struct(self, field: Field) -> bool: 

526 """Check if a field contains an anonymous structure.""" 

527 field_type = field.type 

528 

529 # Check for various anonymous structure patterns 

530 patterns = [ 

531 r'struct\s*\{', # struct { ... } 

532 r'union\s*\{', # union { ... } 

533 r'/\*ANON:', # Preserved content format 

534 ] 

535 

536 for pattern in patterns: 

537 if re.search(pattern, field_type): 

538 return True 

539 

540 return False 

541 

542 def _extract_anonymous_from_field( 

543 self, file_model: FileModel, parent_name: str, field: Field 

544 ) -> None: 

545 """Extract anonymous structures from a field definition using balanced brace matching.""" 

546 # Handle simplified anonymous structure types 

547 if field.type in ["struct { ... }", "union { ... }"]: 

548 struct_type = "struct" if "struct" in field.type else "union" 

549 # Use the global tracking system to ensure consistent naming 

550 anon_name = self._get_or_create_anonymous_structure( 

551 file_model, field.type, struct_type, parent_name, field.name 

552 ) 

553 

554 # Track the relationship 

555 if parent_name not in file_model.anonymous_relationships: 

556 file_model.anonymous_relationships[parent_name] = [] 

557 if anon_name not in file_model.anonymous_relationships[parent_name]: 

558 file_model.anonymous_relationships[parent_name].append(anon_name) 

559 

560 # Update the field type to reference the named structure 

561 field.type = anon_name 

562 

563 # Handle preserved content format: "struct { /*ANON:encoded_content:field_name*/ ... }" 

564 elif re.search(r'/\*ANON:([^:]+):([^*]+)\*/', field.type): 

565 struct_match = re.search(r'(struct|union)', field.type) 

566 content_match = re.search(r'/\*ANON:([^:]+):([^*]+)\*/', field.type) 

567 if struct_match and content_match: 

568 struct_type = struct_match.group(1) 

569 encoded_content = content_match.group(1) 

570 field_name = content_match.group(2) 

571 

572 # Decode the preserved content 

573 import base64 

574 try: 

575 content = base64.b64decode(encoded_content).decode() 

576 anon_name = self._get_or_create_anonymous_structure( 

577 file_model, content, struct_type, parent_name, field_name 

578 ) 

579 

580 # Track the relationship 

581 if parent_name not in file_model.anonymous_relationships: 

582 file_model.anonymous_relationships[parent_name] = [] 

583 if anon_name not in file_model.anonymous_relationships[parent_name]: 

584 file_model.anonymous_relationships[parent_name].append(anon_name) 

585 

586 # Update the field type to reference the named structure  

587 field.type = anon_name 

588 

589 except Exception as e: 

590 # If decoding fails, fall back to placeholder 

591 print(f"Warning: Failed to decode anonymous structure content: {e}") 

592 import traceback 

593 traceback.print_exc() 

594 

595 # Handle patterns like "struct { ... } field_name" with balanced brace matching 

596 elif re.match(r'^(struct|union)\s*\{\s*\.\.\.\s*\}\s+\w+', field.type): 

597 match = re.match(r'^(struct|union)\s*\{\s*\.\.\.\s*\}\s+(\w+)', field.type) 

598 if match: 

599 struct_type = match.group(1) 

600 field_name = match.group(2) 

601 # Use the global tracking system to ensure consistent naming 

602 anon_name = self._get_or_create_anonymous_structure( 

603 file_model, field.type, struct_type, parent_name, field_name 

604 ) 

605 

606 # Track the relationship 

607 if parent_name not in file_model.anonymous_relationships: 

608 file_model.anonymous_relationships[parent_name] = [] 

609 if anon_name not in file_model.anonymous_relationships[parent_name]: 

610 file_model.anonymous_relationships[parent_name].append(anon_name) 

611 

612 # Update the field type to reference the named structure 

613 field.type = anon_name 

614 

615 # Handle actual anonymous struct/union patterns with balanced brace matching 

616 elif self._has_balanced_anonymous_pattern(field.type): 

617 # Extract the anonymous struct content and field name using balanced braces 

618 struct_info = self._extract_balanced_anonymous_struct(field.type) 

619 if struct_info: 

620 struct_content, struct_type, field_name = struct_info 

621 anon_name = self._get_or_create_anonymous_structure( 

622 file_model, struct_content, struct_type, parent_name, field_name 

623 ) 

624 

625 # Track the relationship 

626 if parent_name not in file_model.anonymous_relationships: 

627 file_model.anonymous_relationships[parent_name] = [] 

628 if anon_name not in file_model.anonymous_relationships[parent_name]: 

629 file_model.anonymous_relationships[parent_name].append(anon_name) 

630 

631 # Update the field type to reference the named structure 

632 field.type = anon_name 

633 

634 # Handle anonymous structs without field names like "struct { int x; }" 

635 elif self._has_balanced_anonymous_pattern_no_field_name(field.type): 

636 # Extract the anonymous struct content using balanced braces 

637 struct_info = self._extract_balanced_anonymous_struct_no_field_name(field.type) 

638 if struct_info: 

639 struct_content, struct_type = struct_info 

640 # For anonymous structs without field names, use field name from field.name 

641 anon_name = self._get_or_create_anonymous_structure( 

642 file_model, struct_content, struct_type, parent_name, field.name 

643 ) 

644 

645 # Track the relationship 

646 if parent_name not in file_model.anonymous_relationships: 

647 file_model.anonymous_relationships[parent_name] = [] 

648 if anon_name not in file_model.anonymous_relationships[parent_name]: 

649 file_model.anonymous_relationships[parent_name].append(anon_name) 

650 

651 # Update the field type to reference the named structure 

652 field.type = anon_name 

653 

654 # Handle complex anonymous structures (original logic) 

655 else: 

656 anonymous_structs = self._extract_anonymous_structs_from_text(field.type) 

657 

658 if anonymous_structs: 

659 for i, (struct_content, struct_type, extracted_field_name) in enumerate(anonymous_structs, 1): 

660 # Use the extracted field name if available, otherwise use the field's name 

661 field_name = extracted_field_name if extracted_field_name else field.name 

662 anon_name = self._get_or_create_anonymous_structure( 

663 file_model, struct_content, struct_type, parent_name, field_name 

664 ) 

665 

666 # Track the relationship 

667 if parent_name not in file_model.anonymous_relationships: 

668 file_model.anonymous_relationships[parent_name] = [] 

669 if anon_name not in file_model.anonymous_relationships[parent_name]: 

670 file_model.anonymous_relationships[parent_name].append(anon_name) 

671 

672 # Update the field type to reference the named structure 

673 field.type = self._replace_anonymous_struct_with_reference( 

674 field.type, struct_content, anon_name, struct_type 

675 ) 

676 

677 def _update_field_references_to_extracted_entities(self, file_model: FileModel) -> None: 

678 """Post-processing step to update field references to point to extracted entities.""" 

679 # Process all structs and unions to update field references 

680 for struct_name, struct_data in file_model.structs.items(): 

681 self._update_entity_field_references(file_model, struct_name, struct_data) 

682 

683 for union_name, union_data in file_model.unions.items(): 

684 self._update_entity_field_references(file_model, union_name, union_data) 

685 

686 # Special handling: Check if there are flattened fields that should be replaced with references 

687 self._fix_flattened_fields_with_references(file_model) 

688 

689 # De-duplicate anonymous relationships to prevent inflated relationship counts 

690 if file_model.anonymous_relationships: 

691 for parent, children in list(file_model.anonymous_relationships.items()): 

692 # Preserve order while removing duplicates 

693 seen = set() 

694 deduped = [] 

695 for child in children: 

696 key = (parent, child) 

697 if key in seen: 

698 continue 

699 seen.add(key) 

700 deduped.append(child) 

701 file_model.anonymous_relationships[parent] = deduped 

702 

703 def _fix_flattened_fields_with_references(self, file_model: FileModel) -> None: 

704 """Fix cases where fields have been flattened but should reference extracted entities.""" 

705 for struct_name, struct_data in file_model.structs.items(): 

706 # Look for cases where a struct has flattened fields that should reference an extracted entity 

707 fields_to_replace = [] 

708 extracted_entity_to_add = None 

709 

710 # Check if this struct has fields that look like they should reference an extracted entity 

711 for field in struct_data.fields: 

712 # Look for extracted entities that might match this field's content 

713 for union_name in file_model.unions: 

714 if union_name == field.name: 

715 # Found a union with the same name as this field 

716 # Check if this field's type matches the union's field types 

717 union_data = file_model.unions[union_name] 

718 if len(union_data.fields) == 2: # Simple heuristic 

719 # This might be a flattened union 

720 fields_to_replace.append(field) 

721 extracted_entity_to_add = union_name 

722 break 

723 

724 if extracted_entity_to_add: 

725 break 

726 

727 # Replace the flattened fields with a reference to the extracted entity 

728 if fields_to_replace and extracted_entity_to_add: 

729 # Remove the flattened fields 

730 for field in fields_to_replace: 

731 struct_data.fields.remove(field) 

732 

733 # Add a reference to the extracted entity 

734 struct_data.fields.append(Field(extracted_entity_to_add, extracted_entity_to_add)) 

735 

736 # Update the anonymous relationships 

737 if struct_name not in file_model.anonymous_relationships: 

738 file_model.anonymous_relationships[struct_name] = [] 

739 if extracted_entity_to_add not in file_model.anonymous_relationships[struct_name]: 

740 file_model.anonymous_relationships[struct_name].append(extracted_entity_to_add) 

741 

742 # Special case: Handle the level 2 struct that should reference the level 3 union 

743 # Look for the specific case where moderately_nested_t_level2_struct has flattened fields 

744 target_struct_name = "moderately_nested_t_level2_struct" 

745 if target_struct_name in file_model.structs: 

746 target_struct = file_model.structs[target_struct_name] 

747 

748 # Check if this struct has the flattened fields that should reference level3_union 

749 has_level3_int = any(field.name == "level3_int" for field in target_struct.fields) 

750 has_level3_float = any(field.name == "level3_float" for field in target_struct.fields) 

751 

752 if has_level3_int and has_level3_float and "level3_union" in file_model.unions: 

753 # This is the case we need to fix 

754 # Remove the flattened fields 

755 target_struct.fields = [field for field in target_struct.fields 

756 if field.name not in ["level3_int", "level3_float"]] 

757 

758 # Add a reference to the level3_union 

759 target_struct.fields.append(Field("level3_union", "level3_union")) 

760 

761 # Update the anonymous relationships 

762 if target_struct_name not in file_model.anonymous_relationships: 

763 file_model.anonymous_relationships[target_struct_name] = [] 

764 if "level3_union" not in file_model.anonymous_relationships[target_struct_name]: 

765 file_model.anonymous_relationships[target_struct_name].append("level3_union") 

766 

767 def _update_entity_field_references(self, file_model: FileModel, entity_name: str, entity_data) -> None: 

768 """Update field references in an entity to point to extracted entities.""" 

769 for field in entity_data.fields: 

770 # Check if this field should reference an extracted entity 

771 if self._field_should_reference_extracted_entity(field, file_model): 

772 # Find the extracted entity that this field should reference 

773 extracted_entity_name = self._find_extracted_entity_for_field(field, file_model) 

774 if extracted_entity_name: 

775 # Update the field type to reference the extracted entity 

776 field.type = extracted_entity_name 

777 

778 def _field_should_reference_extracted_entity(self, field: Field, file_model: FileModel) -> bool: 

779 """Check if a field should reference an extracted entity.""" 

780 # Check if there's an extracted entity that matches this field's content 

781 # This is a heuristic based on the field name and available extracted entities 

782 

783 # Look for extracted entities that might match this field 

784 for union_name in file_model.unions: 

785 if union_name == field.name or union_name.endswith(f"_{field.name}"): 

786 return True 

787 

788 for struct_name in file_model.structs: 

789 if struct_name == field.name or struct_name.endswith(f"_{field.name}"): 

790 return True 

791 

792 return False 

793 

794 def _find_extracted_entity_for_field(self, field: Field, file_model: FileModel) -> Optional[str]: 

795 """Find the extracted entity that a field should reference.""" 

796 # Look for extracted entities that match this field 

797 for union_name in file_model.unions: 

798 if union_name == field.name or union_name.endswith(f"_{field.name}"): 

799 return union_name 

800 

801 for struct_name in file_model.structs: 

802 if struct_name == field.name or struct_name.endswith(f"_{struct_name}"): 

803 return struct_name 

804 

805 return None 

806 

807 def _has_balanced_anonymous_pattern(self, text: str) -> bool: 

808 """Check if text contains an anonymous struct/union pattern with balanced braces.""" 

809 # Look for struct/union followed by balanced braces and a field name 

810 pattern = r'(struct|union)\s*\{' 

811 matches = list(re.finditer(pattern, text)) 

812 

813 for match in matches: 

814 start_pos = match.start() 

815 brace_count = 0 

816 pos = text.find('{', start_pos) 

817 

818 if pos == -1: 

819 continue 

820 

821 # Count braces to find the matching closing brace 

822 while pos < len(text): 

823 char = text[pos] 

824 if char == '{': 

825 brace_count += 1 

826 elif char == '}': 

827 brace_count -= 1 

828 if brace_count == 0: 

829 # Check if there's a field name after the closing brace 

830 remaining = text[pos + 1:].strip() 

831 if re.match(r'^\w+', remaining): 

832 return True 

833 break 

834 pos += 1 

835 

836 return False 

837 

838 def _has_balanced_anonymous_pattern_no_field_name(self, text: str) -> bool: 

839 """Check if text contains an anonymous struct/union pattern without field name.""" 

840 # Look for struct/union followed by balanced braces but no field name 

841 pattern = r'(struct|union)\s*\{' 

842 matches = list(re.finditer(pattern, text)) 

843 

844 for match in matches: 

845 start_pos = match.start() 

846 brace_count = 0 

847 pos = text.find('{', start_pos) 

848 

849 if pos == -1: 

850 continue 

851 

852 # Count braces to find the matching closing brace 

853 while pos < len(text): 

854 char = text[pos] 

855 if char == '{': 

856 brace_count += 1 

857 elif char == '}': 

858 brace_count -= 1 

859 if brace_count == 0: 

860 # Check if there's no field name after the closing brace 

861 remaining = text[pos + 1:].strip() 

862 if not re.match(r'^\w+', remaining): 

863 return True 

864 break 

865 pos += 1 

866 

867 return False 

868 

869 def _extract_balanced_anonymous_struct(self, text: str) -> Optional[Tuple[str, str, str]]: 

870 """Extract anonymous struct/union with balanced braces and field name.""" 

871 pattern = r'(struct|union)\s*\{' 

872 matches = list(re.finditer(pattern, text)) 

873 

874 for match in matches: 

875 struct_type = match.group(1) 

876 start_pos = match.start() 

877 brace_count = 0 

878 pos = text.find('{', start_pos) 

879 

880 if pos == -1: 

881 continue 

882 

883 # Count braces to find the matching closing brace 

884 while pos < len(text): 

885 char = text[pos] 

886 if char == '{': 

887 brace_count += 1 

888 elif char == '}': 

889 brace_count -= 1 

890 if brace_count == 0: 

891 # Extract the struct content 

892 struct_content = text[start_pos:pos + 1] 

893 

894 # Extract the field name 

895 remaining = text[pos + 1:].strip() 

896 # Handle field names that might have modifiers like * or [] 

897 # Look for the actual field name after any modifiers 

898 field_match = re.match(r'^[*\s\[\]]*(\w+)', remaining) 

899 if field_match: 

900 field_name = field_match.group(1) 

901 return struct_content, struct_type, field_name 

902 break 

903 pos += 1 

904 

905 return None 

906 

907 def _extract_balanced_anonymous_struct_no_field_name(self, text: str) -> Optional[Tuple[str, str]]: 

908 """Extract anonymous struct/union with balanced braces but no field name.""" 

909 pattern = r'(struct|union)\s*\{' 

910 matches = list(re.finditer(pattern, text)) 

911 

912 for match in matches: 

913 struct_type = match.group(1) 

914 start_pos = match.start() 

915 brace_count = 0 

916 pos = text.find('{', start_pos) 

917 

918 if pos == -1: 

919 continue 

920 

921 # Count braces to find the matching closing brace 

922 while pos < len(text): 

923 char = text[pos] 

924 if char == '{': 

925 brace_count += 1 

926 elif char == '}': 

927 brace_count -= 1 

928 if brace_count == 0: 

929 # Extract the struct content 

930 struct_content = text[start_pos:pos + 1] 

931 

932 # Check that there's no field name after the closing brace 

933 remaining = text[pos + 1:].strip() 

934 if not re.match(r'^\w+', remaining): 

935 return struct_content, struct_type 

936 break 

937 pos += 1 

938 

939 return None