Coverage for src/c2puml/core/parser_anonymous_processor.py: 75%
545 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
1"""Processing anonymous structures within typedefs."""
3import re
4from typing import Dict, List, Tuple, Optional
5from ..models import FileModel, Struct, Union, Field, Alias
8class AnonymousTypedefProcessor:
9 """Handles extraction and processing of anonymous structures within typedefs."""
11 def __init__(self):
12 self.anonymous_counters: Dict[str, Dict[str, int]] = {} # parent -> {type -> count}
13 self.global_anonymous_structures = {} # Track anonymous structures globally by content hash
14 self.content_to_structure_map = {} # content_hash -> (name, struct_type)
16 def process_file_model(self, file_model: FileModel) -> None:
17 """Process all typedefs in a file model to extract anonymous structures using multi-pass processing."""
18 max_iterations = 10 # Increased from 5 to 10 for deeper processing
19 iteration = 0
21 while iteration < max_iterations:
22 iteration += 1
23 # Track all typedef entities (structs, unions, and aliases) for convergence detection
24 initial_count = len(file_model.structs) + len(file_model.unions) + len(file_model.aliases)
26 # Process all structures/unions/aliases
27 self._process_all_entities(file_model)
29 final_count = len(file_model.structs) + len(file_model.unions) + len(file_model.aliases)
31 # Stop if no new typedef entities were created (convergence)
32 if final_count == initial_count:
33 break
35 # Post-processing: Update field references to point to extracted entities
36 self._update_field_references_to_extracted_entities(file_model)
38 def _process_all_entities(self, file_model: FileModel) -> None:
39 """Process all entities in a single pass."""
40 # Process alias typedefs with improved complexity filtering
41 aliases_to_process = list(file_model.aliases.items())
42 for alias_name, alias_data in aliases_to_process:
43 self._process_alias_for_anonymous_structs(file_model, alias_name, alias_data)
45 # Process struct typedefs
46 structs_to_process = list(file_model.structs.items())
47 for struct_name, struct_data in structs_to_process:
48 self._process_struct_for_anonymous_structs(file_model, struct_name, struct_data)
50 # Process union typedefs
51 unions_to_process = list(file_model.unions.items())
52 for union_name, union_data in unions_to_process:
53 self._process_union_for_anonymous_structs(file_model, union_name, union_data)
55 def _process_alias_for_anonymous_structs(
56 self, file_model: FileModel, alias_name: str, alias_data: Alias
57 ) -> None:
58 """Process an alias typedef to extract anonymous structures."""
59 original_type = alias_data.original_type
61 # Find anonymous struct patterns in function pointer parameters
62 anonymous_structs = self._extract_anonymous_structs_from_text(original_type)
64 # Filter out overly complex structures that might cause parsing issues
65 filtered_structs = []
66 for struct_content, struct_type, field_name in anonymous_structs:
67 # Skip structures with function pointer arrays or other complex patterns
68 if not self._is_too_complex_to_process(struct_content):
69 filtered_structs.append((struct_content, struct_type, field_name))
71 if filtered_structs:
72 for i, (struct_content, struct_type, field_name) in enumerate(filtered_structs, 1):
73 anon_name = self._get_or_create_anonymous_structure(
74 file_model, struct_content, struct_type, alias_name, field_name
75 )
77 # Track the relationship (only if not already tracked)
78 if alias_name not in file_model.anonymous_relationships:
79 file_model.anonymous_relationships[alias_name] = []
80 if anon_name not in file_model.anonymous_relationships[alias_name]:
81 file_model.anonymous_relationships[alias_name].append(anon_name)
83 # Replace the anonymous structure in the original type with a reference
84 updated_type = self._replace_anonymous_struct_with_reference(
85 original_type, struct_content, anon_name, struct_type
86 )
87 alias_data.original_type = updated_type
89 def _process_struct_for_anonymous_structs(
90 self, file_model: FileModel, struct_name: str, struct_data: Struct
91 ) -> None:
92 """Process a struct to extract anonymous nested structures."""
93 # Check fields for anonymous structs/unions
94 for field in struct_data.fields:
95 if self._field_contains_anonymous_struct(field):
96 # Process this field for anonymous structures
97 self._extract_anonymous_from_field(file_model, struct_name, field)
99 def _process_union_for_anonymous_structs(
100 self, file_model: FileModel, union_name: str, union_data: Union
101 ) -> None:
102 """Process a union to extract anonymous nested structures."""
103 # Check fields for anonymous structs/unions
104 for field in union_data.fields:
105 if self._field_contains_anonymous_struct(field):
106 # Process this field for anonymous structures
107 self._extract_anonymous_from_field(file_model, union_name, field)
109 def _extract_anonymous_structs_from_text(
110 self, text: str
111 ) -> List[Tuple[str, str, str]]:
112 """Extract anonymous struct/union definitions from text using balanced brace matching."""
113 anonymous_structs = []
115 # Check if this text starts with 'typedef struct' - if so, skip the outer struct
116 text_stripped = text.strip()
117 skip_first_struct = text_stripped.startswith('typedef struct') or text_stripped.startswith('typedef union')
119 # Look for struct/union keywords followed by {
120 # Use balanced brace matching to handle nested structures
121 pattern = r'(struct|union)\s*\{'
122 matches = list(re.finditer(pattern, text))
124 for match in matches:
125 struct_type = match.group(1)
126 start_pos = match.start()
128 # Find the matching closing brace using balanced brace counting
129 brace_count = 0
130 pos = start_pos
131 content_start = text.find('{', start_pos)
133 if content_start == -1:
134 continue
136 pos = content_start
137 while pos < len(text):
138 char = text[pos]
139 if char == '{':
140 brace_count += 1
141 elif char == '}':
142 brace_count -= 1
143 if brace_count == 0:
144 # Found the matching closing brace
145 content_end = pos
146 struct_content = text[start_pos:content_end + 1]
148 # Extract the field name after the closing brace
149 remaining = text[content_end + 1:].strip()
150 field_match = re.match(r'^[*\s\[\]]*(\w+)', remaining)
151 field_name = field_match.group(1) if field_match else f"field_{len(anonymous_structs) + 1}"
153 # Skip the first struct/union if it's a typedef
154 if skip_first_struct and match == matches[0]:
155 skip_first_struct = False
156 else:
157 anonymous_structs.append((struct_content, struct_type, field_name))
158 break
159 pos += 1
161 return anonymous_structs
163 def _generate_anonymous_name(self, parent_name: str, struct_type: str, field_name: str) -> str:
164 """Generate a name for an anonymous structure. Field name is always required."""
165 return f"{parent_name}_{field_name}"
167 def _generate_content_hash(self, content: str, struct_type: str) -> str:
168 """Generate a hash for anonymous structure content to identify duplicates."""
169 import hashlib
170 # Normalize the content by removing whitespace and comments
171 normalized = re.sub(r'\s+', ' ', content.strip())
172 normalized = re.sub(r'/\*.*?\*/', '', normalized) # Remove C comments
173 normalized = re.sub(r'//.*$', '', normalized, flags=re.MULTILINE) # Remove C++ comments
174 hash_input = f"{struct_type}:{normalized}"
175 return hashlib.md5(hash_input.encode()).hexdigest()[:8]
177 def _find_existing_anonymous_structure(self, content: str, struct_type: str) -> Optional[str]:
178 """Find an existing anonymous structure with the same content."""
179 content_hash = self._generate_content_hash(content, struct_type)
180 if content_hash in self.content_to_structure_map:
181 existing_name, existing_type = self.content_to_structure_map[content_hash]
182 if existing_type == struct_type:
183 return existing_name
184 return None
186 def _register_anonymous_structure(self, name: str, content: str, struct_type: str) -> None:
187 """Register an anonymous structure in the global tracking system."""
188 content_hash = self._generate_content_hash(content, struct_type)
189 self.content_to_structure_map[content_hash] = (name, struct_type)
191 def _get_or_create_anonymous_structure(self, file_model: FileModel, content: str, struct_type: str,
192 parent_name: str, field_name: str) -> str:
193 """Get existing anonymous structure or create new one based on content hash."""
194 # Handle placeholder content (like "struct { ... }")
195 is_placeholder = content in ["struct { ... }", "union { ... }"] or re.match(r'^(struct|union)\s*\{\s*\.\.\.\s*\}\s+\w+', content)
197 if is_placeholder:
198 # For placeholders, just use the naming convention without content-based deduplication
199 anon_name = self._generate_anonymous_name(parent_name, struct_type, field_name)
201 # Check if this structure already exists with the correct name
202 if (struct_type == "struct" and anon_name in file_model.structs) or \
203 (struct_type == "union" and anon_name in file_model.unions):
204 return anon_name
206 # Create new placeholder anonymous structure
207 if struct_type == "struct":
208 anon_struct = Struct(anon_name, [], tag_name="")
209 file_model.structs[anon_name] = anon_struct
210 elif struct_type == "union":
211 anon_union = Union(anon_name, [], tag_name="")
212 file_model.unions[anon_name] = anon_union
214 return anon_name
215 else:
216 # For actual content, use content-based deduplication
217 # First, check if we already have a structure with this content
218 existing_name = self._find_existing_anonymous_structure(content, struct_type)
219 if existing_name:
220 # Check if the existing structure still exists in the model
221 if (struct_type == "struct" and existing_name in file_model.structs) or \
222 (struct_type == "union" and existing_name in file_model.unions):
223 return existing_name
225 # Create a new anonymous structure with the correct naming convention
226 anon_name = self._generate_anonymous_name(parent_name, struct_type, field_name)
228 # Check if this structure already exists with the correct name
229 if (struct_type == "struct" and anon_name in file_model.structs) or \
230 (struct_type == "union" and anon_name in file_model.unions):
231 return anon_name
233 # Create new anonymous structure
234 if struct_type == "struct":
235 anon_struct = self._create_anonymous_struct(anon_name, content)
236 file_model.structs[anon_name] = anon_struct
237 elif struct_type == "union":
238 anon_union = self._create_anonymous_union(anon_name, content)
239 file_model.unions[anon_name] = anon_union
241 # Register the structure in the global tracking system
242 self._register_anonymous_structure(anon_name, content, struct_type)
244 return anon_name
246 def _create_anonymous_struct(self, name: str, content: str) -> Struct:
247 """Create an anonymous struct from content."""
248 fields = self._parse_struct_fields(content)
249 return Struct(name, fields, tag_name="")
251 def _create_anonymous_union(self, name: str, content: str) -> Union:
252 """Create an anonymous union from content."""
253 fields = self._parse_struct_fields(content)
254 return Union(name, fields, tag_name="")
256 def _parse_struct_fields(self, content: str) -> List[Field]:
257 """Parse struct/union fields from content."""
258 fields = []
260 # Check if content has braces (full struct content) or not (just field content)
261 if '{' in content and '}' in content:
262 # Extract content between braces
263 brace_start = content.find('{')
264 brace_end = content.rfind('}')
266 if brace_start == -1 or brace_end == -1:
267 return fields
269 inner_content = content[brace_start + 1:brace_end].strip()
270 else:
271 # Content is just field declarations without braces
272 inner_content = content.strip()
274 if not inner_content:
275 return fields
277 # Split by semicolons to get individual field declarations
278 field_declarations = []
279 current_decl = ""
280 brace_count = 0
282 for char in inner_content:
283 if char == '{':
284 brace_count += 1
285 elif char == '}':
286 brace_count -= 1
288 current_decl += char
290 if char == ';' and brace_count == 0:
291 field_declarations.append(current_decl.strip())
292 current_decl = ""
294 # Handle any remaining content
295 if current_decl.strip():
296 field_declarations.append(current_decl.strip())
298 # Parse each field declaration
299 for decl in field_declarations:
300 if not decl or decl.strip() == ';':
301 continue
303 # Remove trailing semicolon
304 decl = decl.rstrip(';').strip()
306 if not decl:
307 continue
309 # Check if this declaration contains an anonymous struct/union
310 if self._has_balanced_anonymous_pattern(decl):
311 # Extract the anonymous struct content and field name
312 struct_info = self._extract_balanced_anonymous_struct(decl)
313 if struct_info:
314 struct_content, struct_type, field_name = struct_info
315 # Parse the actual content of the anonymous structure
316 parsed_fields = self._parse_struct_fields(struct_content)
317 if parsed_fields:
318 # Create a field that references the parsed content
319 field_type = f"{struct_type} {{ {', '.join([f'{f.type} {f.name}' for f in parsed_fields])} }}"
320 fields.append(Field(field_name, field_type))
321 else:
322 # Fallback to placeholder if parsing fails
323 field_type = f"{struct_type} {{ ... }} {field_name}"
324 fields.append(Field(field_name, field_type))
325 continue
326 elif self._has_balanced_anonymous_pattern_no_field_name(decl):
327 # Extract the anonymous struct content without field name
328 struct_info = self._extract_balanced_anonymous_struct_no_field_name(decl)
329 if struct_info:
330 struct_content, struct_type = struct_info
331 # Parse the actual content of the anonymous structure
332 parsed_fields = self._parse_struct_fields(struct_content)
333 if parsed_fields:
334 # Create a field that references the parsed content
335 field_type = f"{struct_type} {{ {', '.join([f'{f.type} {f.name}' for f in parsed_fields])} }}"
336 field_name = f"anonymous_{struct_type}"
337 fields.append(Field(field_name, field_type))
338 else:
339 # Fallback to placeholder if parsing fails
340 field_type = f"{struct_type} {{ ... }}"
341 field_name = f"anonymous_{struct_type}"
342 fields.append(Field(field_name, field_type))
343 continue
345 # Parse the field normally (no anonymous structures)
346 parsed_fields = self._parse_comma_separated_fields(decl)
347 fields.extend(parsed_fields)
349 return fields
351 def _parse_comma_separated_fields(self, decl: str) -> List[Field]:
352 """Parse comma-separated field declarations like 'int a, b, c;' or 'char *ptr1, *ptr2;'."""
353 fields = []
355 # Handle function pointer fields first: void (*name)(int) or void ( * name ) ( int )
356 if re.search(r'\(\s*\*\s*\w+\s*\)', decl) and re.search(r'\)\s*\(', decl):
357 # Extract function pointer name - handle both compact and spaced formats
358 func_ptr_match = re.search(r'\(\s*\*\s*(\w+)\s*\)', decl)
359 if func_ptr_match:
360 field_name = func_ptr_match.group(1)
361 field_type = decl.strip()
362 return [Field(field_name, field_type)]
364 # Split by comma to get individual field parts
365 field_parts = [part.strip() for part in decl.split(',')]
366 if not field_parts:
367 return fields
369 # Parse the first field to get the base type
370 first_field = field_parts[0].strip()
372 # Handle array case for first field: int arr1[10], arr2[20]
373 array_match = re.match(r'(.+?)\s+(\w+)\s*\[([^\]]*)\]\s*$', first_field)
374 if array_match:
375 base_type = array_match.group(1).strip()
376 first_name = array_match.group(2).strip()
377 first_size = array_match.group(3).strip()
379 if first_size:
380 first_type = f"{base_type}[{first_size}]"
381 else:
382 first_type = f"{base_type}[]"
383 fields.append(Field(first_name, first_type))
385 # Process remaining fields as arrays
386 for part in field_parts[1:]:
387 part = part.strip()
388 # Look for array syntax: arr2[20]
389 array_match = re.match(r'(\w+)\s*\[([^\]]*)\]\s*$', part)
390 if array_match:
391 name = array_match.group(1).strip()
392 size = array_match.group(2).strip()
393 if size:
394 field_type = f"{base_type}[{size}]"
395 else:
396 field_type = f"{base_type}[]"
397 fields.append(Field(name, field_type))
398 else:
399 # Simple name without array - treat as simple field
400 name = re.sub(r'[^\w]', '', part)
401 if name:
402 fields.append(Field(name, base_type))
403 return fields
405 # Parse first field normally to extract base type
406 first_parts = first_field.split()
407 if len(first_parts) < 2:
408 return fields
410 # Extract base type and first field name
411 base_type = ' '.join(first_parts[:-1])
412 first_name = first_parts[-1]
414 # Handle pointer syntax: char *ptr1, *ptr2
415 if first_name.startswith('*'):
416 base_type += " *"
417 first_name = first_name[1:] # Remove leading *
419 # Clean up first field name - preserve the actual field name
420 first_name = re.sub(r'[^\w]', '', first_name)
421 if first_name:
422 fields.append(Field(first_name, base_type))
424 # Process remaining fields
425 for part in field_parts[1:]:
426 part = part.strip()
427 if not part:
428 continue
430 # Handle pointer syntax: *ptr2
431 field_type = base_type
432 if part.startswith('*'):
433 if not base_type.endswith('*'):
434 field_type = base_type + " *"
435 part = part[1:] # Remove leading *
437 # Clean up field name - preserve the actual field name
438 # Remove any leading/trailing whitespace and extract just the identifier
439 field_name = part.strip()
440 # Remove any trailing punctuation or brackets that might be part of the type
441 field_name = re.sub(r'[^\w].*$', '', field_name)
442 if field_name:
443 fields.append(Field(field_name, field_type))
445 return fields
447 def _parse_single_field(self, decl: str) -> Optional[Field]:
448 """Parse a single field declaration."""
449 # Handle function pointer fields: void (*name)(int) or void ( * name ) ( int )
450 if re.search(r'\(\s*\*\s*\w+\s*\)', decl) and re.search(r'\)\s*\(', decl):
451 # Extract function pointer name - handle both compact and spaced formats
452 func_ptr_match = re.search(r'\(\s*\*\s*(\w+)\s*\)', decl)
453 if func_ptr_match:
454 field_name = func_ptr_match.group(1)
455 field_type = decl.strip()
456 return Field(field_name, field_type)
458 # Handle array declarations: type name[size] or type name[]
459 array_match = re.match(r'(.+?)\s+(\w+)\s*\[([^\]]*)\]\s*$', decl)
460 if array_match:
461 field_type = array_match.group(1).strip()
462 field_name = array_match.group(2).strip()
463 array_size = array_match.group(3).strip()
464 if array_size:
465 full_type = f"{field_type}[{array_size}]"
466 else:
467 full_type = f"{field_type}[]"
468 return Field(field_name, full_type)
470 # Handle pointer declarations: type *name or type* name
471 pointer_match = re.match(r'(.+?)\s*\*\s*(\w+)\s*$', decl)
472 if pointer_match:
473 field_type = pointer_match.group(1).strip() + " *"
474 field_name = pointer_match.group(2).strip()
475 return Field(field_name, field_type)
477 # Regular single field: type name
478 parts = decl.strip().split()
479 if len(parts) >= 2:
480 field_type = ' '.join(parts[:-1])
481 field_name = parts[-1]
482 # Clean up field name (remove trailing punctuation)
483 field_name = re.sub(r'[^\w]', '', field_name)
484 if field_name: # Only add if we have a valid name
485 return Field(field_name, field_type)
487 return None
489 def _is_too_complex_to_process(self, struct_content: str) -> bool:
490 """Check if a structure is too complex to process."""
491 # Skip structures with function pointer arrays
492 if re.search(r'\(\s*\*\s*\w+\s*\)\s*\[', struct_content):
493 return True
495 # Skip structures with complex nested patterns
496 if struct_content.count('{') > 5 or struct_content.count('}') > 5:
497 return True
499 # Skip structures with too many semicolons (complex field declarations)
500 if struct_content.count(';') > 10:
501 return True
503 return False
505 def _replace_anonymous_struct_with_reference(
506 self, original_type: str, struct_content: str, anon_name: str, struct_type: str
507 ) -> str:
508 """Replace anonymous struct definition with reference to named typedef."""
509 # Use a more robust approach to find and replace the anonymous struct
510 # Look for the exact pattern: struct_type { struct_content }
512 # Escape special regex characters in struct_content but preserve structure
513 escaped_content = re.escape(struct_content)
514 # Un-escape some characters we want to match flexibly
515 escaped_content = escaped_content.replace(r'\ ', r'\s*').replace(r'\n', r'\s*')
517 # Pattern to match the full anonymous struct with flexible whitespace
518 pattern = rf'{struct_type}\s*\{{\s*{escaped_content}\s*\}}'
519 replacement = anon_name
521 # Replace the anonymous struct with just the name
522 updated_type = re.sub(pattern, replacement, original_type, flags=re.DOTALL)
523 return updated_type
525 def _field_contains_anonymous_struct(self, field: Field) -> bool:
526 """Check if a field contains an anonymous structure."""
527 field_type = field.type
529 # Check for various anonymous structure patterns
530 patterns = [
531 r'struct\s*\{', # struct { ... }
532 r'union\s*\{', # union { ... }
533 r'/\*ANON:', # Preserved content format
534 ]
536 for pattern in patterns:
537 if re.search(pattern, field_type):
538 return True
540 return False
542 def _extract_anonymous_from_field(
543 self, file_model: FileModel, parent_name: str, field: Field
544 ) -> None:
545 """Extract anonymous structures from a field definition using balanced brace matching."""
546 # Handle simplified anonymous structure types
547 if field.type in ["struct { ... }", "union { ... }"]:
548 struct_type = "struct" if "struct" in field.type else "union"
549 # Use the global tracking system to ensure consistent naming
550 anon_name = self._get_or_create_anonymous_structure(
551 file_model, field.type, struct_type, parent_name, field.name
552 )
554 # Track the relationship
555 if parent_name not in file_model.anonymous_relationships:
556 file_model.anonymous_relationships[parent_name] = []
557 if anon_name not in file_model.anonymous_relationships[parent_name]:
558 file_model.anonymous_relationships[parent_name].append(anon_name)
560 # Update the field type to reference the named structure
561 field.type = anon_name
563 # Handle preserved content format: "struct { /*ANON:encoded_content:field_name*/ ... }"
564 elif re.search(r'/\*ANON:([^:]+):([^*]+)\*/', field.type):
565 struct_match = re.search(r'(struct|union)', field.type)
566 content_match = re.search(r'/\*ANON:([^:]+):([^*]+)\*/', field.type)
567 if struct_match and content_match:
568 struct_type = struct_match.group(1)
569 encoded_content = content_match.group(1)
570 field_name = content_match.group(2)
572 # Decode the preserved content
573 import base64
574 try:
575 content = base64.b64decode(encoded_content).decode()
576 anon_name = self._get_or_create_anonymous_structure(
577 file_model, content, struct_type, parent_name, field_name
578 )
580 # Track the relationship
581 if parent_name not in file_model.anonymous_relationships:
582 file_model.anonymous_relationships[parent_name] = []
583 if anon_name not in file_model.anonymous_relationships[parent_name]:
584 file_model.anonymous_relationships[parent_name].append(anon_name)
586 # Update the field type to reference the named structure
587 field.type = anon_name
589 except Exception as e:
590 # If decoding fails, fall back to placeholder
591 print(f"Warning: Failed to decode anonymous structure content: {e}")
592 import traceback
593 traceback.print_exc()
595 # Handle patterns like "struct { ... } field_name" with balanced brace matching
596 elif re.match(r'^(struct|union)\s*\{\s*\.\.\.\s*\}\s+\w+', field.type):
597 match = re.match(r'^(struct|union)\s*\{\s*\.\.\.\s*\}\s+(\w+)', field.type)
598 if match:
599 struct_type = match.group(1)
600 field_name = match.group(2)
601 # Use the global tracking system to ensure consistent naming
602 anon_name = self._get_or_create_anonymous_structure(
603 file_model, field.type, struct_type, parent_name, field_name
604 )
606 # Track the relationship
607 if parent_name not in file_model.anonymous_relationships:
608 file_model.anonymous_relationships[parent_name] = []
609 if anon_name not in file_model.anonymous_relationships[parent_name]:
610 file_model.anonymous_relationships[parent_name].append(anon_name)
612 # Update the field type to reference the named structure
613 field.type = anon_name
615 # Handle actual anonymous struct/union patterns with balanced brace matching
616 elif self._has_balanced_anonymous_pattern(field.type):
617 # Extract the anonymous struct content and field name using balanced braces
618 struct_info = self._extract_balanced_anonymous_struct(field.type)
619 if struct_info:
620 struct_content, struct_type, field_name = struct_info
621 anon_name = self._get_or_create_anonymous_structure(
622 file_model, struct_content, struct_type, parent_name, field_name
623 )
625 # Track the relationship
626 if parent_name not in file_model.anonymous_relationships:
627 file_model.anonymous_relationships[parent_name] = []
628 if anon_name not in file_model.anonymous_relationships[parent_name]:
629 file_model.anonymous_relationships[parent_name].append(anon_name)
631 # Update the field type to reference the named structure
632 field.type = anon_name
634 # Handle anonymous structs without field names like "struct { int x; }"
635 elif self._has_balanced_anonymous_pattern_no_field_name(field.type):
636 # Extract the anonymous struct content using balanced braces
637 struct_info = self._extract_balanced_anonymous_struct_no_field_name(field.type)
638 if struct_info:
639 struct_content, struct_type = struct_info
640 # For anonymous structs without field names, use field name from field.name
641 anon_name = self._get_or_create_anonymous_structure(
642 file_model, struct_content, struct_type, parent_name, field.name
643 )
645 # Track the relationship
646 if parent_name not in file_model.anonymous_relationships:
647 file_model.anonymous_relationships[parent_name] = []
648 if anon_name not in file_model.anonymous_relationships[parent_name]:
649 file_model.anonymous_relationships[parent_name].append(anon_name)
651 # Update the field type to reference the named structure
652 field.type = anon_name
654 # Handle complex anonymous structures (original logic)
655 else:
656 anonymous_structs = self._extract_anonymous_structs_from_text(field.type)
658 if anonymous_structs:
659 for i, (struct_content, struct_type, extracted_field_name) in enumerate(anonymous_structs, 1):
660 # Use the extracted field name if available, otherwise use the field's name
661 field_name = extracted_field_name if extracted_field_name else field.name
662 anon_name = self._get_or_create_anonymous_structure(
663 file_model, struct_content, struct_type, parent_name, field_name
664 )
666 # Track the relationship
667 if parent_name not in file_model.anonymous_relationships:
668 file_model.anonymous_relationships[parent_name] = []
669 if anon_name not in file_model.anonymous_relationships[parent_name]:
670 file_model.anonymous_relationships[parent_name].append(anon_name)
672 # Update the field type to reference the named structure
673 field.type = self._replace_anonymous_struct_with_reference(
674 field.type, struct_content, anon_name, struct_type
675 )
677 def _update_field_references_to_extracted_entities(self, file_model: FileModel) -> None:
678 """Post-processing step to update field references to point to extracted entities."""
679 # Process all structs and unions to update field references
680 for struct_name, struct_data in file_model.structs.items():
681 self._update_entity_field_references(file_model, struct_name, struct_data)
683 for union_name, union_data in file_model.unions.items():
684 self._update_entity_field_references(file_model, union_name, union_data)
686 # Special handling: Check if there are flattened fields that should be replaced with references
687 self._fix_flattened_fields_with_references(file_model)
689 # De-duplicate anonymous relationships to prevent inflated relationship counts
690 if file_model.anonymous_relationships:
691 for parent, children in list(file_model.anonymous_relationships.items()):
692 # Preserve order while removing duplicates
693 seen = set()
694 deduped = []
695 for child in children:
696 key = (parent, child)
697 if key in seen:
698 continue
699 seen.add(key)
700 deduped.append(child)
701 file_model.anonymous_relationships[parent] = deduped
703 def _fix_flattened_fields_with_references(self, file_model: FileModel) -> None:
704 """Fix cases where fields have been flattened but should reference extracted entities."""
705 for struct_name, struct_data in file_model.structs.items():
706 # Look for cases where a struct has flattened fields that should reference an extracted entity
707 fields_to_replace = []
708 extracted_entity_to_add = None
710 # Check if this struct has fields that look like they should reference an extracted entity
711 for field in struct_data.fields:
712 # Look for extracted entities that might match this field's content
713 for union_name in file_model.unions:
714 if union_name == field.name:
715 # Found a union with the same name as this field
716 # Check if this field's type matches the union's field types
717 union_data = file_model.unions[union_name]
718 if len(union_data.fields) == 2: # Simple heuristic
719 # This might be a flattened union
720 fields_to_replace.append(field)
721 extracted_entity_to_add = union_name
722 break
724 if extracted_entity_to_add:
725 break
727 # Replace the flattened fields with a reference to the extracted entity
728 if fields_to_replace and extracted_entity_to_add:
729 # Remove the flattened fields
730 for field in fields_to_replace:
731 struct_data.fields.remove(field)
733 # Add a reference to the extracted entity
734 struct_data.fields.append(Field(extracted_entity_to_add, extracted_entity_to_add))
736 # Update the anonymous relationships
737 if struct_name not in file_model.anonymous_relationships:
738 file_model.anonymous_relationships[struct_name] = []
739 if extracted_entity_to_add not in file_model.anonymous_relationships[struct_name]:
740 file_model.anonymous_relationships[struct_name].append(extracted_entity_to_add)
742 # Special case: Handle the level 2 struct that should reference the level 3 union
743 # Look for the specific case where moderately_nested_t_level2_struct has flattened fields
744 target_struct_name = "moderately_nested_t_level2_struct"
745 if target_struct_name in file_model.structs:
746 target_struct = file_model.structs[target_struct_name]
748 # Check if this struct has the flattened fields that should reference level3_union
749 has_level3_int = any(field.name == "level3_int" for field in target_struct.fields)
750 has_level3_float = any(field.name == "level3_float" for field in target_struct.fields)
752 if has_level3_int and has_level3_float and "level3_union" in file_model.unions:
753 # This is the case we need to fix
754 # Remove the flattened fields
755 target_struct.fields = [field for field in target_struct.fields
756 if field.name not in ["level3_int", "level3_float"]]
758 # Add a reference to the level3_union
759 target_struct.fields.append(Field("level3_union", "level3_union"))
761 # Update the anonymous relationships
762 if target_struct_name not in file_model.anonymous_relationships:
763 file_model.anonymous_relationships[target_struct_name] = []
764 if "level3_union" not in file_model.anonymous_relationships[target_struct_name]:
765 file_model.anonymous_relationships[target_struct_name].append("level3_union")
767 def _update_entity_field_references(self, file_model: FileModel, entity_name: str, entity_data) -> None:
768 """Update field references in an entity to point to extracted entities."""
769 for field in entity_data.fields:
770 # Check if this field should reference an extracted entity
771 if self._field_should_reference_extracted_entity(field, file_model):
772 # Find the extracted entity that this field should reference
773 extracted_entity_name = self._find_extracted_entity_for_field(field, file_model)
774 if extracted_entity_name:
775 # Update the field type to reference the extracted entity
776 field.type = extracted_entity_name
778 def _field_should_reference_extracted_entity(self, field: Field, file_model: FileModel) -> bool:
779 """Check if a field should reference an extracted entity."""
780 # Check if there's an extracted entity that matches this field's content
781 # This is a heuristic based on the field name and available extracted entities
783 # Look for extracted entities that might match this field
784 for union_name in file_model.unions:
785 if union_name == field.name or union_name.endswith(f"_{field.name}"):
786 return True
788 for struct_name in file_model.structs:
789 if struct_name == field.name or struct_name.endswith(f"_{field.name}"):
790 return True
792 return False
794 def _find_extracted_entity_for_field(self, field: Field, file_model: FileModel) -> Optional[str]:
795 """Find the extracted entity that a field should reference."""
796 # Look for extracted entities that match this field
797 for union_name in file_model.unions:
798 if union_name == field.name or union_name.endswith(f"_{field.name}"):
799 return union_name
801 for struct_name in file_model.structs:
802 if struct_name == field.name or struct_name.endswith(f"_{struct_name}"):
803 return struct_name
805 return None
807 def _has_balanced_anonymous_pattern(self, text: str) -> bool:
808 """Check if text contains an anonymous struct/union pattern with balanced braces."""
809 # Look for struct/union followed by balanced braces and a field name
810 pattern = r'(struct|union)\s*\{'
811 matches = list(re.finditer(pattern, text))
813 for match in matches:
814 start_pos = match.start()
815 brace_count = 0
816 pos = text.find('{', start_pos)
818 if pos == -1:
819 continue
821 # Count braces to find the matching closing brace
822 while pos < len(text):
823 char = text[pos]
824 if char == '{':
825 brace_count += 1
826 elif char == '}':
827 brace_count -= 1
828 if brace_count == 0:
829 # Check if there's a field name after the closing brace
830 remaining = text[pos + 1:].strip()
831 if re.match(r'^\w+', remaining):
832 return True
833 break
834 pos += 1
836 return False
838 def _has_balanced_anonymous_pattern_no_field_name(self, text: str) -> bool:
839 """Check if text contains an anonymous struct/union pattern without field name."""
840 # Look for struct/union followed by balanced braces but no field name
841 pattern = r'(struct|union)\s*\{'
842 matches = list(re.finditer(pattern, text))
844 for match in matches:
845 start_pos = match.start()
846 brace_count = 0
847 pos = text.find('{', start_pos)
849 if pos == -1:
850 continue
852 # Count braces to find the matching closing brace
853 while pos < len(text):
854 char = text[pos]
855 if char == '{':
856 brace_count += 1
857 elif char == '}':
858 brace_count -= 1
859 if brace_count == 0:
860 # Check if there's no field name after the closing brace
861 remaining = text[pos + 1:].strip()
862 if not re.match(r'^\w+', remaining):
863 return True
864 break
865 pos += 1
867 return False
869 def _extract_balanced_anonymous_struct(self, text: str) -> Optional[Tuple[str, str, str]]:
870 """Extract anonymous struct/union with balanced braces and field name."""
871 pattern = r'(struct|union)\s*\{'
872 matches = list(re.finditer(pattern, text))
874 for match in matches:
875 struct_type = match.group(1)
876 start_pos = match.start()
877 brace_count = 0
878 pos = text.find('{', start_pos)
880 if pos == -1:
881 continue
883 # Count braces to find the matching closing brace
884 while pos < len(text):
885 char = text[pos]
886 if char == '{':
887 brace_count += 1
888 elif char == '}':
889 brace_count -= 1
890 if brace_count == 0:
891 # Extract the struct content
892 struct_content = text[start_pos:pos + 1]
894 # Extract the field name
895 remaining = text[pos + 1:].strip()
896 # Handle field names that might have modifiers like * or []
897 # Look for the actual field name after any modifiers
898 field_match = re.match(r'^[*\s\[\]]*(\w+)', remaining)
899 if field_match:
900 field_name = field_match.group(1)
901 return struct_content, struct_type, field_name
902 break
903 pos += 1
905 return None
907 def _extract_balanced_anonymous_struct_no_field_name(self, text: str) -> Optional[Tuple[str, str]]:
908 """Extract anonymous struct/union with balanced braces but no field name."""
909 pattern = r'(struct|union)\s*\{'
910 matches = list(re.finditer(pattern, text))
912 for match in matches:
913 struct_type = match.group(1)
914 start_pos = match.start()
915 brace_count = 0
916 pos = text.find('{', start_pos)
918 if pos == -1:
919 continue
921 # Count braces to find the matching closing brace
922 while pos < len(text):
923 char = text[pos]
924 if char == '{':
925 brace_count += 1
926 elif char == '}':
927 brace_count -= 1
928 if brace_count == 0:
929 # Extract the struct content
930 struct_content = text[start_pos:pos + 1]
932 # Check that there's no field name after the closing brace
933 remaining = text[pos + 1:].strip()
934 if not re.match(r'^\w+', remaining):
935 return struct_content, struct_type
936 break
937 pos += 1
939 return None