Coverage for src/c2puml/utils.py: 43%
47 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 03:53 +0000
1#!/usr/bin/env python3
2"""
3Utility functions for C to PlantUML converter
4"""
6import logging
7from pathlib import Path
8from typing import Dict, Optional
10# Try to import chardet, fallback to basic encoding detection if not available
11try:
12 import chardet
14 CHARDET_AVAILABLE = True
15except ImportError:
16 CHARDET_AVAILABLE = False
19def detect_file_encoding(file_path: Path) -> str:
20 """Detect file encoding with platform-aware fallbacks"""
21 try:
22 if CHARDET_AVAILABLE:
23 # Try to detect encoding with chardet
24 with open(file_path, "rb") as f:
25 raw_data = f.read(1024) # Read first 1KB for detection
26 if raw_data:
27 result = chardet.detect(raw_data)
28 if result and result["confidence"] > 0.7:
29 return result["encoding"]
31 # Fallback encodings in order of preference
32 fallback_encodings = ["utf-8", "latin-1", "cp1252", "iso-8859-1"]
34 for encoding in fallback_encodings:
35 try:
36 with open(file_path, "r", encoding=encoding) as f:
37 f.read(1024) # Test read
38 return encoding
39 except (UnicodeDecodeError, UnicodeError):
40 continue
42 # Final fallback
43 return "utf-8"
45 except Exception as e:
46 logging.warning(f"Failed to detect encoding for {file_path}: {e}")
47 return "utf-8"
53# Backward compatibility functions for existing tests
54def get_acceptable_encodings() -> list:
55 """
56 Get a list of acceptable encodings for cross-platform compatibility.
58 Returns:
59 List of encoding names that are considered acceptable across platforms.
60 """
61 return [
62 "utf-8",
63 "utf-8-sig",
64 "utf-16",
65 "utf-16le",
66 "utf-16be",
67 "windows-1252",
68 "windows-1254",
69 "cp1252",
70 "cp1254",
71 "iso-8859-1",
72 "latin-1",
73 "ascii",
74 ]
77def is_acceptable_encoding(encoding: str) -> bool:
78 """
79 Check if an encoding is acceptable for cross-platform compatibility.
81 Args:
82 encoding: The encoding name to check.
84 Returns:
85 True if the encoding is acceptable, False otherwise.
86 """
87 return encoding.lower() in [enc.lower() for enc in get_acceptable_encodings()]
90def normalize_encoding(encoding: str) -> str:
91 """
92 Normalize encoding name for consistency across platforms.
94 Args:
95 encoding: The encoding name to normalize.
97 Returns:
98 Normalized encoding name.
99 """
100 encoding_lower = encoding.lower()
102 # Normalize common Windows encodings
103 if encoding_lower in ["windows-1252", "cp1252"]:
104 return "windows-1252"
105 elif encoding_lower in ["windows-1254", "cp1254"]:
106 return "windows-1254"
107 elif encoding_lower in ["iso-8859-1", "latin-1"]:
108 return "iso-8859-1"
110 return encoding_lower
113def get_platform_default_encoding() -> str:
114 """
115 Get the default encoding for the current platform.
117 Returns:
118 The default encoding name for the current platform.
119 """
120 import sys
122 if sys.platform.startswith("win"):
123 return "windows-1252" # Common Windows default
124 else:
125 return "utf-8" # Common Unix/Linux default