Coverage for src/c2puml/utils.py: 43%

47 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-20 03:53 +0000

1#!/usr/bin/env python3 

2""" 

3Utility functions for C to PlantUML converter 

4""" 

5 

6import logging 

7from pathlib import Path 

8from typing import Dict, Optional 

9 

10# Try to import chardet, fallback to basic encoding detection if not available 

11try: 

12 import chardet 

13 

14 CHARDET_AVAILABLE = True 

15except ImportError: 

16 CHARDET_AVAILABLE = False 

17 

18 

19def detect_file_encoding(file_path: Path) -> str: 

20 """Detect file encoding with platform-aware fallbacks""" 

21 try: 

22 if CHARDET_AVAILABLE: 

23 # Try to detect encoding with chardet 

24 with open(file_path, "rb") as f: 

25 raw_data = f.read(1024) # Read first 1KB for detection 

26 if raw_data: 

27 result = chardet.detect(raw_data) 

28 if result and result["confidence"] > 0.7: 

29 return result["encoding"] 

30 

31 # Fallback encodings in order of preference 

32 fallback_encodings = ["utf-8", "latin-1", "cp1252", "iso-8859-1"] 

33 

34 for encoding in fallback_encodings: 

35 try: 

36 with open(file_path, "r", encoding=encoding) as f: 

37 f.read(1024) # Test read 

38 return encoding 

39 except (UnicodeDecodeError, UnicodeError): 

40 continue 

41 

42 # Final fallback 

43 return "utf-8" 

44 

45 except Exception as e: 

46 logging.warning(f"Failed to detect encoding for {file_path}: {e}") 

47 return "utf-8" 

48 

49 

50 

51 

52 

53# Backward compatibility functions for existing tests 

54def get_acceptable_encodings() -> list: 

55 """ 

56 Get a list of acceptable encodings for cross-platform compatibility. 

57 

58 Returns: 

59 List of encoding names that are considered acceptable across platforms. 

60 """ 

61 return [ 

62 "utf-8", 

63 "utf-8-sig", 

64 "utf-16", 

65 "utf-16le", 

66 "utf-16be", 

67 "windows-1252", 

68 "windows-1254", 

69 "cp1252", 

70 "cp1254", 

71 "iso-8859-1", 

72 "latin-1", 

73 "ascii", 

74 ] 

75 

76 

77def is_acceptable_encoding(encoding: str) -> bool: 

78 """ 

79 Check if an encoding is acceptable for cross-platform compatibility. 

80 

81 Args: 

82 encoding: The encoding name to check. 

83 

84 Returns: 

85 True if the encoding is acceptable, False otherwise. 

86 """ 

87 return encoding.lower() in [enc.lower() for enc in get_acceptable_encodings()] 

88 

89 

90def normalize_encoding(encoding: str) -> str: 

91 """ 

92 Normalize encoding name for consistency across platforms. 

93 

94 Args: 

95 encoding: The encoding name to normalize. 

96 

97 Returns: 

98 Normalized encoding name. 

99 """ 

100 encoding_lower = encoding.lower() 

101 

102 # Normalize common Windows encodings 

103 if encoding_lower in ["windows-1252", "cp1252"]: 

104 return "windows-1252" 

105 elif encoding_lower in ["windows-1254", "cp1254"]: 

106 return "windows-1254" 

107 elif encoding_lower in ["iso-8859-1", "latin-1"]: 

108 return "iso-8859-1" 

109 

110 return encoding_lower 

111 

112 

113def get_platform_default_encoding() -> str: 

114 """ 

115 Get the default encoding for the current platform. 

116 

117 Returns: 

118 The default encoding name for the current platform. 

119 """ 

120 import sys 

121 

122 if sys.platform.startswith("win"): 

123 return "windows-1252" # Common Windows default 

124 else: 

125 return "utf-8" # Common Unix/Linux default