Coverage for src/c2puml/utils.py: 43%

1#!/usr/bin/env python3

2"""

3Utility functions for C to PlantUML converter

4"""

6import logging

7from pathlib import Path

8from typing import Dict, Optional

10# Try to import chardet, fallback to basic encoding detection if not available

11try:

12 import chardet

14 CHARDET_AVAILABLE = True

15except ImportError:

16 CHARDET_AVAILABLE = False

19def detect_file_encoding(file_path: Path) -> str:

20 """Detect file encoding with platform-aware fallbacks"""

21 try:

22 if CHARDET_AVAILABLE:

23 # Try to detect encoding with chardet

24 with open(file_path, "rb") as f:

25 raw_data = f.read(1024) # Read first 1KB for detection

26 if raw_data:

27 result = chardet.detect(raw_data)

28 if result and result["confidence"] > 0.7:

29 return result["encoding"]

31 # Fallback encodings in order of preference

32 fallback_encodings = ["utf-8", "latin-1", "cp1252", "iso-8859-1"]

34 for encoding in fallback_encodings:

35 try:

36 with open(file_path, "r", encoding=encoding) as f:

37 f.read(1024) # Test read

38 return encoding

39 except (UnicodeDecodeError, UnicodeError):

40 continue

42 # Final fallback

43 return "utf-8"

45 except Exception as e:

46 logging.warning(f"Failed to detect encoding for {file_path}: {e}")

47 return "utf-8"

53# Backward compatibility functions for existing tests

54def get_acceptable_encodings() -> list:

55 """

56 Get a list of acceptable encodings for cross-platform compatibility.

58 Returns:

59 List of encoding names that are considered acceptable across platforms.

60 """

61 return [

62 "utf-8",

63 "utf-8-sig",

64 "utf-16",

65 "utf-16le",

66 "utf-16be",

67 "windows-1252",

68 "windows-1254",

69 "cp1252",

70 "cp1254",

71 "iso-8859-1",

72 "latin-1",

73 "ascii",

74 ]

77def is_acceptable_encoding(encoding: str) -> bool:

78 """

79 Check if an encoding is acceptable for cross-platform compatibility.

81 Args:

82 encoding: The encoding name to check.

84 Returns:

85 True if the encoding is acceptable, False otherwise.

86 """

87 return encoding.lower() in [enc.lower() for enc in get_acceptable_encodings()]

90def normalize_encoding(encoding: str) -> str:

91 """

92 Normalize encoding name for consistency across platforms.

94 Args:

95 encoding: The encoding name to normalize.

97 Returns:

98 Normalized encoding name.

99 """

100 encoding_lower = encoding.lower()

101

102 # Normalize common Windows encodings

103 if encoding_lower in ["windows-1252", "cp1252"]:

104 return "windows-1252"

105 elif encoding_lower in ["windows-1254", "cp1254"]:

106 return "windows-1254"

107 elif encoding_lower in ["iso-8859-1", "latin-1"]:

108 return "iso-8859-1"

109

110 return encoding_lower

111

112

113def get_platform_default_encoding() -> str:

114 """

115 Get the default encoding for the current platform.

116

117 Returns:

118 The default encoding name for the current platform.

119 """

120 import sys

121

122 if sys.platform.startswith("win"):

123 return "windows-1252" # Common Windows default

124 else:

125 return "utf-8" # Common Unix/Linux default