Coverage for lmcat\processing_pipeline.py: 83%

71 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-29 16:42 -0700

1from importlib.util import spec_from_file_location, module_from_spec 

2import sys 

3from pathlib import Path 

4from typing import Literal 

5import re 

6import warnings 

7 

8from lmcat.processors import ( 

9 ProcessorName, 

10 DeciderName, 

11 ProcessorFunc, 

12 DeciderFunc, 

13 PROCESSORS, 

14 DECIDERS, 

15) 

16 

17OnMultipleProcessors = Literal["warn", "except", "do_first", "do_last", "skip"] 

18 

19 

20def _compile_glob(pattern: str) -> re.Pattern: 

21 """Convert a glob pattern to a regex pattern. 

22 

23 # Parameters: 

24 - `pattern : str` 

25 Glob pattern to compile 

26 

27 # Returns: 

28 - `re.Pattern` 

29 Compiled regex pattern 

30 """ 

31 regex_str: str = pattern.replace(".", r"\.").replace("*", ".*").replace("?", ".") 

32 return re.compile(f"^{regex_str}$") 

33 

34 

35def load_plugins(plugins_file: Path) -> None: 

36 """Load plugins from a Python file. 

37 

38 # Parameters: 

39 - `plugins_file : Path` 

40 Path to plugins file 

41 """ 

42 if not plugins_file.exists(): 

43 return 

44 

45 try: 

46 # Load module 

47 spec = spec_from_file_location("lmcat_plugins", plugins_file) 

48 if spec is None or spec.loader is None: 

49 return 

50 

51 module = module_from_spec(spec) 

52 # Add to sys.modules so imports work properly 

53 sys.modules["lmcat_plugins"] = module 

54 spec.loader.exec_module(module) 

55 except Exception as e: 

56 print(f"Error loading plugins: {e}", file=sys.stderr) 

57 

58 

59class ProcessingPipeline: 

60 """Manages the processing pipeline for files. 

61 

62 # Attributes: 

63 - `glob_process : dict[str, ProcessorName]` 

64 Maps glob patterns to processor names 

65 - `decider_process : dict[DeciderName, ProcessorName]` 

66 Maps decider names to processor names 

67 - `_compiled_globs : dict[str, re.Pattern]` 

68 Cached compiled glob patterns for performance 

69 """ 

70 

71 def __init__( 

72 self, 

73 plugins_file: Path | None, 

74 decider_process_keys: dict[DeciderName, ProcessorName], 

75 glob_process_keys: dict[str, ProcessorName], 

76 on_multiple_processors: OnMultipleProcessors, 

77 ): 

78 # store the vars 

79 self.plugins_file: Path | None = plugins_file 

80 self.decider_process_keys: dict[DeciderName, ProcessorName] = ( 

81 decider_process_keys 

82 ) 

83 self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys 

84 self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors 

85 

86 # load the plugins file 

87 if self.plugins_file is not None: 

88 load_plugins(self.plugins_file) 

89 

90 # try to get the glob and decider processor functions 

91 try: 

92 self.decider_process: dict[DeciderFunc, ProcessorFunc] = { 

93 DECIDERS[decider_name]: PROCESSORS[processor_name] 

94 for decider_name, processor_name in self.decider_process_keys.items() 

95 } 

96 except KeyError as e: 

97 raise ValueError( 

98 f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }" 

99 ) from e 

100 

101 try: 

102 self.glob_process: dict[re.Pattern, ProcessorFunc] = { 

103 _compile_glob(glob_pattern): PROCESSORS[processor_name] 

104 for glob_pattern, processor_name in self.glob_process_keys.items() 

105 } 

106 except KeyError as e: 

107 raise ValueError( 

108 f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }" 

109 ) from e 

110 

111 def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]: 

112 """Get all applicable processors for a given path. 

113 

114 # Parameters: 

115 - `path : Path` 

116 Path to get processors for 

117 

118 # Returns: 

119 - `list[ProcessorFunc]` 

120 List of applicable path processors 

121 """ 

122 processors: list[ProcessorFunc] = [] 

123 

124 # Check deciders 

125 for decider, processor in self.decider_process.items(): 

126 if decider(path): 

127 processors.append(processor) 

128 

129 # Check glob patterns 

130 for glob_pattern, processor in self.glob_process.items(): 

131 if glob_pattern.match(path.name): 

132 processors.append(processor) 

133 

134 return processors 

135 

136 def process_file(self, path: Path) -> tuple[str, str | None]: 

137 """Process a file through the pipeline. 

138 

139 # Parameters: 

140 - `path : Path` 

141 Path to process the content of 

142 

143 # Returns: 

144 - `tuple[str, str]` 

145 Processed content and the processor name 

146 if no processor is found, will be `(path.read_text(), None)` 

147 """ 

148 # Get all applicable processors 

149 processors: list[ProcessorFunc] = self.get_processors_for_path(path) 

150 

151 # Early return if no processors 

152 selected_processor: ProcessorFunc | None 

153 

154 if len(processors) == 0: 

155 selected_processor = None 

156 elif len(processors) == 1: 

157 # Apply single processor 

158 selected_processor = processors[0] 

159 else: 

160 match self.on_multiple_processors: 

161 case "warn": 

162 warnings.warn(f"Multiple processors for {path.name}: {processors}") 

163 selected_processor = processors[0] 

164 case "except": 

165 raise ValueError( 

166 f"Multiple processors for {path.name}: {processors}" 

167 ) 

168 case "do_first": 

169 selected_processor = processors[0] 

170 case "do_last": 

171 selected_processor = processors[-1] 

172 case "skip": 

173 selected_processor = None 

174 case _: 

175 raise ValueError( 

176 f"Invalid on_multiple_processors: {self.on_multiple_processors = }" 

177 ) 

178 

179 # Process the file and return 

180 if selected_processor is None: 

181 return path.read_text(encoding="utf-8", errors="surrogateescape"), None 

182 else: 

183 return selected_processor(path), selected_processor.__name__