lmcat.processing_pipeline
1from importlib.util import spec_from_file_location, module_from_spec 2import sys 3from pathlib import Path 4from typing import Literal 5import re 6import warnings 7 8from lmcat.processors import ( 9 ProcessorName, 10 DeciderName, 11 ProcessorFunc, 12 DeciderFunc, 13 PROCESSORS, 14 DECIDERS, 15) 16 17OnMultipleProcessors = Literal["warn", "except", "do_first", "do_last", "skip"] 18 19 20def _compile_glob(pattern: str) -> re.Pattern: 21 """Convert a glob pattern to a regex pattern. 22 23 # Parameters: 24 - `pattern : str` 25 Glob pattern to compile 26 27 # Returns: 28 - `re.Pattern` 29 Compiled regex pattern 30 """ 31 regex_str: str = pattern.replace(".", r"\.").replace("*", ".*").replace("?", ".") 32 return re.compile(f"^{regex_str}$") 33 34 35def load_plugins(plugins_file: Path) -> None: 36 """Load plugins from a Python file. 37 38 # Parameters: 39 - `plugins_file : Path` 40 Path to plugins file 41 """ 42 if not plugins_file.exists(): 43 return 44 45 try: 46 # Load module 47 spec = spec_from_file_location("lmcat_plugins", plugins_file) 48 if spec is None or spec.loader is None: 49 return 50 51 module = module_from_spec(spec) 52 # Add to sys.modules so imports work properly 53 sys.modules["lmcat_plugins"] = module 54 spec.loader.exec_module(module) 55 except Exception as e: 56 print(f"Error loading plugins: {e}", file=sys.stderr) 57 58 59class ProcessingPipeline: 60 """Manages the processing pipeline for files. 61 62 # Attributes: 63 - `glob_process : dict[str, ProcessorName]` 64 Maps glob patterns to processor names 65 - `decider_process : dict[DeciderName, ProcessorName]` 66 Maps decider names to processor names 67 - `_compiled_globs : dict[str, re.Pattern]` 68 Cached compiled glob patterns for performance 69 """ 70 71 def __init__( 72 self, 73 plugins_file: Path | None, 74 decider_process_keys: dict[DeciderName, ProcessorName], 75 glob_process_keys: dict[str, ProcessorName], 76 on_multiple_processors: OnMultipleProcessors, 77 ): 78 # store the vars 79 self.plugins_file: Path | None = plugins_file 80 self.decider_process_keys: dict[DeciderName, ProcessorName] = ( 81 decider_process_keys 82 ) 83 self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys 84 self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors 85 86 # load the plugins file 87 if self.plugins_file is not None: 88 load_plugins(self.plugins_file) 89 90 # try to get the glob and decider processor functions 91 try: 92 self.decider_process: dict[DeciderFunc, ProcessorFunc] = { 93 DECIDERS[decider_name]: PROCESSORS[processor_name] 94 for decider_name, processor_name in self.decider_process_keys.items() 95 } 96 except KeyError as e: 97 raise ValueError( 98 f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }" 99 ) from e 100 101 try: 102 self.glob_process: dict[re.Pattern, ProcessorFunc] = { 103 _compile_glob(glob_pattern): PROCESSORS[processor_name] 104 for glob_pattern, processor_name in self.glob_process_keys.items() 105 } 106 except KeyError as e: 107 raise ValueError( 108 f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }" 109 ) from e 110 111 def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]: 112 """Get all applicable processors for a given path. 113 114 # Parameters: 115 - `path : Path` 116 Path to get processors for 117 118 # Returns: 119 - `list[ProcessorFunc]` 120 List of applicable path processors 121 """ 122 processors: list[ProcessorFunc] = [] 123 124 # Check deciders 125 for decider, processor in self.decider_process.items(): 126 if decider(path): 127 processors.append(processor) 128 129 # Check glob patterns 130 for glob_pattern, processor in self.glob_process.items(): 131 if glob_pattern.match(path.name): 132 processors.append(processor) 133 134 return processors 135 136 def process_file(self, path: Path) -> tuple[str, str | None]: 137 """Process a file through the pipeline. 138 139 # Parameters: 140 - `path : Path` 141 Path to process the content of 142 143 # Returns: 144 - `tuple[str, str]` 145 Processed content and the processor name 146 if no processor is found, will be `(path.read_text(), None)` 147 """ 148 # Get all applicable processors 149 processors: list[ProcessorFunc] = self.get_processors_for_path(path) 150 151 # Early return if no processors 152 selected_processor: ProcessorFunc | None 153 154 if len(processors) == 0: 155 selected_processor = None 156 elif len(processors) == 1: 157 # Apply single processor 158 selected_processor = processors[0] 159 else: 160 match self.on_multiple_processors: 161 case "warn": 162 warnings.warn(f"Multiple processors for {path.name}: {processors}") 163 selected_processor = processors[0] 164 case "except": 165 raise ValueError( 166 f"Multiple processors for {path.name}: {processors}" 167 ) 168 case "do_first": 169 selected_processor = processors[0] 170 case "do_last": 171 selected_processor = processors[-1] 172 case "skip": 173 selected_processor = None 174 case _: 175 raise ValueError( 176 f"Invalid on_multiple_processors: {self.on_multiple_processors = }" 177 ) 178 179 # Process the file and return 180 if selected_processor is None: 181 return path.read_text(encoding="utf-8", errors="surrogateescape"), None 182 else: 183 return selected_processor(path), selected_processor.__name__
OnMultipleProcessors =
typing.Literal['warn', 'except', 'do_first', 'do_last', 'skip']
def
load_plugins(plugins_file: pathlib.Path) -> None:
36def load_plugins(plugins_file: Path) -> None: 37 """Load plugins from a Python file. 38 39 # Parameters: 40 - `plugins_file : Path` 41 Path to plugins file 42 """ 43 if not plugins_file.exists(): 44 return 45 46 try: 47 # Load module 48 spec = spec_from_file_location("lmcat_plugins", plugins_file) 49 if spec is None or spec.loader is None: 50 return 51 52 module = module_from_spec(spec) 53 # Add to sys.modules so imports work properly 54 sys.modules["lmcat_plugins"] = module 55 spec.loader.exec_module(module) 56 except Exception as e: 57 print(f"Error loading plugins: {e}", file=sys.stderr)
Load plugins from a Python file.
Parameters:
plugins_file : Path
Path to plugins file
class
ProcessingPipeline:
60class ProcessingPipeline: 61 """Manages the processing pipeline for files. 62 63 # Attributes: 64 - `glob_process : dict[str, ProcessorName]` 65 Maps glob patterns to processor names 66 - `decider_process : dict[DeciderName, ProcessorName]` 67 Maps decider names to processor names 68 - `_compiled_globs : dict[str, re.Pattern]` 69 Cached compiled glob patterns for performance 70 """ 71 72 def __init__( 73 self, 74 plugins_file: Path | None, 75 decider_process_keys: dict[DeciderName, ProcessorName], 76 glob_process_keys: dict[str, ProcessorName], 77 on_multiple_processors: OnMultipleProcessors, 78 ): 79 # store the vars 80 self.plugins_file: Path | None = plugins_file 81 self.decider_process_keys: dict[DeciderName, ProcessorName] = ( 82 decider_process_keys 83 ) 84 self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys 85 self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors 86 87 # load the plugins file 88 if self.plugins_file is not None: 89 load_plugins(self.plugins_file) 90 91 # try to get the glob and decider processor functions 92 try: 93 self.decider_process: dict[DeciderFunc, ProcessorFunc] = { 94 DECIDERS[decider_name]: PROCESSORS[processor_name] 95 for decider_name, processor_name in self.decider_process_keys.items() 96 } 97 except KeyError as e: 98 raise ValueError( 99 f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }" 100 ) from e 101 102 try: 103 self.glob_process: dict[re.Pattern, ProcessorFunc] = { 104 _compile_glob(glob_pattern): PROCESSORS[processor_name] 105 for glob_pattern, processor_name in self.glob_process_keys.items() 106 } 107 except KeyError as e: 108 raise ValueError( 109 f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }" 110 ) from e 111 112 def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]: 113 """Get all applicable processors for a given path. 114 115 # Parameters: 116 - `path : Path` 117 Path to get processors for 118 119 # Returns: 120 - `list[ProcessorFunc]` 121 List of applicable path processors 122 """ 123 processors: list[ProcessorFunc] = [] 124 125 # Check deciders 126 for decider, processor in self.decider_process.items(): 127 if decider(path): 128 processors.append(processor) 129 130 # Check glob patterns 131 for glob_pattern, processor in self.glob_process.items(): 132 if glob_pattern.match(path.name): 133 processors.append(processor) 134 135 return processors 136 137 def process_file(self, path: Path) -> tuple[str, str | None]: 138 """Process a file through the pipeline. 139 140 # Parameters: 141 - `path : Path` 142 Path to process the content of 143 144 # Returns: 145 - `tuple[str, str]` 146 Processed content and the processor name 147 if no processor is found, will be `(path.read_text(), None)` 148 """ 149 # Get all applicable processors 150 processors: list[ProcessorFunc] = self.get_processors_for_path(path) 151 152 # Early return if no processors 153 selected_processor: ProcessorFunc | None 154 155 if len(processors) == 0: 156 selected_processor = None 157 elif len(processors) == 1: 158 # Apply single processor 159 selected_processor = processors[0] 160 else: 161 match self.on_multiple_processors: 162 case "warn": 163 warnings.warn(f"Multiple processors for {path.name}: {processors}") 164 selected_processor = processors[0] 165 case "except": 166 raise ValueError( 167 f"Multiple processors for {path.name}: {processors}" 168 ) 169 case "do_first": 170 selected_processor = processors[0] 171 case "do_last": 172 selected_processor = processors[-1] 173 case "skip": 174 selected_processor = None 175 case _: 176 raise ValueError( 177 f"Invalid on_multiple_processors: {self.on_multiple_processors = }" 178 ) 179 180 # Process the file and return 181 if selected_processor is None: 182 return path.read_text(encoding="utf-8", errors="surrogateescape"), None 183 else: 184 return selected_processor(path), selected_processor.__name__
Manages the processing pipeline for files.
Attributes:
glob_process : dict[str, ProcessorName]
Maps glob patterns to processor namesdecider_process : dict[DeciderName, ProcessorName]
Maps decider names to processor names_compiled_globs : dict[str, re.Pattern]
Cached compiled glob patterns for performance
ProcessingPipeline( plugins_file: pathlib.Path | None, decider_process_keys: dict[str, str], glob_process_keys: dict[str, str], on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip'])
72 def __init__( 73 self, 74 plugins_file: Path | None, 75 decider_process_keys: dict[DeciderName, ProcessorName], 76 glob_process_keys: dict[str, ProcessorName], 77 on_multiple_processors: OnMultipleProcessors, 78 ): 79 # store the vars 80 self.plugins_file: Path | None = plugins_file 81 self.decider_process_keys: dict[DeciderName, ProcessorName] = ( 82 decider_process_keys 83 ) 84 self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys 85 self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors 86 87 # load the plugins file 88 if self.plugins_file is not None: 89 load_plugins(self.plugins_file) 90 91 # try to get the glob and decider processor functions 92 try: 93 self.decider_process: dict[DeciderFunc, ProcessorFunc] = { 94 DECIDERS[decider_name]: PROCESSORS[processor_name] 95 for decider_name, processor_name in self.decider_process_keys.items() 96 } 97 except KeyError as e: 98 raise ValueError( 99 f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }" 100 ) from e 101 102 try: 103 self.glob_process: dict[re.Pattern, ProcessorFunc] = { 104 _compile_glob(glob_pattern): PROCESSORS[processor_name] 105 for glob_pattern, processor_name in self.glob_process_keys.items() 106 } 107 except KeyError as e: 108 raise ValueError( 109 f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }" 110 ) from e
def
get_processors_for_path(self, path: pathlib.Path) -> list[typing.Callable[[pathlib.Path], str]]:
112 def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]: 113 """Get all applicable processors for a given path. 114 115 # Parameters: 116 - `path : Path` 117 Path to get processors for 118 119 # Returns: 120 - `list[ProcessorFunc]` 121 List of applicable path processors 122 """ 123 processors: list[ProcessorFunc] = [] 124 125 # Check deciders 126 for decider, processor in self.decider_process.items(): 127 if decider(path): 128 processors.append(processor) 129 130 # Check glob patterns 131 for glob_pattern, processor in self.glob_process.items(): 132 if glob_pattern.match(path.name): 133 processors.append(processor) 134 135 return processors
Get all applicable processors for a given path.
Parameters:
path : Path
Path to get processors for
Returns:
list[ProcessorFunc]
List of applicable path processors
def
process_file(self, path: pathlib.Path) -> tuple[str, str | None]:
137 def process_file(self, path: Path) -> tuple[str, str | None]: 138 """Process a file through the pipeline. 139 140 # Parameters: 141 - `path : Path` 142 Path to process the content of 143 144 # Returns: 145 - `tuple[str, str]` 146 Processed content and the processor name 147 if no processor is found, will be `(path.read_text(), None)` 148 """ 149 # Get all applicable processors 150 processors: list[ProcessorFunc] = self.get_processors_for_path(path) 151 152 # Early return if no processors 153 selected_processor: ProcessorFunc | None 154 155 if len(processors) == 0: 156 selected_processor = None 157 elif len(processors) == 1: 158 # Apply single processor 159 selected_processor = processors[0] 160 else: 161 match self.on_multiple_processors: 162 case "warn": 163 warnings.warn(f"Multiple processors for {path.name}: {processors}") 164 selected_processor = processors[0] 165 case "except": 166 raise ValueError( 167 f"Multiple processors for {path.name}: {processors}" 168 ) 169 case "do_first": 170 selected_processor = processors[0] 171 case "do_last": 172 selected_processor = processors[-1] 173 case "skip": 174 selected_processor = None 175 case _: 176 raise ValueError( 177 f"Invalid on_multiple_processors: {self.on_multiple_processors = }" 178 ) 179 180 # Process the file and return 181 if selected_processor is None: 182 return path.read_text(encoding="utf-8", errors="surrogateescape"), None 183 else: 184 return selected_processor(path), selected_processor.__name__
Process a file through the pipeline.
Parameters:
path : Path
Path to process the content of
Returns:
tuple[str, str]
Processed content and the processor name if no processor is found, will be(path.read_text(), None)