Coverage for lmcat\processing_pipeline.py: 83%
71 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-29 16:42 -0700
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-29 16:42 -0700
1from importlib.util import spec_from_file_location, module_from_spec
2import sys
3from pathlib import Path
4from typing import Literal
5import re
6import warnings
8from lmcat.processors import (
9 ProcessorName,
10 DeciderName,
11 ProcessorFunc,
12 DeciderFunc,
13 PROCESSORS,
14 DECIDERS,
15)
17OnMultipleProcessors = Literal["warn", "except", "do_first", "do_last", "skip"]
20def _compile_glob(pattern: str) -> re.Pattern:
21 """Convert a glob pattern to a regex pattern.
23 # Parameters:
24 - `pattern : str`
25 Glob pattern to compile
27 # Returns:
28 - `re.Pattern`
29 Compiled regex pattern
30 """
31 regex_str: str = pattern.replace(".", r"\.").replace("*", ".*").replace("?", ".")
32 return re.compile(f"^{regex_str}$")
35def load_plugins(plugins_file: Path) -> None:
36 """Load plugins from a Python file.
38 # Parameters:
39 - `plugins_file : Path`
40 Path to plugins file
41 """
42 if not plugins_file.exists():
43 return
45 try:
46 # Load module
47 spec = spec_from_file_location("lmcat_plugins", plugins_file)
48 if spec is None or spec.loader is None:
49 return
51 module = module_from_spec(spec)
52 # Add to sys.modules so imports work properly
53 sys.modules["lmcat_plugins"] = module
54 spec.loader.exec_module(module)
55 except Exception as e:
56 print(f"Error loading plugins: {e}", file=sys.stderr)
59class ProcessingPipeline:
60 """Manages the processing pipeline for files.
62 # Attributes:
63 - `glob_process : dict[str, ProcessorName]`
64 Maps glob patterns to processor names
65 - `decider_process : dict[DeciderName, ProcessorName]`
66 Maps decider names to processor names
67 - `_compiled_globs : dict[str, re.Pattern]`
68 Cached compiled glob patterns for performance
69 """
71 def __init__(
72 self,
73 plugins_file: Path | None,
74 decider_process_keys: dict[DeciderName, ProcessorName],
75 glob_process_keys: dict[str, ProcessorName],
76 on_multiple_processors: OnMultipleProcessors,
77 ):
78 # store the vars
79 self.plugins_file: Path | None = plugins_file
80 self.decider_process_keys: dict[DeciderName, ProcessorName] = (
81 decider_process_keys
82 )
83 self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys
84 self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors
86 # load the plugins file
87 if self.plugins_file is not None:
88 load_plugins(self.plugins_file)
90 # try to get the glob and decider processor functions
91 try:
92 self.decider_process: dict[DeciderFunc, ProcessorFunc] = {
93 DECIDERS[decider_name]: PROCESSORS[processor_name]
94 for decider_name, processor_name in self.decider_process_keys.items()
95 }
96 except KeyError as e:
97 raise ValueError(
98 f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }"
99 ) from e
101 try:
102 self.glob_process: dict[re.Pattern, ProcessorFunc] = {
103 _compile_glob(glob_pattern): PROCESSORS[processor_name]
104 for glob_pattern, processor_name in self.glob_process_keys.items()
105 }
106 except KeyError as e:
107 raise ValueError(
108 f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }"
109 ) from e
111 def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]:
112 """Get all applicable processors for a given path.
114 # Parameters:
115 - `path : Path`
116 Path to get processors for
118 # Returns:
119 - `list[ProcessorFunc]`
120 List of applicable path processors
121 """
122 processors: list[ProcessorFunc] = []
124 # Check deciders
125 for decider, processor in self.decider_process.items():
126 if decider(path):
127 processors.append(processor)
129 # Check glob patterns
130 for glob_pattern, processor in self.glob_process.items():
131 if glob_pattern.match(path.name):
132 processors.append(processor)
134 return processors
136 def process_file(self, path: Path) -> tuple[str, str | None]:
137 """Process a file through the pipeline.
139 # Parameters:
140 - `path : Path`
141 Path to process the content of
143 # Returns:
144 - `tuple[str, str]`
145 Processed content and the processor name
146 if no processor is found, will be `(path.read_text(), None)`
147 """
148 # Get all applicable processors
149 processors: list[ProcessorFunc] = self.get_processors_for_path(path)
151 # Early return if no processors
152 selected_processor: ProcessorFunc | None
154 if len(processors) == 0:
155 selected_processor = None
156 elif len(processors) == 1:
157 # Apply single processor
158 selected_processor = processors[0]
159 else:
160 match self.on_multiple_processors:
161 case "warn":
162 warnings.warn(f"Multiple processors for {path.name}: {processors}")
163 selected_processor = processors[0]
164 case "except":
165 raise ValueError(
166 f"Multiple processors for {path.name}: {processors}"
167 )
168 case "do_first":
169 selected_processor = processors[0]
170 case "do_last":
171 selected_processor = processors[-1]
172 case "skip":
173 selected_processor = None
174 case _:
175 raise ValueError(
176 f"Invalid on_multiple_processors: {self.on_multiple_processors = }"
177 )
179 # Process the file and return
180 if selected_processor is None:
181 return path.read_text(encoding="utf-8", errors="surrogateescape"), None
182 else:
183 return selected_processor(path), selected_processor.__name__