docs for lmcat v0.1.2
View Source on GitHub

lmcat.processing_pipeline


  1from importlib.util import spec_from_file_location, module_from_spec
  2import sys
  3from pathlib import Path
  4from typing import Literal
  5import re
  6import warnings
  7
  8from lmcat.processors import (
  9	ProcessorName,
 10	DeciderName,
 11	ProcessorFunc,
 12	DeciderFunc,
 13	PROCESSORS,
 14	DECIDERS,
 15)
 16
 17OnMultipleProcessors = Literal["warn", "except", "do_first", "do_last", "skip"]
 18
 19
 20def _compile_glob(pattern: str) -> re.Pattern:
 21	"""Convert a glob pattern to a regex pattern.
 22
 23	# Parameters:
 24		- `pattern : str`
 25		Glob pattern to compile
 26
 27	# Returns:
 28		- `re.Pattern`
 29		Compiled regex pattern
 30	"""
 31	regex_str: str = pattern.replace(".", r"\.").replace("*", ".*").replace("?", ".")
 32	return re.compile(f"^{regex_str}$")
 33
 34
 35def load_plugins(plugins_file: Path) -> None:
 36	"""Load plugins from a Python file.
 37
 38	# Parameters:
 39	 - `plugins_file : Path`
 40	    Path to plugins file
 41	"""
 42	if not plugins_file.exists():
 43		return
 44
 45	try:
 46		# Load module
 47		spec = spec_from_file_location("lmcat_plugins", plugins_file)
 48		if spec is None or spec.loader is None:
 49			return
 50
 51		module = module_from_spec(spec)
 52		# Add to sys.modules so imports work properly
 53		sys.modules["lmcat_plugins"] = module
 54		spec.loader.exec_module(module)
 55	except Exception as e:
 56		print(f"Error loading plugins: {e}", file=sys.stderr)
 57
 58
 59class ProcessingPipeline:
 60	"""Manages the processing pipeline for files.
 61
 62	# Attributes:
 63	 - `glob_process : dict[str, ProcessorName]`
 64		Maps glob patterns to processor names
 65	 - `decider_process : dict[DeciderName, ProcessorName]`
 66		Maps decider names to processor names
 67	 - `_compiled_globs : dict[str, re.Pattern]`
 68		Cached compiled glob patterns for performance
 69	"""
 70
 71	def __init__(
 72		self,
 73		plugins_file: Path | None,
 74		decider_process_keys: dict[DeciderName, ProcessorName],
 75		glob_process_keys: dict[str, ProcessorName],
 76		on_multiple_processors: OnMultipleProcessors,
 77	):
 78		# store the vars
 79		self.plugins_file: Path | None = plugins_file
 80		self.decider_process_keys: dict[DeciderName, ProcessorName] = (
 81			decider_process_keys
 82		)
 83		self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys
 84		self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors
 85
 86		# load the plugins file
 87		if self.plugins_file is not None:
 88			load_plugins(self.plugins_file)
 89
 90		# try to get the glob and decider processor functions
 91		try:
 92			self.decider_process: dict[DeciderFunc, ProcessorFunc] = {
 93				DECIDERS[decider_name]: PROCESSORS[processor_name]
 94				for decider_name, processor_name in self.decider_process_keys.items()
 95			}
 96		except KeyError as e:
 97			raise ValueError(
 98				f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }"
 99			) from e
100
101		try:
102			self.glob_process: dict[re.Pattern, ProcessorFunc] = {
103				_compile_glob(glob_pattern): PROCESSORS[processor_name]
104				for glob_pattern, processor_name in self.glob_process_keys.items()
105			}
106		except KeyError as e:
107			raise ValueError(
108				f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }"
109			) from e
110
111	def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]:
112		"""Get all applicable processors for a given path.
113
114		# Parameters:
115		 - `path : Path`
116			Path to get processors for
117
118		# Returns:
119		 - `list[ProcessorFunc]`
120			List of applicable path processors
121		"""
122		processors: list[ProcessorFunc] = []
123
124		# Check deciders
125		for decider, processor in self.decider_process.items():
126			if decider(path):
127				processors.append(processor)
128
129		# Check glob patterns
130		for glob_pattern, processor in self.glob_process.items():
131			if glob_pattern.match(path.name):
132				processors.append(processor)
133
134		return processors
135
136	def process_file(self, path: Path) -> tuple[str, str | None]:
137		"""Process a file through the pipeline.
138
139		# Parameters:
140		 - `path : Path`
141			Path to process the content of
142
143		# Returns:
144		 - `tuple[str, str]`
145			Processed content and the processor name
146			if no processor is found, will be `(path.read_text(), None)`
147		"""
148		# Get all applicable processors
149		processors: list[ProcessorFunc] = self.get_processors_for_path(path)
150
151		# Early return if no processors
152		selected_processor: ProcessorFunc | None
153
154		if len(processors) == 0:
155			selected_processor = None
156		elif len(processors) == 1:
157			# Apply single processor
158			selected_processor = processors[0]
159		else:
160			match self.on_multiple_processors:
161				case "warn":
162					warnings.warn(f"Multiple processors for {path.name}: {processors}")
163					selected_processor = processors[0]
164				case "except":
165					raise ValueError(
166						f"Multiple processors for {path.name}: {processors}"
167					)
168				case "do_first":
169					selected_processor = processors[0]
170				case "do_last":
171					selected_processor = processors[-1]
172				case "skip":
173					selected_processor = None
174				case _:
175					raise ValueError(
176						f"Invalid on_multiple_processors: {self.on_multiple_processors = }"
177					)
178
179		# Process the file and return
180		if selected_processor is None:
181			return path.read_text(encoding="utf-8", errors="surrogateescape"), None
182		else:
183			return selected_processor(path), selected_processor.__name__

OnMultipleProcessors = typing.Literal['warn', 'except', 'do_first', 'do_last', 'skip']
def load_plugins(plugins_file: pathlib.Path) -> None:
36def load_plugins(plugins_file: Path) -> None:
37	"""Load plugins from a Python file.
38
39	# Parameters:
40	 - `plugins_file : Path`
41	    Path to plugins file
42	"""
43	if not plugins_file.exists():
44		return
45
46	try:
47		# Load module
48		spec = spec_from_file_location("lmcat_plugins", plugins_file)
49		if spec is None or spec.loader is None:
50			return
51
52		module = module_from_spec(spec)
53		# Add to sys.modules so imports work properly
54		sys.modules["lmcat_plugins"] = module
55		spec.loader.exec_module(module)
56	except Exception as e:
57		print(f"Error loading plugins: {e}", file=sys.stderr)

Load plugins from a Python file.

Parameters:

  • plugins_file : Path Path to plugins file
class ProcessingPipeline:
 60class ProcessingPipeline:
 61	"""Manages the processing pipeline for files.
 62
 63	# Attributes:
 64	 - `glob_process : dict[str, ProcessorName]`
 65		Maps glob patterns to processor names
 66	 - `decider_process : dict[DeciderName, ProcessorName]`
 67		Maps decider names to processor names
 68	 - `_compiled_globs : dict[str, re.Pattern]`
 69		Cached compiled glob patterns for performance
 70	"""
 71
 72	def __init__(
 73		self,
 74		plugins_file: Path | None,
 75		decider_process_keys: dict[DeciderName, ProcessorName],
 76		glob_process_keys: dict[str, ProcessorName],
 77		on_multiple_processors: OnMultipleProcessors,
 78	):
 79		# store the vars
 80		self.plugins_file: Path | None = plugins_file
 81		self.decider_process_keys: dict[DeciderName, ProcessorName] = (
 82			decider_process_keys
 83		)
 84		self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys
 85		self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors
 86
 87		# load the plugins file
 88		if self.plugins_file is not None:
 89			load_plugins(self.plugins_file)
 90
 91		# try to get the glob and decider processor functions
 92		try:
 93			self.decider_process: dict[DeciderFunc, ProcessorFunc] = {
 94				DECIDERS[decider_name]: PROCESSORS[processor_name]
 95				for decider_name, processor_name in self.decider_process_keys.items()
 96			}
 97		except KeyError as e:
 98			raise ValueError(
 99				f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }"
100			) from e
101
102		try:
103			self.glob_process: dict[re.Pattern, ProcessorFunc] = {
104				_compile_glob(glob_pattern): PROCESSORS[processor_name]
105				for glob_pattern, processor_name in self.glob_process_keys.items()
106			}
107		except KeyError as e:
108			raise ValueError(
109				f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }"
110			) from e
111
112	def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]:
113		"""Get all applicable processors for a given path.
114
115		# Parameters:
116		 - `path : Path`
117			Path to get processors for
118
119		# Returns:
120		 - `list[ProcessorFunc]`
121			List of applicable path processors
122		"""
123		processors: list[ProcessorFunc] = []
124
125		# Check deciders
126		for decider, processor in self.decider_process.items():
127			if decider(path):
128				processors.append(processor)
129
130		# Check glob patterns
131		for glob_pattern, processor in self.glob_process.items():
132			if glob_pattern.match(path.name):
133				processors.append(processor)
134
135		return processors
136
137	def process_file(self, path: Path) -> tuple[str, str | None]:
138		"""Process a file through the pipeline.
139
140		# Parameters:
141		 - `path : Path`
142			Path to process the content of
143
144		# Returns:
145		 - `tuple[str, str]`
146			Processed content and the processor name
147			if no processor is found, will be `(path.read_text(), None)`
148		"""
149		# Get all applicable processors
150		processors: list[ProcessorFunc] = self.get_processors_for_path(path)
151
152		# Early return if no processors
153		selected_processor: ProcessorFunc | None
154
155		if len(processors) == 0:
156			selected_processor = None
157		elif len(processors) == 1:
158			# Apply single processor
159			selected_processor = processors[0]
160		else:
161			match self.on_multiple_processors:
162				case "warn":
163					warnings.warn(f"Multiple processors for {path.name}: {processors}")
164					selected_processor = processors[0]
165				case "except":
166					raise ValueError(
167						f"Multiple processors for {path.name}: {processors}"
168					)
169				case "do_first":
170					selected_processor = processors[0]
171				case "do_last":
172					selected_processor = processors[-1]
173				case "skip":
174					selected_processor = None
175				case _:
176					raise ValueError(
177						f"Invalid on_multiple_processors: {self.on_multiple_processors = }"
178					)
179
180		# Process the file and return
181		if selected_processor is None:
182			return path.read_text(encoding="utf-8", errors="surrogateescape"), None
183		else:
184			return selected_processor(path), selected_processor.__name__

Manages the processing pipeline for files.

Attributes:

  • glob_process : dict[str, ProcessorName] Maps glob patterns to processor names
  • decider_process : dict[DeciderName, ProcessorName] Maps decider names to processor names
  • _compiled_globs : dict[str, re.Pattern] Cached compiled glob patterns for performance
ProcessingPipeline( plugins_file: pathlib.Path | None, decider_process_keys: dict[str, str], glob_process_keys: dict[str, str], on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip'])
 72	def __init__(
 73		self,
 74		plugins_file: Path | None,
 75		decider_process_keys: dict[DeciderName, ProcessorName],
 76		glob_process_keys: dict[str, ProcessorName],
 77		on_multiple_processors: OnMultipleProcessors,
 78	):
 79		# store the vars
 80		self.plugins_file: Path | None = plugins_file
 81		self.decider_process_keys: dict[DeciderName, ProcessorName] = (
 82			decider_process_keys
 83		)
 84		self.glob_process_keys: dict[str, ProcessorName] = glob_process_keys
 85		self.on_multiple_processors: OnMultipleProcessors = on_multiple_processors
 86
 87		# load the plugins file
 88		if self.plugins_file is not None:
 89			load_plugins(self.plugins_file)
 90
 91		# try to get the glob and decider processor functions
 92		try:
 93			self.decider_process: dict[DeciderFunc, ProcessorFunc] = {
 94				DECIDERS[decider_name]: PROCESSORS[processor_name]
 95				for decider_name, processor_name in self.decider_process_keys.items()
 96			}
 97		except KeyError as e:
 98			raise ValueError(
 99				f"Invalid decider or decider processor:\n{e}\n{DECIDERS.keys() = }\n{PROCESSORS.keys() = }\n{self.decider_process_keys = }"
100			) from e
101
102		try:
103			self.glob_process: dict[re.Pattern, ProcessorFunc] = {
104				_compile_glob(glob_pattern): PROCESSORS[processor_name]
105				for glob_pattern, processor_name in self.glob_process_keys.items()
106			}
107		except KeyError as e:
108			raise ValueError(
109				f"Invalid glob processor:\n{e}\n{PROCESSORS.keys() = }\n{self.glob_process_keys = }"
110			) from e
plugins_file: pathlib.Path | None
decider_process_keys: dict[str, str]
glob_process_keys: dict[str, str]
on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip']
def get_processors_for_path(self, path: pathlib.Path) -> list[typing.Callable[[pathlib.Path], str]]:
112	def get_processors_for_path(self, path: Path) -> list[ProcessorFunc]:
113		"""Get all applicable processors for a given path.
114
115		# Parameters:
116		 - `path : Path`
117			Path to get processors for
118
119		# Returns:
120		 - `list[ProcessorFunc]`
121			List of applicable path processors
122		"""
123		processors: list[ProcessorFunc] = []
124
125		# Check deciders
126		for decider, processor in self.decider_process.items():
127			if decider(path):
128				processors.append(processor)
129
130		# Check glob patterns
131		for glob_pattern, processor in self.glob_process.items():
132			if glob_pattern.match(path.name):
133				processors.append(processor)
134
135		return processors

Get all applicable processors for a given path.

Parameters:

  • path : Path Path to get processors for

Returns:

  • list[ProcessorFunc] List of applicable path processors
def process_file(self, path: pathlib.Path) -> tuple[str, str | None]:
137	def process_file(self, path: Path) -> tuple[str, str | None]:
138		"""Process a file through the pipeline.
139
140		# Parameters:
141		 - `path : Path`
142			Path to process the content of
143
144		# Returns:
145		 - `tuple[str, str]`
146			Processed content and the processor name
147			if no processor is found, will be `(path.read_text(), None)`
148		"""
149		# Get all applicable processors
150		processors: list[ProcessorFunc] = self.get_processors_for_path(path)
151
152		# Early return if no processors
153		selected_processor: ProcessorFunc | None
154
155		if len(processors) == 0:
156			selected_processor = None
157		elif len(processors) == 1:
158			# Apply single processor
159			selected_processor = processors[0]
160		else:
161			match self.on_multiple_processors:
162				case "warn":
163					warnings.warn(f"Multiple processors for {path.name}: {processors}")
164					selected_processor = processors[0]
165				case "except":
166					raise ValueError(
167						f"Multiple processors for {path.name}: {processors}"
168					)
169				case "do_first":
170					selected_processor = processors[0]
171				case "do_last":
172					selected_processor = processors[-1]
173				case "skip":
174					selected_processor = None
175				case _:
176					raise ValueError(
177						f"Invalid on_multiple_processors: {self.on_multiple_processors = }"
178					)
179
180		# Process the file and return
181		if selected_processor is None:
182			return path.read_text(encoding="utf-8", errors="surrogateescape"), None
183		else:
184			return selected_processor(path), selected_processor.__name__

Process a file through the pipeline.

Parameters:

  • path : Path Path to process the content of

Returns:

  • tuple[str, str] Processed content and the processor name if no processor is found, will be (path.read_text(), None)