docs for lmcat v0.1.2
View Source on GitHub

lmcat.lmcat


  1import argparse
  2import io
  3import json
  4
  5# from dataclasses import dataclass, field
  6from pathlib import Path
  7import sys
  8
  9from lmcat.processing_pipeline import ProcessingPipeline
 10
 11
 12# Handle Python 3.11+ vs older Python for TOML parsing
 13try:
 14	import tomllib
 15except ImportError:
 16	try:
 17		import tomli as tomllib  # type: ignore
 18	except ImportError:
 19		tomllib = None  # type: ignore[assignment]
 20
 21import igittigitt  # noqa: E402
 22
 23from muutils.json_serialize import (
 24	SerializableDataclass,
 25	serializable_dataclass,
 26	serializable_field,
 27)
 28from muutils.misc import shorten_numerical_to_str  # noqa: E402
 29
 30
 31from lmcat.file_stats import FileStats, TokenizerWrapper, TreeEntry, TOKENIZERS_PRESENT
 32from lmcat.processing_pipeline import OnMultipleProcessors
 33
 34
 35@serializable_dataclass(kw_only=True)
 36class LMCatConfig(SerializableDataclass):
 37	"""Configuration dataclass for lmcat"""
 38
 39	content_divider: str = serializable_field(default="``````")
 40	tree_only: bool = serializable_field(default=False)
 41
 42	# ignoring
 43	ignore_patterns: list[str] = serializable_field(default_factory=list)
 44	ignore_patterns_files: list[Path] = serializable_field(
 45		default_factory=lambda: [Path(".gitignore"), Path(".lmignore")],
 46		serialization_fn=lambda x: [p.as_posix() for p in x],
 47		deserialize_fn=lambda x: [Path(p) for p in x],
 48	)
 49
 50	# this file will be imported, and if the functions in it are decorated
 51	# with one of the `register_*` decorators, they will be added to the functions
 52	# which can be used in the processing pipeline
 53	# --allow-plugins is a command line only option and must be set to true for this to work
 54	plugins_file: Path | None = serializable_field(
 55		default=None,
 56		serialization_fn=lambda x: x.as_posix() if x else None,
 57		deserialize_fn=lambda x: Path(x) if x else None,
 58	)
 59	allow_plugins: bool = serializable_field(
 60		default=False,
 61		deserialize_fn=lambda x: False,  # this can only be overriden through the command line
 62	)
 63
 64	# processing pipeline
 65	glob_process: dict[str, str] = serializable_field(default_factory=dict)
 66	decider_process: dict[str, str] = serializable_field(default_factory=dict)
 67	on_multiple_processors: OnMultipleProcessors = serializable_field(
 68		default="except",
 69		assert_type=False,
 70	)
 71
 72	# tokenization
 73	tokenizer: str = serializable_field(
 74		default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split"
 75	)
 76	"Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed."
 77
 78	# tree formatting
 79	tree_divider: str = serializable_field(default="│   ")
 80	tree_file_divider: str = serializable_field(default="├── ")
 81	tree_indent: str = serializable_field(default=" ")
 82
 83	# output location
 84	output: str | None = serializable_field(default=None)
 85
 86	def get_tokenizer_obj(self) -> TokenizerWrapper:
 87		"""Get the tokenizer object"""
 88		return TokenizerWrapper(self.tokenizer)
 89
 90	def get_processing_pipeline(self) -> ProcessingPipeline:
 91		"""Get the processing pipeline object"""
 92		plugins_file: Path | None = self.plugins_file if self.allow_plugins else None
 93		return ProcessingPipeline(
 94			plugins_file=plugins_file,
 95			decider_process_keys=self.decider_process,
 96			glob_process_keys=self.glob_process,
 97			on_multiple_processors=self.on_multiple_processors,
 98		)
 99
100	@classmethod
101	def read(cls, root_dir: Path) -> "LMCatConfig":
102		"""Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json."""
103		pyproject_path: Path = root_dir / "pyproject.toml"
104		lmcat_toml_path: Path = root_dir / "lmcat.toml"
105		lmcat_json_path: Path = root_dir / "lmcat.json"
106
107		if (
108			sum(
109				int(p.is_file())
110				for p in (pyproject_path, lmcat_toml_path, lmcat_json_path)
111			)
112			> 1
113		):
114			raise ValueError(
115				"Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json."
116			)
117
118		# Try pyproject.toml first
119		if tomllib is not None and pyproject_path.is_file():
120			with pyproject_path.open("rb") as f:
121				pyproject_data = tomllib.load(f)
122			if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]:
123				return cls.load(pyproject_data["tool"]["lmcat"])
124
125		# Then try lmcat.toml
126		if tomllib is not None and lmcat_toml_path.is_file():
127			with lmcat_toml_path.open("rb") as f:
128				toml_data = tomllib.load(f)
129			return cls.load(toml_data)
130
131		# Finally try lmcat.json
132		if lmcat_json_path.is_file():
133			with lmcat_json_path.open("r", encoding="utf-8") as f:
134				json_data = json.load(f)
135			return cls.load(json_data)
136
137		# Fallback to defaults
138		return cls()
139
140
141class IgnoreHandler:
142	"""Handles all ignore pattern matching using igittigitt"""
143
144	def __init__(self, root_dir: Path, config: LMCatConfig):
145		self.root_dir: Path = root_dir
146		self.config: LMCatConfig = config
147
148		# set up parser
149		self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser()
150
151		# first from the files
152		for ignore_file in self.config.ignore_patterns_files:
153			self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name)
154
155		# then from the config itself
156		for pattern in self.config.ignore_patterns:
157			self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
158
159	def is_ignored(self, path: Path) -> bool:
160		"""Check if a path should be ignored"""
161		# Never ignore the gitignore/lmignore files themselves
162		if path.name in {".gitignore", ".lmignore"}:
163			return True
164
165		# Use igittigitt's matching
166		return self.parser.match(path)
167
168
169def sorted_entries(directory: Path) -> list[Path]:
170	"""Return directory contents sorted: directories first, then files"""
171	subdirs: list[Path] = sorted(
172		[p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name
173	)
174	files: list[Path] = sorted(
175		[p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name
176	)
177	return subdirs + files
178
179
180def walk_dir(
181	directory: Path,
182	ignore_handler: IgnoreHandler,
183	config: LMCatConfig,
184	tokenizer: TokenizerWrapper,
185	prefix: str = "",
186) -> tuple[list[TreeEntry], list[Path]]:
187	"""Recursively walk a directory, building tree lines and collecting file paths"""
188	tree_output: list[TreeEntry] = []
189	collected_files: list[Path] = []
190
191	entries: list[Path] = sorted_entries(directory)
192	for i, entry in enumerate(entries):
193		if ignore_handler.is_ignored(entry):
194			continue
195
196		is_last: bool = i == len(entries) - 1
197		connector: str = (
198			config.tree_file_divider
199			if not is_last
200			else config.tree_file_divider.replace("├", "└")
201		)
202
203		if entry.is_dir():
204			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None))
205			extension: str = config.tree_divider if not is_last else config.tree_indent
206			sub_output: list[TreeEntry]
207			sub_files: list[Path]
208			sub_output, sub_files = walk_dir(
209				directory=entry,
210				ignore_handler=ignore_handler,
211				config=config,
212				tokenizer=tokenizer,
213				prefix=prefix + extension,
214			)
215			tree_output.extend(sub_output)
216			collected_files.extend(sub_files)
217		else:
218			stats: FileStats = FileStats.from_file(entry, tokenizer)
219			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats))
220			collected_files.append(entry)
221
222	return tree_output, collected_files
223
224
225def format_tree_with_stats(
226	entries: list[TreeEntry], show_tokens: bool = False
227) -> list[str]:
228	"""Format tree entries with aligned statistics
229
230	# Parameters:
231	 - `entries : list[TreeEntry]`
232		List of tree entries with optional stats
233	 - `show_tokens : bool`
234		Whether to show token counts
235
236	# Returns:
237	 - `list[str]`
238		Formatted tree lines with aligned stats
239	"""
240	# Find max widths for alignment
241	max_line_len: int = max(len(entry.line) for entry in entries)
242	max_lines: int = max(
243		(len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries
244	)
245	max_chars: int = max(
246		(len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries
247	)
248	max_tokens: int = (
249		max(
250			(
251				len(f"{entry.stats.tokens:,}")
252				if entry.stats and entry.stats.tokens
253				else 0
254			)
255			for entry in entries
256		)
257		if show_tokens
258		else 0
259	)
260
261	formatted: list[str] = []
262	for entry in entries:
263		line: str = entry.line.ljust(max_line_len + 2)
264		if entry.stats:
265			lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1)
266			chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1)
267			stats_str: str = f"[{lines_str} {chars_str}"
268			if show_tokens and entry.stats.tokens is not None:
269				tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1)
270				stats_str += f" {tokens_str}"
271			stats_str += "]"
272			formatted.append(f"{line}{stats_str}")
273		else:
274			formatted.append(line)
275
276	return formatted
277
278
279def walk_and_collect(
280	root_dir: Path,
281	config: LMCatConfig,
282) -> tuple[list[str], list[Path]]:
283	"""Walk filesystem from root_dir and gather tree listing plus file paths"""
284	if config is None:
285		config = LMCatConfig()
286
287	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
288
289	ignore_handler = IgnoreHandler(root_dir, config)
290	base_name = root_dir.resolve().name
291
292	# Start with root directory name
293	tree_output = [TreeEntry(base_name)]
294
295	# Walk the directory tree
296	sub_output, sub_files = walk_dir(
297		directory=root_dir,
298		ignore_handler=ignore_handler,
299		config=config,
300		tokenizer=tokenizer,
301		prefix="",
302	)
303	tree_output.extend(sub_output)
304
305	# Format tree with stats
306	formatted_tree = format_tree_with_stats(
307		tree_output, show_tokens=tokenizer is not None
308	)
309
310	return formatted_tree, sub_files
311
312
313def assemble_summary(
314	root_dir: Path,
315	config: LMCatConfig,
316) -> str:
317	"""Assemble the summary output and return"""
318
319	processing_pipeline: ProcessingPipeline = config.get_processing_pipeline()
320
321	tree_output: list[str]
322	collected_files: list[Path]
323	tree_output, collected_files = walk_and_collect(
324		root_dir=root_dir,
325		config=config,
326	)
327
328	output: list[str] = []
329	output.append("# File Tree")
330	output.append("\n```")
331	output.extend(tree_output)
332	output.append("```\n")
333
334	# Add file contents if not suppressed
335	if not config.tree_only:
336		output.append("# File Contents")
337
338		for fpath in collected_files:
339			# get the path
340			relpath_posix: str = fpath.relative_to(root_dir).as_posix()
341
342			# process the contents
343			f_contents: str
344			p_name: str | None
345			f_contents, p_name = processing_pipeline.process_file(fpath)
346			processed_with: str = f'processed_with="{p_name}"' if p_name else ""
347
348			# start of file marker
349			pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}'
350			pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}'
351			output.append("")
352			output.append(config.content_divider + pathspec_start)
353
354			# process the actual contents of the file with the pipeline, and append
355			output.append(f_contents)
356
357			# add the end of file marker
358			output.append(config.content_divider + pathspec_end)
359
360	output_joined: str = "\n".join(output)
361
362	stats_dict_ints: dict[str, int] = {
363		"files": len(collected_files),
364		"lines": len(output_joined.splitlines()),
365		"chars": len(output_joined),
366	}
367
368	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
369
370	n_tokens: int = tokenizer.n_tokens(output_joined)
371	stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens
372
373	stats_header: list[str] = ["# Stats"]
374	for key, val in stats_dict_ints.items():
375		val_str: str = str(val)
376		val_short: str = shorten_numerical_to_str(val)
377		if val_str != val_short:
378			stats_header.append(f"- {val} ({val_short}) {key}")
379		else:
380			stats_header.append(f"- {val} {key}")
381
382	output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined
383
384	return output_complete
385
386
387def main() -> None:
388	"""Main entry point for the script"""
389	arg_parser = argparse.ArgumentParser(
390		description="lmcat - list tree and content, combining .gitignore + .lmignore",
391		add_help=False,
392	)
393	arg_parser.add_argument(
394		"-t",
395		"--tree-only",
396		action="store_true",
397		default=False,
398		help="Only print the tree, not the file contents.",
399	)
400	arg_parser.add_argument(
401		"-o",
402		"--output",
403		action="store",
404		default=None,
405		help="Output file to write the tree and contents to.",
406	)
407	arg_parser.add_argument(
408		"-h", "--help", action="help", help="Show this help message and exit."
409	)
410	arg_parser.add_argument(
411		"--print-cfg",
412		action="store_true",
413		default=False,
414		help="Print the configuration as json and exit.",
415	)
416	arg_parser.add_argument(
417		"--allow-plugins",
418		action="store_true",
419		default=False,
420		help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.",
421	)
422
423	args: argparse.Namespace = arg_parser.parse_known_args()[0]
424	root_dir: Path = Path(".").resolve()
425	config: LMCatConfig = LMCatConfig.read(root_dir)
426
427	# CLI overrides
428	config.output = args.output
429	config.tree_only = args.tree_only
430	config.allow_plugins = args.allow_plugins
431
432	# print cfg and exit if requested
433	if args.print_cfg:
434		print(json.dumps(config.serialize(), indent="\t"))
435		return
436
437	# assemble summary
438	summary: str = assemble_summary(root_dir=root_dir, config=config)
439
440	# Write output
441	if config.output:
442		output_path: Path = Path(args.output)
443		output_path.parent.mkdir(parents=True, exist_ok=True)
444		output_path.write_text(summary, encoding="utf-8")
445	else:
446		if sys.platform == "win32":
447			sys.stdout = io.TextIOWrapper(
448				sys.stdout.buffer, encoding="utf-8", errors="replace"
449			)
450			sys.stderr = io.TextIOWrapper(
451				sys.stderr.buffer, encoding="utf-8", errors="replace"
452			)
453
454		print(summary)
455
456
457if __name__ == "__main__":
458	main()

@serializable_dataclass(kw_only=True)
class LMCatConfig(muutils.json_serialize.serializable_dataclass.SerializableDataclass):
 36@serializable_dataclass(kw_only=True)
 37class LMCatConfig(SerializableDataclass):
 38	"""Configuration dataclass for lmcat"""
 39
 40	content_divider: str = serializable_field(default="``````")
 41	tree_only: bool = serializable_field(default=False)
 42
 43	# ignoring
 44	ignore_patterns: list[str] = serializable_field(default_factory=list)
 45	ignore_patterns_files: list[Path] = serializable_field(
 46		default_factory=lambda: [Path(".gitignore"), Path(".lmignore")],
 47		serialization_fn=lambda x: [p.as_posix() for p in x],
 48		deserialize_fn=lambda x: [Path(p) for p in x],
 49	)
 50
 51	# this file will be imported, and if the functions in it are decorated
 52	# with one of the `register_*` decorators, they will be added to the functions
 53	# which can be used in the processing pipeline
 54	# --allow-plugins is a command line only option and must be set to true for this to work
 55	plugins_file: Path | None = serializable_field(
 56		default=None,
 57		serialization_fn=lambda x: x.as_posix() if x else None,
 58		deserialize_fn=lambda x: Path(x) if x else None,
 59	)
 60	allow_plugins: bool = serializable_field(
 61		default=False,
 62		deserialize_fn=lambda x: False,  # this can only be overriden through the command line
 63	)
 64
 65	# processing pipeline
 66	glob_process: dict[str, str] = serializable_field(default_factory=dict)
 67	decider_process: dict[str, str] = serializable_field(default_factory=dict)
 68	on_multiple_processors: OnMultipleProcessors = serializable_field(
 69		default="except",
 70		assert_type=False,
 71	)
 72
 73	# tokenization
 74	tokenizer: str = serializable_field(
 75		default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split"
 76	)
 77	"Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed."
 78
 79	# tree formatting
 80	tree_divider: str = serializable_field(default="│   ")
 81	tree_file_divider: str = serializable_field(default="├── ")
 82	tree_indent: str = serializable_field(default=" ")
 83
 84	# output location
 85	output: str | None = serializable_field(default=None)
 86
 87	def get_tokenizer_obj(self) -> TokenizerWrapper:
 88		"""Get the tokenizer object"""
 89		return TokenizerWrapper(self.tokenizer)
 90
 91	def get_processing_pipeline(self) -> ProcessingPipeline:
 92		"""Get the processing pipeline object"""
 93		plugins_file: Path | None = self.plugins_file if self.allow_plugins else None
 94		return ProcessingPipeline(
 95			plugins_file=plugins_file,
 96			decider_process_keys=self.decider_process,
 97			glob_process_keys=self.glob_process,
 98			on_multiple_processors=self.on_multiple_processors,
 99		)
100
101	@classmethod
102	def read(cls, root_dir: Path) -> "LMCatConfig":
103		"""Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json."""
104		pyproject_path: Path = root_dir / "pyproject.toml"
105		lmcat_toml_path: Path = root_dir / "lmcat.toml"
106		lmcat_json_path: Path = root_dir / "lmcat.json"
107
108		if (
109			sum(
110				int(p.is_file())
111				for p in (pyproject_path, lmcat_toml_path, lmcat_json_path)
112			)
113			> 1
114		):
115			raise ValueError(
116				"Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json."
117			)
118
119		# Try pyproject.toml first
120		if tomllib is not None and pyproject_path.is_file():
121			with pyproject_path.open("rb") as f:
122				pyproject_data = tomllib.load(f)
123			if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]:
124				return cls.load(pyproject_data["tool"]["lmcat"])
125
126		# Then try lmcat.toml
127		if tomllib is not None and lmcat_toml_path.is_file():
128			with lmcat_toml_path.open("rb") as f:
129				toml_data = tomllib.load(f)
130			return cls.load(toml_data)
131
132		# Finally try lmcat.json
133		if lmcat_json_path.is_file():
134			with lmcat_json_path.open("r", encoding="utf-8") as f:
135				json_data = json.load(f)
136			return cls.load(json_data)
137
138		# Fallback to defaults
139		return cls()

Configuration dataclass for lmcat

LMCatConfig( *, content_divider: str = '``````', tree_only: bool = False, ignore_patterns: list[str] = <factory>, ignore_patterns_files: list[pathlib.Path] = <factory>, plugins_file: pathlib.Path | None = None, allow_plugins: bool = False, glob_process: dict[str, str] = <factory>, decider_process: dict[str, str] = <factory>, on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip'] = 'except', tokenizer: str = 'gpt2', tree_divider: str = '│ ', tree_file_divider: str = '├── ', tree_indent: str = ' ', output: str | None = None)
content_divider: str = '``````'
tree_only: bool = False
ignore_patterns: list[str]
ignore_patterns_files: list[pathlib.Path]
plugins_file: pathlib.Path | None = None
allow_plugins: bool = False
glob_process: dict[str, str]
decider_process: dict[str, str]
on_multiple_processors: Literal['warn', 'except', 'do_first', 'do_last', 'skip'] = 'except'
tokenizer: str = 'gpt2'

Tokenizer to use for tokenizing the output. gpt2 by default. passed to tokenizers.Tokenizer.from_pretrained(). If specified and tokenizers not installed, will throw exception. fallback whitespace-split used to avoid exception when tokenizers not installed.

tree_divider: str = '│ '
tree_file_divider: str = '├── '
tree_indent: str = ' '
output: str | None = None
def get_tokenizer_obj(self) -> lmcat.file_stats.TokenizerWrapper:
87	def get_tokenizer_obj(self) -> TokenizerWrapper:
88		"""Get the tokenizer object"""
89		return TokenizerWrapper(self.tokenizer)

Get the tokenizer object

def get_processing_pipeline(self) -> lmcat.processing_pipeline.ProcessingPipeline:
91	def get_processing_pipeline(self) -> ProcessingPipeline:
92		"""Get the processing pipeline object"""
93		plugins_file: Path | None = self.plugins_file if self.allow_plugins else None
94		return ProcessingPipeline(
95			plugins_file=plugins_file,
96			decider_process_keys=self.decider_process,
97			glob_process_keys=self.glob_process,
98			on_multiple_processors=self.on_multiple_processors,
99		)

Get the processing pipeline object

@classmethod
def read(cls, root_dir: pathlib.Path) -> LMCatConfig:
101	@classmethod
102	def read(cls, root_dir: Path) -> "LMCatConfig":
103		"""Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json."""
104		pyproject_path: Path = root_dir / "pyproject.toml"
105		lmcat_toml_path: Path = root_dir / "lmcat.toml"
106		lmcat_json_path: Path = root_dir / "lmcat.json"
107
108		if (
109			sum(
110				int(p.is_file())
111				for p in (pyproject_path, lmcat_toml_path, lmcat_json_path)
112			)
113			> 1
114		):
115			raise ValueError(
116				"Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json."
117			)
118
119		# Try pyproject.toml first
120		if tomllib is not None and pyproject_path.is_file():
121			with pyproject_path.open("rb") as f:
122				pyproject_data = tomllib.load(f)
123			if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]:
124				return cls.load(pyproject_data["tool"]["lmcat"])
125
126		# Then try lmcat.toml
127		if tomllib is not None and lmcat_toml_path.is_file():
128			with lmcat_toml_path.open("rb") as f:
129				toml_data = tomllib.load(f)
130			return cls.load(toml_data)
131
132		# Finally try lmcat.json
133		if lmcat_json_path.is_file():
134			with lmcat_json_path.open("r", encoding="utf-8") as f:
135				json_data = json.load(f)
136			return cls.load(json_data)
137
138		# Fallback to defaults
139		return cls()

Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.

def serialize(self) -> dict[str, typing.Any]:
704        def serialize(self) -> dict[str, Any]:
705            result: dict[str, Any] = {
706                "__format__": f"{self.__class__.__name__}(SerializableDataclass)"
707            }
708            # for each field in the class
709            for field in dataclasses.fields(self):  # type: ignore[arg-type]
710                # need it to be our special SerializableField
711                if not isinstance(field, SerializableField):
712                    raise NotSerializableFieldException(
713                        f"Field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__} is not a `SerializableField`, "
714                        f"but a {type(field)} "
715                        "this state should be inaccessible, please report this bug!"
716                    )
717
718                # try to save it
719                if field.serialize:
720                    try:
721                        # get the val
722                        value = getattr(self, field.name)
723                        # if it is a serializable dataclass, serialize it
724                        if isinstance(value, SerializableDataclass):
725                            value = value.serialize()
726                        # if the value has a serialization function, use that
727                        if hasattr(value, "serialize") and callable(value.serialize):
728                            value = value.serialize()
729                        # if the field has a serialization function, use that
730                        # it would be nice to be able to override a class's `.serialize()`, but that could lead to some inconsistencies!
731                        elif field.serialization_fn:
732                            value = field.serialization_fn(value)
733
734                        # store the value in the result
735                        result[field.name] = value
736                    except Exception as e:
737                        raise FieldSerializationError(
738                            "\n".join(
739                                [
740                                    f"Error serializing field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__}",
741                                    f"{field = }",
742                                    f"{value = }",
743                                    f"{self = }",
744                                ]
745                            )
746                        ) from e
747
748            # store each property if we can get it
749            for prop in self._properties_to_serialize:
750                if hasattr(cls, prop):
751                    value = getattr(self, prop)
752                    result[prop] = value
753                else:
754                    raise AttributeError(
755                        f"Cannot serialize property '{prop}' on class {self.__class__.__module__}.{self.__class__.__name__}"
756                        + f"but it is in {self._properties_to_serialize = }"
757                        + f"\n{self = }"
758                    )
759
760            return result

returns the class as a dict, implemented by using @serializable_dataclass decorator

@classmethod
def load(cls, data: Union[dict[str, Any], ~T]) -> Type[~T]:
767        @classmethod  # type: ignore[misc]
768        def load(cls, data: dict[str, Any] | T) -> Type[T]:
769            # HACK: this is kind of ugly, but it fixes a lot of issues for when we do recursive loading with ZANJ
770            if isinstance(data, cls):
771                return data
772
773            assert isinstance(
774                data, typing.Mapping
775            ), f"When loading {cls.__name__ = } expected a Mapping, but got {type(data) = }:\n{data = }"
776
777            cls_type_hints: dict[str, Any] = get_cls_type_hints(cls)
778
779            # initialize dict for keeping what we will pass to the constructor
780            ctor_kwargs: dict[str, Any] = dict()
781
782            # iterate over the fields of the class
783            for field in dataclasses.fields(cls):
784                # check if the field is a SerializableField
785                assert isinstance(
786                    field, SerializableField
787                ), f"Field '{field.name}' on class {cls.__name__} is not a SerializableField, but a {type(field)}. this state should be inaccessible, please report this bug!\nhttps://github.com/mivanit/muutils/issues/new"
788
789                # check if the field is in the data and if it should be initialized
790                if (field.name in data) and field.init:
791                    # get the value, we will be processing it
792                    value: Any = data[field.name]
793
794                    # get the type hint for the field
795                    field_type_hint: Any = cls_type_hints.get(field.name, None)
796
797                    # we rely on the init of `SerializableField` to check that only one of `loading_fn` and `deserialize_fn` is set
798                    if field.deserialize_fn:
799                        # if it has a deserialization function, use that
800                        value = field.deserialize_fn(value)
801                    elif field.loading_fn:
802                        # if it has a loading function, use that
803                        value = field.loading_fn(data)
804                    elif (
805                        field_type_hint is not None
806                        and hasattr(field_type_hint, "load")
807                        and callable(field_type_hint.load)
808                    ):
809                        # if no loading function but has a type hint with a load method, use that
810                        if isinstance(value, dict):
811                            value = field_type_hint.load(value)
812                        else:
813                            raise FieldLoadingError(
814                                f"Cannot load value into {field_type_hint}, expected {type(value) = } to be a dict\n{value = }"
815                            )
816                    else:
817                        # assume no loading needs to happen, keep `value` as-is
818                        pass
819
820                    # store the value in the constructor kwargs
821                    ctor_kwargs[field.name] = value
822
823            # create a new instance of the class with the constructor kwargs
824            output: cls = cls(**ctor_kwargs)
825
826            # validate the types of the fields if needed
827            if on_typecheck_mismatch != ErrorMode.IGNORE:
828                fields_valid: dict[str, bool] = (
829                    SerializableDataclass__validate_fields_types__dict(
830                        output,
831                        on_typecheck_error=on_typecheck_error,
832                    )
833                )
834
835                # if there are any fields that are not valid, raise an error
836                if not all(fields_valid.values()):
837                    msg: str = (
838                        f"Type mismatch in fields of {cls.__name__}:\n"
839                        + "\n".join(
840                            [
841                                f"{k}:\texpected {cls_type_hints[k] = }, but got value {getattr(output, k) = }, {type(getattr(output, k)) = }"
842                                for k, v in fields_valid.items()
843                                if not v
844                            ]
845                        )
846                    )
847
848                    on_typecheck_mismatch.process(
849                        msg, except_cls=FieldTypeMismatchError
850                    )
851
852            # return the new instance
853            return output

takes in an appropriately structured dict and returns an instance of the class, implemented by using @serializable_dataclass decorator

def validate_fields_types( self: muutils.json_serialize.serializable_dataclass.SerializableDataclass, on_typecheck_error: muutils.errormode.ErrorMode = ErrorMode.Except) -> bool:
304def SerializableDataclass__validate_fields_types(
305    self: SerializableDataclass,
306    on_typecheck_error: ErrorMode = _DEFAULT_ON_TYPECHECK_ERROR,
307) -> bool:
308    """validate the types of all the fields on a `SerializableDataclass`. calls `SerializableDataclass__validate_field_type` for each field"""
309    return all(
310        SerializableDataclass__validate_fields_types__dict(
311            self, on_typecheck_error=on_typecheck_error
312        ).values()
313    )

validate the types of all the fields on a SerializableDataclass. calls SerializableDataclass__validate_field_type for each field

Inherited Members
muutils.json_serialize.serializable_dataclass.SerializableDataclass
validate_field_type
diff
update_from_nested_dict
class IgnoreHandler:
142class IgnoreHandler:
143	"""Handles all ignore pattern matching using igittigitt"""
144
145	def __init__(self, root_dir: Path, config: LMCatConfig):
146		self.root_dir: Path = root_dir
147		self.config: LMCatConfig = config
148
149		# set up parser
150		self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser()
151
152		# first from the files
153		for ignore_file in self.config.ignore_patterns_files:
154			self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name)
155
156		# then from the config itself
157		for pattern in self.config.ignore_patterns:
158			self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
159
160	def is_ignored(self, path: Path) -> bool:
161		"""Check if a path should be ignored"""
162		# Never ignore the gitignore/lmignore files themselves
163		if path.name in {".gitignore", ".lmignore"}:
164			return True
165
166		# Use igittigitt's matching
167		return self.parser.match(path)

Handles all ignore pattern matching using igittigitt

IgnoreHandler(root_dir: pathlib.Path, config: LMCatConfig)
145	def __init__(self, root_dir: Path, config: LMCatConfig):
146		self.root_dir: Path = root_dir
147		self.config: LMCatConfig = config
148
149		# set up parser
150		self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser()
151
152		# first from the files
153		for ignore_file in self.config.ignore_patterns_files:
154			self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name)
155
156		# then from the config itself
157		for pattern in self.config.ignore_patterns:
158			self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
root_dir: pathlib.Path
config: LMCatConfig
parser: igittigitt.igittigitt.IgnoreParser
def is_ignored(self, path: pathlib.Path) -> bool:
160	def is_ignored(self, path: Path) -> bool:
161		"""Check if a path should be ignored"""
162		# Never ignore the gitignore/lmignore files themselves
163		if path.name in {".gitignore", ".lmignore"}:
164			return True
165
166		# Use igittigitt's matching
167		return self.parser.match(path)

Check if a path should be ignored

def sorted_entries(directory: pathlib.Path) -> list[pathlib.Path]:
170def sorted_entries(directory: Path) -> list[Path]:
171	"""Return directory contents sorted: directories first, then files"""
172	subdirs: list[Path] = sorted(
173		[p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name
174	)
175	files: list[Path] = sorted(
176		[p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name
177	)
178	return subdirs + files

Return directory contents sorted: directories first, then files

def walk_dir( directory: pathlib.Path, ignore_handler: IgnoreHandler, config: LMCatConfig, tokenizer: lmcat.file_stats.TokenizerWrapper, prefix: str = '') -> tuple[list[lmcat.file_stats.TreeEntry], list[pathlib.Path]]:
181def walk_dir(
182	directory: Path,
183	ignore_handler: IgnoreHandler,
184	config: LMCatConfig,
185	tokenizer: TokenizerWrapper,
186	prefix: str = "",
187) -> tuple[list[TreeEntry], list[Path]]:
188	"""Recursively walk a directory, building tree lines and collecting file paths"""
189	tree_output: list[TreeEntry] = []
190	collected_files: list[Path] = []
191
192	entries: list[Path] = sorted_entries(directory)
193	for i, entry in enumerate(entries):
194		if ignore_handler.is_ignored(entry):
195			continue
196
197		is_last: bool = i == len(entries) - 1
198		connector: str = (
199			config.tree_file_divider
200			if not is_last
201			else config.tree_file_divider.replace("├", "└")
202		)
203
204		if entry.is_dir():
205			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None))
206			extension: str = config.tree_divider if not is_last else config.tree_indent
207			sub_output: list[TreeEntry]
208			sub_files: list[Path]
209			sub_output, sub_files = walk_dir(
210				directory=entry,
211				ignore_handler=ignore_handler,
212				config=config,
213				tokenizer=tokenizer,
214				prefix=prefix + extension,
215			)
216			tree_output.extend(sub_output)
217			collected_files.extend(sub_files)
218		else:
219			stats: FileStats = FileStats.from_file(entry, tokenizer)
220			tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats))
221			collected_files.append(entry)
222
223	return tree_output, collected_files

Recursively walk a directory, building tree lines and collecting file paths

def format_tree_with_stats( entries: list[lmcat.file_stats.TreeEntry], show_tokens: bool = False) -> list[str]:
226def format_tree_with_stats(
227	entries: list[TreeEntry], show_tokens: bool = False
228) -> list[str]:
229	"""Format tree entries with aligned statistics
230
231	# Parameters:
232	 - `entries : list[TreeEntry]`
233		List of tree entries with optional stats
234	 - `show_tokens : bool`
235		Whether to show token counts
236
237	# Returns:
238	 - `list[str]`
239		Formatted tree lines with aligned stats
240	"""
241	# Find max widths for alignment
242	max_line_len: int = max(len(entry.line) for entry in entries)
243	max_lines: int = max(
244		(len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries
245	)
246	max_chars: int = max(
247		(len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries
248	)
249	max_tokens: int = (
250		max(
251			(
252				len(f"{entry.stats.tokens:,}")
253				if entry.stats and entry.stats.tokens
254				else 0
255			)
256			for entry in entries
257		)
258		if show_tokens
259		else 0
260	)
261
262	formatted: list[str] = []
263	for entry in entries:
264		line: str = entry.line.ljust(max_line_len + 2)
265		if entry.stats:
266			lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1)
267			chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1)
268			stats_str: str = f"[{lines_str} {chars_str}"
269			if show_tokens and entry.stats.tokens is not None:
270				tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1)
271				stats_str += f" {tokens_str}"
272			stats_str += "]"
273			formatted.append(f"{line}{stats_str}")
274		else:
275			formatted.append(line)
276
277	return formatted

Format tree entries with aligned statistics

Parameters:

  • entries : list[TreeEntry] List of tree entries with optional stats
  • show_tokens : bool Whether to show token counts

Returns:

  • list[str] Formatted tree lines with aligned stats
def walk_and_collect( root_dir: pathlib.Path, config: LMCatConfig) -> tuple[list[str], list[pathlib.Path]]:
280def walk_and_collect(
281	root_dir: Path,
282	config: LMCatConfig,
283) -> tuple[list[str], list[Path]]:
284	"""Walk filesystem from root_dir and gather tree listing plus file paths"""
285	if config is None:
286		config = LMCatConfig()
287
288	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
289
290	ignore_handler = IgnoreHandler(root_dir, config)
291	base_name = root_dir.resolve().name
292
293	# Start with root directory name
294	tree_output = [TreeEntry(base_name)]
295
296	# Walk the directory tree
297	sub_output, sub_files = walk_dir(
298		directory=root_dir,
299		ignore_handler=ignore_handler,
300		config=config,
301		tokenizer=tokenizer,
302		prefix="",
303	)
304	tree_output.extend(sub_output)
305
306	# Format tree with stats
307	formatted_tree = format_tree_with_stats(
308		tree_output, show_tokens=tokenizer is not None
309	)
310
311	return formatted_tree, sub_files

Walk filesystem from root_dir and gather tree listing plus file paths

def assemble_summary(root_dir: pathlib.Path, config: LMCatConfig) -> str:
314def assemble_summary(
315	root_dir: Path,
316	config: LMCatConfig,
317) -> str:
318	"""Assemble the summary output and return"""
319
320	processing_pipeline: ProcessingPipeline = config.get_processing_pipeline()
321
322	tree_output: list[str]
323	collected_files: list[Path]
324	tree_output, collected_files = walk_and_collect(
325		root_dir=root_dir,
326		config=config,
327	)
328
329	output: list[str] = []
330	output.append("# File Tree")
331	output.append("\n```")
332	output.extend(tree_output)
333	output.append("```\n")
334
335	# Add file contents if not suppressed
336	if not config.tree_only:
337		output.append("# File Contents")
338
339		for fpath in collected_files:
340			# get the path
341			relpath_posix: str = fpath.relative_to(root_dir).as_posix()
342
343			# process the contents
344			f_contents: str
345			p_name: str | None
346			f_contents, p_name = processing_pipeline.process_file(fpath)
347			processed_with: str = f'processed_with="{p_name}"' if p_name else ""
348
349			# start of file marker
350			pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}'
351			pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}'
352			output.append("")
353			output.append(config.content_divider + pathspec_start)
354
355			# process the actual contents of the file with the pipeline, and append
356			output.append(f_contents)
357
358			# add the end of file marker
359			output.append(config.content_divider + pathspec_end)
360
361	output_joined: str = "\n".join(output)
362
363	stats_dict_ints: dict[str, int] = {
364		"files": len(collected_files),
365		"lines": len(output_joined.splitlines()),
366		"chars": len(output_joined),
367	}
368
369	tokenizer: TokenizerWrapper = config.get_tokenizer_obj()
370
371	n_tokens: int = tokenizer.n_tokens(output_joined)
372	stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens
373
374	stats_header: list[str] = ["# Stats"]
375	for key, val in stats_dict_ints.items():
376		val_str: str = str(val)
377		val_short: str = shorten_numerical_to_str(val)
378		if val_str != val_short:
379			stats_header.append(f"- {val} ({val_short}) {key}")
380		else:
381			stats_header.append(f"- {val} {key}")
382
383	output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined
384
385	return output_complete

Assemble the summary output and return

def main() -> None:
388def main() -> None:
389	"""Main entry point for the script"""
390	arg_parser = argparse.ArgumentParser(
391		description="lmcat - list tree and content, combining .gitignore + .lmignore",
392		add_help=False,
393	)
394	arg_parser.add_argument(
395		"-t",
396		"--tree-only",
397		action="store_true",
398		default=False,
399		help="Only print the tree, not the file contents.",
400	)
401	arg_parser.add_argument(
402		"-o",
403		"--output",
404		action="store",
405		default=None,
406		help="Output file to write the tree and contents to.",
407	)
408	arg_parser.add_argument(
409		"-h", "--help", action="help", help="Show this help message and exit."
410	)
411	arg_parser.add_argument(
412		"--print-cfg",
413		action="store_true",
414		default=False,
415		help="Print the configuration as json and exit.",
416	)
417	arg_parser.add_argument(
418		"--allow-plugins",
419		action="store_true",
420		default=False,
421		help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.",
422	)
423
424	args: argparse.Namespace = arg_parser.parse_known_args()[0]
425	root_dir: Path = Path(".").resolve()
426	config: LMCatConfig = LMCatConfig.read(root_dir)
427
428	# CLI overrides
429	config.output = args.output
430	config.tree_only = args.tree_only
431	config.allow_plugins = args.allow_plugins
432
433	# print cfg and exit if requested
434	if args.print_cfg:
435		print(json.dumps(config.serialize(), indent="\t"))
436		return
437
438	# assemble summary
439	summary: str = assemble_summary(root_dir=root_dir, config=config)
440
441	# Write output
442	if config.output:
443		output_path: Path = Path(args.output)
444		output_path.parent.mkdir(parents=True, exist_ok=True)
445		output_path.write_text(summary, encoding="utf-8")
446	else:
447		if sys.platform == "win32":
448			sys.stdout = io.TextIOWrapper(
449				sys.stdout.buffer, encoding="utf-8", errors="replace"
450			)
451			sys.stderr = io.TextIOWrapper(
452				sys.stderr.buffer, encoding="utf-8", errors="replace"
453			)
454
455		print(summary)

Main entry point for the script