lmcat.lmcat
1import argparse 2import io 3import json 4 5# from dataclasses import dataclass, field 6from pathlib import Path 7import sys 8 9from lmcat.processing_pipeline import ProcessingPipeline 10 11 12# Handle Python 3.11+ vs older Python for TOML parsing 13try: 14 import tomllib 15except ImportError: 16 try: 17 import tomli as tomllib # type: ignore 18 except ImportError: 19 tomllib = None # type: ignore[assignment] 20 21import igittigitt # noqa: E402 22 23from muutils.json_serialize import ( 24 SerializableDataclass, 25 serializable_dataclass, 26 serializable_field, 27) 28from muutils.misc import shorten_numerical_to_str # noqa: E402 29 30 31from lmcat.file_stats import FileStats, TokenizerWrapper, TreeEntry, TOKENIZERS_PRESENT 32from lmcat.processing_pipeline import OnMultipleProcessors 33 34 35@serializable_dataclass(kw_only=True) 36class LMCatConfig(SerializableDataclass): 37 """Configuration dataclass for lmcat""" 38 39 content_divider: str = serializable_field(default="``````") 40 tree_only: bool = serializable_field(default=False) 41 42 # ignoring 43 ignore_patterns: list[str] = serializable_field(default_factory=list) 44 ignore_patterns_files: list[Path] = serializable_field( 45 default_factory=lambda: [Path(".gitignore"), Path(".lmignore")], 46 serialization_fn=lambda x: [p.as_posix() for p in x], 47 deserialize_fn=lambda x: [Path(p) for p in x], 48 ) 49 50 # this file will be imported, and if the functions in it are decorated 51 # with one of the `register_*` decorators, they will be added to the functions 52 # which can be used in the processing pipeline 53 # --allow-plugins is a command line only option and must be set to true for this to work 54 plugins_file: Path | None = serializable_field( 55 default=None, 56 serialization_fn=lambda x: x.as_posix() if x else None, 57 deserialize_fn=lambda x: Path(x) if x else None, 58 ) 59 allow_plugins: bool = serializable_field( 60 default=False, 61 deserialize_fn=lambda x: False, # this can only be overriden through the command line 62 ) 63 64 # processing pipeline 65 glob_process: dict[str, str] = serializable_field(default_factory=dict) 66 decider_process: dict[str, str] = serializable_field(default_factory=dict) 67 on_multiple_processors: OnMultipleProcessors = serializable_field( 68 default="except", 69 assert_type=False, 70 ) 71 72 # tokenization 73 tokenizer: str = serializable_field( 74 default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split" 75 ) 76 "Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed." 77 78 # tree formatting 79 tree_divider: str = serializable_field(default="│ ") 80 tree_file_divider: str = serializable_field(default="├── ") 81 tree_indent: str = serializable_field(default=" ") 82 83 # output location 84 output: str | None = serializable_field(default=None) 85 86 def get_tokenizer_obj(self) -> TokenizerWrapper: 87 """Get the tokenizer object""" 88 return TokenizerWrapper(self.tokenizer) 89 90 def get_processing_pipeline(self) -> ProcessingPipeline: 91 """Get the processing pipeline object""" 92 plugins_file: Path | None = self.plugins_file if self.allow_plugins else None 93 return ProcessingPipeline( 94 plugins_file=plugins_file, 95 decider_process_keys=self.decider_process, 96 glob_process_keys=self.glob_process, 97 on_multiple_processors=self.on_multiple_processors, 98 ) 99 100 @classmethod 101 def read(cls, root_dir: Path) -> "LMCatConfig": 102 """Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.""" 103 pyproject_path: Path = root_dir / "pyproject.toml" 104 lmcat_toml_path: Path = root_dir / "lmcat.toml" 105 lmcat_json_path: Path = root_dir / "lmcat.json" 106 107 if ( 108 sum( 109 int(p.is_file()) 110 for p in (pyproject_path, lmcat_toml_path, lmcat_json_path) 111 ) 112 > 1 113 ): 114 raise ValueError( 115 "Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json." 116 ) 117 118 # Try pyproject.toml first 119 if tomllib is not None and pyproject_path.is_file(): 120 with pyproject_path.open("rb") as f: 121 pyproject_data = tomllib.load(f) 122 if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]: 123 return cls.load(pyproject_data["tool"]["lmcat"]) 124 125 # Then try lmcat.toml 126 if tomllib is not None and lmcat_toml_path.is_file(): 127 with lmcat_toml_path.open("rb") as f: 128 toml_data = tomllib.load(f) 129 return cls.load(toml_data) 130 131 # Finally try lmcat.json 132 if lmcat_json_path.is_file(): 133 with lmcat_json_path.open("r", encoding="utf-8") as f: 134 json_data = json.load(f) 135 return cls.load(json_data) 136 137 # Fallback to defaults 138 return cls() 139 140 141class IgnoreHandler: 142 """Handles all ignore pattern matching using igittigitt""" 143 144 def __init__(self, root_dir: Path, config: LMCatConfig): 145 self.root_dir: Path = root_dir 146 self.config: LMCatConfig = config 147 148 # set up parser 149 self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser() 150 151 # first from the files 152 for ignore_file in self.config.ignore_patterns_files: 153 self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name) 154 155 # then from the config itself 156 for pattern in self.config.ignore_patterns: 157 self.parser.add_rule(pattern=pattern, base_path=self.root_dir) 158 159 def is_ignored(self, path: Path) -> bool: 160 """Check if a path should be ignored""" 161 # Never ignore the gitignore/lmignore files themselves 162 if path.name in {".gitignore", ".lmignore"}: 163 return True 164 165 # Use igittigitt's matching 166 return self.parser.match(path) 167 168 169def sorted_entries(directory: Path) -> list[Path]: 170 """Return directory contents sorted: directories first, then files""" 171 subdirs: list[Path] = sorted( 172 [p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name 173 ) 174 files: list[Path] = sorted( 175 [p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name 176 ) 177 return subdirs + files 178 179 180def walk_dir( 181 directory: Path, 182 ignore_handler: IgnoreHandler, 183 config: LMCatConfig, 184 tokenizer: TokenizerWrapper, 185 prefix: str = "", 186) -> tuple[list[TreeEntry], list[Path]]: 187 """Recursively walk a directory, building tree lines and collecting file paths""" 188 tree_output: list[TreeEntry] = [] 189 collected_files: list[Path] = [] 190 191 entries: list[Path] = sorted_entries(directory) 192 for i, entry in enumerate(entries): 193 if ignore_handler.is_ignored(entry): 194 continue 195 196 is_last: bool = i == len(entries) - 1 197 connector: str = ( 198 config.tree_file_divider 199 if not is_last 200 else config.tree_file_divider.replace("├", "└") 201 ) 202 203 if entry.is_dir(): 204 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None)) 205 extension: str = config.tree_divider if not is_last else config.tree_indent 206 sub_output: list[TreeEntry] 207 sub_files: list[Path] 208 sub_output, sub_files = walk_dir( 209 directory=entry, 210 ignore_handler=ignore_handler, 211 config=config, 212 tokenizer=tokenizer, 213 prefix=prefix + extension, 214 ) 215 tree_output.extend(sub_output) 216 collected_files.extend(sub_files) 217 else: 218 stats: FileStats = FileStats.from_file(entry, tokenizer) 219 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats)) 220 collected_files.append(entry) 221 222 return tree_output, collected_files 223 224 225def format_tree_with_stats( 226 entries: list[TreeEntry], show_tokens: bool = False 227) -> list[str]: 228 """Format tree entries with aligned statistics 229 230 # Parameters: 231 - `entries : list[TreeEntry]` 232 List of tree entries with optional stats 233 - `show_tokens : bool` 234 Whether to show token counts 235 236 # Returns: 237 - `list[str]` 238 Formatted tree lines with aligned stats 239 """ 240 # Find max widths for alignment 241 max_line_len: int = max(len(entry.line) for entry in entries) 242 max_lines: int = max( 243 (len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries 244 ) 245 max_chars: int = max( 246 (len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries 247 ) 248 max_tokens: int = ( 249 max( 250 ( 251 len(f"{entry.stats.tokens:,}") 252 if entry.stats and entry.stats.tokens 253 else 0 254 ) 255 for entry in entries 256 ) 257 if show_tokens 258 else 0 259 ) 260 261 formatted: list[str] = [] 262 for entry in entries: 263 line: str = entry.line.ljust(max_line_len + 2) 264 if entry.stats: 265 lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1) 266 chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1) 267 stats_str: str = f"[{lines_str} {chars_str}" 268 if show_tokens and entry.stats.tokens is not None: 269 tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1) 270 stats_str += f" {tokens_str}" 271 stats_str += "]" 272 formatted.append(f"{line}{stats_str}") 273 else: 274 formatted.append(line) 275 276 return formatted 277 278 279def walk_and_collect( 280 root_dir: Path, 281 config: LMCatConfig, 282) -> tuple[list[str], list[Path]]: 283 """Walk filesystem from root_dir and gather tree listing plus file paths""" 284 if config is None: 285 config = LMCatConfig() 286 287 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 288 289 ignore_handler = IgnoreHandler(root_dir, config) 290 base_name = root_dir.resolve().name 291 292 # Start with root directory name 293 tree_output = [TreeEntry(base_name)] 294 295 # Walk the directory tree 296 sub_output, sub_files = walk_dir( 297 directory=root_dir, 298 ignore_handler=ignore_handler, 299 config=config, 300 tokenizer=tokenizer, 301 prefix="", 302 ) 303 tree_output.extend(sub_output) 304 305 # Format tree with stats 306 formatted_tree = format_tree_with_stats( 307 tree_output, show_tokens=tokenizer is not None 308 ) 309 310 return formatted_tree, sub_files 311 312 313def assemble_summary( 314 root_dir: Path, 315 config: LMCatConfig, 316) -> str: 317 """Assemble the summary output and return""" 318 319 processing_pipeline: ProcessingPipeline = config.get_processing_pipeline() 320 321 tree_output: list[str] 322 collected_files: list[Path] 323 tree_output, collected_files = walk_and_collect( 324 root_dir=root_dir, 325 config=config, 326 ) 327 328 output: list[str] = [] 329 output.append("# File Tree") 330 output.append("\n```") 331 output.extend(tree_output) 332 output.append("```\n") 333 334 # Add file contents if not suppressed 335 if not config.tree_only: 336 output.append("# File Contents") 337 338 for fpath in collected_files: 339 # get the path 340 relpath_posix: str = fpath.relative_to(root_dir).as_posix() 341 342 # process the contents 343 f_contents: str 344 p_name: str | None 345 f_contents, p_name = processing_pipeline.process_file(fpath) 346 processed_with: str = f'processed_with="{p_name}"' if p_name else "" 347 348 # start of file marker 349 pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}' 350 pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}' 351 output.append("") 352 output.append(config.content_divider + pathspec_start) 353 354 # process the actual contents of the file with the pipeline, and append 355 output.append(f_contents) 356 357 # add the end of file marker 358 output.append(config.content_divider + pathspec_end) 359 360 output_joined: str = "\n".join(output) 361 362 stats_dict_ints: dict[str, int] = { 363 "files": len(collected_files), 364 "lines": len(output_joined.splitlines()), 365 "chars": len(output_joined), 366 } 367 368 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 369 370 n_tokens: int = tokenizer.n_tokens(output_joined) 371 stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens 372 373 stats_header: list[str] = ["# Stats"] 374 for key, val in stats_dict_ints.items(): 375 val_str: str = str(val) 376 val_short: str = shorten_numerical_to_str(val) 377 if val_str != val_short: 378 stats_header.append(f"- {val} ({val_short}) {key}") 379 else: 380 stats_header.append(f"- {val} {key}") 381 382 output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined 383 384 return output_complete 385 386 387def main() -> None: 388 """Main entry point for the script""" 389 arg_parser = argparse.ArgumentParser( 390 description="lmcat - list tree and content, combining .gitignore + .lmignore", 391 add_help=False, 392 ) 393 arg_parser.add_argument( 394 "-t", 395 "--tree-only", 396 action="store_true", 397 default=False, 398 help="Only print the tree, not the file contents.", 399 ) 400 arg_parser.add_argument( 401 "-o", 402 "--output", 403 action="store", 404 default=None, 405 help="Output file to write the tree and contents to.", 406 ) 407 arg_parser.add_argument( 408 "-h", "--help", action="help", help="Show this help message and exit." 409 ) 410 arg_parser.add_argument( 411 "--print-cfg", 412 action="store_true", 413 default=False, 414 help="Print the configuration as json and exit.", 415 ) 416 arg_parser.add_argument( 417 "--allow-plugins", 418 action="store_true", 419 default=False, 420 help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.", 421 ) 422 423 args: argparse.Namespace = arg_parser.parse_known_args()[0] 424 root_dir: Path = Path(".").resolve() 425 config: LMCatConfig = LMCatConfig.read(root_dir) 426 427 # CLI overrides 428 config.output = args.output 429 config.tree_only = args.tree_only 430 config.allow_plugins = args.allow_plugins 431 432 # print cfg and exit if requested 433 if args.print_cfg: 434 print(json.dumps(config.serialize(), indent="\t")) 435 return 436 437 # assemble summary 438 summary: str = assemble_summary(root_dir=root_dir, config=config) 439 440 # Write output 441 if config.output: 442 output_path: Path = Path(args.output) 443 output_path.parent.mkdir(parents=True, exist_ok=True) 444 output_path.write_text(summary, encoding="utf-8") 445 else: 446 if sys.platform == "win32": 447 sys.stdout = io.TextIOWrapper( 448 sys.stdout.buffer, encoding="utf-8", errors="replace" 449 ) 450 sys.stderr = io.TextIOWrapper( 451 sys.stderr.buffer, encoding="utf-8", errors="replace" 452 ) 453 454 print(summary) 455 456 457if __name__ == "__main__": 458 main()
36@serializable_dataclass(kw_only=True) 37class LMCatConfig(SerializableDataclass): 38 """Configuration dataclass for lmcat""" 39 40 content_divider: str = serializable_field(default="``````") 41 tree_only: bool = serializable_field(default=False) 42 43 # ignoring 44 ignore_patterns: list[str] = serializable_field(default_factory=list) 45 ignore_patterns_files: list[Path] = serializable_field( 46 default_factory=lambda: [Path(".gitignore"), Path(".lmignore")], 47 serialization_fn=lambda x: [p.as_posix() for p in x], 48 deserialize_fn=lambda x: [Path(p) for p in x], 49 ) 50 51 # this file will be imported, and if the functions in it are decorated 52 # with one of the `register_*` decorators, they will be added to the functions 53 # which can be used in the processing pipeline 54 # --allow-plugins is a command line only option and must be set to true for this to work 55 plugins_file: Path | None = serializable_field( 56 default=None, 57 serialization_fn=lambda x: x.as_posix() if x else None, 58 deserialize_fn=lambda x: Path(x) if x else None, 59 ) 60 allow_plugins: bool = serializable_field( 61 default=False, 62 deserialize_fn=lambda x: False, # this can only be overriden through the command line 63 ) 64 65 # processing pipeline 66 glob_process: dict[str, str] = serializable_field(default_factory=dict) 67 decider_process: dict[str, str] = serializable_field(default_factory=dict) 68 on_multiple_processors: OnMultipleProcessors = serializable_field( 69 default="except", 70 assert_type=False, 71 ) 72 73 # tokenization 74 tokenizer: str = serializable_field( 75 default="gpt2" if TOKENIZERS_PRESENT else "whitespace-split" 76 ) 77 "Tokenizer to use for tokenizing the output. `gpt2` by default. passed to `tokenizers.Tokenizer.from_pretrained()`. If specified and `tokenizers` not installed, will throw exception. fallback `whitespace-split` used to avoid exception when `tokenizers` not installed." 78 79 # tree formatting 80 tree_divider: str = serializable_field(default="│ ") 81 tree_file_divider: str = serializable_field(default="├── ") 82 tree_indent: str = serializable_field(default=" ") 83 84 # output location 85 output: str | None = serializable_field(default=None) 86 87 def get_tokenizer_obj(self) -> TokenizerWrapper: 88 """Get the tokenizer object""" 89 return TokenizerWrapper(self.tokenizer) 90 91 def get_processing_pipeline(self) -> ProcessingPipeline: 92 """Get the processing pipeline object""" 93 plugins_file: Path | None = self.plugins_file if self.allow_plugins else None 94 return ProcessingPipeline( 95 plugins_file=plugins_file, 96 decider_process_keys=self.decider_process, 97 glob_process_keys=self.glob_process, 98 on_multiple_processors=self.on_multiple_processors, 99 ) 100 101 @classmethod 102 def read(cls, root_dir: Path) -> "LMCatConfig": 103 """Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.""" 104 pyproject_path: Path = root_dir / "pyproject.toml" 105 lmcat_toml_path: Path = root_dir / "lmcat.toml" 106 lmcat_json_path: Path = root_dir / "lmcat.json" 107 108 if ( 109 sum( 110 int(p.is_file()) 111 for p in (pyproject_path, lmcat_toml_path, lmcat_json_path) 112 ) 113 > 1 114 ): 115 raise ValueError( 116 "Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json." 117 ) 118 119 # Try pyproject.toml first 120 if tomllib is not None and pyproject_path.is_file(): 121 with pyproject_path.open("rb") as f: 122 pyproject_data = tomllib.load(f) 123 if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]: 124 return cls.load(pyproject_data["tool"]["lmcat"]) 125 126 # Then try lmcat.toml 127 if tomllib is not None and lmcat_toml_path.is_file(): 128 with lmcat_toml_path.open("rb") as f: 129 toml_data = tomllib.load(f) 130 return cls.load(toml_data) 131 132 # Finally try lmcat.json 133 if lmcat_json_path.is_file(): 134 with lmcat_json_path.open("r", encoding="utf-8") as f: 135 json_data = json.load(f) 136 return cls.load(json_data) 137 138 # Fallback to defaults 139 return cls()
Configuration dataclass for lmcat
Tokenizer to use for tokenizing the output. gpt2
by default. passed to tokenizers.Tokenizer.from_pretrained()
. If specified and tokenizers
not installed, will throw exception. fallback whitespace-split
used to avoid exception when tokenizers
not installed.
87 def get_tokenizer_obj(self) -> TokenizerWrapper: 88 """Get the tokenizer object""" 89 return TokenizerWrapper(self.tokenizer)
Get the tokenizer object
91 def get_processing_pipeline(self) -> ProcessingPipeline: 92 """Get the processing pipeline object""" 93 plugins_file: Path | None = self.plugins_file if self.allow_plugins else None 94 return ProcessingPipeline( 95 plugins_file=plugins_file, 96 decider_process_keys=self.decider_process, 97 glob_process_keys=self.glob_process, 98 on_multiple_processors=self.on_multiple_processors, 99 )
Get the processing pipeline object
101 @classmethod 102 def read(cls, root_dir: Path) -> "LMCatConfig": 103 """Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.""" 104 pyproject_path: Path = root_dir / "pyproject.toml" 105 lmcat_toml_path: Path = root_dir / "lmcat.toml" 106 lmcat_json_path: Path = root_dir / "lmcat.json" 107 108 if ( 109 sum( 110 int(p.is_file()) 111 for p in (pyproject_path, lmcat_toml_path, lmcat_json_path) 112 ) 113 > 1 114 ): 115 raise ValueError( 116 "Multiple configuration files found. Please only use one of pyproject.toml, lmcat.toml, or lmcat.json." 117 ) 118 119 # Try pyproject.toml first 120 if tomllib is not None and pyproject_path.is_file(): 121 with pyproject_path.open("rb") as f: 122 pyproject_data = tomllib.load(f) 123 if "tool" in pyproject_data and "lmcat" in pyproject_data["tool"]: 124 return cls.load(pyproject_data["tool"]["lmcat"]) 125 126 # Then try lmcat.toml 127 if tomllib is not None and lmcat_toml_path.is_file(): 128 with lmcat_toml_path.open("rb") as f: 129 toml_data = tomllib.load(f) 130 return cls.load(toml_data) 131 132 # Finally try lmcat.json 133 if lmcat_json_path.is_file(): 134 with lmcat_json_path.open("r", encoding="utf-8") as f: 135 json_data = json.load(f) 136 return cls.load(json_data) 137 138 # Fallback to defaults 139 return cls()
Attempt to read config from pyproject.toml, lmcat.toml, or lmcat.json.
704 def serialize(self) -> dict[str, Any]: 705 result: dict[str, Any] = { 706 "__format__": f"{self.__class__.__name__}(SerializableDataclass)" 707 } 708 # for each field in the class 709 for field in dataclasses.fields(self): # type: ignore[arg-type] 710 # need it to be our special SerializableField 711 if not isinstance(field, SerializableField): 712 raise NotSerializableFieldException( 713 f"Field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__} is not a `SerializableField`, " 714 f"but a {type(field)} " 715 "this state should be inaccessible, please report this bug!" 716 ) 717 718 # try to save it 719 if field.serialize: 720 try: 721 # get the val 722 value = getattr(self, field.name) 723 # if it is a serializable dataclass, serialize it 724 if isinstance(value, SerializableDataclass): 725 value = value.serialize() 726 # if the value has a serialization function, use that 727 if hasattr(value, "serialize") and callable(value.serialize): 728 value = value.serialize() 729 # if the field has a serialization function, use that 730 # it would be nice to be able to override a class's `.serialize()`, but that could lead to some inconsistencies! 731 elif field.serialization_fn: 732 value = field.serialization_fn(value) 733 734 # store the value in the result 735 result[field.name] = value 736 except Exception as e: 737 raise FieldSerializationError( 738 "\n".join( 739 [ 740 f"Error serializing field '{field.name}' on class {self.__class__.__module__}.{self.__class__.__name__}", 741 f"{field = }", 742 f"{value = }", 743 f"{self = }", 744 ] 745 ) 746 ) from e 747 748 # store each property if we can get it 749 for prop in self._properties_to_serialize: 750 if hasattr(cls, prop): 751 value = getattr(self, prop) 752 result[prop] = value 753 else: 754 raise AttributeError( 755 f"Cannot serialize property '{prop}' on class {self.__class__.__module__}.{self.__class__.__name__}" 756 + f"but it is in {self._properties_to_serialize = }" 757 + f"\n{self = }" 758 ) 759 760 return result
returns the class as a dict, implemented by using @serializable_dataclass
decorator
767 @classmethod # type: ignore[misc] 768 def load(cls, data: dict[str, Any] | T) -> Type[T]: 769 # HACK: this is kind of ugly, but it fixes a lot of issues for when we do recursive loading with ZANJ 770 if isinstance(data, cls): 771 return data 772 773 assert isinstance( 774 data, typing.Mapping 775 ), f"When loading {cls.__name__ = } expected a Mapping, but got {type(data) = }:\n{data = }" 776 777 cls_type_hints: dict[str, Any] = get_cls_type_hints(cls) 778 779 # initialize dict for keeping what we will pass to the constructor 780 ctor_kwargs: dict[str, Any] = dict() 781 782 # iterate over the fields of the class 783 for field in dataclasses.fields(cls): 784 # check if the field is a SerializableField 785 assert isinstance( 786 field, SerializableField 787 ), f"Field '{field.name}' on class {cls.__name__} is not a SerializableField, but a {type(field)}. this state should be inaccessible, please report this bug!\nhttps://github.com/mivanit/muutils/issues/new" 788 789 # check if the field is in the data and if it should be initialized 790 if (field.name in data) and field.init: 791 # get the value, we will be processing it 792 value: Any = data[field.name] 793 794 # get the type hint for the field 795 field_type_hint: Any = cls_type_hints.get(field.name, None) 796 797 # we rely on the init of `SerializableField` to check that only one of `loading_fn` and `deserialize_fn` is set 798 if field.deserialize_fn: 799 # if it has a deserialization function, use that 800 value = field.deserialize_fn(value) 801 elif field.loading_fn: 802 # if it has a loading function, use that 803 value = field.loading_fn(data) 804 elif ( 805 field_type_hint is not None 806 and hasattr(field_type_hint, "load") 807 and callable(field_type_hint.load) 808 ): 809 # if no loading function but has a type hint with a load method, use that 810 if isinstance(value, dict): 811 value = field_type_hint.load(value) 812 else: 813 raise FieldLoadingError( 814 f"Cannot load value into {field_type_hint}, expected {type(value) = } to be a dict\n{value = }" 815 ) 816 else: 817 # assume no loading needs to happen, keep `value` as-is 818 pass 819 820 # store the value in the constructor kwargs 821 ctor_kwargs[field.name] = value 822 823 # create a new instance of the class with the constructor kwargs 824 output: cls = cls(**ctor_kwargs) 825 826 # validate the types of the fields if needed 827 if on_typecheck_mismatch != ErrorMode.IGNORE: 828 fields_valid: dict[str, bool] = ( 829 SerializableDataclass__validate_fields_types__dict( 830 output, 831 on_typecheck_error=on_typecheck_error, 832 ) 833 ) 834 835 # if there are any fields that are not valid, raise an error 836 if not all(fields_valid.values()): 837 msg: str = ( 838 f"Type mismatch in fields of {cls.__name__}:\n" 839 + "\n".join( 840 [ 841 f"{k}:\texpected {cls_type_hints[k] = }, but got value {getattr(output, k) = }, {type(getattr(output, k)) = }" 842 for k, v in fields_valid.items() 843 if not v 844 ] 845 ) 846 ) 847 848 on_typecheck_mismatch.process( 849 msg, except_cls=FieldTypeMismatchError 850 ) 851 852 # return the new instance 853 return output
takes in an appropriately structured dict and returns an instance of the class, implemented by using @serializable_dataclass
decorator
304def SerializableDataclass__validate_fields_types( 305 self: SerializableDataclass, 306 on_typecheck_error: ErrorMode = _DEFAULT_ON_TYPECHECK_ERROR, 307) -> bool: 308 """validate the types of all the fields on a `SerializableDataclass`. calls `SerializableDataclass__validate_field_type` for each field""" 309 return all( 310 SerializableDataclass__validate_fields_types__dict( 311 self, on_typecheck_error=on_typecheck_error 312 ).values() 313 )
validate the types of all the fields on a SerializableDataclass
. calls SerializableDataclass__validate_field_type
for each field
Inherited Members
- muutils.json_serialize.serializable_dataclass.SerializableDataclass
- validate_field_type
- diff
- update_from_nested_dict
142class IgnoreHandler: 143 """Handles all ignore pattern matching using igittigitt""" 144 145 def __init__(self, root_dir: Path, config: LMCatConfig): 146 self.root_dir: Path = root_dir 147 self.config: LMCatConfig = config 148 149 # set up parser 150 self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser() 151 152 # first from the files 153 for ignore_file in self.config.ignore_patterns_files: 154 self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name) 155 156 # then from the config itself 157 for pattern in self.config.ignore_patterns: 158 self.parser.add_rule(pattern=pattern, base_path=self.root_dir) 159 160 def is_ignored(self, path: Path) -> bool: 161 """Check if a path should be ignored""" 162 # Never ignore the gitignore/lmignore files themselves 163 if path.name in {".gitignore", ".lmignore"}: 164 return True 165 166 # Use igittigitt's matching 167 return self.parser.match(path)
Handles all ignore pattern matching using igittigitt
145 def __init__(self, root_dir: Path, config: LMCatConfig): 146 self.root_dir: Path = root_dir 147 self.config: LMCatConfig = config 148 149 # set up parser 150 self.parser: igittigitt.IgnoreParser = igittigitt.IgnoreParser() 151 152 # first from the files 153 for ignore_file in self.config.ignore_patterns_files: 154 self.parser.parse_rule_files(self.root_dir, filename=ignore_file.name) 155 156 # then from the config itself 157 for pattern in self.config.ignore_patterns: 158 self.parser.add_rule(pattern=pattern, base_path=self.root_dir)
160 def is_ignored(self, path: Path) -> bool: 161 """Check if a path should be ignored""" 162 # Never ignore the gitignore/lmignore files themselves 163 if path.name in {".gitignore", ".lmignore"}: 164 return True 165 166 # Use igittigitt's matching 167 return self.parser.match(path)
Check if a path should be ignored
170def sorted_entries(directory: Path) -> list[Path]: 171 """Return directory contents sorted: directories first, then files""" 172 subdirs: list[Path] = sorted( 173 [p for p in directory.iterdir() if p.is_dir()], key=lambda x: x.name 174 ) 175 files: list[Path] = sorted( 176 [p for p in directory.iterdir() if p.is_file()], key=lambda x: x.name 177 ) 178 return subdirs + files
Return directory contents sorted: directories first, then files
181def walk_dir( 182 directory: Path, 183 ignore_handler: IgnoreHandler, 184 config: LMCatConfig, 185 tokenizer: TokenizerWrapper, 186 prefix: str = "", 187) -> tuple[list[TreeEntry], list[Path]]: 188 """Recursively walk a directory, building tree lines and collecting file paths""" 189 tree_output: list[TreeEntry] = [] 190 collected_files: list[Path] = [] 191 192 entries: list[Path] = sorted_entries(directory) 193 for i, entry in enumerate(entries): 194 if ignore_handler.is_ignored(entry): 195 continue 196 197 is_last: bool = i == len(entries) - 1 198 connector: str = ( 199 config.tree_file_divider 200 if not is_last 201 else config.tree_file_divider.replace("├", "└") 202 ) 203 204 if entry.is_dir(): 205 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", None)) 206 extension: str = config.tree_divider if not is_last else config.tree_indent 207 sub_output: list[TreeEntry] 208 sub_files: list[Path] 209 sub_output, sub_files = walk_dir( 210 directory=entry, 211 ignore_handler=ignore_handler, 212 config=config, 213 tokenizer=tokenizer, 214 prefix=prefix + extension, 215 ) 216 tree_output.extend(sub_output) 217 collected_files.extend(sub_files) 218 else: 219 stats: FileStats = FileStats.from_file(entry, tokenizer) 220 tree_output.append(TreeEntry(f"{prefix}{connector}{entry.name}", stats)) 221 collected_files.append(entry) 222 223 return tree_output, collected_files
Recursively walk a directory, building tree lines and collecting file paths
226def format_tree_with_stats( 227 entries: list[TreeEntry], show_tokens: bool = False 228) -> list[str]: 229 """Format tree entries with aligned statistics 230 231 # Parameters: 232 - `entries : list[TreeEntry]` 233 List of tree entries with optional stats 234 - `show_tokens : bool` 235 Whether to show token counts 236 237 # Returns: 238 - `list[str]` 239 Formatted tree lines with aligned stats 240 """ 241 # Find max widths for alignment 242 max_line_len: int = max(len(entry.line) for entry in entries) 243 max_lines: int = max( 244 (len(f"{entry.stats.lines:,}") if entry.stats else 0) for entry in entries 245 ) 246 max_chars: int = max( 247 (len(f"{entry.stats.chars:,}") if entry.stats else 0) for entry in entries 248 ) 249 max_tokens: int = ( 250 max( 251 ( 252 len(f"{entry.stats.tokens:,}") 253 if entry.stats and entry.stats.tokens 254 else 0 255 ) 256 for entry in entries 257 ) 258 if show_tokens 259 else 0 260 ) 261 262 formatted: list[str] = [] 263 for entry in entries: 264 line: str = entry.line.ljust(max_line_len + 2) 265 if entry.stats: 266 lines_str: str = f"{entry.stats.lines:,}L".rjust(max_lines + 1) 267 chars_str: str = f"{entry.stats.chars:,}C".rjust(max_chars + 1) 268 stats_str: str = f"[{lines_str} {chars_str}" 269 if show_tokens and entry.stats.tokens is not None: 270 tokens_str: str = f"{entry.stats.tokens:,}T".rjust(max_tokens + 1) 271 stats_str += f" {tokens_str}" 272 stats_str += "]" 273 formatted.append(f"{line}{stats_str}") 274 else: 275 formatted.append(line) 276 277 return formatted
Format tree entries with aligned statistics
Parameters:
entries : list[TreeEntry]
List of tree entries with optional statsshow_tokens : bool
Whether to show token counts
Returns:
list[str]
Formatted tree lines with aligned stats
280def walk_and_collect( 281 root_dir: Path, 282 config: LMCatConfig, 283) -> tuple[list[str], list[Path]]: 284 """Walk filesystem from root_dir and gather tree listing plus file paths""" 285 if config is None: 286 config = LMCatConfig() 287 288 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 289 290 ignore_handler = IgnoreHandler(root_dir, config) 291 base_name = root_dir.resolve().name 292 293 # Start with root directory name 294 tree_output = [TreeEntry(base_name)] 295 296 # Walk the directory tree 297 sub_output, sub_files = walk_dir( 298 directory=root_dir, 299 ignore_handler=ignore_handler, 300 config=config, 301 tokenizer=tokenizer, 302 prefix="", 303 ) 304 tree_output.extend(sub_output) 305 306 # Format tree with stats 307 formatted_tree = format_tree_with_stats( 308 tree_output, show_tokens=tokenizer is not None 309 ) 310 311 return formatted_tree, sub_files
Walk filesystem from root_dir and gather tree listing plus file paths
314def assemble_summary( 315 root_dir: Path, 316 config: LMCatConfig, 317) -> str: 318 """Assemble the summary output and return""" 319 320 processing_pipeline: ProcessingPipeline = config.get_processing_pipeline() 321 322 tree_output: list[str] 323 collected_files: list[Path] 324 tree_output, collected_files = walk_and_collect( 325 root_dir=root_dir, 326 config=config, 327 ) 328 329 output: list[str] = [] 330 output.append("# File Tree") 331 output.append("\n```") 332 output.extend(tree_output) 333 output.append("```\n") 334 335 # Add file contents if not suppressed 336 if not config.tree_only: 337 output.append("# File Contents") 338 339 for fpath in collected_files: 340 # get the path 341 relpath_posix: str = fpath.relative_to(root_dir).as_posix() 342 343 # process the contents 344 f_contents: str 345 p_name: str | None 346 f_contents, p_name = processing_pipeline.process_file(fpath) 347 processed_with: str = f'processed_with="{p_name}"' if p_name else "" 348 349 # start of file marker 350 pathspec_start: str = f'{{ path="{relpath_posix}" {processed_with} }}' 351 pathspec_end: str = f'{{ end_of_file="{relpath_posix}" }}' 352 output.append("") 353 output.append(config.content_divider + pathspec_start) 354 355 # process the actual contents of the file with the pipeline, and append 356 output.append(f_contents) 357 358 # add the end of file marker 359 output.append(config.content_divider + pathspec_end) 360 361 output_joined: str = "\n".join(output) 362 363 stats_dict_ints: dict[str, int] = { 364 "files": len(collected_files), 365 "lines": len(output_joined.splitlines()), 366 "chars": len(output_joined), 367 } 368 369 tokenizer: TokenizerWrapper = config.get_tokenizer_obj() 370 371 n_tokens: int = tokenizer.n_tokens(output_joined) 372 stats_dict_ints[f"`{tokenizer.name}` tokens"] = n_tokens 373 374 stats_header: list[str] = ["# Stats"] 375 for key, val in stats_dict_ints.items(): 376 val_str: str = str(val) 377 val_short: str = shorten_numerical_to_str(val) 378 if val_str != val_short: 379 stats_header.append(f"- {val} ({val_short}) {key}") 380 else: 381 stats_header.append(f"- {val} {key}") 382 383 output_complete: str = "\n".join(stats_header) + "\n\n" + output_joined 384 385 return output_complete
Assemble the summary output and return
388def main() -> None: 389 """Main entry point for the script""" 390 arg_parser = argparse.ArgumentParser( 391 description="lmcat - list tree and content, combining .gitignore + .lmignore", 392 add_help=False, 393 ) 394 arg_parser.add_argument( 395 "-t", 396 "--tree-only", 397 action="store_true", 398 default=False, 399 help="Only print the tree, not the file contents.", 400 ) 401 arg_parser.add_argument( 402 "-o", 403 "--output", 404 action="store", 405 default=None, 406 help="Output file to write the tree and contents to.", 407 ) 408 arg_parser.add_argument( 409 "-h", "--help", action="help", help="Show this help message and exit." 410 ) 411 arg_parser.add_argument( 412 "--print-cfg", 413 action="store_true", 414 default=False, 415 help="Print the configuration as json and exit.", 416 ) 417 arg_parser.add_argument( 418 "--allow-plugins", 419 action="store_true", 420 default=False, 421 help="Allow plugins to be loaded from the plugins file. WARNING: this will execute arbitrary code found in the file pointed to by `config.plugins_file`, and **is a security risk**.", 422 ) 423 424 args: argparse.Namespace = arg_parser.parse_known_args()[0] 425 root_dir: Path = Path(".").resolve() 426 config: LMCatConfig = LMCatConfig.read(root_dir) 427 428 # CLI overrides 429 config.output = args.output 430 config.tree_only = args.tree_only 431 config.allow_plugins = args.allow_plugins 432 433 # print cfg and exit if requested 434 if args.print_cfg: 435 print(json.dumps(config.serialize(), indent="\t")) 436 return 437 438 # assemble summary 439 summary: str = assemble_summary(root_dir=root_dir, config=config) 440 441 # Write output 442 if config.output: 443 output_path: Path = Path(args.output) 444 output_path.parent.mkdir(parents=True, exist_ok=True) 445 output_path.write_text(summary, encoding="utf-8") 446 else: 447 if sys.platform == "win32": 448 sys.stdout = io.TextIOWrapper( 449 sys.stdout.buffer, encoding="utf-8", errors="replace" 450 ) 451 sys.stderr = io.TextIOWrapper( 452 sys.stderr.buffer, encoding="utf-8", errors="replace" 453 ) 454 455 print(summary)
Main entry point for the script