Coverage for lmcat\file_stats.py: 83%
42 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-29 16:42 -0700
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-29 16:42 -0700
1from dataclasses import dataclass
2from pathlib import Path
3from typing import NamedTuple, Optional
5# Handle Python 3.11+ vs older Python for TOML parsing
6try:
7 import tomllib
8except ImportError:
9 try:
10 import tomli as tomllib # type: ignore
11 except ImportError:
12 tomllib = None # type: ignore[assignment]
15# tokenizers (optional dep)
16TOKENIZERS_PRESENT: bool = False
17try:
18 import tokenizers # type: ignore[import-untyped]
20 TOKENIZERS_PRESENT = True
21except ImportError:
22 pass
25class TokenizerWrapper:
26 """tokenizer wrapper. stores name and provides `n_tokens` method.
28 uses splitting by whitespace as a fallback -- `whitespace-split`"""
30 def __init__(self, name: str = "whitespace-split") -> None:
31 self.name: str = name
32 self.use_fallback: bool = name == "whitespace-split"
33 self.tokenizer: Optional[tokenizers.Tokenizer] = (
34 None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name)
35 )
37 def n_tokens(self, text: str) -> int:
38 """Return number of tokens in text"""
39 if self.use_fallback:
40 return len(text.split())
41 else:
42 assert self.tokenizer is not None
43 return len(self.tokenizer.encode(text).tokens)
46@dataclass
47class FileStats:
48 """Statistics for a single file"""
50 lines: int
51 chars: int
52 tokens: Optional[int] = None
54 @classmethod
55 def from_file(
56 cls,
57 path: Path,
58 tokenizer: TokenizerWrapper,
59 ) -> "FileStats":
60 """Get statistics for a single file
62 # Parameters:
63 - `path : Path`
64 Path to the file to analyze
65 - `tokenizer : Optional[tokenizers.Tokenizer]`
66 Tokenizer to use for counting tokens, if any
68 # Returns:
69 - `FileStats`
70 Statistics for the file
71 """
72 with path.open("r", encoding="utf-8", errors="ignore") as f:
73 content: str = f.read()
74 lines: int = len(content.splitlines())
75 chars: int = len(content)
76 tokens: int = tokenizer.n_tokens(content)
77 return FileStats(lines=lines, chars=chars, tokens=tokens)
80class TreeEntry(NamedTuple):
81 """Entry in the tree output with optional stats"""
83 line: str
84 stats: Optional[FileStats] = None