docs for lmcat v0.1.2
View Source on GitHub

lmcat.file_stats


 1from dataclasses import dataclass
 2from pathlib import Path
 3from typing import NamedTuple, Optional
 4
 5# Handle Python 3.11+ vs older Python for TOML parsing
 6try:
 7	import tomllib
 8except ImportError:
 9	try:
10		import tomli as tomllib  # type: ignore
11	except ImportError:
12		tomllib = None  # type: ignore[assignment]
13
14
15# tokenizers (optional dep)
16TOKENIZERS_PRESENT: bool = False
17try:
18	import tokenizers  # type: ignore[import-untyped]
19
20	TOKENIZERS_PRESENT = True
21except ImportError:
22	pass
23
24
25class TokenizerWrapper:
26	"""tokenizer wrapper. stores name and provides `n_tokens` method.
27
28	uses splitting by whitespace as a fallback -- `whitespace-split`"""
29
30	def __init__(self, name: str = "whitespace-split") -> None:
31		self.name: str = name
32		self.use_fallback: bool = name == "whitespace-split"
33		self.tokenizer: Optional[tokenizers.Tokenizer] = (
34			None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name)
35		)
36
37	def n_tokens(self, text: str) -> int:
38		"""Return number of tokens in text"""
39		if self.use_fallback:
40			return len(text.split())
41		else:
42			assert self.tokenizer is not None
43			return len(self.tokenizer.encode(text).tokens)
44
45
46@dataclass
47class FileStats:
48	"""Statistics for a single file"""
49
50	lines: int
51	chars: int
52	tokens: Optional[int] = None
53
54	@classmethod
55	def from_file(
56		cls,
57		path: Path,
58		tokenizer: TokenizerWrapper,
59	) -> "FileStats":
60		"""Get statistics for a single file
61
62		# Parameters:
63		- `path : Path`
64			Path to the file to analyze
65		- `tokenizer : Optional[tokenizers.Tokenizer]`
66			Tokenizer to use for counting tokens, if any
67
68		# Returns:
69		- `FileStats`
70			Statistics for the file
71		"""
72		with path.open("r", encoding="utf-8", errors="ignore") as f:
73			content: str = f.read()
74			lines: int = len(content.splitlines())
75			chars: int = len(content)
76			tokens: int = tokenizer.n_tokens(content)
77			return FileStats(lines=lines, chars=chars, tokens=tokens)
78
79
80class TreeEntry(NamedTuple):
81	"""Entry in the tree output with optional stats"""
82
83	line: str
84	stats: Optional[FileStats] = None

TOKENIZERS_PRESENT: bool = True
class TokenizerWrapper:
26class TokenizerWrapper:
27	"""tokenizer wrapper. stores name and provides `n_tokens` method.
28
29	uses splitting by whitespace as a fallback -- `whitespace-split`"""
30
31	def __init__(self, name: str = "whitespace-split") -> None:
32		self.name: str = name
33		self.use_fallback: bool = name == "whitespace-split"
34		self.tokenizer: Optional[tokenizers.Tokenizer] = (
35			None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name)
36		)
37
38	def n_tokens(self, text: str) -> int:
39		"""Return number of tokens in text"""
40		if self.use_fallback:
41			return len(text.split())
42		else:
43			assert self.tokenizer is not None
44			return len(self.tokenizer.encode(text).tokens)

tokenizer wrapper. stores name and provides n_tokens method.

uses splitting by whitespace as a fallback -- whitespace-split

TokenizerWrapper(name: str = 'whitespace-split')
31	def __init__(self, name: str = "whitespace-split") -> None:
32		self.name: str = name
33		self.use_fallback: bool = name == "whitespace-split"
34		self.tokenizer: Optional[tokenizers.Tokenizer] = (
35			None if self.use_fallback else tokenizers.Tokenizer.from_pretrained(name)
36		)
name: str
use_fallback: bool
tokenizer: Optional[tokenizers.Tokenizer]
def n_tokens(self, text: str) -> int:
38	def n_tokens(self, text: str) -> int:
39		"""Return number of tokens in text"""
40		if self.use_fallback:
41			return len(text.split())
42		else:
43			assert self.tokenizer is not None
44			return len(self.tokenizer.encode(text).tokens)

Return number of tokens in text

@dataclass
class FileStats:
47@dataclass
48class FileStats:
49	"""Statistics for a single file"""
50
51	lines: int
52	chars: int
53	tokens: Optional[int] = None
54
55	@classmethod
56	def from_file(
57		cls,
58		path: Path,
59		tokenizer: TokenizerWrapper,
60	) -> "FileStats":
61		"""Get statistics for a single file
62
63		# Parameters:
64		- `path : Path`
65			Path to the file to analyze
66		- `tokenizer : Optional[tokenizers.Tokenizer]`
67			Tokenizer to use for counting tokens, if any
68
69		# Returns:
70		- `FileStats`
71			Statistics for the file
72		"""
73		with path.open("r", encoding="utf-8", errors="ignore") as f:
74			content: str = f.read()
75			lines: int = len(content.splitlines())
76			chars: int = len(content)
77			tokens: int = tokenizer.n_tokens(content)
78			return FileStats(lines=lines, chars=chars, tokens=tokens)

Statistics for a single file

FileStats(lines: int, chars: int, tokens: Optional[int] = None)
lines: int
chars: int
tokens: Optional[int] = None
@classmethod
def from_file( cls, path: pathlib.Path, tokenizer: TokenizerWrapper) -> FileStats:
55	@classmethod
56	def from_file(
57		cls,
58		path: Path,
59		tokenizer: TokenizerWrapper,
60	) -> "FileStats":
61		"""Get statistics for a single file
62
63		# Parameters:
64		- `path : Path`
65			Path to the file to analyze
66		- `tokenizer : Optional[tokenizers.Tokenizer]`
67			Tokenizer to use for counting tokens, if any
68
69		# Returns:
70		- `FileStats`
71			Statistics for the file
72		"""
73		with path.open("r", encoding="utf-8", errors="ignore") as f:
74			content: str = f.read()
75			lines: int = len(content.splitlines())
76			chars: int = len(content)
77			tokens: int = tokenizer.n_tokens(content)
78			return FileStats(lines=lines, chars=chars, tokens=tokens)

Get statistics for a single file

Parameters:

  • path : Path Path to the file to analyze
  • tokenizer : Optional[tokenizers.Tokenizer] Tokenizer to use for counting tokens, if any

Returns:

class TreeEntry(typing.NamedTuple):
81class TreeEntry(NamedTuple):
82	"""Entry in the tree output with optional stats"""
83
84	line: str
85	stats: Optional[FileStats] = None

Entry in the tree output with optional stats

TreeEntry(line: str, stats: Optional[FileStats] = None)

Create new instance of TreeEntry(line, stats)

line: str

Alias for field number 0

stats: Optional[FileStats]

Alias for field number 1

Inherited Members
builtins.tuple
index
count