Coverage for tests\test_lmcat_3.py: 98%
99 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-29 16:42 -0700
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-29 16:42 -0700
1from pathlib import Path
2import pytest
3from typing import Any
5from lmcat.file_stats import FileStats, TokenizerWrapper
6from lmcat.lmcat import LMCatConfig
7from lmcat.processing_pipeline import OnMultipleProcessors
8from lmcat.processors import register_processor, register_decider
10# Use same temp path as other tests
11TEMP_PATH: Path = Path("tests/_temp")
14def test_tokenizer_wrapper_gpt2():
15 """Test TokenizerWrapper with GPT2 tokenizer if available"""
16 try:
17 tokenizer = TokenizerWrapper("gpt2")
18 assert tokenizer.name == "gpt2"
19 assert not tokenizer.use_fallback
21 # Test token counting
22 assert tokenizer.n_tokens("Hello world") == 2
23 assert tokenizer.n_tokens("Hello world") == 4 # Multiple spaces
24 except ImportError:
25 pytest.skip("tokenizers package not installed")
28def test_tokenizer_wrapper_fallback():
29 """Test TokenizerWrapper fallback whitespace tokenization"""
30 tokenizer = TokenizerWrapper("whitespace-split")
31 assert tokenizer.name == "whitespace-split"
32 assert tokenizer.use_fallback
34 assert tokenizer.n_tokens("Hello world") == 2
35 assert tokenizer.n_tokens("Hello world") == 2
36 assert tokenizer.n_tokens("abc") == 1
39def test_processing_pipeline_multiple_matches():
40 """Test different behaviors when multiple processors match"""
41 test_dir = TEMP_PATH / "pipeline_test"
42 test_dir.mkdir(parents=True, exist_ok=True)
43 test_file = test_dir / "test.txt"
44 test_file.write_text("original content")
46 # Register test processors
47 @register_processor
48 def processor1(path: Path) -> str:
49 return "processor1 output"
51 @register_processor
52 def processor2(path: Path) -> str:
53 return "processor2 output"
55 @register_decider
56 def always_true(path: Path) -> bool:
57 return True
59 # Test different OnMultipleProcessors behaviors
60 configs: dict[OnMultipleProcessors, Any] = {
61 "do_first": "processor1 output",
62 "do_last": "processor2 output",
63 "skip": "original content",
64 }
66 for mode, expected in configs.items():
67 print(f"{mode = }, {expected = }")
68 config = LMCatConfig(
69 decider_process={"always_true": "processor1"},
70 glob_process={"*.txt": "processor2"},
71 on_multiple_processors=mode,
72 )
73 pipeline = config.get_processing_pipeline()
74 result, p_used = pipeline.process_file(test_file)
75 if mode == "skip":
76 assert p_used is None
77 elif mode == "do_first":
78 assert p_used == "processor1"
79 elif mode == "do_last":
80 assert p_used == "processor2"
82 assert result == expected
84 # Test "except" mode raises error
85 config_except = LMCatConfig(
86 decider_process={"always_true": "processor1"},
87 glob_process={"*.txt": "processor2"},
88 on_multiple_processors="except",
89 )
90 pipeline = config_except.get_processing_pipeline()
91 with pytest.raises(ValueError):
92 pipeline.process_file(test_file)
95def test_filestats_large_file():
96 """Test FileStats handling of large files"""
97 test_dir = TEMP_PATH / "large_file_stats"
98 test_dir.mkdir(parents=True, exist_ok=True)
99 large_file = test_dir / "large.txt"
101 # Create 5MB file
102 chunk = "x" * 1024 # 1KB chunk
103 with large_file.open("w") as f:
104 for _ in range(5 * 1024): # Write 5MB
105 f.write(chunk)
107 tokenizer = TokenizerWrapper()
108 stats = FileStats.from_file(large_file, tokenizer)
110 assert stats.lines == 1
111 assert stats.chars == 5 * 1024 * 1024
112 assert stats.tokens is not None
113 assert stats.tokens > 0
116def test_config_plugins():
117 """Test plugin loading functionality"""
118 test_dir = TEMP_PATH / "plugins_test"
119 test_dir.mkdir(parents=True, exist_ok=True)
121 # Create test plugin file
122 plugin_file = test_dir / "test_plugin.py"
123 plugin_file.write_text("""
124from pathlib import Path
125from lmcat.processors import register_processor, register_decider
127@register_processor
128def custom_processor(path: Path) -> str:
129 return "custom processed"
131@register_decider
132def custom_decider(path: Path) -> bool:
133 return path.suffix == '.custom'
134""")
136 # Test with plugins enabled
137 config = LMCatConfig(
138 plugins_file=plugin_file,
139 allow_plugins=True,
140 decider_process={"custom_decider": "custom_processor"},
141 )
143 pipeline = config.get_processing_pipeline()
145 # Create test file
146 test_file = test_dir / "test.custom"
147 test_file.write_text("original content")
149 # Test custom processor
150 result, processor_name = pipeline.process_file(test_file)
151 assert result == "custom processed"
152 assert processor_name == "custom_processor"
155def test_error_files():
156 """Test handling of files with various error conditions"""
157 test_dir = TEMP_PATH / "error_files"
158 test_dir.mkdir(parents=True, exist_ok=True)
160 # Create a directory that looks like a file
161 dir_file = test_dir / "dir.txt"
162 dir_file.mkdir()
164 # Create an empty file
165 empty_file = test_dir / "empty.txt"
166 empty_file.touch()
168 # Create file with invalid UTF-8
169 invalid_utf8 = test_dir / "invalid.txt"
170 invalid_utf8.write_bytes(b"Hello\xff\xfeWorld")
172 tokenizer = TokenizerWrapper()
174 # Test empty file
175 stats = FileStats.from_file(empty_file, tokenizer)
176 assert stats.lines == 0
177 assert stats.chars == 0
178 assert stats.tokens == 0
180 # Test invalid UTF-8 file
181 stats = FileStats.from_file(invalid_utf8, tokenizer)
182 assert stats.lines >= 0 # Should handle without crashing
183 assert stats.chars >= 0
184 assert stats.tokens is not None