Coverage for tests\test_lmcat_3.py: 98%

99 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-29 16:42 -0700

1from pathlib import Path 

2import pytest 

3from typing import Any 

4 

5from lmcat.file_stats import FileStats, TokenizerWrapper 

6from lmcat.lmcat import LMCatConfig 

7from lmcat.processing_pipeline import OnMultipleProcessors 

8from lmcat.processors import register_processor, register_decider 

9 

10# Use same temp path as other tests 

11TEMP_PATH: Path = Path("tests/_temp") 

12 

13 

14def test_tokenizer_wrapper_gpt2(): 

15 """Test TokenizerWrapper with GPT2 tokenizer if available""" 

16 try: 

17 tokenizer = TokenizerWrapper("gpt2") 

18 assert tokenizer.name == "gpt2" 

19 assert not tokenizer.use_fallback 

20 

21 # Test token counting 

22 assert tokenizer.n_tokens("Hello world") == 2 

23 assert tokenizer.n_tokens("Hello world") == 4 # Multiple spaces 

24 except ImportError: 

25 pytest.skip("tokenizers package not installed") 

26 

27 

28def test_tokenizer_wrapper_fallback(): 

29 """Test TokenizerWrapper fallback whitespace tokenization""" 

30 tokenizer = TokenizerWrapper("whitespace-split") 

31 assert tokenizer.name == "whitespace-split" 

32 assert tokenizer.use_fallback 

33 

34 assert tokenizer.n_tokens("Hello world") == 2 

35 assert tokenizer.n_tokens("Hello world") == 2 

36 assert tokenizer.n_tokens("abc") == 1 

37 

38 

39def test_processing_pipeline_multiple_matches(): 

40 """Test different behaviors when multiple processors match""" 

41 test_dir = TEMP_PATH / "pipeline_test" 

42 test_dir.mkdir(parents=True, exist_ok=True) 

43 test_file = test_dir / "test.txt" 

44 test_file.write_text("original content") 

45 

46 # Register test processors 

47 @register_processor 

48 def processor1(path: Path) -> str: 

49 return "processor1 output" 

50 

51 @register_processor 

52 def processor2(path: Path) -> str: 

53 return "processor2 output" 

54 

55 @register_decider 

56 def always_true(path: Path) -> bool: 

57 return True 

58 

59 # Test different OnMultipleProcessors behaviors 

60 configs: dict[OnMultipleProcessors, Any] = { 

61 "do_first": "processor1 output", 

62 "do_last": "processor2 output", 

63 "skip": "original content", 

64 } 

65 

66 for mode, expected in configs.items(): 

67 print(f"{mode = }, {expected = }") 

68 config = LMCatConfig( 

69 decider_process={"always_true": "processor1"}, 

70 glob_process={"*.txt": "processor2"}, 

71 on_multiple_processors=mode, 

72 ) 

73 pipeline = config.get_processing_pipeline() 

74 result, p_used = pipeline.process_file(test_file) 

75 if mode == "skip": 

76 assert p_used is None 

77 elif mode == "do_first": 

78 assert p_used == "processor1" 

79 elif mode == "do_last": 

80 assert p_used == "processor2" 

81 

82 assert result == expected 

83 

84 # Test "except" mode raises error 

85 config_except = LMCatConfig( 

86 decider_process={"always_true": "processor1"}, 

87 glob_process={"*.txt": "processor2"}, 

88 on_multiple_processors="except", 

89 ) 

90 pipeline = config_except.get_processing_pipeline() 

91 with pytest.raises(ValueError): 

92 pipeline.process_file(test_file) 

93 

94 

95def test_filestats_large_file(): 

96 """Test FileStats handling of large files""" 

97 test_dir = TEMP_PATH / "large_file_stats" 

98 test_dir.mkdir(parents=True, exist_ok=True) 

99 large_file = test_dir / "large.txt" 

100 

101 # Create 5MB file 

102 chunk = "x" * 1024 # 1KB chunk 

103 with large_file.open("w") as f: 

104 for _ in range(5 * 1024): # Write 5MB 

105 f.write(chunk) 

106 

107 tokenizer = TokenizerWrapper() 

108 stats = FileStats.from_file(large_file, tokenizer) 

109 

110 assert stats.lines == 1 

111 assert stats.chars == 5 * 1024 * 1024 

112 assert stats.tokens is not None 

113 assert stats.tokens > 0 

114 

115 

116def test_config_plugins(): 

117 """Test plugin loading functionality""" 

118 test_dir = TEMP_PATH / "plugins_test" 

119 test_dir.mkdir(parents=True, exist_ok=True) 

120 

121 # Create test plugin file 

122 plugin_file = test_dir / "test_plugin.py" 

123 plugin_file.write_text(""" 

124from pathlib import Path 

125from lmcat.processors import register_processor, register_decider 

126 

127@register_processor 

128def custom_processor(path: Path) -> str: 

129 return "custom processed" 

130 

131@register_decider 

132def custom_decider(path: Path) -> bool: 

133 return path.suffix == '.custom' 

134""") 

135 

136 # Test with plugins enabled 

137 config = LMCatConfig( 

138 plugins_file=plugin_file, 

139 allow_plugins=True, 

140 decider_process={"custom_decider": "custom_processor"}, 

141 ) 

142 

143 pipeline = config.get_processing_pipeline() 

144 

145 # Create test file 

146 test_file = test_dir / "test.custom" 

147 test_file.write_text("original content") 

148 

149 # Test custom processor 

150 result, processor_name = pipeline.process_file(test_file) 

151 assert result == "custom processed" 

152 assert processor_name == "custom_processor" 

153 

154 

155def test_error_files(): 

156 """Test handling of files with various error conditions""" 

157 test_dir = TEMP_PATH / "error_files" 

158 test_dir.mkdir(parents=True, exist_ok=True) 

159 

160 # Create a directory that looks like a file 

161 dir_file = test_dir / "dir.txt" 

162 dir_file.mkdir() 

163 

164 # Create an empty file 

165 empty_file = test_dir / "empty.txt" 

166 empty_file.touch() 

167 

168 # Create file with invalid UTF-8 

169 invalid_utf8 = test_dir / "invalid.txt" 

170 invalid_utf8.write_bytes(b"Hello\xff\xfeWorld") 

171 

172 tokenizer = TokenizerWrapper() 

173 

174 # Test empty file 

175 stats = FileStats.from_file(empty_file, tokenizer) 

176 assert stats.lines == 0 

177 assert stats.chars == 0 

178 assert stats.tokens == 0 

179 

180 # Test invalid UTF-8 file 

181 stats = FileStats.from_file(invalid_utf8, tokenizer) 

182 assert stats.lines >= 0 # Should handle without crashing 

183 assert stats.chars >= 0 

184 assert stats.tokens is not None