Coverage for src/m6rclib/metaphor_lexer.py: 100%

91 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-12 19:59 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15from typing import Dict, List, Final 

16 

17from .metaphor_token import Token, TokenType 

18 

19class MetaphorLexer: 

20 """ 

21 Lexer for handling the Metaphor language with its specific syntax. 

22 

23 The Metaphor language consists of: 

24 - Keywords (Action:, Context:, Role:, etc) 

25 - Indented blocks 

26 - Text content 

27 - Include/Embed directives 

28 

29 This lexer handles proper indentation, text block detection, and keyword parsing. 

30 """ 

31 

32 # Constants for language elements 

33 INDENT_SPACES = 4 

34 

35 # Mapping of keywords to their token types 

36 KEYWORDS: Final[Dict[str, TokenType]] = { 

37 "Action:": TokenType.ACTION, 

38 "Context:": TokenType.CONTEXT, 

39 "Embed:": TokenType.EMBED, 

40 "Include:": TokenType.INCLUDE, 

41 "Role:": TokenType.ROLE 

42 } 

43 

44 def __init__(self, input_text: str, filename: str) -> None: 

45 """ 

46 Initialize the MetaphorLexer. 

47 

48 Args: 

49 input_text (str): The text content to be lexically analyzed 

50 filename (str): Name of the file being processed 

51 """ 

52 self.in_text_block: bool = False 

53 self.in_fenced_code: bool = False 

54 self.indent_column: int = 1 

55 self.filename: str = filename 

56 self.tokens: List[Token] = [] 

57 self.current_line: int = 1 

58 self.input: str = input_text 

59 self._tokenize() 

60 

61 def get_next_token(self) -> Token: 

62 """Return the next token from the token list.""" 

63 if self.tokens: 

64 return self.tokens.pop(0) 

65 

66 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1) 

67 

68 def _tokenize(self) -> None: 

69 """ 

70 Tokenize the input file into appropriate tokens. 

71 Processes each line for indentation, keywords, and text content. 

72 """ 

73 if not self.input: 

74 return 

75 

76 lines: List[str] = self.input.splitlines() 

77 for line in lines: 

78 self._process_line(line) 

79 self.current_line += 1 

80 

81 # Handle remaining outdents at end of file 

82 self._handle_final_outdents() 

83 

84 def _handle_final_outdents(self) -> None: 

85 """Handle any remaining outdents needed at the end of file.""" 

86 while self.indent_column > 1: 

87 self.tokens.append( 

88 Token( 

89 type=TokenType.OUTDENT, 

90 value="[Outdent]", 

91 input="", 

92 filename=self.filename, 

93 line=self.current_line, 

94 column=self.indent_column 

95 ) 

96 ) 

97 self.indent_column -= self.INDENT_SPACES 

98 

99 def _process_line(self, line: str) -> None: 

100 """ 

101 Process a single line of input. 

102 

103 Args: 

104 line: The line to process 

105 """ 

106 stripped_line = line.lstrip(' ') 

107 start_column = len(line) - len(stripped_line) + 1 

108 

109 if not stripped_line: 

110 return 

111 

112 # Is this line a comment? 

113 if stripped_line.startswith('#'): 

114 return 

115 

116 # Does this line start with a tab character? 

117 if stripped_line.startswith('\t'): 

118 self._handle_tab_character(stripped_line, start_column) 

119 stripped_line = stripped_line[1:] 

120 if not stripped_line: 

121 return 

122 

123 # Does this line start with a code fence? 

124 if stripped_line.startswith('```'): 

125 self.in_fenced_code = not self.in_fenced_code 

126 

127 # If we're not in a fenced code block then look for keywords. 

128 if not self.in_fenced_code: 

129 words = stripped_line.split(maxsplit=1) 

130 first_word = words[0].capitalize() 

131 

132 if first_word in self.KEYWORDS: 

133 self._handle_keyword_line(line, words, first_word, start_column) 

134 return 

135 

136 # Treat this as a text block. 

137 self._handle_text_line(line, start_column) 

138 

139 def _handle_tab_character(self, line: str, column: int) -> None: 

140 """ 

141 Handle tab characters in the input. 

142 

143 Args: 

144 line: The line to check 

145 column: The current column number 

146 

147 Returns: 

148 True if a tab was handled, False otherwise 

149 """ 

150 self.tokens.append( 

151 Token( 

152 type=TokenType.TAB, 

153 value="[Tab]", 

154 input=line, 

155 filename=self.filename, 

156 line=self.current_line, 

157 column=column 

158 ) 

159 ) 

160 

161 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None: 

162 """ 

163 Handle a line that starts with a keyword. 

164 

165 Args: 

166 line: The complete line 

167 words: The line split into words 

168 keyword: The keyword found 

169 start_column: The starting column of the content 

170 """ 

171 self._process_indentation(line, start_column) 

172 

173 # Create keyword token 

174 self.tokens.append( 

175 Token( 

176 type=self.KEYWORDS[keyword], 

177 value=keyword, 

178 input=line, 

179 filename=self.filename, 

180 line=self.current_line, 

181 column=start_column 

182 ) 

183 ) 

184 

185 # Handle any text after the keyword 

186 if len(words) > 1: 

187 self.tokens.append( 

188 Token( 

189 type=TokenType.KEYWORD_TEXT, 

190 value=words[1], 

191 input=line, 

192 filename=self.filename, 

193 line=self.current_line, 

194 column=start_column + len(keyword) + 1 

195 ) 

196 ) 

197 

198 self.in_text_block = False 

199 

200 def _handle_text_line(self, line: str, start_column: int) -> None: 

201 """ 

202 Handle a line that contains text content. 

203 

204 Args: 

205 line: The line to process 

206 start_column: The starting column of the content 

207 """ 

208 # Adjust indentation for continued text blocks 

209 if self.in_text_block: 

210 if start_column > self.indent_column: 

211 start_column = self.indent_column 

212 elif start_column < self.indent_column: 

213 self._process_indentation(line, start_column) 

214 else: 

215 self._process_indentation(line, start_column) 

216 

217 text_content = line[start_column - 1:] 

218 self.tokens.append( 

219 Token( 

220 type=TokenType.TEXT, 

221 value=text_content, 

222 input=line, 

223 filename=self.filename, 

224 line=self.current_line, 

225 column=start_column 

226 ) 

227 ) 

228 self.in_text_block = True 

229 

230 def _process_indentation(self, line: str, start_column: int) -> None: 

231 """ 

232 Process the indentation of the current line. 

233 

234 Args: 

235 line: The current line 

236 start_column: The starting column of the content 

237 """ 

238 indent_offset = start_column - self.indent_column 

239 

240 if indent_offset > 0: 

241 self._handle_indent(line, start_column, indent_offset) 

242 elif indent_offset < 0: 

243 self._handle_outdent(line, start_column, indent_offset) 

244 

245 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None: 

246 """ 

247 Handle an increase in indentation. 

248 

249 Args: 

250 line: The current line 

251 start_column: The starting column of the content 

252 indent_offset: The change in indentation 

253 """ 

254 if indent_offset % self.INDENT_SPACES != 0: 

255 self.tokens.append( 

256 Token( 

257 type=TokenType.BAD_INDENT, 

258 value="[Bad Indent]", 

259 input=line, 

260 filename=self.filename, 

261 line=self.current_line, 

262 column=start_column 

263 ) 

264 ) 

265 return 

266 

267 while indent_offset > 0: 

268 self.tokens.append( 

269 Token( 

270 type=TokenType.INDENT, 

271 value="[Indent]", 

272 input=line, 

273 filename=self.filename, 

274 line=self.current_line, 

275 column=start_column 

276 ) 

277 ) 

278 indent_offset -= self.INDENT_SPACES 

279 

280 self.indent_column = start_column 

281 

282 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None: 

283 """ 

284 Handle a decrease in indentation. 

285 

286 Args: 

287 line: The current line 

288 start_column: The starting column of the content 

289 indent_offset: The change in indentation 

290 """ 

291 if abs(indent_offset) % self.INDENT_SPACES != 0: 

292 self.tokens.append( 

293 Token( 

294 type=TokenType.BAD_OUTDENT, 

295 value="[Bad Outdent]", 

296 input=line, 

297 filename=self.filename, 

298 line=self.current_line, 

299 column=start_column 

300 ) 

301 ) 

302 return 

303 

304 while indent_offset < 0: 

305 self.tokens.append( 

306 Token( 

307 type=TokenType.OUTDENT, 

308 value="[Outdent]", 

309 input=line, 

310 filename=self.filename, 

311 line=self.current_line, 

312 column=start_column 

313 ) 

314 ) 

315 indent_offset += self.INDENT_SPACES 

316 

317 self.indent_column = start_column