Coverage for src/m6rclib/metaphor_lexer.py: 100%

91 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-19 11:15 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15from typing import Dict, List, Final 

16 

17from .metaphor_token import Token, TokenType 

18 

19class MetaphorLexer: 

20 """ 

21 Lexer for handling the Metaphor language with its specific syntax. 

22 

23 The Metaphor language consists of: 

24 - Keywords (Action:, Context:, Role:, etc) 

25 - Indented blocks 

26 - Text content 

27 - Include/Embed directives 

28 

29 This lexer handles proper indentation, text block detection, and keyword parsing. 

30 """ 

31 

32 # Constants for language elements 

33 INDENT_SPACES = 4 

34 

35 # Mapping of keywords to their token types 

36 KEYWORDS: Final[Dict[str, TokenType]] = { 

37 "Action:": TokenType.ACTION, 

38 "Context:": TokenType.CONTEXT, 

39 "Embed:": TokenType.EMBED, 

40 "Include:": TokenType.INCLUDE, 

41 "Role:": TokenType.ROLE 

42 } 

43 

44 def __init__(self, input_text: str, filename: str) -> None: 

45 """ 

46 Initialize the MetaphorLexer. 

47 

48 Args: 

49 input_text (str): The text content to be lexically analyzed 

50 filename (str): Name of the file being processed 

51 """ 

52 self.in_text_block: bool = False 

53 self.in_fenced_code: bool = False 

54 self.indent_column: int = 1 

55 self.filename: str = filename 

56 self.tokens: List[Token] = [] 

57 self.current_line: int = 1 

58 self.input: str = input_text 

59 self._tokenize() 

60 

61 def get_next_token(self) -> Token: 

62 """Return the next token from the token list.""" 

63 if self.tokens: 

64 return self.tokens.pop(0) 

65 

66 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1) 

67 

68 def _tokenize(self) -> None: 

69 """ 

70 Tokenize the input file into appropriate tokens. 

71 Processes each line for indentation, keywords, and text content. 

72 """ 

73 if not self.input: 

74 return 

75 

76 lines: List[str] = self.input.splitlines() 

77 for line in lines: 

78 self._process_line(line) 

79 self.current_line += 1 

80 

81 # Handle remaining outdents at end of file 

82 self._handle_final_outdents() 

83 

84 def _handle_final_outdents(self) -> None: 

85 """Handle any remaining outdents needed at the end of file.""" 

86 while self.indent_column > 1: 

87 self.tokens.append( 

88 Token( 

89 type=TokenType.OUTDENT, 

90 value="[Outdent]", 

91 input="", 

92 filename=self.filename, 

93 line=self.current_line, 

94 column=self.indent_column 

95 ) 

96 ) 

97 self.indent_column -= self.INDENT_SPACES 

98 

99 def _process_line(self, line: str) -> None: 

100 """ 

101 Process a single line of input. 

102 

103 Args: 

104 line: The line to process 

105 """ 

106 stripped_line = line.lstrip(' ') 

107 start_column = len(line) - len(stripped_line) + 1 

108 

109 if not stripped_line: 

110 return 

111 

112 # Is this line a comment? 

113 if stripped_line.startswith('#'): 

114 return 

115 

116 # Does this line start with a tab character? 

117 if stripped_line.startswith('\t'): 

118 self._handle_tab_character(stripped_line, start_column) 

119 stripped_line = stripped_line[1:] 

120 if not stripped_line: 

121 return 

122 

123 # Does this line start with a code fence? 

124 if stripped_line.startswith('```'): 

125 self.in_fenced_code = not self.in_fenced_code 

126 

127 # If we're not in a fenced code block then look for keywords. 

128 if not self.in_fenced_code: 

129 words = stripped_line.split(maxsplit=1) 

130 first_word = words[0].capitalize() 

131 

132 if first_word in self.KEYWORDS: 

133 self._handle_keyword_line(line, words, first_word, start_column) 

134 return 

135 

136 # Treat this as a text block. 

137 self._handle_text_line(line, start_column) 

138 

139 def _handle_tab_character(self, line: str, column: int) -> None: 

140 """ 

141 Handle tab characters in the input. 

142 

143 Args: 

144 line: The line to check 

145 column: The current column number 

146 """ 

147 self.tokens.append( 

148 Token( 

149 type=TokenType.TAB, 

150 value="[Tab]", 

151 input=line, 

152 filename=self.filename, 

153 line=self.current_line, 

154 column=column 

155 ) 

156 ) 

157 

158 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None: 

159 """ 

160 Handle a line that starts with a keyword. 

161 

162 Args: 

163 line: The complete line 

164 words: The line split into words 

165 keyword: The keyword found 

166 start_column: The starting column of the content 

167 """ 

168 self._process_indentation(line, start_column) 

169 

170 # Create keyword token 

171 self.tokens.append( 

172 Token( 

173 type=self.KEYWORDS[keyword], 

174 value=keyword, 

175 input=line, 

176 filename=self.filename, 

177 line=self.current_line, 

178 column=start_column 

179 ) 

180 ) 

181 

182 # Handle any text after the keyword 

183 if len(words) > 1: 

184 self.tokens.append( 

185 Token( 

186 type=TokenType.KEYWORD_TEXT, 

187 value=words[1], 

188 input=line, 

189 filename=self.filename, 

190 line=self.current_line, 

191 column=start_column + len(keyword) + 1 

192 ) 

193 ) 

194 

195 self.in_text_block = False 

196 

197 def _handle_text_line(self, line: str, start_column: int) -> None: 

198 """ 

199 Handle a line that contains text content. 

200 

201 Args: 

202 line: The line to process 

203 start_column: The starting column of the content 

204 """ 

205 # Adjust indentation for continued text blocks 

206 if self.in_text_block: 

207 if start_column > self.indent_column: 

208 start_column = self.indent_column 

209 elif start_column < self.indent_column: 

210 self._process_indentation(line, start_column) 

211 else: 

212 self._process_indentation(line, start_column) 

213 

214 text_content = line[start_column - 1:] 

215 self.tokens.append( 

216 Token( 

217 type=TokenType.TEXT, 

218 value=text_content, 

219 input=line, 

220 filename=self.filename, 

221 line=self.current_line, 

222 column=start_column 

223 ) 

224 ) 

225 self.in_text_block = True 

226 

227 def _process_indentation(self, line: str, start_column: int) -> None: 

228 """ 

229 Process the indentation of the current line. 

230 

231 Args: 

232 line: The current line 

233 start_column: The starting column of the content 

234 """ 

235 indent_offset = start_column - self.indent_column 

236 

237 if indent_offset > 0: 

238 self._handle_indent(line, start_column, indent_offset) 

239 elif indent_offset < 0: 

240 self._handle_outdent(line, start_column, indent_offset) 

241 

242 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None: 

243 """ 

244 Handle an increase in indentation. 

245 

246 Args: 

247 line: The current line 

248 start_column: The starting column of the content 

249 indent_offset: The change in indentation 

250 """ 

251 if indent_offset % self.INDENT_SPACES != 0: 

252 self.tokens.append( 

253 Token( 

254 type=TokenType.BAD_INDENT, 

255 value="[Bad Indent]", 

256 input=line, 

257 filename=self.filename, 

258 line=self.current_line, 

259 column=start_column 

260 ) 

261 ) 

262 return 

263 

264 while indent_offset > 0: 

265 self.tokens.append( 

266 Token( 

267 type=TokenType.INDENT, 

268 value="[Indent]", 

269 input=line, 

270 filename=self.filename, 

271 line=self.current_line, 

272 column=start_column 

273 ) 

274 ) 

275 indent_offset -= self.INDENT_SPACES 

276 

277 self.indent_column = start_column 

278 

279 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None: 

280 """ 

281 Handle a decrease in indentation. 

282 

283 Args: 

284 line: The current line 

285 start_column: The starting column of the content 

286 indent_offset: The change in indentation 

287 """ 

288 if abs(indent_offset) % self.INDENT_SPACES != 0: 

289 self.tokens.append( 

290 Token( 

291 type=TokenType.BAD_OUTDENT, 

292 value="[Bad Outdent]", 

293 input=line, 

294 filename=self.filename, 

295 line=self.current_line, 

296 column=start_column 

297 ) 

298 ) 

299 return 

300 

301 while indent_offset < 0: 

302 self.tokens.append( 

303 Token( 

304 type=TokenType.OUTDENT, 

305 value="[Outdent]", 

306 input=line, 

307 filename=self.filename, 

308 line=self.current_line, 

309 column=start_column 

310 ) 

311 ) 

312 indent_offset += self.INDENT_SPACES 

313 

314 self.indent_column = start_column