Coverage for src/m6rclib/metaphor_lexer.py: 100%

88 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-12 12:10 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15from typing import Dict, List, Final 

16 

17from .metaphor_token import Token, TokenType 

18 

19class MetaphorLexer: 

20 """ 

21 Lexer for handling the Metaphor language with its specific syntax. 

22 

23 The Metaphor language consists of: 

24 - Keywords (Action:, Context:, Role:, etc) 

25 - Indented blocks 

26 - Text content 

27 - Include/Embed directives 

28 

29 This lexer handles proper indentation, text block detection, and keyword parsing. 

30 """ 

31 

32 # Constants for language elements 

33 INDENT_SPACES = 4 

34 

35 # Mapping of keywords to their token types 

36 KEYWORDS: Final[Dict[str, TokenType]] = { 

37 "Action:": TokenType.ACTION, 

38 "Context:": TokenType.CONTEXT, 

39 "Embed:": TokenType.EMBED, 

40 "Include:": TokenType.INCLUDE, 

41 "Role:": TokenType.ROLE 

42 } 

43 

44 def __init__(self, input_text: str, filename: str) -> None: 

45 """ 

46 Initialize the MetaphorLexer. 

47 

48 Args: 

49 input_text (str): The text content to be lexically analyzed 

50 filename (str): Name of the file being processed 

51 """ 

52 self.in_text_block: bool = False 

53 self.indent_column: int = 1 

54 self.filename: str = filename 

55 self.tokens: List[Token] = [] 

56 self.current_line: int = 1 

57 self.input: str = input_text 

58 self._tokenize() 

59 

60 def get_next_token(self) -> Token: 

61 """Return the next token from the token list.""" 

62 if self.tokens: 

63 return self.tokens.pop(0) 

64 

65 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1) 

66 

67 def _tokenize(self) -> None: 

68 """ 

69 Tokenize the input file into appropriate tokens. 

70 Processes each line for indentation, keywords, and text content. 

71 """ 

72 if not self.input: 

73 return 

74 

75 lines: List[str] = self.input.splitlines() 

76 for line in lines: 

77 self._process_line(line) 

78 self.current_line += 1 

79 

80 # Handle remaining outdents at end of file 

81 self._handle_final_outdents() 

82 

83 def _handle_final_outdents(self) -> None: 

84 """Handle any remaining outdents needed at the end of file.""" 

85 while self.indent_column > 1: 

86 self.tokens.append( 

87 Token( 

88 type=TokenType.OUTDENT, 

89 value="[Outdent]", 

90 input="", 

91 filename=self.filename, 

92 line=self.current_line, 

93 column=self.indent_column 

94 ) 

95 ) 

96 self.indent_column -= self.INDENT_SPACES 

97 

98 def _process_line(self, line: str) -> None: 

99 """ 

100 Process a single line of input. 

101 

102 Args: 

103 line: The line to process 

104 """ 

105 stripped_line = line.lstrip(' ') 

106 start_column = len(line) - len(stripped_line) + 1 

107 

108 if not stripped_line: 

109 return 

110 

111 # Is this line a comment? 

112 if stripped_line.startswith('#'): 

113 return 

114 

115 # Does this line start with a tab character? 

116 if stripped_line.startswith('\t'): 

117 self._handle_tab_character(stripped_line, start_column) 

118 stripped_line = stripped_line[1:] 

119 if not stripped_line: 

120 return 

121 

122 self._handle_line_content(line, stripped_line, start_column) 

123 

124 def _handle_tab_character(self, line: str, column: int) -> None: 

125 """ 

126 Handle tab characters in the input. 

127 

128 Args: 

129 line: The line to check 

130 column: The current column number 

131 

132 Returns: 

133 True if a tab was handled, False otherwise 

134 """ 

135 self.tokens.append( 

136 Token( 

137 type=TokenType.TAB, 

138 value="[Tab]", 

139 input=line, 

140 filename=self.filename, 

141 line=self.current_line, 

142 column=column 

143 ) 

144 ) 

145 

146 def _handle_line_content(self, full_line: str, stripped_line: str, start_column: int) -> None: 

147 """ 

148 Process the content of a line after initial cleaning. 

149 

150 Args: 

151 full_line: The complete line 

152 stripped_line: The line with leading whitespace removed 

153 start_column: The starting column of the content 

154 """ 

155 words = stripped_line.split(maxsplit=1) 

156 first_word = words[0].capitalize() 

157 

158 if first_word in self.KEYWORDS: 

159 self._handle_keyword_line(full_line, words, first_word, start_column) 

160 else: 

161 self._handle_text_line(full_line, start_column) 

162 

163 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None: 

164 """ 

165 Handle a line that starts with a keyword. 

166 

167 Args: 

168 line: The complete line 

169 words: The line split into words 

170 keyword: The keyword found 

171 start_column: The starting column of the content 

172 """ 

173 self._process_indentation(line, start_column) 

174 

175 # Create keyword token 

176 self.tokens.append( 

177 Token( 

178 type=self.KEYWORDS[keyword], 

179 value=keyword, 

180 input=line, 

181 filename=self.filename, 

182 line=self.current_line, 

183 column=start_column 

184 ) 

185 ) 

186 

187 # Handle any text after the keyword 

188 if len(words) > 1: 

189 self.tokens.append( 

190 Token( 

191 type=TokenType.KEYWORD_TEXT, 

192 value=words[1], 

193 input=line, 

194 filename=self.filename, 

195 line=self.current_line, 

196 column=start_column + len(keyword) + 1 

197 ) 

198 ) 

199 

200 self.in_text_block = False 

201 

202 def _handle_text_line(self, line: str, start_column: int) -> None: 

203 """ 

204 Handle a line that contains text content. 

205 

206 Args: 

207 line: The line to process 

208 start_column: The starting column of the content 

209 """ 

210 # Adjust indentation for continued text blocks 

211 if self.in_text_block: 

212 if start_column > self.indent_column: 

213 start_column = self.indent_column 

214 elif start_column < self.indent_column: 

215 self._process_indentation(line, start_column) 

216 else: 

217 self._process_indentation(line, start_column) 

218 

219 text_content = line[start_column - 1:] 

220 self.tokens.append( 

221 Token( 

222 type=TokenType.TEXT, 

223 value=text_content, 

224 input=line, 

225 filename=self.filename, 

226 line=self.current_line, 

227 column=start_column 

228 ) 

229 ) 

230 self.in_text_block = True 

231 

232 def _process_indentation(self, line: str, start_column: int) -> None: 

233 """ 

234 Process the indentation of the current line. 

235 

236 Args: 

237 line: The current line 

238 start_column: The starting column of the content 

239 """ 

240 indent_offset = start_column - self.indent_column 

241 

242 if indent_offset > 0: 

243 self._handle_indent(line, start_column, indent_offset) 

244 elif indent_offset < 0: 

245 self._handle_outdent(line, start_column, indent_offset) 

246 

247 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None: 

248 """ 

249 Handle an increase in indentation. 

250 

251 Args: 

252 line: The current line 

253 start_column: The starting column of the content 

254 indent_offset: The change in indentation 

255 """ 

256 if indent_offset % self.INDENT_SPACES != 0: 

257 self.tokens.append( 

258 Token( 

259 type=TokenType.BAD_INDENT, 

260 value="[Bad Indent]", 

261 input=line, 

262 filename=self.filename, 

263 line=self.current_line, 

264 column=start_column 

265 ) 

266 ) 

267 return 

268 

269 while indent_offset > 0: 

270 self.tokens.append( 

271 Token( 

272 type=TokenType.INDENT, 

273 value="[Indent]", 

274 input=line, 

275 filename=self.filename, 

276 line=self.current_line, 

277 column=start_column 

278 ) 

279 ) 

280 indent_offset -= self.INDENT_SPACES 

281 

282 self.indent_column = start_column 

283 

284 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None: 

285 """ 

286 Handle a decrease in indentation. 

287 

288 Args: 

289 line: The current line 

290 start_column: The starting column of the content 

291 indent_offset: The change in indentation 

292 """ 

293 if abs(indent_offset) % self.INDENT_SPACES != 0: 

294 self.tokens.append( 

295 Token( 

296 type=TokenType.BAD_OUTDENT, 

297 value="[Bad Outdent]", 

298 input=line, 

299 filename=self.filename, 

300 line=self.current_line, 

301 column=start_column 

302 ) 

303 ) 

304 return 

305 

306 while indent_offset < 0: 

307 self.tokens.append( 

308 Token( 

309 type=TokenType.OUTDENT, 

310 value="[Outdent]", 

311 input=line, 

312 filename=self.filename, 

313 line=self.current_line, 

314 column=start_column 

315 ) 

316 ) 

317 indent_offset += self.INDENT_SPACES 

318 

319 self.indent_column = start_column