Coverage for src/m6rclib/metaphor_lexer.py: 100%
95 statements
« prev ^ index » next coverage.py v7.6.1, created at 2025-03-02 17:40 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2025-03-02 17:40 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15from typing import Dict, List, Final
17from .metaphor_token import Token, TokenType
19class MetaphorLexer:
20 """
21 Lexer for handling the Metaphor language with its specific syntax.
23 The Metaphor language consists of:
24 - Keywords (Action:, Context:, Role:, etc)
25 - Indented blocks
26 - Text content
27 - Include/Embed directives
29 This lexer handles proper indentation, text block detection, and keyword parsing.
30 """
32 # Constants for language elements
33 INDENT_SPACES = 4
35 # Mapping of keywords to their token types
36 KEYWORDS: Final[Dict[str, TokenType]] = {
37 "Action:": TokenType.ACTION,
38 "Context:": TokenType.CONTEXT,
39 "Embed:": TokenType.EMBED,
40 "Include:": TokenType.INCLUDE,
41 "Role:": TokenType.ROLE
42 }
44 def __init__(self, input_text: str, filename: str) -> None:
45 """
46 Initialize the MetaphorLexer.
48 Args:
49 input_text (str): The text content to be lexically analyzed
50 filename (str): Name of the file being processed
51 """
52 self.in_text_block: bool = False
53 self.in_fenced_code: bool = False
54 self.indent_column: int = 1
55 self.filename: str = filename
56 self.tokens: List[Token] = []
57 self.current_line: int = 1
58 self.input: str = input_text
59 self._tokenize()
61 def get_next_token(self) -> Token:
62 """Return the next token from the token list."""
63 if self.tokens:
64 return self.tokens.pop(0)
66 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1)
68 def _tokenize(self) -> None:
69 """
70 Tokenize the input file into appropriate tokens.
71 Processes each line for indentation, keywords, and text content.
72 """
73 if not self.input:
74 return
76 lines: List[str] = self.input.splitlines()
77 for line in lines:
78 self._process_line(line)
79 self.current_line += 1
81 # Handle remaining outdents at end of file
82 self._handle_final_outdents()
84 def _handle_final_outdents(self) -> None:
85 """Handle any remaining outdents needed at the end of file."""
86 while self.indent_column > 1:
87 self.tokens.append(
88 Token(
89 type=TokenType.OUTDENT,
90 value="[Outdent]",
91 input="",
92 filename=self.filename,
93 line=self.current_line,
94 column=self.indent_column
95 )
96 )
97 self.indent_column -= self.INDENT_SPACES
99 def _process_line(self, line: str) -> None:
100 """
101 Process a single line of input.
103 Args:
104 line: The line to process
105 """
106 stripped_line = line.lstrip(' ')
107 start_column = len(line) - len(stripped_line) + 1
109 if not stripped_line:
110 if self.in_fenced_code:
111 self._handle_blank_line(start_column)
113 return
115 # Is this line a comment?
116 if stripped_line.startswith('#'):
117 return
119 # Does this line start with a tab character?
120 if stripped_line.startswith('\t'):
121 self._handle_tab_character(stripped_line, start_column)
122 stripped_line = stripped_line[1:]
123 if not stripped_line:
124 return
126 # Does this line start with a code fence?
127 if stripped_line.startswith('```'):
128 self.in_fenced_code = not self.in_fenced_code
130 # If we're not in a fenced code block then look for keywords.
131 if not self.in_fenced_code:
132 words = stripped_line.split(maxsplit=1)
133 first_word = words[0].capitalize()
135 if first_word in self.KEYWORDS:
136 self._handle_keyword_line(line, words, first_word, start_column)
137 return
139 # Treat this as a text block.
140 self._handle_text_line(line, start_column)
142 def _handle_tab_character(self, line: str, column: int) -> None:
143 """
144 Handle tab characters in the input.
146 Args:
147 line: The line to check
148 column: The current column number
149 """
150 self.tokens.append(
151 Token(
152 type=TokenType.TAB,
153 value="[Tab]",
154 input=line,
155 filename=self.filename,
156 line=self.current_line,
157 column=column
158 )
159 )
161 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None:
162 """
163 Handle a line that starts with a keyword.
165 Args:
166 line: The complete line
167 words: The line split into words
168 keyword: The keyword found
169 start_column: The starting column of the content
170 """
171 self._process_indentation(line, start_column)
173 # Create keyword token
174 self.tokens.append(
175 Token(
176 type=self.KEYWORDS[keyword],
177 value=keyword,
178 input=line,
179 filename=self.filename,
180 line=self.current_line,
181 column=start_column
182 )
183 )
185 # Handle any text after the keyword
186 if len(words) > 1:
187 self.tokens.append(
188 Token(
189 type=TokenType.KEYWORD_TEXT,
190 value=words[1],
191 input=line,
192 filename=self.filename,
193 line=self.current_line,
194 column=start_column + len(keyword) + 1
195 )
196 )
198 self.in_text_block = False
200 def _handle_text_line(self, line: str, start_column: int) -> None:
201 """
202 Handle a line that contains text content.
204 Args:
205 line: The line to process
206 start_column: The starting column of the content
207 """
208 # Adjust indentation for continued text blocks
209 if self.in_text_block:
210 if start_column > self.indent_column:
211 start_column = self.indent_column
212 elif start_column < self.indent_column:
213 self._process_indentation(line, start_column)
214 else:
215 self._process_indentation(line, start_column)
217 text_content = line[start_column - 1:]
218 self.tokens.append(
219 Token(
220 type=TokenType.TEXT,
221 value=text_content,
222 input=line,
223 filename=self.filename,
224 line=self.current_line,
225 column=start_column
226 )
227 )
228 self.in_text_block = True
230 def _handle_blank_line(self, start_column: int) -> None:
231 self.tokens.append(
232 Token(
233 type=TokenType.TEXT,
234 value="",
235 input="",
236 filename=self.filename,
237 line=self.current_line,
238 column=start_column
239 )
240 )
242 def _process_indentation(self, line: str, start_column: int) -> None:
243 """
244 Process the indentation of the current line.
246 Args:
247 line: The current line
248 start_column: The starting column of the content
249 """
250 indent_offset = start_column - self.indent_column
252 if indent_offset > 0:
253 self._handle_indent(line, start_column, indent_offset)
254 elif indent_offset < 0:
255 self._handle_outdent(line, start_column, indent_offset)
257 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None:
258 """
259 Handle an increase in indentation.
261 Args:
262 line: The current line
263 start_column: The starting column of the content
264 indent_offset: The change in indentation
265 """
266 if indent_offset % self.INDENT_SPACES != 0:
267 self.tokens.append(
268 Token(
269 type=TokenType.BAD_INDENT,
270 value="[Bad Indent]",
271 input=line,
272 filename=self.filename,
273 line=self.current_line,
274 column=start_column
275 )
276 )
277 return
279 while indent_offset > 0:
280 self.tokens.append(
281 Token(
282 type=TokenType.INDENT,
283 value="[Indent]",
284 input=line,
285 filename=self.filename,
286 line=self.current_line,
287 column=start_column
288 )
289 )
290 indent_offset -= self.INDENT_SPACES
292 self.indent_column = start_column
294 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None:
295 """
296 Handle a decrease in indentation.
298 Args:
299 line: The current line
300 start_column: The starting column of the content
301 indent_offset: The change in indentation
302 """
303 if abs(indent_offset) % self.INDENT_SPACES != 0:
304 self.tokens.append(
305 Token(
306 type=TokenType.BAD_OUTDENT,
307 value="[Bad Outdent]",
308 input=line,
309 filename=self.filename,
310 line=self.current_line,
311 column=start_column
312 )
313 )
314 return
316 while indent_offset < 0:
317 self.tokens.append(
318 Token(
319 type=TokenType.OUTDENT,
320 value="[Outdent]",
321 input=line,
322 filename=self.filename,
323 line=self.current_line,
324 column=start_column
325 )
326 )
327 indent_offset += self.INDENT_SPACES
329 self.indent_column = start_column