Coverage for src/m6rclib/metaphor_lexer.py: 100%
91 statements
« prev ^ index » next coverage.py v7.6.1, created at 2025-01-22 17:09 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2025-01-22 17:09 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15from typing import Dict, List, Final
17from .metaphor_token import Token, TokenType
19class MetaphorLexer:
20 """
21 Lexer for handling the Metaphor language with its specific syntax.
23 The Metaphor language consists of:
24 - Keywords (Action:, Context:, Role:, etc)
25 - Indented blocks
26 - Text content
27 - Include/Embed directives
29 This lexer handles proper indentation, text block detection, and keyword parsing.
30 """
32 # Constants for language elements
33 INDENT_SPACES = 4
35 # Mapping of keywords to their token types
36 KEYWORDS: Final[Dict[str, TokenType]] = {
37 "Action:": TokenType.ACTION,
38 "Context:": TokenType.CONTEXT,
39 "Embed:": TokenType.EMBED,
40 "Include:": TokenType.INCLUDE,
41 "Role:": TokenType.ROLE
42 }
44 def __init__(self, input_text: str, filename: str) -> None:
45 """
46 Initialize the MetaphorLexer.
48 Args:
49 input_text (str): The text content to be lexically analyzed
50 filename (str): Name of the file being processed
51 """
52 self.in_text_block: bool = False
53 self.in_fenced_code: bool = False
54 self.indent_column: int = 1
55 self.filename: str = filename
56 self.tokens: List[Token] = []
57 self.current_line: int = 1
58 self.input: str = input_text
59 self._tokenize()
61 def get_next_token(self) -> Token:
62 """Return the next token from the token list."""
63 if self.tokens:
64 return self.tokens.pop(0)
66 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1)
68 def _tokenize(self) -> None:
69 """
70 Tokenize the input file into appropriate tokens.
71 Processes each line for indentation, keywords, and text content.
72 """
73 if not self.input:
74 return
76 lines: List[str] = self.input.splitlines()
77 for line in lines:
78 self._process_line(line)
79 self.current_line += 1
81 # Handle remaining outdents at end of file
82 self._handle_final_outdents()
84 def _handle_final_outdents(self) -> None:
85 """Handle any remaining outdents needed at the end of file."""
86 while self.indent_column > 1:
87 self.tokens.append(
88 Token(
89 type=TokenType.OUTDENT,
90 value="[Outdent]",
91 input="",
92 filename=self.filename,
93 line=self.current_line,
94 column=self.indent_column
95 )
96 )
97 self.indent_column -= self.INDENT_SPACES
99 def _process_line(self, line: str) -> None:
100 """
101 Process a single line of input.
103 Args:
104 line: The line to process
105 """
106 stripped_line = line.lstrip(' ')
107 start_column = len(line) - len(stripped_line) + 1
109 if not stripped_line:
110 return
112 # Is this line a comment?
113 if stripped_line.startswith('#'):
114 return
116 # Does this line start with a tab character?
117 if stripped_line.startswith('\t'):
118 self._handle_tab_character(stripped_line, start_column)
119 stripped_line = stripped_line[1:]
120 if not stripped_line:
121 return
123 # Does this line start with a code fence?
124 if stripped_line.startswith('```'):
125 self.in_fenced_code = not self.in_fenced_code
127 # If we're not in a fenced code block then look for keywords.
128 if not self.in_fenced_code:
129 words = stripped_line.split(maxsplit=1)
130 first_word = words[0].capitalize()
132 if first_word in self.KEYWORDS:
133 self._handle_keyword_line(line, words, first_word, start_column)
134 return
136 # Treat this as a text block.
137 self._handle_text_line(line, start_column)
139 def _handle_tab_character(self, line: str, column: int) -> None:
140 """
141 Handle tab characters in the input.
143 Args:
144 line: The line to check
145 column: The current column number
146 """
147 self.tokens.append(
148 Token(
149 type=TokenType.TAB,
150 value="[Tab]",
151 input=line,
152 filename=self.filename,
153 line=self.current_line,
154 column=column
155 )
156 )
158 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None:
159 """
160 Handle a line that starts with a keyword.
162 Args:
163 line: The complete line
164 words: The line split into words
165 keyword: The keyword found
166 start_column: The starting column of the content
167 """
168 self._process_indentation(line, start_column)
170 # Create keyword token
171 self.tokens.append(
172 Token(
173 type=self.KEYWORDS[keyword],
174 value=keyword,
175 input=line,
176 filename=self.filename,
177 line=self.current_line,
178 column=start_column
179 )
180 )
182 # Handle any text after the keyword
183 if len(words) > 1:
184 self.tokens.append(
185 Token(
186 type=TokenType.KEYWORD_TEXT,
187 value=words[1],
188 input=line,
189 filename=self.filename,
190 line=self.current_line,
191 column=start_column + len(keyword) + 1
192 )
193 )
195 self.in_text_block = False
197 def _handle_text_line(self, line: str, start_column: int) -> None:
198 """
199 Handle a line that contains text content.
201 Args:
202 line: The line to process
203 start_column: The starting column of the content
204 """
205 # Adjust indentation for continued text blocks
206 if self.in_text_block:
207 if start_column > self.indent_column:
208 start_column = self.indent_column
209 elif start_column < self.indent_column:
210 self._process_indentation(line, start_column)
211 else:
212 self._process_indentation(line, start_column)
214 text_content = line[start_column - 1:]
215 self.tokens.append(
216 Token(
217 type=TokenType.TEXT,
218 value=text_content,
219 input=line,
220 filename=self.filename,
221 line=self.current_line,
222 column=start_column
223 )
224 )
225 self.in_text_block = True
227 def _process_indentation(self, line: str, start_column: int) -> None:
228 """
229 Process the indentation of the current line.
231 Args:
232 line: The current line
233 start_column: The starting column of the content
234 """
235 indent_offset = start_column - self.indent_column
237 if indent_offset > 0:
238 self._handle_indent(line, start_column, indent_offset)
239 elif indent_offset < 0:
240 self._handle_outdent(line, start_column, indent_offset)
242 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None:
243 """
244 Handle an increase in indentation.
246 Args:
247 line: The current line
248 start_column: The starting column of the content
249 indent_offset: The change in indentation
250 """
251 if indent_offset % self.INDENT_SPACES != 0:
252 self.tokens.append(
253 Token(
254 type=TokenType.BAD_INDENT,
255 value="[Bad Indent]",
256 input=line,
257 filename=self.filename,
258 line=self.current_line,
259 column=start_column
260 )
261 )
262 return
264 while indent_offset > 0:
265 self.tokens.append(
266 Token(
267 type=TokenType.INDENT,
268 value="[Indent]",
269 input=line,
270 filename=self.filename,
271 line=self.current_line,
272 column=start_column
273 )
274 )
275 indent_offset -= self.INDENT_SPACES
277 self.indent_column = start_column
279 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None:
280 """
281 Handle a decrease in indentation.
283 Args:
284 line: The current line
285 start_column: The starting column of the content
286 indent_offset: The change in indentation
287 """
288 if abs(indent_offset) % self.INDENT_SPACES != 0:
289 self.tokens.append(
290 Token(
291 type=TokenType.BAD_OUTDENT,
292 value="[Bad Outdent]",
293 input=line,
294 filename=self.filename,
295 line=self.current_line,
296 column=start_column
297 )
298 )
299 return
301 while indent_offset < 0:
302 self.tokens.append(
303 Token(
304 type=TokenType.OUTDENT,
305 value="[Outdent]",
306 input=line,
307 filename=self.filename,
308 line=self.current_line,
309 column=start_column
310 )
311 )
312 indent_offset += self.INDENT_SPACES
314 self.indent_column = start_column