Coverage for src/m6rclib/metaphor_lexer.py: 100%
91 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-12 19:59 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-12 19:59 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15from typing import Dict, List, Final
17from .metaphor_token import Token, TokenType
19class MetaphorLexer:
20 """
21 Lexer for handling the Metaphor language with its specific syntax.
23 The Metaphor language consists of:
24 - Keywords (Action:, Context:, Role:, etc)
25 - Indented blocks
26 - Text content
27 - Include/Embed directives
29 This lexer handles proper indentation, text block detection, and keyword parsing.
30 """
32 # Constants for language elements
33 INDENT_SPACES = 4
35 # Mapping of keywords to their token types
36 KEYWORDS: Final[Dict[str, TokenType]] = {
37 "Action:": TokenType.ACTION,
38 "Context:": TokenType.CONTEXT,
39 "Embed:": TokenType.EMBED,
40 "Include:": TokenType.INCLUDE,
41 "Role:": TokenType.ROLE
42 }
44 def __init__(self, input_text: str, filename: str) -> None:
45 """
46 Initialize the MetaphorLexer.
48 Args:
49 input_text (str): The text content to be lexically analyzed
50 filename (str): Name of the file being processed
51 """
52 self.in_text_block: bool = False
53 self.in_fenced_code: bool = False
54 self.indent_column: int = 1
55 self.filename: str = filename
56 self.tokens: List[Token] = []
57 self.current_line: int = 1
58 self.input: str = input_text
59 self._tokenize()
61 def get_next_token(self) -> Token:
62 """Return the next token from the token list."""
63 if self.tokens:
64 return self.tokens.pop(0)
66 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1)
68 def _tokenize(self) -> None:
69 """
70 Tokenize the input file into appropriate tokens.
71 Processes each line for indentation, keywords, and text content.
72 """
73 if not self.input:
74 return
76 lines: List[str] = self.input.splitlines()
77 for line in lines:
78 self._process_line(line)
79 self.current_line += 1
81 # Handle remaining outdents at end of file
82 self._handle_final_outdents()
84 def _handle_final_outdents(self) -> None:
85 """Handle any remaining outdents needed at the end of file."""
86 while self.indent_column > 1:
87 self.tokens.append(
88 Token(
89 type=TokenType.OUTDENT,
90 value="[Outdent]",
91 input="",
92 filename=self.filename,
93 line=self.current_line,
94 column=self.indent_column
95 )
96 )
97 self.indent_column -= self.INDENT_SPACES
99 def _process_line(self, line: str) -> None:
100 """
101 Process a single line of input.
103 Args:
104 line: The line to process
105 """
106 stripped_line = line.lstrip(' ')
107 start_column = len(line) - len(stripped_line) + 1
109 if not stripped_line:
110 return
112 # Is this line a comment?
113 if stripped_line.startswith('#'):
114 return
116 # Does this line start with a tab character?
117 if stripped_line.startswith('\t'):
118 self._handle_tab_character(stripped_line, start_column)
119 stripped_line = stripped_line[1:]
120 if not stripped_line:
121 return
123 # Does this line start with a code fence?
124 if stripped_line.startswith('```'):
125 self.in_fenced_code = not self.in_fenced_code
127 # If we're not in a fenced code block then look for keywords.
128 if not self.in_fenced_code:
129 words = stripped_line.split(maxsplit=1)
130 first_word = words[0].capitalize()
132 if first_word in self.KEYWORDS:
133 self._handle_keyword_line(line, words, first_word, start_column)
134 return
136 # Treat this as a text block.
137 self._handle_text_line(line, start_column)
139 def _handle_tab_character(self, line: str, column: int) -> None:
140 """
141 Handle tab characters in the input.
143 Args:
144 line: The line to check
145 column: The current column number
147 Returns:
148 True if a tab was handled, False otherwise
149 """
150 self.tokens.append(
151 Token(
152 type=TokenType.TAB,
153 value="[Tab]",
154 input=line,
155 filename=self.filename,
156 line=self.current_line,
157 column=column
158 )
159 )
161 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None:
162 """
163 Handle a line that starts with a keyword.
165 Args:
166 line: The complete line
167 words: The line split into words
168 keyword: The keyword found
169 start_column: The starting column of the content
170 """
171 self._process_indentation(line, start_column)
173 # Create keyword token
174 self.tokens.append(
175 Token(
176 type=self.KEYWORDS[keyword],
177 value=keyword,
178 input=line,
179 filename=self.filename,
180 line=self.current_line,
181 column=start_column
182 )
183 )
185 # Handle any text after the keyword
186 if len(words) > 1:
187 self.tokens.append(
188 Token(
189 type=TokenType.KEYWORD_TEXT,
190 value=words[1],
191 input=line,
192 filename=self.filename,
193 line=self.current_line,
194 column=start_column + len(keyword) + 1
195 )
196 )
198 self.in_text_block = False
200 def _handle_text_line(self, line: str, start_column: int) -> None:
201 """
202 Handle a line that contains text content.
204 Args:
205 line: The line to process
206 start_column: The starting column of the content
207 """
208 # Adjust indentation for continued text blocks
209 if self.in_text_block:
210 if start_column > self.indent_column:
211 start_column = self.indent_column
212 elif start_column < self.indent_column:
213 self._process_indentation(line, start_column)
214 else:
215 self._process_indentation(line, start_column)
217 text_content = line[start_column - 1:]
218 self.tokens.append(
219 Token(
220 type=TokenType.TEXT,
221 value=text_content,
222 input=line,
223 filename=self.filename,
224 line=self.current_line,
225 column=start_column
226 )
227 )
228 self.in_text_block = True
230 def _process_indentation(self, line: str, start_column: int) -> None:
231 """
232 Process the indentation of the current line.
234 Args:
235 line: The current line
236 start_column: The starting column of the content
237 """
238 indent_offset = start_column - self.indent_column
240 if indent_offset > 0:
241 self._handle_indent(line, start_column, indent_offset)
242 elif indent_offset < 0:
243 self._handle_outdent(line, start_column, indent_offset)
245 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None:
246 """
247 Handle an increase in indentation.
249 Args:
250 line: The current line
251 start_column: The starting column of the content
252 indent_offset: The change in indentation
253 """
254 if indent_offset % self.INDENT_SPACES != 0:
255 self.tokens.append(
256 Token(
257 type=TokenType.BAD_INDENT,
258 value="[Bad Indent]",
259 input=line,
260 filename=self.filename,
261 line=self.current_line,
262 column=start_column
263 )
264 )
265 return
267 while indent_offset > 0:
268 self.tokens.append(
269 Token(
270 type=TokenType.INDENT,
271 value="[Indent]",
272 input=line,
273 filename=self.filename,
274 line=self.current_line,
275 column=start_column
276 )
277 )
278 indent_offset -= self.INDENT_SPACES
280 self.indent_column = start_column
282 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None:
283 """
284 Handle a decrease in indentation.
286 Args:
287 line: The current line
288 start_column: The starting column of the content
289 indent_offset: The change in indentation
290 """
291 if abs(indent_offset) % self.INDENT_SPACES != 0:
292 self.tokens.append(
293 Token(
294 type=TokenType.BAD_OUTDENT,
295 value="[Bad Outdent]",
296 input=line,
297 filename=self.filename,
298 line=self.current_line,
299 column=start_column
300 )
301 )
302 return
304 while indent_offset < 0:
305 self.tokens.append(
306 Token(
307 type=TokenType.OUTDENT,
308 value="[Outdent]",
309 input=line,
310 filename=self.filename,
311 line=self.current_line,
312 column=start_column
313 )
314 )
315 indent_offset += self.INDENT_SPACES
317 self.indent_column = start_column