Coverage for src/m6rclib/metaphor_lexer.py: 100%
88 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-12 12:10 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-12 12:10 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15from typing import Dict, List, Final
17from .metaphor_token import Token, TokenType
19class MetaphorLexer:
20 """
21 Lexer for handling the Metaphor language with its specific syntax.
23 The Metaphor language consists of:
24 - Keywords (Action:, Context:, Role:, etc)
25 - Indented blocks
26 - Text content
27 - Include/Embed directives
29 This lexer handles proper indentation, text block detection, and keyword parsing.
30 """
32 # Constants for language elements
33 INDENT_SPACES = 4
35 # Mapping of keywords to their token types
36 KEYWORDS: Final[Dict[str, TokenType]] = {
37 "Action:": TokenType.ACTION,
38 "Context:": TokenType.CONTEXT,
39 "Embed:": TokenType.EMBED,
40 "Include:": TokenType.INCLUDE,
41 "Role:": TokenType.ROLE
42 }
44 def __init__(self, input_text: str, filename: str) -> None:
45 """
46 Initialize the MetaphorLexer.
48 Args:
49 input_text (str): The text content to be lexically analyzed
50 filename (str): Name of the file being processed
51 """
52 self.in_text_block: bool = False
53 self.indent_column: int = 1
54 self.filename: str = filename
55 self.tokens: List[Token] = []
56 self.current_line: int = 1
57 self.input: str = input_text
58 self._tokenize()
60 def get_next_token(self) -> Token:
61 """Return the next token from the token list."""
62 if self.tokens:
63 return self.tokens.pop(0)
65 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1)
67 def _tokenize(self) -> None:
68 """
69 Tokenize the input file into appropriate tokens.
70 Processes each line for indentation, keywords, and text content.
71 """
72 if not self.input:
73 return
75 lines: List[str] = self.input.splitlines()
76 for line in lines:
77 self._process_line(line)
78 self.current_line += 1
80 # Handle remaining outdents at end of file
81 self._handle_final_outdents()
83 def _handle_final_outdents(self) -> None:
84 """Handle any remaining outdents needed at the end of file."""
85 while self.indent_column > 1:
86 self.tokens.append(
87 Token(
88 type=TokenType.OUTDENT,
89 value="[Outdent]",
90 input="",
91 filename=self.filename,
92 line=self.current_line,
93 column=self.indent_column
94 )
95 )
96 self.indent_column -= self.INDENT_SPACES
98 def _process_line(self, line: str) -> None:
99 """
100 Process a single line of input.
102 Args:
103 line: The line to process
104 """
105 stripped_line = line.lstrip(' ')
106 start_column = len(line) - len(stripped_line) + 1
108 if not stripped_line:
109 return
111 # Is this line a comment?
112 if stripped_line.startswith('#'):
113 return
115 # Does this line start with a tab character?
116 if stripped_line.startswith('\t'):
117 self._handle_tab_character(stripped_line, start_column)
118 stripped_line = stripped_line[1:]
119 if not stripped_line:
120 return
122 self._handle_line_content(line, stripped_line, start_column)
124 def _handle_tab_character(self, line: str, column: int) -> None:
125 """
126 Handle tab characters in the input.
128 Args:
129 line: The line to check
130 column: The current column number
132 Returns:
133 True if a tab was handled, False otherwise
134 """
135 self.tokens.append(
136 Token(
137 type=TokenType.TAB,
138 value="[Tab]",
139 input=line,
140 filename=self.filename,
141 line=self.current_line,
142 column=column
143 )
144 )
146 def _handle_line_content(self, full_line: str, stripped_line: str, start_column: int) -> None:
147 """
148 Process the content of a line after initial cleaning.
150 Args:
151 full_line: The complete line
152 stripped_line: The line with leading whitespace removed
153 start_column: The starting column of the content
154 """
155 words = stripped_line.split(maxsplit=1)
156 first_word = words[0].capitalize()
158 if first_word in self.KEYWORDS:
159 self._handle_keyword_line(full_line, words, first_word, start_column)
160 else:
161 self._handle_text_line(full_line, start_column)
163 def _handle_keyword_line(self, line: str, words: List[str], keyword: str, start_column: int) -> None:
164 """
165 Handle a line that starts with a keyword.
167 Args:
168 line: The complete line
169 words: The line split into words
170 keyword: The keyword found
171 start_column: The starting column of the content
172 """
173 self._process_indentation(line, start_column)
175 # Create keyword token
176 self.tokens.append(
177 Token(
178 type=self.KEYWORDS[keyword],
179 value=keyword,
180 input=line,
181 filename=self.filename,
182 line=self.current_line,
183 column=start_column
184 )
185 )
187 # Handle any text after the keyword
188 if len(words) > 1:
189 self.tokens.append(
190 Token(
191 type=TokenType.KEYWORD_TEXT,
192 value=words[1],
193 input=line,
194 filename=self.filename,
195 line=self.current_line,
196 column=start_column + len(keyword) + 1
197 )
198 )
200 self.in_text_block = False
202 def _handle_text_line(self, line: str, start_column: int) -> None:
203 """
204 Handle a line that contains text content.
206 Args:
207 line: The line to process
208 start_column: The starting column of the content
209 """
210 # Adjust indentation for continued text blocks
211 if self.in_text_block:
212 if start_column > self.indent_column:
213 start_column = self.indent_column
214 elif start_column < self.indent_column:
215 self._process_indentation(line, start_column)
216 else:
217 self._process_indentation(line, start_column)
219 text_content = line[start_column - 1:]
220 self.tokens.append(
221 Token(
222 type=TokenType.TEXT,
223 value=text_content,
224 input=line,
225 filename=self.filename,
226 line=self.current_line,
227 column=start_column
228 )
229 )
230 self.in_text_block = True
232 def _process_indentation(self, line: str, start_column: int) -> None:
233 """
234 Process the indentation of the current line.
236 Args:
237 line: The current line
238 start_column: The starting column of the content
239 """
240 indent_offset = start_column - self.indent_column
242 if indent_offset > 0:
243 self._handle_indent(line, start_column, indent_offset)
244 elif indent_offset < 0:
245 self._handle_outdent(line, start_column, indent_offset)
247 def _handle_indent(self, line: str, start_column: int, indent_offset: int) -> None:
248 """
249 Handle an increase in indentation.
251 Args:
252 line: The current line
253 start_column: The starting column of the content
254 indent_offset: The change in indentation
255 """
256 if indent_offset % self.INDENT_SPACES != 0:
257 self.tokens.append(
258 Token(
259 type=TokenType.BAD_INDENT,
260 value="[Bad Indent]",
261 input=line,
262 filename=self.filename,
263 line=self.current_line,
264 column=start_column
265 )
266 )
267 return
269 while indent_offset > 0:
270 self.tokens.append(
271 Token(
272 type=TokenType.INDENT,
273 value="[Indent]",
274 input=line,
275 filename=self.filename,
276 line=self.current_line,
277 column=start_column
278 )
279 )
280 indent_offset -= self.INDENT_SPACES
282 self.indent_column = start_column
284 def _handle_outdent(self, line: str, start_column: int, indent_offset: int) -> None:
285 """
286 Handle a decrease in indentation.
288 Args:
289 line: The current line
290 start_column: The starting column of the content
291 indent_offset: The change in indentation
292 """
293 if abs(indent_offset) % self.INDENT_SPACES != 0:
294 self.tokens.append(
295 Token(
296 type=TokenType.BAD_OUTDENT,
297 value="[Bad Outdent]",
298 input=line,
299 filename=self.filename,
300 line=self.current_line,
301 column=start_column
302 )
303 )
304 return
306 while indent_offset < 0:
307 self.tokens.append(
308 Token(
309 type=TokenType.OUTDENT,
310 value="[Outdent]",
311 input=line,
312 filename=self.filename,
313 line=self.current_line,
314 column=start_column
315 )
316 )
317 indent_offset += self.INDENT_SPACES
319 self.indent_column = start_column