Coverage for src/m6rclib/metaphor_parser.py: 100%
228 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-19 11:15 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-19 11:15 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15import glob
16import os
17from pathlib import Path
19from typing import List, Set, Optional, Union
21from .metaphor_token import Token, TokenType
22from .embed_lexer import EmbedLexer
23from .metaphor_lexer import MetaphorLexer
24from .metaphor_ast_node import MetaphorASTNode, MetaphorASTNodeType
26class MetaphorParserFileAlreadyUsedError(Exception):
27 """Exception raised when a file is used more than once."""
28 def __init__(self, filename: str, token: Token) -> None:
29 super().__init__(f"The file '{filename}' has already been used.")
30 self.filename: str = filename
31 self.token: Token = token
34class MetaphorParserSyntaxError(Exception):
35 """Exception generated when there is a syntax error."""
36 def __init__(self, message: str, filename: str, line: int, column: int, input_text: str) -> None:
37 super().__init__(f"{message}: file: {filename}, line {line}, column {column}, ")
38 self.message: str = message
39 self.filename: str = filename
40 self.line: int = line
41 self.column: int = column
42 self.input_text: str = input_text
45class MetaphorParserError(Exception):
46 """Exception wrapper generated when there is a syntax error."""
47 def __init__(self, message: str, errors: List[MetaphorParserSyntaxError]) -> None:
48 super().__init__(message)
49 self.errors: List[MetaphorParserSyntaxError] = errors
52class MetaphorParser:
53 """
54 Parser class to process tokens and build an Abstract Syntax Tree (AST).
56 Attributes:
57 syntax_tree (MetaphorASTNode): The root node of the AST.
58 parse_errors (List[MetaphorParserSyntaxError]): List of syntax errors encountered during parsing.
59 lexers (List[Union[MetaphorLexer, EmbedLexer]]): Stack of lexers used for parsing multiple files.
60 previously_seen_files (Set[str]): Set of canonical filenames already processed.
61 search_paths (List[str]): List of paths to search for included files.
62 current_token (Optional[Token]): The current token being processed.
63 """
64 def __init__(self) -> None:
65 self.syntax_tree: MetaphorASTNode = MetaphorASTNode(MetaphorASTNodeType.ROOT, "")
66 self.parse_errors: List[MetaphorParserSyntaxError] = []
67 self.lexers: List[Union[MetaphorLexer, EmbedLexer]] = []
68 self.previously_seen_files: Set[str] = set()
69 self.search_paths: List[str] = []
70 self.current_token: Optional[Token] = None
72 def _insert_preamble_text(self, text: str) -> None:
73 self.syntax_tree.attach_child(MetaphorASTNode(MetaphorASTNodeType.TEXT, text))
75 def _generate_preamble(self) -> None:
76 preamble: List[str] = [
77 "The following preamble describes some elements of a language called Metaphor. Please pay",
78 "extremely close attention to the details as they will affect the way you interpret",
79 "everything that follows after \"BEGIN DESCRIPTION IN METAPHOR:\"",
80 "",
81 "Metaphor has the structure of a document tree with branches and leaves being prefixed",
82 "by the keywords \"Role:\", \"Context:\" or \"Action:\". Each of these indicates the",
83 "start of a new block of information.",
84 "",
85 "Blocks have an optional section name that will immediately follow them on the same line.",
86 "If this is missing then the section name is not defined.",
87 "",
88 "After a keyword line there may be one or more lines of text that will describe the purpose",
89 "of that block. A block may also include one or more optional child blocks inside them and",
90 "that further clarify their parent block. These text blocks and any keywords lines nested",
91 "inside a parent block will be indented by 4 spaces more than its parent.",
92 "",
93 "For example a \"Context:\" indented by 8 spaces is a child of the block above it that is",
94 "indented by 4 spaces. One indented 12 spaces would be a child of the block above it that is",
95 "indented by 8 spaces.",
96 "",
97 "Within the text of a block, you may be presented with code or document fragments inside a",
98 "block delimited by 3 backticks. Please pay close attention to the indentation level of the",
99 "opening 3 backticks. The identation of such code or document fragments is relative to this,",
100 "not relative to the block in which the code or document fragment occurs.",
101 "For example, consider:",
102 " ```plaintext",
103 " text line 1",
104 " text line 2",
105 " ```",
106 " ```plaintext",
107 " text line 3",
108 " ```"
109 "In this example, \"text line 1\" is not indented from the opening 3 backticks and thus has no",
110 "indentation. \"text line 2\" is indented by 2 spaces relative to the opening 3 backticks",
111 " \"text line 3\" is indented by 1 space relative to its opening 3 backticks.",
112 "",
113 "If \"Role:\" blocks exists then these contain details about the role you should fulfil. This",
114 "section may also describe specific skills you have, knowledge you should apply, and the",
115 "approach you take to apply these."
116 "",
117 "\"Context:\" blocks provide context necessary to understand what you will be asked to do.",
118 "",
119 "\"Action:\" blocks describes the task, or tasks, I would like you to do.",
120 "",
121 "When you process the actions please carefully ensure you do all of them accurately and",
122 "complete all the elements requested. Unless otherwise instructed, do not include any",
123 "placeholders in your responses.",
124 "",
125 "BEGIN DESCRIPTION IN METAPHOR:"
126 ]
128 for text in preamble:
129 self._insert_preamble_text(text)
131 def parse(self, input_text: str, filename: str, search_paths: List[str]) -> MetaphorASTNode:
132 """
133 Parse an input string and construct the AST.
135 Args:
136 input_text (str): The text to be parsed.
137 filename (str): The name of the file being parsed.
138 search_paths (List[str]): List of paths to search for included files.
140 Returns:
141 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes.
143 Raises:
144 MetaphorParserError: If there are syntax errors during parsing.
145 FileNotFoundError: If a required file cannot be found.
146 """
147 self.search_paths = search_paths
149 try:
150 self.lexers.append(MetaphorLexer(input_text, filename))
151 self._generate_preamble()
153 seen_action_tree: bool = False
154 seen_context_tree: bool = False
155 seen_role_tree: bool = False
157 while True:
158 token = self.get_next_token()
159 if token.type == TokenType.ACTION:
160 if seen_action_tree:
161 self._record_syntax_error(token, "'Action' already defined")
163 self.syntax_tree.attach_child(self._parse_action(token))
164 seen_action_tree = True
165 elif token.type == TokenType.CONTEXT:
166 if seen_context_tree:
167 self._record_syntax_error(token, "'Context' already defined")
169 self.syntax_tree.attach_child(self._parse_context(token))
170 seen_context_tree = True
171 elif token.type == TokenType.ROLE:
172 if seen_role_tree:
173 self._record_syntax_error(token, "'Role' already defined")
175 self.syntax_tree.attach_child(self._parse_role(token))
176 seen_role_tree = True
177 elif token.type == TokenType.END_OF_FILE:
178 if self.parse_errors:
179 raise(MetaphorParserError("parser error", self.parse_errors))
181 return self.syntax_tree
182 else:
183 self._record_syntax_error(token, f"Unexpected token: {token.value} at top level")
184 except FileNotFoundError as e:
185 err_token = self.current_token
186 self.parse_errors.append(MetaphorParserSyntaxError(
187 f"{e}", err_token.filename, err_token.line, err_token.column, err_token.input
188 ))
189 raise(MetaphorParserError("parser error", self.parse_errors)) from e
190 except MetaphorParserFileAlreadyUsedError as e:
191 self.parse_errors.append(MetaphorParserSyntaxError(
192 f"The file '{e.filename}' has already been used",
193 e.token.filename,
194 e.token.line,
195 e.token.column,
196 e.token.input
197 ))
198 raise(MetaphorParserError("parser error", self.parse_errors)) from e
200 def parse_file(self, filename: str, search_paths: List[str]) -> MetaphorASTNode:
201 """
202 Parse a file and construct the AST.
204 Args:
205 filename (str): The path to the file to be parsed.
206 search_paths (List[str]): List of paths to search for included files.
208 Returns:
209 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes.
211 Raises:
212 MetaphorParserError: If there are syntax errors during parsing.
213 FileNotFoundError: If the file cannot be found.
214 """
215 try:
216 self._check_file_not_loaded(filename)
217 input_text = self._read_file(filename)
218 return self.parse(input_text, filename, search_paths)
219 except FileNotFoundError as e:
220 self.parse_errors.append(MetaphorParserSyntaxError(
221 f"{e}", "", 0, 0, ""
222 ))
223 raise(MetaphorParserError("parser error", self.parse_errors)) from e
224 except MetaphorParserError as e:
225 raise(MetaphorParserError("parser error", self.parse_errors)) from e
227 def get_next_token(self) -> Token:
228 """Get the next token from the active lexer."""
229 while self.lexers:
230 lexer = self.lexers[-1]
231 token = lexer.get_next_token()
232 self.current_token = token
234 if token.type == TokenType.INCLUDE:
235 self._parse_include()
236 elif token.type == TokenType.EMBED:
237 self._parse_embed()
238 elif token.type == TokenType.END_OF_FILE:
239 self.lexers.pop()
240 else:
241 return token
243 return Token(TokenType.END_OF_FILE, "", "", "", 0, 0)
245 def _record_syntax_error(self, token, message):
246 """Raise a syntax error and add it to the error list."""
247 error = MetaphorParserSyntaxError(
248 message, token.filename, token.line, token.column, token.input
249 )
250 self.parse_errors.append(error)
252 def _find_file_path(self, filename):
253 """Try to find a valid path for a file, given all the search path options"""
254 if Path(filename).exists():
255 return filename
257 # If we don't have an absolute path then we can try search paths.
258 if not os.path.isabs(filename):
259 for path in self.search_paths:
260 try_name = os.path.join(path, filename)
261 if Path(try_name).exists():
262 return try_name
264 raise FileNotFoundError(f"File not found: {filename}")
266 def _read_file(self, filename):
267 """Read file content into memory."""
268 try:
269 with open(filename, 'r', encoding='utf-8') as file:
270 return file.read()
271 except FileNotFoundError as e:
272 raise FileNotFoundError(f"File not found: {filename}") from e
273 except PermissionError as e:
274 raise FileNotFoundError(f"You do not have permission to access: {filename}") from e
275 except IsADirectoryError as e:
276 raise FileNotFoundError(f"Is a directory: {filename}") from e
277 except OSError as e:
278 raise FileNotFoundError(f"OS error: {e}") from e
280 def _check_file_not_loaded(self, filename):
281 """Check we have not already loaded a file."""
282 canonical_filename = os.path.realpath(filename)
283 if canonical_filename in self.previously_seen_files:
284 raise MetaphorParserFileAlreadyUsedError(filename, self.current_token)
286 self.previously_seen_files.add(canonical_filename)
288 def _parse_text(self, token):
289 """Parse a text block."""
290 return MetaphorASTNode(MetaphorASTNodeType.TEXT, token.value)
292 def _parse_action(self, token):
293 """Parse an action block and construct its AST node."""
294 label_name = ""
296 seen_token_type = TokenType.NONE
298 init_token = self.get_next_token()
299 if init_token.type == TokenType.KEYWORD_TEXT:
300 label_name = init_token.value
301 indent_token = self.get_next_token()
302 if indent_token.type != TokenType.INDENT:
303 self._record_syntax_error(
304 token,
305 "Expected indent after keyword description for 'Action' block"
306 )
307 elif init_token.type != TokenType.INDENT:
308 self._record_syntax_error(token, "Expected description or indent for 'Action' block")
310 action_node = MetaphorASTNode(MetaphorASTNodeType.ACTION, label_name)
312 while True:
313 token = self.get_next_token()
314 if token.type == TokenType.TEXT:
315 if seen_token_type != TokenType.NONE:
316 self._record_syntax_error(token, "Text must come first in an 'Action' block")
318 action_node.attach_child(self._parse_text(token))
319 elif token.type == TokenType.ACTION:
320 action_node.attach_child(self._parse_action(token))
321 seen_token_type = TokenType.ACTION
322 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE:
323 return action_node
324 else:
325 self._record_syntax_error(
326 token,
327 f"Unexpected token: {token.value} in 'Action' block"
328 )
330 def _parse_context(self, token):
331 """Parse a Context block."""
332 label_name = ""
334 seen_token_type = TokenType.NONE
336 init_token = self.get_next_token()
337 if init_token.type == TokenType.KEYWORD_TEXT:
338 label_name = init_token.value
339 indent_token = self.get_next_token()
340 if indent_token.type != TokenType.INDENT:
341 self._record_syntax_error(
342 token,
343 "Expected indent after keyword description for 'Context' block"
344 )
345 elif init_token.type != TokenType.INDENT:
346 self._record_syntax_error(token, "Expected description or indent for 'Context' block")
348 context_node = MetaphorASTNode(MetaphorASTNodeType.CONTEXT, label_name)
350 while True:
351 token = self.get_next_token()
352 if token.type == TokenType.TEXT:
353 if seen_token_type != TokenType.NONE:
354 self._record_syntax_error(token, "Text must come first in a 'Context' block")
356 context_node.attach_child(self._parse_text(token))
357 elif token.type == TokenType.CONTEXT:
358 context_node.attach_child(self._parse_context(token))
359 seen_token_type = TokenType.CONTEXT
360 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE:
361 return context_node
362 else:
363 self._record_syntax_error(
364 token,
365 f"Unexpected token: {token.value} in 'Context' block"
366 )
368 def _parse_role(self, token):
369 """Parse a Role block."""
370 label_name = ""
372 seen_token_type = TokenType.NONE
374 init_token = self.get_next_token()
375 if init_token.type == TokenType.KEYWORD_TEXT:
376 label_name = init_token.value
377 indent_token = self.get_next_token()
378 if indent_token.type != TokenType.INDENT:
379 self._record_syntax_error(
380 token,
381 "Expected indent after keyword description for 'Role' block"
382 )
383 elif init_token.type != TokenType.INDENT:
384 self._record_syntax_error(token, "Expected description or indent for 'Role' block")
386 role_node = MetaphorASTNode(MetaphorASTNodeType.ROLE, label_name)
388 while True:
389 token = self.get_next_token()
390 if token.type == TokenType.TEXT:
391 if seen_token_type != TokenType.NONE:
392 self._record_syntax_error(token, "Text must come first in a 'Role' block")
394 role_node.attach_child(self._parse_text(token))
395 elif token.type == TokenType.ROLE:
396 role_node.attach_child(self._parse_role(token))
397 seen_token_type = TokenType.ROLE
398 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE:
399 return role_node
400 else:
401 self._record_syntax_error(
402 token,
403 f"Unexpected token: {token.value} in 'Role' block"
404 )
406 def _parse_include(self):
407 """Parse an Include block and load the included file."""
408 token_next = self.get_next_token()
409 if token_next.type != TokenType.KEYWORD_TEXT:
410 self._record_syntax_error(token_next, "Expected file name for 'Include'")
411 return
413 filename = token_next.value
414 self._check_file_not_loaded(filename)
415 try_file = self._find_file_path(filename)
416 input_text = self._read_file(try_file)
417 self.lexers.append(MetaphorLexer(input_text, try_file))
419 def _parse_embed(self):
420 """Parse an Embed block and load the embedded file."""
421 token_next = self.get_next_token()
422 if token_next.type != TokenType.KEYWORD_TEXT:
423 self._record_syntax_error(token_next, "Expected file name or wildcard match for 'Embed'")
424 return
426 recurse = False
427 match = token_next.value
428 if "**/" in match:
429 recurse = True
431 files = glob.glob(match, recursive=recurse)
432 if not files:
433 self._record_syntax_error(token_next, f"{match} does not match any files for 'Embed'")
434 return
436 for file in files:
437 input_text = self._read_file(file)
438 self.lexers.append(EmbedLexer(input_text, file))