Coverage for src/m6rclib/metaphor_parser.py: 100%
231 statements
« prev ^ index » next coverage.py v7.6.1, created at 2025-01-22 17:26 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2025-01-22 17:26 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15import glob
16import os
17from pathlib import Path
19from typing import List, Set, Optional, Union
21from .metaphor_token import Token, TokenType
22from .embed_lexer import EmbedLexer
23from .metaphor_lexer import MetaphorLexer
24from .metaphor_ast_node import MetaphorASTNode, MetaphorASTNodeType
26class MetaphorParserFileAlreadyUsedError(Exception):
27 """Exception raised when a file is used more than once."""
28 def __init__(self, filename: str, token: Token) -> None:
29 super().__init__(f"The file '{filename}' has already been used.")
30 self.filename: str = filename
31 self.token: Token = token
34class MetaphorParserSyntaxError(Exception):
35 """Exception generated when there is a syntax error."""
36 def __init__(self, message: str, filename: str, line: int, column: int, input_text: str) -> None:
37 super().__init__(f"{message}: file: {filename}, line {line}, column {column}, ")
38 self.message: str = message
39 self.filename: str = filename
40 self.line: int = line
41 self.column: int = column
42 self.input_text: str = input_text
45class MetaphorParserError(Exception):
46 """Exception wrapper generated when there is a syntax error."""
47 def __init__(self, message: str, errors: List[MetaphorParserSyntaxError]) -> None:
48 super().__init__(message)
49 self.errors: List[MetaphorParserSyntaxError] = errors
52class MetaphorParser:
53 """
54 Parser class to process tokens and build an Abstract Syntax Tree (AST).
56 Attributes:
57 syntax_tree (MetaphorASTNode): The root node of the AST.
58 parse_errors (List[MetaphorParserSyntaxError]): List of syntax errors encountered during parsing.
59 lexers (List[Union[MetaphorLexer, EmbedLexer]]): Stack of lexers used for parsing multiple files.
60 previously_seen_files (Set[str]): Set of canonical filenames already processed.
61 search_paths (List[str]): List of paths to search for included files.
62 current_token (Optional[Token]): The current token being processed.
63 """
64 def __init__(self) -> None:
65 self.syntax_tree: MetaphorASTNode = MetaphorASTNode(MetaphorASTNodeType.ROOT, "")
66 self.parse_errors: List[MetaphorParserSyntaxError] = []
67 self.lexers: List[Union[MetaphorLexer, EmbedLexer]] = []
68 self.previously_seen_files: Set[str] = set()
69 self.search_paths: List[str] = []
70 self.embed_path: str = None
71 self.current_token: Optional[Token] = None
73 def _insert_preamble_text(self, text: str) -> None:
74 self.syntax_tree.attach_child(MetaphorASTNode(MetaphorASTNodeType.TEXT, text))
76 def _generate_preamble(self) -> None:
77 preamble: List[str] = [
78 "The following preamble describes some elements of a language called Metaphor. Please pay",
79 "extremely close attention to the details as they will affect the way you interpret",
80 "everything that follows after \"BEGIN DESCRIPTION IN METAPHOR:\"",
81 "",
82 "Metaphor has the structure of a document tree with branches and leaves being prefixed",
83 "by the keywords \"Role:\", \"Context:\" or \"Action:\". Each of these indicates the",
84 "start of a new block of information.",
85 "",
86 "Blocks have an optional section name that will immediately follow them on the same line.",
87 "If this is missing then the section name is not defined.",
88 "",
89 "After a keyword line there may be one or more lines of text that will describe the purpose",
90 "of that block. A block may also include one or more optional child blocks inside them and",
91 "that further clarify their parent block. These text blocks and any keywords lines nested",
92 "inside a parent block will be indented by 4 spaces more than its parent.",
93 "",
94 "For example a \"Context:\" indented by 8 spaces is a child of the block above it that is",
95 "indented by 4 spaces. One indented 12 spaces would be a child of the block above it that is",
96 "indented by 8 spaces.",
97 "",
98 "Within the text of a block, you may be presented with code or document fragments inside a",
99 "block delimited by 3 backticks. Please pay close attention to the indentation level of the",
100 "opening 3 backticks. The identation of such code or document fragments is relative to this,",
101 "not relative to the block in which the code or document fragment occurs.",
102 "For example, consider:",
103 " ```plaintext",
104 " text line 1",
105 " text line 2",
106 " ```",
107 " ```plaintext",
108 " text line 3",
109 " ```",
110 "In this example, \"text line 1\" is not indented from the opening 3 backticks and thus has no",
111 "indentation. \"text line 2\" is indented by 2 spaces relative to the opening 3 backticks",
112 " \"text line 3\" is indented by 1 space relative to its opening 3 backticks.",
113 "",
114 "If \"Role:\" blocks exists then these contain details about the role you should fulfil. This",
115 "section may also describe specific skills you have, knowledge you should apply, and the",
116 "approach you take to apply these."
117 "",
118 "\"Context:\" blocks provide context necessary to understand what you will be asked to do.",
119 "",
120 "\"Action:\" blocks describes the task, or tasks, I would like you to do.",
121 "",
122 "When you process the actions please carefully ensure you do all of them accurately and",
123 "complete all the elements requested. Unless otherwise instructed, do not include any",
124 "placeholders in your responses.",
125 "",
126 "BEGIN DESCRIPTION IN METAPHOR:"
127 ]
129 for text in preamble:
130 self._insert_preamble_text(text)
132 def parse(self, input_text: str, filename: str, search_paths: List[str], embed_path: Optional[str]=None) -> MetaphorASTNode:
133 """
134 Parse an input string and construct the AST.
136 Args:
137 input_text (str): The text to be parsed.
138 filename (str): The name of the file being parsed.
139 search_paths (List[str]): List of paths to search for included files.
140 embed_path: Path used to search for embedded files (uses CWD if None).
142 Returns:
143 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes.
145 Raises:
146 MetaphorParserError: If there are syntax errors during parsing.
147 FileNotFoundError: If a required file cannot be found.
148 """
149 self.search_paths = search_paths
150 self.embed_path = embed_path if embed_path else os.getcwd()
152 try:
153 self.lexers.append(MetaphorLexer(input_text, filename))
154 self._generate_preamble()
156 seen_action_tree: bool = False
157 seen_context_tree: bool = False
158 seen_role_tree: bool = False
160 while True:
161 token = self.get_next_token()
162 if token.type == TokenType.ACTION:
163 if seen_action_tree:
164 self._record_syntax_error(token, "'Action' already defined")
166 self.syntax_tree.attach_child(self._parse_action(token))
167 seen_action_tree = True
168 elif token.type == TokenType.CONTEXT:
169 if seen_context_tree:
170 self._record_syntax_error(token, "'Context' already defined")
172 self.syntax_tree.attach_child(self._parse_context(token))
173 seen_context_tree = True
174 elif token.type == TokenType.ROLE:
175 if seen_role_tree:
176 self._record_syntax_error(token, "'Role' already defined")
178 self.syntax_tree.attach_child(self._parse_role(token))
179 seen_role_tree = True
180 elif token.type == TokenType.END_OF_FILE:
181 if self.parse_errors:
182 raise(MetaphorParserError("parser error", self.parse_errors))
184 return self.syntax_tree
185 else:
186 self._record_syntax_error(token, f"Unexpected token: {token.value} at top level")
187 except FileNotFoundError as e:
188 err_token = self.current_token
189 self.parse_errors.append(MetaphorParserSyntaxError(
190 f"{e}", err_token.filename, err_token.line, err_token.column, err_token.input
191 ))
192 raise(MetaphorParserError("parser error", self.parse_errors)) from e
193 except MetaphorParserFileAlreadyUsedError as e:
194 self.parse_errors.append(MetaphorParserSyntaxError(
195 f"The file '{e.filename}' has already been used",
196 e.token.filename,
197 e.token.line,
198 e.token.column,
199 e.token.input
200 ))
201 raise(MetaphorParserError("parser error", self.parse_errors)) from e
203 def parse_file(self, filename: str, search_paths: List[str], embed_path: Optional[str]=None) -> MetaphorASTNode:
204 """
205 Parse a file and construct the AST.
207 Args:
208 filename (str): The path to the file to be parsed.
209 search_paths (List[str]): List of paths to search for included files.
210 embed_path: Path used to search for embedded files (uses CWD if None).
212 Returns:
213 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes.
215 Raises:
216 MetaphorParserError: If there are syntax errors during parsing.
217 FileNotFoundError: If the file cannot be found.
218 """
219 try:
220 self._check_file_not_loaded(filename)
221 input_text = self._read_file(filename)
222 return self.parse(input_text, filename, search_paths, embed_path)
223 except FileNotFoundError as e:
224 self.parse_errors.append(MetaphorParserSyntaxError(
225 f"{e}", "", 0, 0, ""
226 ))
227 raise(MetaphorParserError("parser error", self.parse_errors)) from e
228 except MetaphorParserError as e:
229 raise(MetaphorParserError("parser error", self.parse_errors)) from e
231 def get_next_token(self) -> Token:
232 """Get the next token from the active lexer."""
233 while self.lexers:
234 lexer = self.lexers[-1]
235 token = lexer.get_next_token()
236 self.current_token = token
238 if token.type == TokenType.INCLUDE:
239 self._parse_include()
240 elif token.type == TokenType.EMBED:
241 self._parse_embed()
242 elif token.type == TokenType.END_OF_FILE:
243 self.lexers.pop()
244 else:
245 return token
247 return Token(TokenType.END_OF_FILE, "", "", "", 0, 0)
249 def _record_syntax_error(self, token, message):
250 """Raise a syntax error and add it to the error list."""
251 error = MetaphorParserSyntaxError(
252 message, token.filename, token.line, token.column, token.input
253 )
254 self.parse_errors.append(error)
256 def _find_file_path(self, filename):
257 """Try to find a valid path for a file, given all the search path options"""
258 if Path(filename).exists():
259 return filename
261 # If we don't have an absolute path then we can try search paths.
262 if not os.path.isabs(filename):
263 for path in self.search_paths:
264 try_name = os.path.join(path, filename)
265 if Path(try_name).exists():
266 return try_name
268 raise FileNotFoundError(f"File not found: {filename}")
270 def _read_file(self, filename):
271 """Read file content into memory."""
272 try:
273 with open(filename, 'r', encoding='utf-8') as file:
274 return file.read()
275 except FileNotFoundError as e:
276 raise FileNotFoundError(f"File not found: {filename}") from e
277 except PermissionError as e:
278 raise FileNotFoundError(f"You do not have permission to access: {filename}") from e
279 except IsADirectoryError as e:
280 raise FileNotFoundError(f"Is a directory: {filename}") from e
281 except OSError as e:
282 raise FileNotFoundError(f"OS error: {e}") from e
284 def _check_file_not_loaded(self, filename):
285 """Check we have not already loaded a file."""
286 canonical_filename = os.path.realpath(filename)
287 if canonical_filename in self.previously_seen_files:
288 raise MetaphorParserFileAlreadyUsedError(filename, self.current_token)
290 self.previously_seen_files.add(canonical_filename)
292 def _parse_text(self, token):
293 """Parse a text block."""
294 return MetaphorASTNode(MetaphorASTNodeType.TEXT, token.value)
296 def _parse_action(self, token):
297 """Parse an action block and construct its AST node."""
298 label_name = ""
300 seen_token_type = TokenType.NONE
302 init_token = self.get_next_token()
303 if init_token.type == TokenType.KEYWORD_TEXT:
304 label_name = init_token.value
305 indent_token = self.get_next_token()
306 if indent_token.type != TokenType.INDENT:
307 self._record_syntax_error(
308 token,
309 "Expected indent after keyword description for 'Action' block"
310 )
311 elif init_token.type != TokenType.INDENT:
312 self._record_syntax_error(token, "Expected description or indent for 'Action' block")
314 action_node = MetaphorASTNode(MetaphorASTNodeType.ACTION, label_name)
316 while True:
317 token = self.get_next_token()
318 if token.type == TokenType.TEXT:
319 if seen_token_type != TokenType.NONE:
320 self._record_syntax_error(token, "Text must come first in an 'Action' block")
322 action_node.attach_child(self._parse_text(token))
323 elif token.type == TokenType.ACTION:
324 action_node.attach_child(self._parse_action(token))
325 seen_token_type = TokenType.ACTION
326 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE:
327 return action_node
328 else:
329 self._record_syntax_error(
330 token,
331 f"Unexpected token: {token.value} in 'Action' block"
332 )
334 def _parse_context(self, token):
335 """Parse a Context block."""
336 label_name = ""
338 seen_token_type = TokenType.NONE
340 init_token = self.get_next_token()
341 if init_token.type == TokenType.KEYWORD_TEXT:
342 label_name = init_token.value
343 indent_token = self.get_next_token()
344 if indent_token.type != TokenType.INDENT:
345 self._record_syntax_error(
346 token,
347 "Expected indent after keyword description for 'Context' block"
348 )
349 elif init_token.type != TokenType.INDENT:
350 self._record_syntax_error(token, "Expected description or indent for 'Context' block")
352 context_node = MetaphorASTNode(MetaphorASTNodeType.CONTEXT, label_name)
354 while True:
355 token = self.get_next_token()
356 if token.type == TokenType.TEXT:
357 if seen_token_type != TokenType.NONE:
358 self._record_syntax_error(token, "Text must come first in a 'Context' block")
360 context_node.attach_child(self._parse_text(token))
361 elif token.type == TokenType.CONTEXT:
362 context_node.attach_child(self._parse_context(token))
363 seen_token_type = TokenType.CONTEXT
364 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE:
365 return context_node
366 else:
367 self._record_syntax_error(
368 token,
369 f"Unexpected token: {token.value} in 'Context' block"
370 )
372 def _parse_role(self, token):
373 """Parse a Role block."""
374 label_name = ""
376 seen_token_type = TokenType.NONE
378 init_token = self.get_next_token()
379 if init_token.type == TokenType.KEYWORD_TEXT:
380 label_name = init_token.value
381 indent_token = self.get_next_token()
382 if indent_token.type != TokenType.INDENT:
383 self._record_syntax_error(
384 token,
385 "Expected indent after keyword description for 'Role' block"
386 )
387 elif init_token.type != TokenType.INDENT:
388 self._record_syntax_error(token, "Expected description or indent for 'Role' block")
390 role_node = MetaphorASTNode(MetaphorASTNodeType.ROLE, label_name)
392 while True:
393 token = self.get_next_token()
394 if token.type == TokenType.TEXT:
395 if seen_token_type != TokenType.NONE:
396 self._record_syntax_error(token, "Text must come first in a 'Role' block")
398 role_node.attach_child(self._parse_text(token))
399 elif token.type == TokenType.ROLE:
400 role_node.attach_child(self._parse_role(token))
401 seen_token_type = TokenType.ROLE
402 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE:
403 return role_node
404 else:
405 self._record_syntax_error(
406 token,
407 f"Unexpected token: {token.value} in 'Role' block"
408 )
410 def _parse_include(self):
411 """Parse an Include block and load the included file."""
412 token_next = self.get_next_token()
413 if token_next.type != TokenType.KEYWORD_TEXT:
414 self._record_syntax_error(token_next, "Expected file name for 'Include'")
415 return
417 filename = token_next.value
418 self._check_file_not_loaded(filename)
419 try_file = self._find_file_path(filename)
420 input_text = self._read_file(try_file)
421 self.lexers.append(MetaphorLexer(input_text, try_file))
423 def _parse_embed(self):
424 """Parse an Embed block and load the embedded file."""
425 token_next = self.get_next_token()
426 if token_next.type != TokenType.KEYWORD_TEXT:
427 self._record_syntax_error(token_next, "Expected file name or wildcard match for 'Embed'")
428 return
430 recurse = False
431 match = token_next.value
432 if "**/" in match:
433 recurse = True
435 path = os.path.join(self.embed_path, match)
436 files = glob.glob(path, recursive=recurse)
437 if not files:
438 self._record_syntax_error(token_next, f"{match} does not match any files for 'Embed' in {self.embed_path}")
439 return
441 for file in files:
442 input_text = self._read_file(file)
443 self.lexers.append(EmbedLexer(input_text, file))