Coverage for src/m6rclib/metaphor_parser.py: 100%

231 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2025-01-22 17:26 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15import glob 

16import os 

17from pathlib import Path 

18 

19from typing import List, Set, Optional, Union 

20 

21from .metaphor_token import Token, TokenType 

22from .embed_lexer import EmbedLexer 

23from .metaphor_lexer import MetaphorLexer 

24from .metaphor_ast_node import MetaphorASTNode, MetaphorASTNodeType 

25 

26class MetaphorParserFileAlreadyUsedError(Exception): 

27 """Exception raised when a file is used more than once.""" 

28 def __init__(self, filename: str, token: Token) -> None: 

29 super().__init__(f"The file '{filename}' has already been used.") 

30 self.filename: str = filename 

31 self.token: Token = token 

32 

33 

34class MetaphorParserSyntaxError(Exception): 

35 """Exception generated when there is a syntax error.""" 

36 def __init__(self, message: str, filename: str, line: int, column: int, input_text: str) -> None: 

37 super().__init__(f"{message}: file: {filename}, line {line}, column {column}, ") 

38 self.message: str = message 

39 self.filename: str = filename 

40 self.line: int = line 

41 self.column: int = column 

42 self.input_text: str = input_text 

43 

44 

45class MetaphorParserError(Exception): 

46 """Exception wrapper generated when there is a syntax error.""" 

47 def __init__(self, message: str, errors: List[MetaphorParserSyntaxError]) -> None: 

48 super().__init__(message) 

49 self.errors: List[MetaphorParserSyntaxError] = errors 

50 

51 

52class MetaphorParser: 

53 """ 

54 Parser class to process tokens and build an Abstract Syntax Tree (AST). 

55 

56 Attributes: 

57 syntax_tree (MetaphorASTNode): The root node of the AST. 

58 parse_errors (List[MetaphorParserSyntaxError]): List of syntax errors encountered during parsing. 

59 lexers (List[Union[MetaphorLexer, EmbedLexer]]): Stack of lexers used for parsing multiple files. 

60 previously_seen_files (Set[str]): Set of canonical filenames already processed. 

61 search_paths (List[str]): List of paths to search for included files. 

62 current_token (Optional[Token]): The current token being processed. 

63 """ 

64 def __init__(self) -> None: 

65 self.syntax_tree: MetaphorASTNode = MetaphorASTNode(MetaphorASTNodeType.ROOT, "") 

66 self.parse_errors: List[MetaphorParserSyntaxError] = [] 

67 self.lexers: List[Union[MetaphorLexer, EmbedLexer]] = [] 

68 self.previously_seen_files: Set[str] = set() 

69 self.search_paths: List[str] = [] 

70 self.embed_path: str = None 

71 self.current_token: Optional[Token] = None 

72 

73 def _insert_preamble_text(self, text: str) -> None: 

74 self.syntax_tree.attach_child(MetaphorASTNode(MetaphorASTNodeType.TEXT, text)) 

75 

76 def _generate_preamble(self) -> None: 

77 preamble: List[str] = [ 

78 "The following preamble describes some elements of a language called Metaphor. Please pay", 

79 "extremely close attention to the details as they will affect the way you interpret", 

80 "everything that follows after \"BEGIN DESCRIPTION IN METAPHOR:\"", 

81 "", 

82 "Metaphor has the structure of a document tree with branches and leaves being prefixed", 

83 "by the keywords \"Role:\", \"Context:\" or \"Action:\". Each of these indicates the", 

84 "start of a new block of information.", 

85 "", 

86 "Blocks have an optional section name that will immediately follow them on the same line.", 

87 "If this is missing then the section name is not defined.", 

88 "", 

89 "After a keyword line there may be one or more lines of text that will describe the purpose", 

90 "of that block. A block may also include one or more optional child blocks inside them and", 

91 "that further clarify their parent block. These text blocks and any keywords lines nested", 

92 "inside a parent block will be indented by 4 spaces more than its parent.", 

93 "", 

94 "For example a \"Context:\" indented by 8 spaces is a child of the block above it that is", 

95 "indented by 4 spaces. One indented 12 spaces would be a child of the block above it that is", 

96 "indented by 8 spaces.", 

97 "", 

98 "Within the text of a block, you may be presented with code or document fragments inside a", 

99 "block delimited by 3 backticks. Please pay close attention to the indentation level of the", 

100 "opening 3 backticks. The identation of such code or document fragments is relative to this,", 

101 "not relative to the block in which the code or document fragment occurs.", 

102 "For example, consider:", 

103 " ```plaintext", 

104 " text line 1", 

105 " text line 2", 

106 " ```", 

107 " ```plaintext", 

108 " text line 3", 

109 " ```", 

110 "In this example, \"text line 1\" is not indented from the opening 3 backticks and thus has no", 

111 "indentation. \"text line 2\" is indented by 2 spaces relative to the opening 3 backticks", 

112 " \"text line 3\" is indented by 1 space relative to its opening 3 backticks.", 

113 "", 

114 "If \"Role:\" blocks exists then these contain details about the role you should fulfil. This", 

115 "section may also describe specific skills you have, knowledge you should apply, and the", 

116 "approach you take to apply these." 

117 "", 

118 "\"Context:\" blocks provide context necessary to understand what you will be asked to do.", 

119 "", 

120 "\"Action:\" blocks describes the task, or tasks, I would like you to do.", 

121 "", 

122 "When you process the actions please carefully ensure you do all of them accurately and", 

123 "complete all the elements requested. Unless otherwise instructed, do not include any", 

124 "placeholders in your responses.", 

125 "", 

126 "BEGIN DESCRIPTION IN METAPHOR:" 

127 ] 

128 

129 for text in preamble: 

130 self._insert_preamble_text(text) 

131 

132 def parse(self, input_text: str, filename: str, search_paths: List[str], embed_path: Optional[str]=None) -> MetaphorASTNode: 

133 """ 

134 Parse an input string and construct the AST. 

135 

136 Args: 

137 input_text (str): The text to be parsed. 

138 filename (str): The name of the file being parsed. 

139 search_paths (List[str]): List of paths to search for included files. 

140 embed_path: Path used to search for embedded files (uses CWD if None). 

141 

142 Returns: 

143 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes. 

144 

145 Raises: 

146 MetaphorParserError: If there are syntax errors during parsing. 

147 FileNotFoundError: If a required file cannot be found. 

148 """ 

149 self.search_paths = search_paths 

150 self.embed_path = embed_path if embed_path else os.getcwd() 

151 

152 try: 

153 self.lexers.append(MetaphorLexer(input_text, filename)) 

154 self._generate_preamble() 

155 

156 seen_action_tree: bool = False 

157 seen_context_tree: bool = False 

158 seen_role_tree: bool = False 

159 

160 while True: 

161 token = self.get_next_token() 

162 if token.type == TokenType.ACTION: 

163 if seen_action_tree: 

164 self._record_syntax_error(token, "'Action' already defined") 

165 

166 self.syntax_tree.attach_child(self._parse_action(token)) 

167 seen_action_tree = True 

168 elif token.type == TokenType.CONTEXT: 

169 if seen_context_tree: 

170 self._record_syntax_error(token, "'Context' already defined") 

171 

172 self.syntax_tree.attach_child(self._parse_context(token)) 

173 seen_context_tree = True 

174 elif token.type == TokenType.ROLE: 

175 if seen_role_tree: 

176 self._record_syntax_error(token, "'Role' already defined") 

177 

178 self.syntax_tree.attach_child(self._parse_role(token)) 

179 seen_role_tree = True 

180 elif token.type == TokenType.END_OF_FILE: 

181 if self.parse_errors: 

182 raise(MetaphorParserError("parser error", self.parse_errors)) 

183 

184 return self.syntax_tree 

185 else: 

186 self._record_syntax_error(token, f"Unexpected token: {token.value} at top level") 

187 except FileNotFoundError as e: 

188 err_token = self.current_token 

189 self.parse_errors.append(MetaphorParserSyntaxError( 

190 f"{e}", err_token.filename, err_token.line, err_token.column, err_token.input 

191 )) 

192 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

193 except MetaphorParserFileAlreadyUsedError as e: 

194 self.parse_errors.append(MetaphorParserSyntaxError( 

195 f"The file '{e.filename}' has already been used", 

196 e.token.filename, 

197 e.token.line, 

198 e.token.column, 

199 e.token.input 

200 )) 

201 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

202 

203 def parse_file(self, filename: str, search_paths: List[str], embed_path: Optional[str]=None) -> MetaphorASTNode: 

204 """ 

205 Parse a file and construct the AST. 

206 

207 Args: 

208 filename (str): The path to the file to be parsed. 

209 search_paths (List[str]): List of paths to search for included files. 

210 embed_path: Path used to search for embedded files (uses CWD if None). 

211 

212 Returns: 

213 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes. 

214 

215 Raises: 

216 MetaphorParserError: If there are syntax errors during parsing. 

217 FileNotFoundError: If the file cannot be found. 

218 """ 

219 try: 

220 self._check_file_not_loaded(filename) 

221 input_text = self._read_file(filename) 

222 return self.parse(input_text, filename, search_paths, embed_path) 

223 except FileNotFoundError as e: 

224 self.parse_errors.append(MetaphorParserSyntaxError( 

225 f"{e}", "", 0, 0, "" 

226 )) 

227 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

228 except MetaphorParserError as e: 

229 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

230 

231 def get_next_token(self) -> Token: 

232 """Get the next token from the active lexer.""" 

233 while self.lexers: 

234 lexer = self.lexers[-1] 

235 token = lexer.get_next_token() 

236 self.current_token = token 

237 

238 if token.type == TokenType.INCLUDE: 

239 self._parse_include() 

240 elif token.type == TokenType.EMBED: 

241 self._parse_embed() 

242 elif token.type == TokenType.END_OF_FILE: 

243 self.lexers.pop() 

244 else: 

245 return token 

246 

247 return Token(TokenType.END_OF_FILE, "", "", "", 0, 0) 

248 

249 def _record_syntax_error(self, token, message): 

250 """Raise a syntax error and add it to the error list.""" 

251 error = MetaphorParserSyntaxError( 

252 message, token.filename, token.line, token.column, token.input 

253 ) 

254 self.parse_errors.append(error) 

255 

256 def _find_file_path(self, filename): 

257 """Try to find a valid path for a file, given all the search path options""" 

258 if Path(filename).exists(): 

259 return filename 

260 

261 # If we don't have an absolute path then we can try search paths. 

262 if not os.path.isabs(filename): 

263 for path in self.search_paths: 

264 try_name = os.path.join(path, filename) 

265 if Path(try_name).exists(): 

266 return try_name 

267 

268 raise FileNotFoundError(f"File not found: {filename}") 

269 

270 def _read_file(self, filename): 

271 """Read file content into memory.""" 

272 try: 

273 with open(filename, 'r', encoding='utf-8') as file: 

274 return file.read() 

275 except FileNotFoundError as e: 

276 raise FileNotFoundError(f"File not found: {filename}") from e 

277 except PermissionError as e: 

278 raise FileNotFoundError(f"You do not have permission to access: {filename}") from e 

279 except IsADirectoryError as e: 

280 raise FileNotFoundError(f"Is a directory: {filename}") from e 

281 except OSError as e: 

282 raise FileNotFoundError(f"OS error: {e}") from e 

283 

284 def _check_file_not_loaded(self, filename): 

285 """Check we have not already loaded a file.""" 

286 canonical_filename = os.path.realpath(filename) 

287 if canonical_filename in self.previously_seen_files: 

288 raise MetaphorParserFileAlreadyUsedError(filename, self.current_token) 

289 

290 self.previously_seen_files.add(canonical_filename) 

291 

292 def _parse_text(self, token): 

293 """Parse a text block.""" 

294 return MetaphorASTNode(MetaphorASTNodeType.TEXT, token.value) 

295 

296 def _parse_action(self, token): 

297 """Parse an action block and construct its AST node.""" 

298 label_name = "" 

299 

300 seen_token_type = TokenType.NONE 

301 

302 init_token = self.get_next_token() 

303 if init_token.type == TokenType.KEYWORD_TEXT: 

304 label_name = init_token.value 

305 indent_token = self.get_next_token() 

306 if indent_token.type != TokenType.INDENT: 

307 self._record_syntax_error( 

308 token, 

309 "Expected indent after keyword description for 'Action' block" 

310 ) 

311 elif init_token.type != TokenType.INDENT: 

312 self._record_syntax_error(token, "Expected description or indent for 'Action' block") 

313 

314 action_node = MetaphorASTNode(MetaphorASTNodeType.ACTION, label_name) 

315 

316 while True: 

317 token = self.get_next_token() 

318 if token.type == TokenType.TEXT: 

319 if seen_token_type != TokenType.NONE: 

320 self._record_syntax_error(token, "Text must come first in an 'Action' block") 

321 

322 action_node.attach_child(self._parse_text(token)) 

323 elif token.type == TokenType.ACTION: 

324 action_node.attach_child(self._parse_action(token)) 

325 seen_token_type = TokenType.ACTION 

326 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

327 return action_node 

328 else: 

329 self._record_syntax_error( 

330 token, 

331 f"Unexpected token: {token.value} in 'Action' block" 

332 ) 

333 

334 def _parse_context(self, token): 

335 """Parse a Context block.""" 

336 label_name = "" 

337 

338 seen_token_type = TokenType.NONE 

339 

340 init_token = self.get_next_token() 

341 if init_token.type == TokenType.KEYWORD_TEXT: 

342 label_name = init_token.value 

343 indent_token = self.get_next_token() 

344 if indent_token.type != TokenType.INDENT: 

345 self._record_syntax_error( 

346 token, 

347 "Expected indent after keyword description for 'Context' block" 

348 ) 

349 elif init_token.type != TokenType.INDENT: 

350 self._record_syntax_error(token, "Expected description or indent for 'Context' block") 

351 

352 context_node = MetaphorASTNode(MetaphorASTNodeType.CONTEXT, label_name) 

353 

354 while True: 

355 token = self.get_next_token() 

356 if token.type == TokenType.TEXT: 

357 if seen_token_type != TokenType.NONE: 

358 self._record_syntax_error(token, "Text must come first in a 'Context' block") 

359 

360 context_node.attach_child(self._parse_text(token)) 

361 elif token.type == TokenType.CONTEXT: 

362 context_node.attach_child(self._parse_context(token)) 

363 seen_token_type = TokenType.CONTEXT 

364 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

365 return context_node 

366 else: 

367 self._record_syntax_error( 

368 token, 

369 f"Unexpected token: {token.value} in 'Context' block" 

370 ) 

371 

372 def _parse_role(self, token): 

373 """Parse a Role block.""" 

374 label_name = "" 

375 

376 seen_token_type = TokenType.NONE 

377 

378 init_token = self.get_next_token() 

379 if init_token.type == TokenType.KEYWORD_TEXT: 

380 label_name = init_token.value 

381 indent_token = self.get_next_token() 

382 if indent_token.type != TokenType.INDENT: 

383 self._record_syntax_error( 

384 token, 

385 "Expected indent after keyword description for 'Role' block" 

386 ) 

387 elif init_token.type != TokenType.INDENT: 

388 self._record_syntax_error(token, "Expected description or indent for 'Role' block") 

389 

390 role_node = MetaphorASTNode(MetaphorASTNodeType.ROLE, label_name) 

391 

392 while True: 

393 token = self.get_next_token() 

394 if token.type == TokenType.TEXT: 

395 if seen_token_type != TokenType.NONE: 

396 self._record_syntax_error(token, "Text must come first in a 'Role' block") 

397 

398 role_node.attach_child(self._parse_text(token)) 

399 elif token.type == TokenType.ROLE: 

400 role_node.attach_child(self._parse_role(token)) 

401 seen_token_type = TokenType.ROLE 

402 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

403 return role_node 

404 else: 

405 self._record_syntax_error( 

406 token, 

407 f"Unexpected token: {token.value} in 'Role' block" 

408 ) 

409 

410 def _parse_include(self): 

411 """Parse an Include block and load the included file.""" 

412 token_next = self.get_next_token() 

413 if token_next.type != TokenType.KEYWORD_TEXT: 

414 self._record_syntax_error(token_next, "Expected file name for 'Include'") 

415 return 

416 

417 filename = token_next.value 

418 self._check_file_not_loaded(filename) 

419 try_file = self._find_file_path(filename) 

420 input_text = self._read_file(try_file) 

421 self.lexers.append(MetaphorLexer(input_text, try_file)) 

422 

423 def _parse_embed(self): 

424 """Parse an Embed block and load the embedded file.""" 

425 token_next = self.get_next_token() 

426 if token_next.type != TokenType.KEYWORD_TEXT: 

427 self._record_syntax_error(token_next, "Expected file name or wildcard match for 'Embed'") 

428 return 

429 

430 recurse = False 

431 match = token_next.value 

432 if "**/" in match: 

433 recurse = True 

434 

435 path = os.path.join(self.embed_path, match) 

436 files = glob.glob(path, recursive=recurse) 

437 if not files: 

438 self._record_syntax_error(token_next, f"{match} does not match any files for 'Embed' in {self.embed_path}") 

439 return 

440 

441 for file in files: 

442 input_text = self._read_file(file) 

443 self.lexers.append(EmbedLexer(input_text, file))