Coverage for src/m6rclib/metaphor_parser.py: 100%

228 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2025-01-15 09:19 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15import glob 

16import os 

17from pathlib import Path 

18 

19from typing import List, Set, Optional, Union 

20 

21from .metaphor_token import Token, TokenType 

22from .embed_lexer import EmbedLexer 

23from .metaphor_lexer import MetaphorLexer 

24from .metaphor_ast_node import MetaphorASTNode, MetaphorASTNodeType 

25 

26class MetaphorParserFileAlreadyUsedError(Exception): 

27 """Exception raised when a file is used more than once.""" 

28 def __init__(self, filename: str, token: Token) -> None: 

29 super().__init__(f"The file '{filename}' has already been used.") 

30 self.filename: str = filename 

31 self.token: Token = token 

32 

33 

34class MetaphorParserSyntaxError(Exception): 

35 """Exception generated when there is a syntax error.""" 

36 def __init__(self, message: str, filename: str, line: int, column: int, input_text: str) -> None: 

37 super().__init__(f"{message}: file: {filename}, line {line}, column {column}, ") 

38 self.message: str = message 

39 self.filename: str = filename 

40 self.line: int = line 

41 self.column: int = column 

42 self.input_text: str = input_text 

43 

44 

45class MetaphorParserError(Exception): 

46 """Exception wrapper generated when there is a syntax error.""" 

47 def __init__(self, message: str, errors: List[MetaphorParserSyntaxError]) -> None: 

48 super().__init__(message) 

49 self.errors: List[MetaphorParserSyntaxError] = errors 

50 

51 

52class MetaphorParser: 

53 """ 

54 Parser class to process tokens and build an Abstract Syntax Tree (AST). 

55 

56 Attributes: 

57 syntax_tree (MetaphorASTNode): The root node of the AST. 

58 parse_errors (List[MetaphorParserSyntaxError]): List of syntax errors encountered during parsing. 

59 lexers (List[Union[MetaphorLexer, EmbedLexer]]): Stack of lexers used for parsing multiple files. 

60 previously_seen_files (Set[str]): Set of canonical filenames already processed. 

61 search_paths (List[str]): List of paths to search for included files. 

62 current_token (Optional[Token]): The current token being processed. 

63 """ 

64 def __init__(self) -> None: 

65 self.syntax_tree: MetaphorASTNode = MetaphorASTNode(MetaphorASTNodeType.ROOT, "") 

66 self.parse_errors: List[MetaphorParserSyntaxError] = [] 

67 self.lexers: List[Union[MetaphorLexer, EmbedLexer]] = [] 

68 self.previously_seen_files: Set[str] = set() 

69 self.search_paths: List[str] = [] 

70 self.current_token: Optional[Token] = None 

71 

72 def _insert_preamble_text(self, text: str) -> None: 

73 self.syntax_tree.attach_child(MetaphorASTNode(MetaphorASTNodeType.TEXT, text)) 

74 

75 def _generate_preamble(self) -> None: 

76 preamble: List[str] = [ 

77 "The following preamble describes some elements of a language called Metaphor. Please pay", 

78 "extremely close attention to the details as they will affect the way you interpret", 

79 "everything that follows after \"BEGIN DESCRIPTION IN METAPHOR:\"", 

80 "", 

81 "Metaphor has the structure of a document tree with branches and leaves being prefixed", 

82 "by the keywords \"Role:\", \"Context:\" or \"Action:\". Each of these indicates the", 

83 "start of a new block of information.", 

84 "", 

85 "Blocks have an optional section name that will immediately follow them on the same line.", 

86 "If this is missing then the section name is not defined.", 

87 "", 

88 "After a keyword line there may be one or more lines of text that will describe the purpose", 

89 "of that block. A block may also include one or more optional child blocks inside them and", 

90 "that further clarify their parent block. These text blocks and any keywords lines nested", 

91 "inside a parent block will be indented by 4 spaces more than its parent.", 

92 "", 

93 "For example a \"Context:\" indented by 8 spaces is a child of the block above it that is", 

94 "indented by 4 spaces. One indented 12 spaces would be a child of the block above it that is", 

95 "indented by 8 spaces.", 

96 "", 

97 "Within the text of a block, you may be presented with code or document fragments inside a", 

98 "block delimited by 3 backticks. Please pay close attention to the indentation level of the", 

99 "opening 3 backticks. The identation of such code or document fragments is relative to this,", 

100 "not relative to the block in which the code or document fragment occurs.", 

101 "For example, consider:", 

102 " ```plaintext", 

103 " text line 1", 

104 " text line 2", 

105 " ```", 

106 " ```plaintext", 

107 " text line 3", 

108 " ```", 

109 "In this example, \"text line 1\" is not indented from the opening 3 backticks and thus has no", 

110 "indentation. \"text line 2\" is indented by 2 spaces relative to the opening 3 backticks", 

111 " \"text line 3\" is indented by 1 space relative to its opening 3 backticks.", 

112 "", 

113 "If \"Role:\" blocks exists then these contain details about the role you should fulfil. This", 

114 "section may also describe specific skills you have, knowledge you should apply, and the", 

115 "approach you take to apply these." 

116 "", 

117 "\"Context:\" blocks provide context necessary to understand what you will be asked to do.", 

118 "", 

119 "\"Action:\" blocks describes the task, or tasks, I would like you to do.", 

120 "", 

121 "When you process the actions please carefully ensure you do all of them accurately and", 

122 "complete all the elements requested. Unless otherwise instructed, do not include any", 

123 "placeholders in your responses.", 

124 "", 

125 "BEGIN DESCRIPTION IN METAPHOR:" 

126 ] 

127 

128 for text in preamble: 

129 self._insert_preamble_text(text) 

130 

131 def parse(self, input_text: str, filename: str, search_paths: List[str]) -> MetaphorASTNode: 

132 """ 

133 Parse an input string and construct the AST. 

134 

135 Args: 

136 input_text (str): The text to be parsed. 

137 filename (str): The name of the file being parsed. 

138 search_paths (List[str]): List of paths to search for included files. 

139 

140 Returns: 

141 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes. 

142 

143 Raises: 

144 MetaphorParserError: If there are syntax errors during parsing. 

145 FileNotFoundError: If a required file cannot be found. 

146 """ 

147 self.search_paths = search_paths 

148 

149 try: 

150 self.lexers.append(MetaphorLexer(input_text, filename)) 

151 self._generate_preamble() 

152 

153 seen_action_tree: bool = False 

154 seen_context_tree: bool = False 

155 seen_role_tree: bool = False 

156 

157 while True: 

158 token = self.get_next_token() 

159 if token.type == TokenType.ACTION: 

160 if seen_action_tree: 

161 self._record_syntax_error(token, "'Action' already defined") 

162 

163 self.syntax_tree.attach_child(self._parse_action(token)) 

164 seen_action_tree = True 

165 elif token.type == TokenType.CONTEXT: 

166 if seen_context_tree: 

167 self._record_syntax_error(token, "'Context' already defined") 

168 

169 self.syntax_tree.attach_child(self._parse_context(token)) 

170 seen_context_tree = True 

171 elif token.type == TokenType.ROLE: 

172 if seen_role_tree: 

173 self._record_syntax_error(token, "'Role' already defined") 

174 

175 self.syntax_tree.attach_child(self._parse_role(token)) 

176 seen_role_tree = True 

177 elif token.type == TokenType.END_OF_FILE: 

178 if self.parse_errors: 

179 raise(MetaphorParserError("parser error", self.parse_errors)) 

180 

181 return self.syntax_tree 

182 else: 

183 self._record_syntax_error(token, f"Unexpected token: {token.value} at top level") 

184 except FileNotFoundError as e: 

185 err_token = self.current_token 

186 self.parse_errors.append(MetaphorParserSyntaxError( 

187 f"{e}", err_token.filename, err_token.line, err_token.column, err_token.input 

188 )) 

189 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

190 except MetaphorParserFileAlreadyUsedError as e: 

191 self.parse_errors.append(MetaphorParserSyntaxError( 

192 f"The file '{e.filename}' has already been used", 

193 e.token.filename, 

194 e.token.line, 

195 e.token.column, 

196 e.token.input 

197 )) 

198 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

199 

200 def parse_file(self, filename: str, search_paths: List[str]) -> MetaphorASTNode: 

201 """ 

202 Parse a file and construct the AST. 

203 

204 Args: 

205 filename (str): The path to the file to be parsed. 

206 search_paths (List[str]): List of paths to search for included files. 

207 

208 Returns: 

209 List[Optional[MetaphorASTNode]]: A list containing the role, context, and action AST nodes. 

210 

211 Raises: 

212 MetaphorParserError: If there are syntax errors during parsing. 

213 FileNotFoundError: If the file cannot be found. 

214 """ 

215 try: 

216 self._check_file_not_loaded(filename) 

217 input_text = self._read_file(filename) 

218 return self.parse(input_text, filename, search_paths) 

219 except FileNotFoundError as e: 

220 self.parse_errors.append(MetaphorParserSyntaxError( 

221 f"{e}", "", 0, 0, "" 

222 )) 

223 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

224 except MetaphorParserError as e: 

225 raise(MetaphorParserError("parser error", self.parse_errors)) from e 

226 

227 def get_next_token(self) -> Token: 

228 """Get the next token from the active lexer.""" 

229 while self.lexers: 

230 lexer = self.lexers[-1] 

231 token = lexer.get_next_token() 

232 self.current_token = token 

233 

234 if token.type == TokenType.INCLUDE: 

235 self._parse_include() 

236 elif token.type == TokenType.EMBED: 

237 self._parse_embed() 

238 elif token.type == TokenType.END_OF_FILE: 

239 self.lexers.pop() 

240 else: 

241 return token 

242 

243 return Token(TokenType.END_OF_FILE, "", "", "", 0, 0) 

244 

245 def _record_syntax_error(self, token, message): 

246 """Raise a syntax error and add it to the error list.""" 

247 error = MetaphorParserSyntaxError( 

248 message, token.filename, token.line, token.column, token.input 

249 ) 

250 self.parse_errors.append(error) 

251 

252 def _find_file_path(self, filename): 

253 """Try to find a valid path for a file, given all the search path options""" 

254 if Path(filename).exists(): 

255 return filename 

256 

257 # If we don't have an absolute path then we can try search paths. 

258 if not os.path.isabs(filename): 

259 for path in self.search_paths: 

260 try_name = os.path.join(path, filename) 

261 if Path(try_name).exists(): 

262 return try_name 

263 

264 raise FileNotFoundError(f"File not found: {filename}") 

265 

266 def _read_file(self, filename): 

267 """Read file content into memory.""" 

268 try: 

269 with open(filename, 'r', encoding='utf-8') as file: 

270 return file.read() 

271 except FileNotFoundError as e: 

272 raise FileNotFoundError(f"File not found: {filename}") from e 

273 except PermissionError as e: 

274 raise FileNotFoundError(f"You do not have permission to access: {filename}") from e 

275 except IsADirectoryError as e: 

276 raise FileNotFoundError(f"Is a directory: {filename}") from e 

277 except OSError as e: 

278 raise FileNotFoundError(f"OS error: {e}") from e 

279 

280 def _check_file_not_loaded(self, filename): 

281 """Check we have not already loaded a file.""" 

282 canonical_filename = os.path.realpath(filename) 

283 if canonical_filename in self.previously_seen_files: 

284 raise MetaphorParserFileAlreadyUsedError(filename, self.current_token) 

285 

286 self.previously_seen_files.add(canonical_filename) 

287 

288 def _parse_text(self, token): 

289 """Parse a text block.""" 

290 return MetaphorASTNode(MetaphorASTNodeType.TEXT, token.value) 

291 

292 def _parse_action(self, token): 

293 """Parse an action block and construct its AST node.""" 

294 label_name = "" 

295 

296 seen_token_type = TokenType.NONE 

297 

298 init_token = self.get_next_token() 

299 if init_token.type == TokenType.KEYWORD_TEXT: 

300 label_name = init_token.value 

301 indent_token = self.get_next_token() 

302 if indent_token.type != TokenType.INDENT: 

303 self._record_syntax_error( 

304 token, 

305 "Expected indent after keyword description for 'Action' block" 

306 ) 

307 elif init_token.type != TokenType.INDENT: 

308 self._record_syntax_error(token, "Expected description or indent for 'Action' block") 

309 

310 action_node = MetaphorASTNode(MetaphorASTNodeType.ACTION, label_name) 

311 

312 while True: 

313 token = self.get_next_token() 

314 if token.type == TokenType.TEXT: 

315 if seen_token_type != TokenType.NONE: 

316 self._record_syntax_error(token, "Text must come first in an 'Action' block") 

317 

318 action_node.attach_child(self._parse_text(token)) 

319 elif token.type == TokenType.ACTION: 

320 action_node.attach_child(self._parse_action(token)) 

321 seen_token_type = TokenType.ACTION 

322 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

323 return action_node 

324 else: 

325 self._record_syntax_error( 

326 token, 

327 f"Unexpected token: {token.value} in 'Action' block" 

328 ) 

329 

330 def _parse_context(self, token): 

331 """Parse a Context block.""" 

332 label_name = "" 

333 

334 seen_token_type = TokenType.NONE 

335 

336 init_token = self.get_next_token() 

337 if init_token.type == TokenType.KEYWORD_TEXT: 

338 label_name = init_token.value 

339 indent_token = self.get_next_token() 

340 if indent_token.type != TokenType.INDENT: 

341 self._record_syntax_error( 

342 token, 

343 "Expected indent after keyword description for 'Context' block" 

344 ) 

345 elif init_token.type != TokenType.INDENT: 

346 self._record_syntax_error(token, "Expected description or indent for 'Context' block") 

347 

348 context_node = MetaphorASTNode(MetaphorASTNodeType.CONTEXT, label_name) 

349 

350 while True: 

351 token = self.get_next_token() 

352 if token.type == TokenType.TEXT: 

353 if seen_token_type != TokenType.NONE: 

354 self._record_syntax_error(token, "Text must come first in a 'Context' block") 

355 

356 context_node.attach_child(self._parse_text(token)) 

357 elif token.type == TokenType.CONTEXT: 

358 context_node.attach_child(self._parse_context(token)) 

359 seen_token_type = TokenType.CONTEXT 

360 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

361 return context_node 

362 else: 

363 self._record_syntax_error( 

364 token, 

365 f"Unexpected token: {token.value} in 'Context' block" 

366 ) 

367 

368 def _parse_role(self, token): 

369 """Parse a Role block.""" 

370 label_name = "" 

371 

372 seen_token_type = TokenType.NONE 

373 

374 init_token = self.get_next_token() 

375 if init_token.type == TokenType.KEYWORD_TEXT: 

376 label_name = init_token.value 

377 indent_token = self.get_next_token() 

378 if indent_token.type != TokenType.INDENT: 

379 self._record_syntax_error( 

380 token, 

381 "Expected indent after keyword description for 'Role' block" 

382 ) 

383 elif init_token.type != TokenType.INDENT: 

384 self._record_syntax_error(token, "Expected description or indent for 'Role' block") 

385 

386 role_node = MetaphorASTNode(MetaphorASTNodeType.ROLE, label_name) 

387 

388 while True: 

389 token = self.get_next_token() 

390 if token.type == TokenType.TEXT: 

391 if seen_token_type != TokenType.NONE: 

392 self._record_syntax_error(token, "Text must come first in a 'Role' block") 

393 

394 role_node.attach_child(self._parse_text(token)) 

395 elif token.type == TokenType.ROLE: 

396 role_node.attach_child(self._parse_role(token)) 

397 seen_token_type = TokenType.ROLE 

398 elif token.type == TokenType.OUTDENT or token.type == TokenType.END_OF_FILE: 

399 return role_node 

400 else: 

401 self._record_syntax_error( 

402 token, 

403 f"Unexpected token: {token.value} in 'Role' block" 

404 ) 

405 

406 def _parse_include(self): 

407 """Parse an Include block and load the included file.""" 

408 token_next = self.get_next_token() 

409 if token_next.type != TokenType.KEYWORD_TEXT: 

410 self._record_syntax_error(token_next, "Expected file name for 'Include'") 

411 return 

412 

413 filename = token_next.value 

414 self._check_file_not_loaded(filename) 

415 try_file = self._find_file_path(filename) 

416 input_text = self._read_file(try_file) 

417 self.lexers.append(MetaphorLexer(input_text, try_file)) 

418 

419 def _parse_embed(self): 

420 """Parse an Embed block and load the embedded file.""" 

421 token_next = self.get_next_token() 

422 if token_next.type != TokenType.KEYWORD_TEXT: 

423 self._record_syntax_error(token_next, "Expected file name or wildcard match for 'Embed'") 

424 return 

425 

426 recurse = False 

427 match = token_next.value 

428 if "**/" in match: 

429 recurse = True 

430 

431 files = glob.glob(match, recursive=recurse) 

432 if not files: 

433 self._record_syntax_error(token_next, f"{match} does not match any files for 'Embed'") 

434 return 

435 

436 for file in files: 

437 input_text = self._read_file(file) 

438 self.lexers.append(EmbedLexer(input_text, file))