Coverage for src/m6rclib/embed_lexer.py: 100%
28 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-19 10:32 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-19 10:32 +0000
1# Copyright 2024 M6R Ltd.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15from typing import Dict, List
17from .metaphor_token import Token, TokenType
19class EmbedLexer:
20 """
21 Lexer for handling embedded content like code blocks.
22 """
24 file_exts: Dict[str, str] = {
25 "bash": "bash",
26 "c": "c",
27 "clj": "clojure",
28 "cpp": "cpp",
29 "cs": "csharp",
30 "css": "css",
31 "dart": "dart",
32 "ebnf": "ebnf",
33 "erl": "erlang",
34 "ex": "elixir",
35 "hpp": "cpp",
36 "go": "go",
37 "groovy": "groovy",
38 "h": "c",
39 "hs": "haskell",
40 "html": "html",
41 "java": "java",
42 "js": "javascript",
43 "json": "json",
44 "kt": "kotlin",
45 "lua": "lua",
46 "m6r": "metaphor",
47 "m": "objectivec",
48 "md": "markdown",
49 "mm": "objectivec",
50 "php": "php",
51 "pl": "perl",
52 "py": "python",
53 "r": "r",
54 "rkt": "racket",
55 "rb": "ruby",
56 "rs": "rust",
57 "scala": "scala",
58 "sh": "bash",
59 "sql": "sql",
60 "swift": "swift",
61 "ts": "typescript",
62 "vb": "vbnet",
63 "vbs": "vbscript",
64 "xml": "xml",
65 "yaml": "yaml",
66 "yml": "yaml"
67 }
69 def __init__(self, input_text, filename):
70 """
71 Initialize the EmbedLexer for handling embedded content.
73 Args:
74 input_text (str): The text content to be lexically analyzed
75 filename (str): Name of the file being processed
76 """
77 self.filename: str = filename
78 self.tokens: List[Token] = []
79 self.current_line: int = 1
80 self.input: str = input_text
81 self._tokenize()
83 def get_next_token(self) -> Token:
84 """Return the next token from the token list."""
85 if self.tokens:
86 return self.tokens.pop(0)
88 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1)
90 def _get_language_from_file_extension(self, filename: str) -> str:
91 """Get a language name from a filename extension."""
92 extension: str = ""
93 if '.' in filename:
94 extension = (filename.rsplit('.', 1)[-1]).lower()
96 return self.file_exts.get(extension, "plaintext")
98 def _tokenize(self) -> None:
99 """Tokenizes the input file and handles embedded content."""
100 self.tokens.append(Token(TokenType.TEXT, f"File: {self.filename}", "", self.filename, 0, 1))
101 self.tokens.append(
102 Token(
103 TokenType.TEXT,
104 "```" + self._get_language_from_file_extension(self.filename),
105 "",
106 self.filename,
107 0,
108 1
109 )
110 )
112 lines = self.input.splitlines()
113 for line in lines:
114 token = Token(TokenType.TEXT, line, line, self.filename, self.current_line, 1)
115 self.tokens.append(token)
116 self.current_line += 1
118 self.tokens.append(Token(TokenType.TEXT, "```", "", self.filename, self.current_line, 1))