Coverage for src/m6rclib/embed_lexer.py: 100%

29 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-11 21:21 +0000

1# Copyright 2024 M6R Ltd. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15from typing import Dict, List 

16 

17from .metaphor_token import Token, TokenType 

18 

19class EmbedLexer: 

20 """ 

21 Lexer for handling embedded content like code blocks. 

22 """ 

23 

24 file_exts: Dict[str, str] = { 

25 "bash": "bash", 

26 "c": "c", 

27 "clj": "clojure", 

28 "cpp": "cpp", 

29 "cs": "csharp", 

30 "css": "css", 

31 "dart": "dart", 

32 "ebnf": "ebnf", 

33 "erl": "erlang", 

34 "ex": "elixir", 

35 "hpp": "cpp", 

36 "go": "go", 

37 "groovy": "groovy", 

38 "h": "c", 

39 "hs": "haskell", 

40 "html": "html", 

41 "java": "java", 

42 "js": "javascript", 

43 "json": "json", 

44 "kt": "kotlin", 

45 "lua": "lua", 

46 "m6r": "metaphor", 

47 "m": "objectivec", 

48 "md": "markdown", 

49 "mm": "objectivec", 

50 "php": "php", 

51 "pl": "perl", 

52 "py": "python", 

53 "r": "r", 

54 "rkt": "racket", 

55 "rb": "ruby", 

56 "rs": "rust", 

57 "scala": "scala", 

58 "sh": "bash", 

59 "sql": "sql", 

60 "swift": "swift", 

61 "ts": "typescript", 

62 "vb": "vbnet", 

63 "vbs": "vbscript", 

64 "xml": "xml", 

65 "yaml": "yaml", 

66 "yml": "yaml" 

67 } 

68 

69 def __init__(self, input_text, filename): 

70 """ 

71 Initialize the EmbedLexer for handling embedded content. 

72 

73 Args: 

74 input_text (str): The text content to be lexically analyzed 

75 filename (str): Name of the file being processed 

76 """ 

77 self.filename: str = filename 

78 self.tokens: List[Token] = [] 

79 self.current_line: int = 1 

80 self.input: str = input_text 

81 self._tokenize() 

82 

83 def get_next_token(self) -> Token: 

84 """Return the next token from the token list.""" 

85 if self.tokens: 

86 return self.tokens.pop(0) 

87 

88 return Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1) 

89 

90 def _get_language_from_file_extension(self, filename: str) -> str: 

91 """Get a language name from a filename extension.""" 

92 extension: str = "" 

93 if '.' in filename: 

94 extension = (filename.rsplit('.', 1)[-1]).lower() 

95 

96 return self.file_exts.get(extension, "plaintext") 

97 

98 def _tokenize(self) -> None: 

99 """Tokenizes the input file and handles embedded content.""" 

100 self.tokens.append(Token(TokenType.TEXT, f"File: {self.filename}", "", self.filename, 0, 1)) 

101 self.tokens.append( 

102 Token( 

103 TokenType.TEXT, 

104 "```" + self._get_language_from_file_extension(self.filename), 

105 "", 

106 self.filename, 

107 0, 

108 1 

109 ) 

110 ) 

111 

112 lines = self.input.splitlines() 

113 for line in lines: 

114 token = Token(TokenType.TEXT, line, line, self.filename, self.current_line, 1) 

115 self.tokens.append(token) 

116 self.current_line += 1 

117 

118 self.tokens.append(Token(TokenType.TEXT, "```", "", self.filename, self.current_line, 1)) 

119 self.tokens.append(Token(TokenType.END_OF_FILE, "", "", self.filename, self.current_line, 1))