phml.core.formats.parse

Pythonic Hypertext Markup Language (phml) parser.

  1"""Pythonic Hypertext Markup Language (phml) parser."""
  2from copy import deepcopy
  3from operator import itemgetter
  4import re
  5
  6from phml.core.nodes import (
  7    AST,
  8    Comment,
  9    DocType,
 10    Element,
 11    Point,
 12    Position,
 13    Root,
 14    Text,
 15    Node
 16)
 17
 18def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST:
 19    """Parse a string as a hypertest markup document."""
 20
 21    phml_parser = HypertextMarkupParser()
 22
 23    if isinstance(data, str):
 24        return phml_parser.parse(data, auto_close=auto_close)
 25    raise Exception(f"Data passed to {class_name}.parse must be a str")
 26
 27def strip_blank_lines(data_lines: list[str]) -> list[str]:
 28    """Strip the blank lines at the start and end of a list."""
 29    data_lines = [line.replace("\r\n", "\n") for line in data_lines]
 30    # remove leading blank lines
 31    for idx in range(0, len(data_lines)):  # pylint: disable=consider-using-enumerate
 32        if data_lines[idx].strip() != "":
 33            data_lines = data_lines[idx:]
 34            break
 35        if idx == len(data_lines) - 1:
 36            data_lines = []
 37            break
 38
 39    # Remove trailing blank lines
 40    if len(data_lines) > 0:
 41        for idx in range(len(data_lines) - 1, -1, -1):
 42            if data_lines[idx].replace("\n", " ").strip() != "":
 43                data_lines = data_lines[: idx + 1]
 44                break
 45
 46    return data_lines
 47
 48def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
 49    """This function takes a possibly mutliline string and strips leading and trailing
 50    blank lines. Given the current position it will also calculate the line and column
 51    taht the data ends at.
 52    """
 53    if "pre" not in cur_tags:
 54        data_lines = data.split("\n")
 55
 56        # If multiline data block
 57        if len(data_lines) > 1:
 58            data_lines = strip_blank_lines(data_lines)
 59
 60            data = "\n".join(data_lines)
 61        # Else it is a single line data block
 62        else:
 63            data = data_lines[0]
 64
 65    return data
 66
 67
 68self_closing = [
 69    "area",
 70    "base",
 71    "br",
 72    "col",
 73    "embed",
 74    "hr",
 75    "img",
 76    "input",
 77    "link",
 78    "meta",
 79    "param",
 80    "source",
 81    "track",
 82    "wbr",
 83    "command",
 84    "keygen",
 85    "menuitem",
 86    "Slot",
 87]
 88
 89# Main form of tokenization
 90class RE:
 91    tag_start = re.compile(r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)")
 92    """Matches the start of a tag `<!name|</name|<name`"""
 93
 94    tag_end = re.compile(r"(?P<closing>/?)>")
 95    """Matches the end of a tag `/>|>`."""
 96
 97    comment = re.compile(r"<!--((?:.|\s)*)-->")
 98    """Matches all html style comments `<!--Comment-->`."""
 99    comment_close = re.compile(r"-->")
100
101    attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?")
102    """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
103    
104    bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
105
106class HypertextMarkupParser:
107    """Parse html/xml like source code strings."""
108
109    tag_stack = []
110    """Current stack of tags in order of when they are opened."""
111
112    def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
113        """Calculate the number of lines and columns that lead to the starting point int he source
114        string.
115        """
116        source = source[:start]
117        return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0
118
119    def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
120        """Calculate whether the number of columns should be added to the current column or be
121        treated as if it is starting from zero based on whether new lines exist.
122        """
123        return num_cols if num_lines != 0 else init_cols + num_cols
124
125    def __parse_text_comment(self, text: str, pos: Position) -> list[Node]:
126        """Parse the comments and general text found in the provided source."""
127
128        elements = [] # List of text and comment elements.
129
130        # For each comment add it to the list of elements
131        while RE.comment.search(text) is not None:
132            comment = RE.comment.search(text)
133            line_s, col_s = self.__calc_line_col(text, comment.start())
134            line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0)))
135
136            pos.start = Point(
137                pos.start.line + line_s,
138                self.__calc_col(line_s, col_s, pos.start.column)
139            )
140            pos.end = Point(
141                pos.start.line + line_e,
142                self.__calc_col(line_e, col_e, pos.start.column)
143            )
144
145            # If there is text between two comments then add a text element
146            if comment.start() > 0:
147                elements.append(Text(
148                    text[:comment.span()[0]],
149                    position=deepcopy(pos)
150                ))
151
152            text = text[comment.span()[1]:]
153            elements.append(
154                Comment(comment.group(1), position=deepcopy(pos))
155            )
156
157        # remaining text is added as a text element
158        if len(text) > 0:
159            line, col = self.__calc_line_col(text, len(text))
160            pos.start.line += line
161            pos.start.column = col
162
163            elements.append(Text(
164                text,
165                position=Position(
166                    deepcopy(pos.end),
167                    (pos.end.line + line, self.__calc_col(line, col, pos.end.column))
168                )
169            ))
170        return elements
171
172    def __parse_attributes(self, attrs: str) -> dict:
173        """Parse a tags attributes from the text found between the tag start and the tag end.
174        
175        Example:
176            `<name (attributes)>`
177        """
178        attributes = {}
179        for attr in RE.attribute.finditer(attrs):
180            (
181                name,
182                value,
183                _,
184                double,
185                single,
186                no_bracket
187            ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict())
188
189            if value is not None and RE.bracket_attributte.match(value) is not None:
190                if not name.startswith(":"):
191                    name = ":" + name
192                value = RE.bracket_attributte.match(value).group(1)
193            else:
194                value = double or single or no_bracket
195
196            if value in ["yes", "true", None]:
197                value = True
198            elif value in ["no", "false"]:
199                value = False
200
201            attributes[name] = value
202        return attributes
203
204    def __parse_tag(self, source, position: Position):
205        """Parse a tag from the given source. This includes the tag start, attributes and tag end.
206        It will also parse any comments and text from the start of the source to the start of the
207        tag.
208        """
209        begin = RE.tag_start.search(source)
210        begin = (begin.start(), begin.group(0), begin.groupdict())
211
212        elems = []
213        if begin[0] > 0:
214            elems = self.__parse_text_comment(source[:begin[0]], position)
215        position.end.column = position.start.column + len(begin[1])
216        source = source[begin[0] + len(begin[1]):]
217
218        if begin[2]["comment"] is not None:
219            end = RE.comment_close.search(source)
220            if end is None:
221                raise Exception("Comment was not closed")
222            end = (end.start(), end.group(0), end.groupdict())
223            attributes = {"data": source[:end[0]]}
224        else:
225            begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
226
227            end = RE.tag_end.search(source)
228            if end is None:
229                raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.")
230            end = (end.start(), end.group(0), end.groupdict())
231            attributes = self.__parse_attributes(source[:end[0]])
232
233        line, col = self.__calc_line_col(source, end[0] + len(end[1]))
234        position.end.line = position.start.line + line
235        position.end.column = position.end.column + col
236
237        return source[end[0] + len(end[1]):], begin, attributes, end, elems
238
239    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
240        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
241
242        if auto_closing:
243            return name in self_closing
244        return False
245
246    def parse(self, source: str, auto_close: bool = True) -> Root:
247        """Parse a given html or phml string into it's corresponding phml ast.
248
249        Args:
250            source (str): The html or phml source to parse.
251
252        Returns:
253            AST: A phml AST representing the parsed code source.
254        """
255
256        self.tag_stack = []
257        current = Root()
258        position = Position((0, 0), (0, 0))
259
260        while RE.tag_start.search(source) is not None:
261            source, begin, attr, end, elems = self.__parse_tag(source, position)
262
263            if len(elems) > 0:
264                current.extend(elems)
265
266            if begin[2]["comment"] is not None:
267                current.append(Comment(attr["data"], position=deepcopy(position)))
268            else:
269                name = begin[2]["name"] or ''
270                if begin[2]["opening"] == "/":
271                    if name != self.tag_stack[-1]:
272                        print("Tag Stack", self.tag_stack)
273                        raise Exception(
274                            f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}"
275                        )
276
277                    self.tag_stack.pop()
278                    current.position.end.line = position.end.line
279                    current.position.end.column = position.end.column
280
281                    current = current.parent
282                elif begin[2]["opening"] == "!":
283                    current.append(DocType(attr.get("lang", "html"), position=deepcopy(position)))
284                elif (
285                    end[2]["closing"] != "/"
286                    and not self.is_self_closing(name, auto_close)
287                    and begin[2]["opening"] is None
288                ):
289                    self.tag_stack.append(name)
290                    current.append(Element(name, attr, position=deepcopy(position)))
291                    current = current.children[-1]
292                else:
293                    current.append(Element(name, attr, position=deepcopy(position), startend=True))
294
295            position.start = deepcopy(position.end)
296
297        if len(source) > 0:
298            elems = self.__parse_text_comment(source, position)
299            current.extend(elems)
300
301        return AST(current)
def parse_hypertest_markup( data: str, class_name: str, auto_close: bool = True) -> phml.core.nodes.AST.AST:
19def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST:
20    """Parse a string as a hypertest markup document."""
21
22    phml_parser = HypertextMarkupParser()
23
24    if isinstance(data, str):
25        return phml_parser.parse(data, auto_close=auto_close)
26    raise Exception(f"Data passed to {class_name}.parse must be a str")

Parse a string as a hypertest markup document.

def strip_blank_lines(data_lines: list[str]) -> list[str]:
28def strip_blank_lines(data_lines: list[str]) -> list[str]:
29    """Strip the blank lines at the start and end of a list."""
30    data_lines = [line.replace("\r\n", "\n") for line in data_lines]
31    # remove leading blank lines
32    for idx in range(0, len(data_lines)):  # pylint: disable=consider-using-enumerate
33        if data_lines[idx].strip() != "":
34            data_lines = data_lines[idx:]
35            break
36        if idx == len(data_lines) - 1:
37            data_lines = []
38            break
39
40    # Remove trailing blank lines
41    if len(data_lines) > 0:
42        for idx in range(len(data_lines) - 1, -1, -1):
43            if data_lines[idx].replace("\n", " ").strip() != "":
44                data_lines = data_lines[: idx + 1]
45                break
46
47    return data_lines

Strip the blank lines at the start and end of a list.

def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
49def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
50    """This function takes a possibly mutliline string and strips leading and trailing
51    blank lines. Given the current position it will also calculate the line and column
52    taht the data ends at.
53    """
54    if "pre" not in cur_tags:
55        data_lines = data.split("\n")
56
57        # If multiline data block
58        if len(data_lines) > 1:
59            data_lines = strip_blank_lines(data_lines)
60
61            data = "\n".join(data_lines)
62        # Else it is a single line data block
63        else:
64            data = data_lines[0]
65
66    return data

This function takes a possibly mutliline string and strips leading and trailing blank lines. Given the current position it will also calculate the line and column taht the data ends at.

class RE:
 91class RE:
 92    tag_start = re.compile(r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)")
 93    """Matches the start of a tag `<!name|</name|<name`"""
 94
 95    tag_end = re.compile(r"(?P<closing>/?)>")
 96    """Matches the end of a tag `/>|>`."""
 97
 98    comment = re.compile(r"<!--((?:.|\s)*)-->")
 99    """Matches all html style comments `<!--Comment-->`."""
100    comment_close = re.compile(r"-->")
101
102    attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?")
103    """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
104    
105    bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
RE()
tag_start = re.compile('(?P<comment><!--)|<(?!!--)(?P<opening>!|\\/)?(?P<name>([\\w:\\.]+\\-?)+)|<(?P<opening2>/)?(?=\\s+>|>)')

Matches the start of a tag <!name|</name|<name

tag_end = re.compile('(?P<closing>/?)>')

Matches the end of a tag />|>.

comment = re.compile('<!--((?:.|\\s)*)-->')

Matches all html style comments <!--Comment-->.

attribute = re.compile('(?P<name>[\\w:\\-@]+)(?:=(?P<value>\\{(?P<curly>[^\\}]*)\\/\\}|\\"(?P<double>[^\\"]*)\\"|\'(?P<single>[^\']*)\'|(?P<open>[^>\'\\"]+)))?')

Matches a tags attributes attr|attr=value|attr='value'|attr="value".

class HypertextMarkupParser:
107class HypertextMarkupParser:
108    """Parse html/xml like source code strings."""
109
110    tag_stack = []
111    """Current stack of tags in order of when they are opened."""
112
113    def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
114        """Calculate the number of lines and columns that lead to the starting point int he source
115        string.
116        """
117        source = source[:start]
118        return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0
119
120    def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
121        """Calculate whether the number of columns should be added to the current column or be
122        treated as if it is starting from zero based on whether new lines exist.
123        """
124        return num_cols if num_lines != 0 else init_cols + num_cols
125
126    def __parse_text_comment(self, text: str, pos: Position) -> list[Node]:
127        """Parse the comments and general text found in the provided source."""
128
129        elements = [] # List of text and comment elements.
130
131        # For each comment add it to the list of elements
132        while RE.comment.search(text) is not None:
133            comment = RE.comment.search(text)
134            line_s, col_s = self.__calc_line_col(text, comment.start())
135            line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0)))
136
137            pos.start = Point(
138                pos.start.line + line_s,
139                self.__calc_col(line_s, col_s, pos.start.column)
140            )
141            pos.end = Point(
142                pos.start.line + line_e,
143                self.__calc_col(line_e, col_e, pos.start.column)
144            )
145
146            # If there is text between two comments then add a text element
147            if comment.start() > 0:
148                elements.append(Text(
149                    text[:comment.span()[0]],
150                    position=deepcopy(pos)
151                ))
152
153            text = text[comment.span()[1]:]
154            elements.append(
155                Comment(comment.group(1), position=deepcopy(pos))
156            )
157
158        # remaining text is added as a text element
159        if len(text) > 0:
160            line, col = self.__calc_line_col(text, len(text))
161            pos.start.line += line
162            pos.start.column = col
163
164            elements.append(Text(
165                text,
166                position=Position(
167                    deepcopy(pos.end),
168                    (pos.end.line + line, self.__calc_col(line, col, pos.end.column))
169                )
170            ))
171        return elements
172
173    def __parse_attributes(self, attrs: str) -> dict:
174        """Parse a tags attributes from the text found between the tag start and the tag end.
175        
176        Example:
177            `<name (attributes)>`
178        """
179        attributes = {}
180        for attr in RE.attribute.finditer(attrs):
181            (
182                name,
183                value,
184                _,
185                double,
186                single,
187                no_bracket
188            ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict())
189
190            if value is not None and RE.bracket_attributte.match(value) is not None:
191                if not name.startswith(":"):
192                    name = ":" + name
193                value = RE.bracket_attributte.match(value).group(1)
194            else:
195                value = double or single or no_bracket
196
197            if value in ["yes", "true", None]:
198                value = True
199            elif value in ["no", "false"]:
200                value = False
201
202            attributes[name] = value
203        return attributes
204
205    def __parse_tag(self, source, position: Position):
206        """Parse a tag from the given source. This includes the tag start, attributes and tag end.
207        It will also parse any comments and text from the start of the source to the start of the
208        tag.
209        """
210        begin = RE.tag_start.search(source)
211        begin = (begin.start(), begin.group(0), begin.groupdict())
212
213        elems = []
214        if begin[0] > 0:
215            elems = self.__parse_text_comment(source[:begin[0]], position)
216        position.end.column = position.start.column + len(begin[1])
217        source = source[begin[0] + len(begin[1]):]
218
219        if begin[2]["comment"] is not None:
220            end = RE.comment_close.search(source)
221            if end is None:
222                raise Exception("Comment was not closed")
223            end = (end.start(), end.group(0), end.groupdict())
224            attributes = {"data": source[:end[0]]}
225        else:
226            begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
227
228            end = RE.tag_end.search(source)
229            if end is None:
230                raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.")
231            end = (end.start(), end.group(0), end.groupdict())
232            attributes = self.__parse_attributes(source[:end[0]])
233
234        line, col = self.__calc_line_col(source, end[0] + len(end[1]))
235        position.end.line = position.start.line + line
236        position.end.column = position.end.column + col
237
238        return source[end[0] + len(end[1]):], begin, attributes, end, elems
239
240    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
241        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
242
243        if auto_closing:
244            return name in self_closing
245        return False
246
247    def parse(self, source: str, auto_close: bool = True) -> Root:
248        """Parse a given html or phml string into it's corresponding phml ast.
249
250        Args:
251            source (str): The html or phml source to parse.
252
253        Returns:
254            AST: A phml AST representing the parsed code source.
255        """
256
257        self.tag_stack = []
258        current = Root()
259        position = Position((0, 0), (0, 0))
260
261        while RE.tag_start.search(source) is not None:
262            source, begin, attr, end, elems = self.__parse_tag(source, position)
263
264            if len(elems) > 0:
265                current.extend(elems)
266
267            if begin[2]["comment"] is not None:
268                current.append(Comment(attr["data"], position=deepcopy(position)))
269            else:
270                name = begin[2]["name"] or ''
271                if begin[2]["opening"] == "/":
272                    if name != self.tag_stack[-1]:
273                        print("Tag Stack", self.tag_stack)
274                        raise Exception(
275                            f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}"
276                        )
277
278                    self.tag_stack.pop()
279                    current.position.end.line = position.end.line
280                    current.position.end.column = position.end.column
281
282                    current = current.parent
283                elif begin[2]["opening"] == "!":
284                    current.append(DocType(attr.get("lang", "html"), position=deepcopy(position)))
285                elif (
286                    end[2]["closing"] != "/"
287                    and not self.is_self_closing(name, auto_close)
288                    and begin[2]["opening"] is None
289                ):
290                    self.tag_stack.append(name)
291                    current.append(Element(name, attr, position=deepcopy(position)))
292                    current = current.children[-1]
293                else:
294                    current.append(Element(name, attr, position=deepcopy(position), startend=True))
295
296            position.start = deepcopy(position.end)
297
298        if len(source) > 0:
299            elems = self.__parse_text_comment(source, position)
300            current.extend(elems)
301
302        return AST(current)

Parse html/xml like source code strings.

HypertextMarkupParser()
tag_stack = []

Current stack of tags in order of when they are opened.

def is_self_closing(self, name: str, auto_closing: bool) -> bool:
240    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
241        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
242
243        if auto_closing:
244            return name in self_closing
245        return False

Check if the tag is self closing. Only check if auto_closing is toggled on.

def parse(self, source: str, auto_close: bool = True) -> phml.core.nodes.nodes.Root:
247    def parse(self, source: str, auto_close: bool = True) -> Root:
248        """Parse a given html or phml string into it's corresponding phml ast.
249
250        Args:
251            source (str): The html or phml source to parse.
252
253        Returns:
254            AST: A phml AST representing the parsed code source.
255        """
256
257        self.tag_stack = []
258        current = Root()
259        position = Position((0, 0), (0, 0))
260
261        while RE.tag_start.search(source) is not None:
262            source, begin, attr, end, elems = self.__parse_tag(source, position)
263
264            if len(elems) > 0:
265                current.extend(elems)
266
267            if begin[2]["comment"] is not None:
268                current.append(Comment(attr["data"], position=deepcopy(position)))
269            else:
270                name = begin[2]["name"] or ''
271                if begin[2]["opening"] == "/":
272                    if name != self.tag_stack[-1]:
273                        print("Tag Stack", self.tag_stack)
274                        raise Exception(
275                            f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}"
276                        )
277
278                    self.tag_stack.pop()
279                    current.position.end.line = position.end.line
280                    current.position.end.column = position.end.column
281
282                    current = current.parent
283                elif begin[2]["opening"] == "!":
284                    current.append(DocType(attr.get("lang", "html"), position=deepcopy(position)))
285                elif (
286                    end[2]["closing"] != "/"
287                    and not self.is_self_closing(name, auto_close)
288                    and begin[2]["opening"] is None
289                ):
290                    self.tag_stack.append(name)
291                    current.append(Element(name, attr, position=deepcopy(position)))
292                    current = current.children[-1]
293                else:
294                    current.append(Element(name, attr, position=deepcopy(position), startend=True))
295
296            position.start = deepcopy(position.end)
297
298        if len(source) > 0:
299            elems = self.__parse_text_comment(source, position)
300            current.extend(elems)
301
302        return AST(current)

Parse a given html or phml string into it's corresponding phml ast.

Args
  • source (str): The html or phml source to parse.
Returns

AST: A phml AST representing the parsed code source.