Python-HTML-Parser/HTMLTree.py at master · wangxl1998/Python-HTML-Parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
class HTMLNode():
    def __init__(self, tag, content, parent, attrib):
        """Constructor of HTMLNode. Represents a single HTML element and all its content.

            Args:
                tag     (String)    : HTML tag of element.
                content (String)    : Content of element. (E.g. text inside a <h1></h1>)
                parent  (String)    : Parent HTMLNode.
                attrib  (Dicc)      : Dictionary cotaining all extra tags inside the HTML tag. (E.g. width, height etc.)
        """
        self.id = id(self)
        self.tag = tag
        self.content = content
        self.parent = parent
        self.children = []
        self.attrib = attrib

    def add_child(self, child_node):
        """Method to add child to HTMLNode.

            Args:
                child_node (HTMLNode) : New child.
        """
        self.children.append(child_node)

    def remove_child(self, child_node):
        """Method to remove child from HTMLNode.

            Args:
                child_node (HTMLNode) : Node to remove.
        """
        self.children.remove(child_node)

    def __setitem__(self, key, item):
        self.children[key] = item

    def __getitem__(self, key):
        return self.children[key]

    def __str__(self):
        if self.parent != None:
            t =  "{%s | Tag: %s; Content: %s; Parent: %s; Attrib: %s}\n" % (self.id, self.tag, self.content, str(self.parent.id) + " " + self.parent.tag, self.attrib)
        else:
            t =  "{%s | Tag: %s; Content: %s; Parent: %s; Attrib: %s}\n" % (self.id, self.tag, self.content, self.parent, self.attrib)
        return t


class HTMLTree():
    def __init__(self):
        """Constructor of HTMLTree class.

            Usage:
                my_tree = HTMLTree()
        """
        self.root = None
        self.nodes = []
        self.no_parent_tags = ['img', 'link', 'meta', 'br/', 'br', 'area', 'base', 'col', 'input']

    def parse_from_string(self, string):
        """Method to parse HTML from given String.

            Args:
                string (String): String to parse from.

            Usage:
                Obj.parse_from_string(<html string>)
        """
        def first_split(source, char):
            # Check if char is in source
            if char not in source:
                return [source]
            index = source.index(char)
            return [source[0:index].lstrip().rstrip(), source[index + 1: ].lstrip().rstrip().rstrip('"').lstrip('"')]

        # Find all tags and filter out comments and doctype
        tags = list(map(lambda x: x.lstrip().rstrip(), string.split('<')))
        tags = list(filter(lambda x: x != '' and not x.startswith('!'), tags))

        raw_nodes = []

        for _tag in tags:
            # Find content
            content = ""
            if '>' in _tag:
                content = _tag.split('>')[1]

            tag_map = list(map(lambda x: x.lstrip().rstrip(), _tag.split('>')[0].split(' ')))

            # Get tag name and remove it from tag map
            tag = tag_map[0]
            tag_map.remove(tag)

            tag_map = [ first_split(x, '=') for x in tag_map]
            raw_nodes.append([tag, tag_map, content])

        self.construct_tree(raw_nodes)

    def construct_tree(self, raw_nodes):
        """Constructs DOM tree based on previous parsed string.
           Is automaticly called by parse functions.

           Args:
                raw_nodes (List) : List of lists where each sublist contains information about one node.
        """

        cur_parent = None
        #print(raw_nodes)
        for _node in raw_nodes:
            node = self.construct_node(_node)

            # Check if node is endnode
            if node.tag.startswith('/'):
                to_close = node.tag[1:]

                # Find tag to close
                for _node in self.nodes[::-1]:
                    if _node.tag == to_close:
                        cur_parent = _node.parent
                        break
                        if cur_parent == None:
                            return
            # Check if node is not able to be a parent object
            elif node.tag in self.no_parent_tags:
                node.parent = cur_parent
                cur_parent.add_child(node)
                self.nodes.append(node)

            else:
                if self.root == None:
                    self.root = cur_parent = node
                    continue
                # Add note to list
                node.parent = cur_parent
                cur_parent.add_child(node)
                cur_parent = node
                self.nodes.append(node)

    def construct_node(self, node_data):
        """Method to construct node based on a raw_node item.

            Args:
                node_data (List) : List containing infos about node. (tag, tag_map, content).abs

            Returns:
                HTMLNode : HTMLNode based on given node_data.
        """
        tag = node_data[0]
        content = node_data[2]
        attrib = {}

        for row in node_data[1]:
            if len(row) > 1:
                attrib[row[0]] = row[1]

        return HTMLNode(tag, content, None, attrib)

    def parse_from_file(self, file):
        """Method to read html from file. After it finished reading the content is parsed as a string.

            Args:
                file (String): Filename or path to file.

            Usage:
                OBJ.parse_from_file(<MyHtmlFile>)
        """
        with open(file, 'r') as f:
            self.parse_from_string(f.read())

    def find_tag(self, tag):
        """Method to find HTML elements based on their tag.

            Args:
                tag (String) : Tag to look for.

            Returns:
                List : Contains all found HTMLNodes with given tag.
        """
        return [x for x in self.nodes if x.tag == tag]

    def find_id(self, id):
        """Method to find HTML element with given id.
           If multiple elements have the same id (should not happen in valid HTML),
           only the first is returned.

            Args:
                id (String) : id to look for.

            Returns:
                HTMLNode : If node with given id is found else...
                None
        """
        f = [x for x in self.nodes if 'id' in x.attrib and x.attrib['id'] == id]
        if len(f) > 0:
            return f[0]
        return None

    def find_class(self, _class):
        """Method to find all HTML elements with given class.

            Args:
                class (String) : Name of class to look for.

            Returns:
                List : List containing all found HTMLNodes.

        """
        f = [x for x in self.nodes if 'class' in x.attrib and x.attrib['class'] == _class]
        return f

    def __setitem__(self, key, item):
        self.nodes[key] = item

    def __getitem__(self, key):
        return self.nodes[key]

    def __str__(self):
       t = ""
       for n in self.nodes:
           t += str(n)
       return t