Completed
Push — master ( be0bb5...16e7a2 )
by Matthew
01:20
created

XmlParser.set_handlers()   A

Complexity

Conditions 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
dl 0
loc 4
rs 10
c 1
b 0
f 0
1
from xml.parsers import expat
2
from html.parser import HTMLParser
3
4
class TagStack(object):
5
    def __init__(self):
6
        self.tags = []
7
        self.args = []
8
        self.data = []
9
        self.dataAdded = []
10
        self.stackSize = 0
11
        self.frameHasData = False
12
13
    def push(self, tag, args):
14
        self.tags.append(tag)
15
        self.args.append(args)
16
        self.data.append([])
17
        self.dataAdded.append(False)
18
        self.stackSize += 1
19
20
    def add_data(self, data):
21
        self.data[self.stackSize-1].append(data)
22
        self.dataAdded[-1] = True
23
24
    def clear_frame_data(self):
25
        self.data[self.stackSize-1] = []
26
        self.dataAdded[-1] = False
27
28
    def is_data_added(self, posRel=0):
29
        pos = -1 - posRel
30
        return self.dataAdded[pos]
31
32
    def pop(self):
33
        self.dataAdded.pop()
34
        stackFrame = (self.tags.pop(), self.args.pop(), self.data.pop())
35
        self.stackSize -= 1
36
        return stackFrame
37
38
    def peek(self, posRel=0):
39
        pos = -1 - posRel
40
        return (self.tags[pos], self.args[pos], self.data[pos])
41
42
    def path(self):
43
        return '/'.join(self.tags)
44
45
46
class BaseParser(object):
47
    def __init__(self, parser, tag, parent, root):
48
        self.parser = parser
49
        self.parent = parent
50
        self.tag = tag
51
        self.root = root
52
53
        if self.parent is None and self.tag is None and self.root is None:
54
            self.isRoot = True
55
        else:
56
            self.isRoot = False
57
58
        if self.isRoot:
59
            self.stack = TagStack()
60
            self.root = self
61
        else:
62
            self.stack = self.root.stack
63
64
        self.parsers = {}
65
66
        self.set_handlers()
67
68
        self.init_data()
69
70
    def set_handlers(self):
71
        pass
72
73
    def restore_handlers(self):
74
        if self.parent is not None:
75
            self.parent.set_handlers()
76
77
    def start(self, tag, attrs):
78
        if not isinstance(attrs, dict):
79
            attrs = dict(attrs)
80
81
        self.stack.push(tag, attrs)
82
83
        tagPath = self.stack.path()
84
85
        for parser in self.parsers:
86
            if parser == tagPath:
87
                ParserClass = self.parsers[parser]['object']
88
                parInst = self.switch_parser(ParserClass)
89
                self.parsers[parser]['instance'] = parInst
90
91
    def data(self, data):
92
        # We need to check if the stack frame has been used
93
        # previously and clear the previous data if so.
94
        if self.stack.is_data_added() is True:
95
            self.stack.clear_frame_data()
96
        self.stack.add_data(data.strip())
97
        self.parse()
98
99
    def end(self, tag):
100
        if self.stack.is_data_added() is False:
101
            self.parse()
102
103
        if tag == self.tag:
104
            self.integrate()
105
            self.restore_handlers()
106
107
        self.stack.pop()
108
109
    def switch_parser(self, parser):
110
        tag, attrs, data = self.stack.peek()
111
        return parser(self.parser, tag, self, self.root)
112
113
    def register_parser(self, stackTree, parser):
114
        self.parsers[stackTree] = {'object': parser}
115
116
    # The following method stubs are what the parsing sub-classes
117
    # will be implemented within.
118
    def init_data(self):
119
        pass
120
121
    def parse(self):
122
        pass
123
124
    def integrate(self):
125
        pass
126
127
class XmlParser(BaseParser):
128
    def set_handlers(self):
129
        self.parser.StartElementHandler = self.start
130
        self.parser.CharacterDataHandler = self.data
131
        self.parser.EndElementHandler = self.end
132
133
class HtmlParser(BaseParser):
134
    def set_handlers(self):
135
        self.parser.handle_starttag = self.start
136
        self.parser.handle_data = self.data
137
        self.parser.handle_endtag = self.end
138
139
def parse_html_data(rootParser, htmlData):
140
    htmlParser = HTMLParser()
141
    root = rootParser(htmlParser, None, None, None)
142
    linedData = htmlData.split('\n')
143
    for line in linedData:
144
        htmlParser.feed(line.strip())
145
    return root
146
147
def parse_html(rootParser, htmlPath):
148
    htmlParser = HTMLParser()
149
    root = rootParser(htmlParser, None, None, None)
150
151
    with open(htmlPath, 'rb') as htmlFile:
152
        for line in htmlFile:
153
            htmlParser.feed(line.strip())
154
    
155
    return root
156
157
def parse_xml_data(rootParser, xmlData):
158
    xmlParser = expat.ParserCreate()
159
    root = rootParser(xmlParser, None, None, None)
160
    linedData = xmlData.split('\n')
161
    for line in linedData:
162
        xmlParser.Parse(line.strip(), 0)
163
164
    xmlParser.Parse(b'',)
165
    return root
166
167
def parse_xml(rootParser, xmlPath):
168
169
    xmlParser = expat.ParserCreate()
170
    root = rootParser(xmlParser, None, None, None)
171
172
    with open(xmlPath, 'rb') as xmlFile:
173
        for line in xmlFile:
174
            xmlParser.Parse(line.strip(), 0)
175
176
    xmlParser.Parse(b'', 1)
177
178
    return root
179