Passed
Push — main ( ced03c...e8bc85 )
by Eran
01:33
created

html_dom.html_dom_graph_model()   B

Complexity

Conditions 8

Size

Total Lines 32
Code Lines 25

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 25
nop 1
dl 0
loc 32
rs 7.3333
c 0
b 0
f 0
1
import base64
2
3
import requests
4
from bs4 import BeautifulSoup, Tag
5
6
import graphinate
7
8
9
def load_html_from_url(url="https://www.google.com"):
10
    response = requests.get(url)
11
    return response.text
12
13
14
def load_html(file_path):
15
    with open(file_path) as file:
16
        return file.read()
17
18
19
def html_dom_graph_model(html_content):
20
    graph_model = graphinate.model(name="HTML DOM Graph")
21
    soup = BeautifulSoup(html_content, 'html.parser')
22
23
    def node_type(tag: Tag):
24
        return tag.name.strip('[]')
25
26
    def node_key(tag: Tag):
27
        return str((tag.sourceline, tag.sourcepos)) if isinstance(tag, Tag) else base64.b64encode(
28
            tag.encode()).decode()
29
30
    def node_label(tag: Tag):
31
        return str(tag)
32
33
    @graph_model.node(node_type, key=node_key, label=node_label)
34
    def html_node():
35
        for tag in soup.descendants:
36
            if tag.name is not None:
37
                yield tag
38
39
    @graph_model.edge()
40
    def contains():
41
        for tag in soup.descendants:
42
            if tag.name is not None:
43
                for child in tag.children:
44
                    if child.name is not None:
45
                        yield {
46
                            'source': node_key(tag),
47
                            'target': node_key(child)
48
                        }
49
50
    return graph_model
51
52
53
if __name__ == '__main__':
54
    html_content = load_html_from_url()
55
    dom_model = html_dom_graph_model(html_content)
56
    graphinate.materialize(
57
        dom_model,
58
        builder=graphinate.builders.GraphQLBuilder,
59
        builder_output_handler=graphinate.graphql
60
    )
61
62