page_links   A
last analyzed

Complexity

Total Complexity 7

Size/Duplication

Total Lines 68
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 7
eloc 38
dl 0
loc 68
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
B page_links_graph_model() 0 43 7
1
from urllib.parse import urlparse
2
3
import requests
4
from bs4 import BeautifulSoup
5
from loguru import logger
6
7
import graphinate
8
9
DEFAULT_MAX_DEPTH = 0
10
11
12
def page_links_graph_model(max_depth: int = DEFAULT_MAX_DEPTH):
13
    """
14
    Create a graph model based on page links.
15
16
    Args:
17
        max_depth (int, optional): The maximum depth to crawl for page links. Defaults to DEFAULT_MAX_DEPTH.
18
19
    Returns:
20
        GraphModel: A graph model representing the page links.
21
    """
22
23
    def _links(url: str, depth=0, **kwargs):
24
        reqs = requests.get(url)
25
        logger.debug('Analyzing Page: {url}')
26
        soup = BeautifulSoup(reqs.text, 'lxml')
27
        logger.debug('Done Analyzing Page: {url}')
28
        for link in soup.find_all('a', href=True):
29
            child_url = link.get('href')
30
31
            if child_url.startswith('javascript:'):  # Skip JavaScript links
32
                continue
33
34
            if child_url.startswith('//'):  # Handle protocol-relative URLs
35
                child_url = f"https:{child_url}"
36
37
            if not bool(urlparse(child_url).netloc):  # Skip relative URLs
38
                # child_url = urljoin(url, child_url)
39
                continue
40
41
            if not child_url.startswith('http'):  # Skip non-HTTP URLs
42
                continue
43
44
            yield {'source': url, 'target': child_url}
45
            if depth < max_depth:
46
                yield from _links(child_url, depth=depth + 1, **kwargs)
47
48
    graph_model = graphinate.model(name='Web')
49
50
    @graph_model.edge()
51
    def link(url, **kwargs):
52
        yield from _links(url, **kwargs)
53
54
    return graph_model
55
56
57
if __name__ == '__main__':
58
    model = page_links_graph_model(1)
59
60
    params = {
61
        # 'url': 'https://github.com/erivlis/graphinate'
62
        'url': 'https://erivlis.github.io/graphinate/'
63
    }
64
65
    builder = graphinate.builders.GraphQLBuilder(model, graph_type=graphinate.GraphType.DiGraph)
66
    schema = builder.build(default_node_attributes={'type': 'url'}, **params)
67
    graphinate.graphql.server(schema)
68