|
1
|
|
|
from urllib.parse import urlparse |
|
2
|
|
|
|
|
3
|
|
|
import requests |
|
4
|
|
|
from bs4 import BeautifulSoup |
|
5
|
|
|
from loguru import logger |
|
6
|
|
|
|
|
7
|
|
|
import graphinate |
|
8
|
|
|
|
|
9
|
|
|
DEFAULT_MAX_DEPTH = 0 |
|
10
|
|
|
|
|
11
|
|
|
|
|
12
|
|
|
def page_links_graph_model(max_depth: int = DEFAULT_MAX_DEPTH): |
|
13
|
|
|
""" |
|
14
|
|
|
Create a graph model based on page links. |
|
15
|
|
|
|
|
16
|
|
|
Args: |
|
17
|
|
|
max_depth (int, optional): The maximum depth to crawl for page links. Defaults to DEFAULT_MAX_DEPTH. |
|
18
|
|
|
|
|
19
|
|
|
Returns: |
|
20
|
|
|
GraphModel: A graph model representing the page links. |
|
21
|
|
|
""" |
|
22
|
|
|
|
|
23
|
|
|
def _links(url: str, depth=0, **kwargs): |
|
24
|
|
|
reqs = requests.get(url) |
|
25
|
|
|
logger.debug('Analyzing Page: {url}') |
|
26
|
|
|
soup = BeautifulSoup(reqs.text, 'lxml') |
|
27
|
|
|
logger.debug('Done Analyzing Page: {url}') |
|
28
|
|
|
for link in soup.find_all('a', href=True): |
|
29
|
|
|
child_url = link.get('href') |
|
30
|
|
|
|
|
31
|
|
|
if child_url.startswith('javascript:'): # Skip JavaScript links |
|
32
|
|
|
continue |
|
33
|
|
|
|
|
34
|
|
|
if child_url.startswith('//'): # Handle protocol-relative URLs |
|
35
|
|
|
child_url = f"https:{child_url}" |
|
36
|
|
|
|
|
37
|
|
|
if not bool(urlparse(child_url).netloc): # Skip relative URLs |
|
38
|
|
|
# child_url = urljoin(url, child_url) |
|
39
|
|
|
continue |
|
40
|
|
|
|
|
41
|
|
|
if not child_url.startswith('http'): # Skip non-HTTP URLs |
|
42
|
|
|
continue |
|
43
|
|
|
|
|
44
|
|
|
yield {'source': url, 'target': child_url} |
|
45
|
|
|
if depth < max_depth: |
|
46
|
|
|
yield from _links(child_url, depth=depth + 1, **kwargs) |
|
47
|
|
|
|
|
48
|
|
|
graph_model = graphinate.model(name='Web') |
|
49
|
|
|
|
|
50
|
|
|
@graph_model.edge() |
|
51
|
|
|
def link(url, **kwargs): |
|
52
|
|
|
yield from _links(url, **kwargs) |
|
53
|
|
|
|
|
54
|
|
|
return graph_model |
|
55
|
|
|
|
|
56
|
|
|
|
|
57
|
|
|
if __name__ == '__main__': |
|
58
|
|
|
model = page_links_graph_model(1) |
|
59
|
|
|
|
|
60
|
|
|
params = { |
|
61
|
|
|
# 'url': 'https://github.com/erivlis/graphinate' |
|
62
|
|
|
'url': 'https://erivlis.github.io/graphinate/' |
|
63
|
|
|
} |
|
64
|
|
|
|
|
65
|
|
|
builder = graphinate.builders.GraphQLBuilder(model, graph_type=graphinate.GraphType.DiGraph) |
|
66
|
|
|
schema = builder.build(default_node_attributes={'type': 'url'}, **params) |
|
67
|
|
|
graphinate.graphql.server(schema) |
|
68
|
|
|
|