1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
""" |
3
|
|
|
This module contains command to obtain news from importpython.com |
4
|
|
|
and save the to database. |
5
|
|
|
To use it run something like |
6
|
|
|
python manage.py import_importpython --number 67 |
7
|
|
|
If no args specified parses latest news page. |
8
|
|
|
""" |
9
|
|
|
from __future__ import unicode_literals |
10
|
|
|
|
11
|
|
|
from urllib.error import URLError |
12
|
|
|
from urllib.request import urlopen |
13
|
|
|
|
14
|
|
|
from typing import Dict, Union, Tuple, List |
15
|
|
|
|
16
|
|
|
from django.core.management.base import BaseCommand |
17
|
|
|
from bs4 import BeautifulSoup |
18
|
|
|
|
19
|
|
|
from digest.management.commands import ( |
20
|
|
|
apply_parsing_rules, |
21
|
|
|
apply_video_rules, |
22
|
|
|
save_item |
23
|
|
|
) |
24
|
|
|
|
25
|
|
|
from digest.models import ( |
26
|
|
|
ITEM_STATUS_CHOICES, |
27
|
|
|
ParsingRules, |
28
|
|
|
Section, |
29
|
|
|
Resource |
30
|
|
|
) |
31
|
|
|
|
32
|
|
|
ResourceDict = Dict[str, Union[str, int, Resource]] |
33
|
|
|
ItemTuple = Tuple[BeautifulSoup, BeautifulSoup] |
34
|
|
|
|
35
|
|
|
|
36
|
|
|
class ImportPythonParser(object): |
37
|
|
|
BASE_URL = "http://importpython.com" |
38
|
|
|
RESOURCE_NAME = "importpython" |
39
|
|
|
|
40
|
|
|
def __init__(self): |
41
|
|
|
pass |
42
|
|
|
|
43
|
|
|
@staticmethod |
44
|
|
|
def _get_url_content(url: str) -> str: |
45
|
|
|
"""Gets text from URL's response""" |
46
|
|
|
try: |
47
|
|
|
result = urlopen(url, timeout=10).read() |
48
|
|
|
except URLError: |
49
|
|
|
return '' |
50
|
|
|
else: |
51
|
|
|
return result |
52
|
|
|
|
53
|
|
|
@classmethod |
54
|
|
|
def get_latest_issue_url(cls) -> str: |
55
|
|
|
"""Returns latest issue URL""" |
56
|
|
|
archive_url = "/".join([cls.BASE_URL, "newsletter", "archive"]) |
57
|
|
|
content = cls._get_url_content(archive_url) |
58
|
|
|
soup = BeautifulSoup(content, "lxml") |
59
|
|
|
el = soup.find_all("div", "info")[0] |
60
|
|
|
href = el.find("h2").find("a")["href"] |
61
|
|
|
link = cls.BASE_URL + href |
62
|
|
|
return link |
63
|
|
|
|
64
|
|
|
@classmethod |
65
|
|
|
def get_issue_url(cls, number: int) -> str: |
66
|
|
|
"""Returns issue URL corresponding to the issue number""" |
67
|
|
|
number = int(number) |
68
|
|
|
if number >= 16: |
69
|
|
|
return "/".join([cls.BASE_URL, "newsletter", "no", str(number)]) |
70
|
|
|
elif 12 <= number <= 15: |
71
|
|
|
return "/".join([cls.BASE_URL, "newsletter", "draft", str(number)]) |
72
|
|
|
elif 2 <= number <= 14: |
73
|
|
|
return "/".join([cls.BASE_URL, "static", "files", "issue{}.html".format(str(number))]) |
74
|
|
|
else: |
75
|
|
|
raise ValueError("Incorre page number: {}".format(number)) |
76
|
|
|
|
77
|
|
|
def _get_all_news_blocks(self, |
78
|
|
|
soap: BeautifulSoup) -> List[ItemTuple]: |
79
|
|
|
"""Returns sequence of blocks that present single news""" |
80
|
|
|
# TODO: add tags parsing |
81
|
|
|
subtitle_els = soap.find_all("div", "subtitle") |
82
|
|
|
body_texts = [el.find_next_sibling("div") for el in subtitle_els] |
83
|
|
|
return list(zip(subtitle_els, body_texts)) |
84
|
|
|
|
85
|
|
|
def _get_block_dict(self, |
86
|
|
|
el: Tuple[BeautifulSoup, |
87
|
|
|
BeautifulSoup]) -> ResourceDict: |
88
|
|
|
resource, created = Resource.objects.get_or_create( |
89
|
|
|
title='ImportPython', |
90
|
|
|
link='http://importpython.com' |
91
|
|
|
) |
92
|
|
|
|
93
|
|
|
subtitle, body = el |
94
|
|
|
title = subtitle.find("a").text |
95
|
|
|
url = subtitle.find("a")['href'] |
96
|
|
|
text = body.text |
97
|
|
|
return { |
98
|
|
|
'title': title, |
99
|
|
|
'link': url, |
100
|
|
|
'raw_content': text, |
101
|
|
|
'http_code': 200, |
102
|
|
|
'content': text, |
103
|
|
|
'description': text, |
104
|
|
|
'resource': resource, |
105
|
|
|
'language': 'en', |
106
|
|
|
} |
107
|
|
|
|
108
|
|
|
def get_blocks(self, url: str) -> List[ResourceDict]: |
109
|
|
|
"""Get news dictionaries from the specified URL""" |
110
|
|
|
content = self._get_url_content(url) |
111
|
|
|
soup = BeautifulSoup(content, "lxml") |
112
|
|
|
blocks = self._get_all_news_blocks(soup) |
113
|
|
|
items = map(self._get_block_dict, blocks) |
114
|
|
|
return list(items) |
115
|
|
|
|
116
|
|
|
|
117
|
|
|
def _apply_rules_wrap(**kwargs): |
118
|
|
|
# TODO: move this function into separate module |
119
|
|
|
# as it is used in several parsing modules |
120
|
|
|
rules = kwargs |
121
|
|
|
|
122
|
|
|
def _apply_rules(item: dict) -> dict: |
123
|
|
|
item.update( |
124
|
|
|
apply_parsing_rules(item, **rules) |
125
|
|
|
if kwargs.get('query_rules') else {}) |
126
|
|
|
item.update(apply_video_rules(item)) |
127
|
|
|
return item |
128
|
|
|
|
129
|
|
|
return _apply_rules |
130
|
|
|
|
131
|
|
|
|
132
|
|
|
def main(url: str="", number: int="") -> None: |
133
|
|
|
data = { |
134
|
|
|
'query_rules': ParsingRules.objects.filter(is_activated=True).all(), |
135
|
|
|
'query_sections': Section.objects.all(), |
136
|
|
|
'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES], |
137
|
|
|
} |
138
|
|
|
_apply_rules = _apply_rules_wrap(**data) |
139
|
|
|
|
140
|
|
|
parser = ImportPythonParser() |
141
|
|
|
if number and not url: |
142
|
|
|
url = parser.get_issue_url(number) |
143
|
|
|
if not number and not url: |
144
|
|
|
url = parser.get_latest_issue_url() |
145
|
|
|
blocks = parser.get_blocks(url) |
146
|
|
|
with_rules_applied = map(_apply_rules, blocks) |
147
|
|
|
for block in with_rules_applied: |
148
|
|
|
save_item(block) |
149
|
|
|
|
150
|
|
|
|
151
|
|
|
class Command(BaseCommand): |
152
|
|
|
help = """This command parses importpython.com site\ |
153
|
|
|
and saves posts from it to the database. |
154
|
|
|
You may either specify url by using --url argument, or |
155
|
|
|
implicitly specify issue number by using --number argument.""" |
156
|
|
|
|
157
|
|
|
def add_arguments(self, parser): |
158
|
|
|
parser.add_argument('--url', type=str, help='Url to parse data from') |
159
|
|
|
parser.add_argument('--number', |
160
|
|
|
type=int, |
161
|
|
|
help='Number of "issue" to parse') |
162
|
|
|
|
163
|
|
|
def handle(self, *args, **options): |
164
|
|
|
if 'url' in options and options['url'] is not None: |
165
|
|
|
main(url=options['url']) |
166
|
|
|
elif 'number' in options and options['number'] is not None: |
167
|
|
|
main(number=int(options['number'])) |
168
|
|
|
else: |
169
|
|
|
main() |
170
|
|
|
|