GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( 5f7ff6...6e3631 )
by
unknown
8s
created

ImportPythonParser.get_latest_issue_url()   A

Complexity

Conditions 1

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 1
Metric Value
c 2
b 1
f 1
dl 0
loc 10
rs 9.4285
cc 1
1
# -*- coding: utf-8 -*-
2
"""
3
This module contains command to obtain news from importpython.com
4
and save the to database.
5
To use it run something like
6
python manage.py import_importpython --number 67
7
If no args specified parses latest news page.
8
"""
9
from __future__ import unicode_literals
10
11
from urllib.error import URLError
12
from urllib.request import urlopen
13
14
from typing import Dict, Union, Tuple, List
15
16
from django.core.management.base import BaseCommand
17
from bs4 import BeautifulSoup
18
19
from digest.management.commands import (
20
    apply_parsing_rules,
21
    apply_video_rules,
22
    save_item
23
)
24
25
from digest.models import (
26
    ITEM_STATUS_CHOICES,
27
    ParsingRules,
28
    Section,
29
    Resource
30
)
31
32
ResourceDict = Dict[str, Union[str, int, Resource]]
33
ItemTuple = Tuple[BeautifulSoup, BeautifulSoup]
34
35
36
class ImportPythonParser(object):
37
    BASE_URL = "http://importpython.com"
38
    RESOURCE_NAME = "importpython"
39
40
    def __init__(self):
41
        pass
42
43
    @staticmethod
44
    def _get_url_content(url: str) -> str:
45
        """Gets text from URL's response"""
46
        try:
47
            result = urlopen(url, timeout=10).read()
48
        except URLError:
49
            return ''
50
        else:
51
            return result
52
53
    @classmethod
54
    def get_latest_issue_url(cls) -> str:
55
        """Returns latest issue URL"""
56
        archive_url = "/".join([cls.BASE_URL, "newsletter", "archive"])
57
        content = cls._get_url_content(archive_url)
58
        soup = BeautifulSoup(content, "lxml")
59
        el = soup.find_all("div", "info")[0]
60
        href = el.find("h2").find("a")["href"]
61
        link = cls.BASE_URL + href
62
        return link
63
64
    @classmethod
65
    def get_issue_url(cls, number: int) -> str:
66
        """Returns issue URL corresponding to the issue number"""
67
        number = int(number)
68
        if number >= 16:
69
            return "/".join([cls.BASE_URL, "newsletter", "no", str(number)])
70
        elif 12 <= number <= 15:
71
            return "/".join([cls.BASE_URL, "newsletter", "draft", str(number)])
72
        elif 2 <= number <= 14:
73
            return "/".join([cls.BASE_URL, "static", "files", "issue{}.html".format(str(number))])
74
        else:
75
            raise ValueError("Incorre page number: {}".format(number))
76
77
    def _get_all_news_blocks(self,
78
                             soap: BeautifulSoup) -> List[ItemTuple]:
79
        """Returns sequence of blocks that present single news"""
80
        # TODO: add tags parsing
81
        subtitle_els = soap.find_all("div", "subtitle")
82
        body_texts = [el.find_next_sibling("div") for el in subtitle_els]
83
        return list(zip(subtitle_els, body_texts))
84
85
    def _get_block_dict(self,
86
                        el: Tuple[BeautifulSoup,
87
                                  BeautifulSoup]) -> ResourceDict:
88
        resource, created = Resource.objects.get_or_create(
89
            title='ImportPython',
90
            link='http://importpython.com'
91
        )
92
93
        subtitle, body = el
94
        title = subtitle.find("a").text
95
        url = subtitle.find("a")['href']
96
        text = body.text
97
        return {
98
            'title': title,
99
            'link': url,
100
            'raw_content': text,
101
            'http_code': 200,
102
            'content': text,
103
            'description': text,
104
            'resource': resource,
105
            'language': 'en',
106
        }
107
108
    def get_blocks(self, url: str) -> List[ResourceDict]:
109
        """Get news dictionaries from the specified URL"""
110
        content = self._get_url_content(url)
111
        soup = BeautifulSoup(content, "lxml")
112
        blocks = self._get_all_news_blocks(soup)
113
        items = map(self._get_block_dict, blocks)
114
        return list(items)
115
116
117
def _apply_rules_wrap(**kwargs):
118
    # TODO: move this function into separate module
119
    # as it is used in several parsing modules
120
    rules = kwargs
121
122
    def _apply_rules(item: dict) -> dict:
123
        item.update(
124
            apply_parsing_rules(item, **rules)
125
            if kwargs.get('query_rules') else {})
126
        item.update(apply_video_rules(item))
127
        return item
128
129
    return _apply_rules
130
131
132
def main(url: str="", number: int="") -> None:
133
    data = {
134
        'query_rules': ParsingRules.objects.filter(is_activated=True).all(),
135
        'query_sections': Section.objects.all(),
136
        'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES],
137
    }
138
    _apply_rules = _apply_rules_wrap(**data)
139
140
    parser = ImportPythonParser()
141
    if number and not url:
142
        url = parser.get_issue_url(number)
143
    if not number and not url:
144
        url = parser.get_latest_issue_url()
145
    blocks = parser.get_blocks(url)
146
    with_rules_applied = map(_apply_rules, blocks)
147
    for block in with_rules_applied:
148
        save_item(block)
149
150
151
class Command(BaseCommand):
152
    help = """This command parses importpython.com site\
153
 and saves posts from it to the database.
154
 You may either specify url by using --url argument, or
155
 implicitly specify issue number by using --number argument."""
156
157
    def add_arguments(self, parser):
158
        parser.add_argument('--url', type=str, help='Url to parse data from')
159
        parser.add_argument('--number',
160
                            type=int,
161
                            help='Number of "issue" to parse')
162
163
    def handle(self, *args, **options):
164
        if 'url' in options and options['url'] is not None:
165
            main(url=options['url'])
166
        elif 'number' in options and options['number'] is not None:
167
            main(number=int(options['number']))
168
        else:
169
            main()
170