ImportPythonParser.get_latest_issue_url() - Code Metrics - Inspection of "Merge pull request #108 from MrLokans/master" - pythondigest/pythondigest - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 5f7ff6...6e3631 )

unknown

created 2016-05-14 04:49 UTC

ImportPythonParser.get_latest_issue_url() A

↳ Parent: ImportPythonParser

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	1	Features	1

Metric	Value
c	2
b	1
f	1
dl	0
loc	10
rs	9.4285
cc	1

# -*- coding: utf-8 -*-
"""
This module contains command to obtain news from importpython.com
and save the to database.
To use it run something like
python manage.py import_importpython --number 67
If no args specified parses latest news page.
"""
from __future__ import unicode_literals

from urllib.error import URLError
from urllib.request import urlopen

from typing import Dict, Union, Tuple, List

from django.core.management.base import BaseCommand
from bs4 import BeautifulSoup

from digest.management.commands import (
    apply_parsing_rules,
    apply_video_rules,
    save_item
)

from digest.models import (
    ITEM_STATUS_CHOICES,
    ParsingRules,
    Section,
    Resource
)

ResourceDict = Dict[str, Union[str, int, Resource]]
ItemTuple = Tuple[BeautifulSoup, BeautifulSoup]


class ImportPythonParser(object):
    BASE_URL = "http://importpython.com"
    RESOURCE_NAME = "importpython"

    def __init__(self):
        pass

    @staticmethod
    def _get_url_content(url: str) -> str:
        """Gets text from URL's response"""
        try:
            result = urlopen(url, timeout=10).read()
        except URLError:
            return ''
        else:
            return result

    @classmethod
    def get_latest_issue_url(cls) -> str:
        """Returns latest issue URL"""
        archive_url = "/".join([cls.BASE_URL, "newsletter", "archive"])
        content = cls._get_url_content(archive_url)
        soup = BeautifulSoup(content, "lxml")
        el = soup.find_all("div", "info")[0]
        href = el.find("h2").find("a")["href"]
        link = cls.BASE_URL + href
        return link

    @classmethod
    def get_issue_url(cls, number: int) -> str:
        """Returns issue URL corresponding to the issue number"""
        number = int(number)
        if number >= 16:
            return "/".join([cls.BASE_URL, "newsletter", "no", str(number)])
        elif 12 <= number <= 15:
            return "/".join([cls.BASE_URL, "newsletter", "draft", str(number)])
        elif 2 <= number <= 14:
            return "/".join([cls.BASE_URL, "static", "files", "issue{}.html".format(str(number))])
        else:
            raise ValueError("Incorre page number: {}".format(number))

    def _get_all_news_blocks(self,
                             soap: BeautifulSoup) -> List[ItemTuple]:
        """Returns sequence of blocks that present single news"""
        # TODO: add tags parsing
        subtitle_els = soap.find_all("div", "subtitle")
        body_texts = [el.find_next_sibling("div") for el in subtitle_els]
        return list(zip(subtitle_els, body_texts))

    def _get_block_dict(self,
                        el: Tuple[BeautifulSoup,
                                  BeautifulSoup]) -> ResourceDict:
        resource, created = Resource.objects.get_or_create(
            title='ImportPython',
            link='http://importpython.com'
        )

        subtitle, body = el
        title = subtitle.find("a").text
        url = subtitle.find("a")['href']
        text = body.text
        return {
            'title': title,
            'link': url,
            'raw_content': text,
            'http_code': 200,
            'content': text,
            'description': text,
            'resource': resource,
            'language': 'en',
        }

    def get_blocks(self, url: str) -> List[ResourceDict]:
        """Get news dictionaries from the specified URL"""
        content = self._get_url_content(url)
        soup = BeautifulSoup(content, "lxml")
        blocks = self._get_all_news_blocks(soup)
        items = map(self._get_block_dict, blocks)
        return list(items)


def _apply_rules_wrap(**kwargs):
    # TODO: move this function into separate module
    # as it is used in several parsing modules
    rules = kwargs

    def _apply_rules(item: dict) -> dict:
        item.update(
            apply_parsing_rules(item, **rules)
            if kwargs.get('query_rules') else {})
        item.update(apply_video_rules(item))
        return item

    return _apply_rules


def main(url: str="", number: int="") -> None:
    data = {
        'query_rules': ParsingRules.objects.filter(is_activated=True).all(),
        'query_sections': Section.objects.all(),
        'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES],
    }
    _apply_rules = _apply_rules_wrap(**data)

    parser = ImportPythonParser()
    if number and not url:
        url = parser.get_issue_url(number)
    if not number and not url:
        url = parser.get_latest_issue_url()
    blocks = parser.get_blocks(url)
    with_rules_applied = map(_apply_rules, blocks)
    for block in with_rules_applied:
        save_item(block)


class Command(BaseCommand):
    help = """This command parses importpython.com site\
 and saves posts from it to the database.
 You may either specify url by using --url argument, or
 implicitly specify issue number by using --number argument."""

    def add_arguments(self, parser):
        parser.add_argument('--url', type=str, help='Url to parse data from')
        parser.add_argument('--number',
                            type=int,
                            help='Number of "issue" to parse')

    def handle(self, *args, **options):
        if 'url' in options and options['url'] is not None:
            main(url=options['url'])
        elif 'number' in options and options['number'] is not None:
            main(number=int(options['number']))
        else:
            main()


1			# -- coding: utf-8 --
2			"""
3			This module contains command to obtain news from importpython.com
4			and save the to database.
5			To use it run something like
6			python manage.py import_importpython --number 67
7			If no args specified parses latest news page.
8			"""
9			from __future__ import unicode_literals
10
11			from urllib.error import URLError
12			from urllib.request import urlopen
13
14			from typing import Dict, Union, Tuple, List
15
16			from django.core.management.base import BaseCommand
17			from bs4 import BeautifulSoup
18
19			from digest.management.commands import (
20			apply_parsing_rules,
21			apply_video_rules,
22			save_item
23			)
24
25			from digest.models import (
26			ITEM_STATUS_CHOICES,
27			ParsingRules,
28			Section,
29			Resource
30			)
31
32			ResourceDict = Dict[str, Union[str, int, Resource]]
33			ItemTuple = Tuple[BeautifulSoup, BeautifulSoup]
34
35
36			class ImportPythonParser(object):
37			BASE_URL = "http://importpython.com"
38			RESOURCE_NAME = "importpython"
39
40			def __init__(self):
41			pass
42
43			@staticmethod
44			def _get_url_content(url: str) -> str:
45			"""Gets text from URL's response"""
46			try:
47			result = urlopen(url, timeout=10).read()
48			except URLError:
49			return ''
50			else:
51			return result
52
53			@classmethod
54			def get_latest_issue_url(cls) -> str:
55			"""Returns latest issue URL"""
56			archive_url = "/".join([cls.BASE_URL, "newsletter", "archive"])
57			content = cls._get_url_content(archive_url)
58			soup = BeautifulSoup(content, "lxml")
59			el = soup.find_all("div", "info")[0]
60			href = el.find("h2").find("a")["href"]
61			link = cls.BASE_URL + href
62			return link
63
64			@classmethod
65			def get_issue_url(cls, number: int) -> str:
66			"""Returns issue URL corresponding to the issue number"""
67			number = int(number)
68			if number >= 16:
69			return "/".join([cls.BASE_URL, "newsletter", "no", str(number)])
70			elif 12 <= number <= 15:
71			return "/".join([cls.BASE_URL, "newsletter", "draft", str(number)])
72			elif 2 <= number <= 14:
73			return "/".join([cls.BASE_URL, "static", "files", "issue{}.html".format(str(number))])
74			else:
75			raise ValueError("Incorre page number: {}".format(number))
76
77			def _get_all_news_blocks(self,
78			soap: BeautifulSoup) -> List[ItemTuple]:
79			"""Returns sequence of blocks that present single news"""
80			# TODO: add tags parsing
81			subtitle_els = soap.find_all("div", "subtitle")
82			body_texts = [el.find_next_sibling("div") for el in subtitle_els]
83			return list(zip(subtitle_els, body_texts))
84
85			def _get_block_dict(self,
86			el: Tuple[BeautifulSoup,
87			BeautifulSoup]) -> ResourceDict:
88			resource, created = Resource.objects.get_or_create(
89			title='ImportPython',
90			link='http://importpython.com'
91			)
92
93			subtitle, body = el
94			title = subtitle.find("a").text
95			url = subtitle.find("a")['href']
96			text = body.text
97			return {
98			'title': title,
99			'link': url,
100			'raw_content': text,
101			'http_code': 200,
102			'content': text,
103			'description': text,
104			'resource': resource,
105			'language': 'en',
106			}
107
108			def get_blocks(self, url: str) -> List[ResourceDict]:
109			"""Get news dictionaries from the specified URL"""
110			content = self._get_url_content(url)
111			soup = BeautifulSoup(content, "lxml")
112			blocks = self._get_all_news_blocks(soup)
113			items = map(self._get_block_dict, blocks)
114			return list(items)
115
116
117			def _apply_rules_wrap(**kwargs):
118			# TODO: move this function into separate module
119			# as it is used in several parsing modules
120			rules = kwargs
121
122			def _apply_rules(item: dict) -> dict:
123			item.update(
124			apply_parsing_rules(item, **rules)
125			if kwargs.get('query_rules') else {})
126			item.update(apply_video_rules(item))
127			return item
128
129			return _apply_rules
130
131
132			def main(url: str="", number: int="") -> None:
133			data = {
134			'query_rules': ParsingRules.objects.filter(is_activated=True).all(),
135			'query_sections': Section.objects.all(),
136			'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES],
137			}
138			_apply_rules = _apply_rules_wrap(**data)
139
140			parser = ImportPythonParser()
141			if number and not url:
142			url = parser.get_issue_url(number)
143			if not number and not url:
144			url = parser.get_latest_issue_url()
145			blocks = parser.get_blocks(url)
146			with_rules_applied = map(_apply_rules, blocks)
147			for block in with_rules_applied:
148			save_item(block)
149
150
151			class Command(BaseCommand):
152			help = """This command parses importpython.com site\
153			and saves posts from it to the database.
154			You may either specify url by using --url argument, or
155			implicitly specify issue number by using --number argument."""
156
157			def add_arguments(self, parser):
158			parser.add_argument('--url', type=str, help='Url to parse data from')
159			parser.add_argument('--number',
160			type=int,
161			help='Number of "issue" to parse')
162
163			def handle(self, args, *options):
164			if 'url' in options and options['url'] is not None:
165			main(url=options['url'])
166			elif 'number' in options and options['number'] is not None:
167			main(number=int(options['number']))
168			else:
169			main()
170

pythondigest / pythondigest

GitHub Access Token became invalid

Push — master ( 5f7ff6...6e3631 )

ImportPythonParser.get_latest_issue_url() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like