ParserMixin.get_product_attrs()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 1
dl 0
loc 9
rs 9.6666
c 1
b 0
f 1
1
import multiprocessing
2
from bs4 import BeautifulSoup
3
4
from pysmartprice.results import SmartPriceResult
5
from pysmartprice.helpers import scrape, scrape_helper
6
from pysmartprice import constants
7
8
9
class BaseParser(object):
10
    def __init__(self, mapper, **kwargs):
11
        self.mapper = mapper
12
        self.params = kwargs
13
        self.url = constants.URL_MAPPER[self.mapper]
14
        self.response = scrape(self._make_url(self.url), **kwargs)
15
        self.soup = BeautifulSoup(self.response, 'lxml')
16
        self.result = [
17
            SmartPriceResult(self.get_product_attrs(item))
18
            for item in self.products_html
19
            ]
20
21
    def _make_url(self, target):
22
        return '{}{}'.format(constants.SMARTPRICE_WEB_URL, target)
23
24
    @property
25
    def price_results(self):
26
        if self.get_page_range:
27
            return self.process_multiple_pages()
28
29
        return self.result
30
31
32
class ParserMixin(object):
33
    def get_product_attrs(self, item):
34
        return dict(
35
            img=item.find('img').get('src'),
36
            title=item.find('a', attrs={'class': 'prdct-item__name'}).text,
37
            url=item.find(
38
                'a', attrs={'class': 'prdct-item__name'}).get('href'),
39
            best_price=item.find(
40
                'span', attrs={'class': 'prdct-item__prc-val'}).text,
41
            product_id=item.get('data-mspid')
42
        )
43
44
    @property
45
    def products_html(self):
46
        html = self.soup.findAll('div', attrs={'class': 'prdct-item'})
47
        return html
48
49
    def process_multiple_pages(self):
50
        results = self.result
51
        first_page, last_page = self.get_page_range
52
        paged_url = self.get_paged_url
53
        page_urls = []
54
55
        for page in range(first_page+1, last_page+1):
56
            url = paged_url.replace('.html', '-{}.html'.format(page))
57
            params = self.params.copy()
58
            if self.params.get('page', None):
59
                params.update({'page': page})
60
            page_urls.append((self._make_url(url), params))
61
62
        # Scrape pages in parallel
63
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()*2)
64
65
        for page in pool.map(scrape_helper, page_urls):
66
            self.soup = BeautifulSoup(page, 'lxml')
67
68
            results += [
69
                SmartPriceResult(self.get_product_attrs(item))
70
                for item in self.products_html
71
                ]
72
        return results
73
74
    @property
75
    def get_page_range(self):
76
        page_range = self.soup.findAll(
77
            'span', attrs={'class': 'pgntn__rslt-page'})
78
79
        if not page_range:
80
            return None
81
82
        first_page = int(page_range[0].text)
83
        last_page = int(page_range[1].text)
84
        return first_page, last_page
85
86