1
|
|
|
#!/usr/bin/python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
1 |
|
import logging |
4
|
1 |
|
import os |
5
|
1 |
|
import sys |
6
|
1 |
|
import time |
7
|
1 |
|
from urllib.parse import urlencode, urlparse, parse_qsl, urlunparse |
8
|
|
|
|
9
|
1 |
|
from selenium import webdriver |
10
|
1 |
|
from selenium.webdriver import DesiredCapabilities |
11
|
1 |
|
from selenium.webdriver.chrome.options import Options |
12
|
|
|
|
13
|
1 |
|
CHROMEDRIVER_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, 'bin\\ChromeHeadless', |
14
|
|
|
sys.platform, 'chromedriver')) |
15
|
1 |
|
CHROME_OPTIONS = Options() |
16
|
|
|
|
17
|
|
|
|
18
|
1 |
|
class SeleniumHelper(webdriver.Chrome): |
19
|
|
|
"""Headless Chrome implementation with selenium""" |
20
|
|
|
|
21
|
1 |
|
def __init__(self, log_level=logging.ERROR, *args, **kwargs) -> None: |
22
|
|
|
CHROME_OPTIONS.add_argument("--headless") |
23
|
|
|
super().__init__(executable_path=CHROMEDRIVER_PATH, chrome_options=CHROME_OPTIONS, *args, **kwargs) |
24
|
|
|
logging.basicConfig(level=log_level, |
25
|
|
|
format='[%(asctime)s.%(msecs)03d %(levelname)s %(name)s] %(message)s', |
26
|
|
|
datefmt="%H:%M:%S") |
27
|
|
|
self.logger = logging.getLogger("selenium_logger") |
28
|
|
|
|
29
|
1 |
|
def get(self, url, params=None, headers=None): |
30
|
|
|
"""Rebuild similar behaviour to requests.get function |
31
|
|
|
|
32
|
|
|
:param url: |
33
|
|
|
:param params: |
34
|
|
|
:param headers: |
35
|
|
|
:return: |
36
|
|
|
""" |
37
|
|
|
if headers: |
38
|
|
|
desired_capabilities = DesiredCapabilities.CHROME.copy() |
39
|
|
|
for key in headers: |
40
|
|
|
desired_capabilities['chrome.page.customHeaders.{0:s}'.format(key.lower())] = headers[key] |
41
|
|
|
super().__init__(executable_path=CHROMEDRIVER_PATH, chrome_options=CHROME_OPTIONS, |
42
|
|
|
desired_capabilities=desired_capabilities) |
43
|
|
|
if params: |
44
|
|
|
url_parts = list(urlparse(url)) |
45
|
|
|
query = dict(parse_qsl(url_parts[4])) |
46
|
|
|
query.update(params) |
47
|
|
|
|
48
|
|
|
url_parts[4] = urlencode(query) |
49
|
|
|
url = urlunparse(url_parts) |
50
|
|
|
|
51
|
|
|
super().get(url) |
52
|
|
|
while 'Your browser will redirect to your requested content shortly.' in self.page_source: |
53
|
|
|
self.logger.debug("sleeping to pass cloudflare") |
54
|
|
|
time.sleep(1) |
55
|
|
|
|
56
|
|
|
return self |
57
|
|
|
|