1
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
2
|
|
|
|
3
|
|
|
try: |
4
|
|
|
import logging |
5
|
|
|
import sys |
6
|
|
|
import requests |
7
|
|
|
import os |
8
|
|
|
from bs4 import BeautifulSoup |
9
|
|
|
from osm_poi_matchmaker.utils import config |
10
|
|
|
from osm_poi_matchmaker.utils.enums import FileType |
11
|
|
|
except ImportError as err: |
12
|
|
|
logging.error('Error %s import module: %s', __name__, err) |
13
|
|
|
logging.exception('Exception occurred') |
14
|
|
|
|
15
|
|
|
sys.exit(128) |
16
|
|
|
|
17
|
|
|
|
18
|
|
|
def download_content(link, verify_link=config.get_download_verify_link(), post_parm=None, headers=None, |
|
|
|
|
19
|
|
|
encoding='utf-8'): |
20
|
|
|
try: |
21
|
|
|
if post_parm is None: |
22
|
|
|
logging.debug('Downloading without post parameters.') |
23
|
|
|
page = requests.get(link, verify=verify_link, headers=headers) |
24
|
|
|
page.encoding = encoding |
25
|
|
|
else: |
26
|
|
|
logging.debug('Downloading with post parameters.') |
27
|
|
|
headers_static = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"} |
28
|
|
|
if headers is not None: |
29
|
|
|
headers.update(headers_static) |
30
|
|
|
else: |
31
|
|
|
headers = headers_static |
32
|
|
|
page = requests.post(link, verify=verify_link, data=post_parm, headers=headers) |
33
|
|
|
page.encoding = encoding |
34
|
|
|
except requests.exceptions.ConnectionError as e: |
|
|
|
|
35
|
|
|
logging.warning('Unable to open connection. (%s)', e) |
36
|
|
|
return None |
37
|
|
|
return page.text if page.status_code == 200 else None |
38
|
|
|
|
39
|
|
|
|
40
|
|
|
def save_downloaded_soup(link, file, filetype, post_data=None, verify=config.get_download_verify_link(), headers=None): |
|
|
|
|
41
|
|
|
if config.get_download_use_cached_data() is True and os.path.isfile(file): |
42
|
|
|
soup = readfile(file, filetype) |
43
|
|
|
else: |
44
|
|
|
if link is not None: |
45
|
|
|
soup = download_content(link, verify, post_data, headers) |
46
|
|
|
if soup is not None: |
47
|
|
|
logging.info('We got content, write to file.') |
48
|
|
|
if not os.path.exists(config.get_directory_cache_url()): |
49
|
|
|
os.makedirs(config.get_directory_cache_url()) |
50
|
|
|
with open(file, mode='w', encoding='utf-8') as code: |
51
|
|
|
if filetype == FileType.html: |
52
|
|
|
soup = BeautifulSoup(soup, 'html.parser') |
53
|
|
|
code.write(str(soup.prettify())) |
54
|
|
|
elif filetype == FileType.xml: |
55
|
|
|
soup = BeautifulSoup(soup, 'lxml', from_encoding='utf-8') |
56
|
|
|
logging.debug('original encoding: %s', soup.original_encoding) |
57
|
|
|
code.write(str(soup.prettify())) |
58
|
|
|
elif filetype == FileType.csv or filetype == FileType.json: |
|
|
|
|
59
|
|
|
code.write(str(soup)) |
60
|
|
|
else: |
61
|
|
|
logging.error('Unexpected type to write: %s', filetype) |
62
|
|
|
else: |
63
|
|
|
if os.path.exists(file): |
64
|
|
|
logging.info( |
65
|
|
|
'The %s link returned error code other than 200 but there is an already downloaded file. Try to open it.', |
|
|
|
|
66
|
|
|
link) |
67
|
|
|
soup = readfile(file, filetype) |
68
|
|
|
else: |
69
|
|
|
logging.warning( |
70
|
|
|
'Skipping dataset: %s. There is not downloadable URL, nor already downbloaded file.', link) |
|
|
|
|
71
|
|
|
else: |
72
|
|
|
if os.path.exists(file): |
73
|
|
|
soup = readfile(file, filetype) |
74
|
|
|
if filetype == FileType.html: |
75
|
|
|
soup = BeautifulSoup(soup, 'html.parser') |
76
|
|
|
elif filetype == FileType.xml: |
77
|
|
|
soup = BeautifulSoup(soup, 'lxml') |
78
|
|
|
logging.info( |
79
|
|
|
'Using file only: %s. There is not downloadable URL only just the file. Do not forget to update file manually!', |
|
|
|
|
80
|
|
|
file) |
81
|
|
|
else: |
82
|
|
|
logging.warning( |
83
|
|
|
'Cannot use download and file: %s. There is not downloadable URL, nor already downbloaded file.', |
|
|
|
|
84
|
|
|
file) |
85
|
|
|
return soup |
|
|
|
|
86
|
|
|
|
87
|
|
|
|
88
|
|
|
def readfile(r_filename, r_filetype): |
|
|
|
|
89
|
|
|
try: |
90
|
|
|
if os.path.exists(r_filename): |
|
|
|
|
91
|
|
|
with open(r_filename, mode='r', encoding='utf-8') as code: |
92
|
|
|
if r_filetype == FileType.html: |
93
|
|
|
soup = BeautifulSoup(code.read(), 'html.parser') |
94
|
|
|
elif r_filetype == FileType.csv or r_filetype == FileType.json or r_filetype == FileType.xml: |
|
|
|
|
95
|
|
|
soup = code.read() |
96
|
|
|
else: |
97
|
|
|
logging.error('Unexpected type to read: %s', r_filetype) |
98
|
|
|
return soup |
|
|
|
|
99
|
|
|
else: |
100
|
|
|
return None |
101
|
|
|
except Exception as e: |
|
|
|
|
102
|
|
|
logging.error(e) |
103
|
|
|
logging.exception('Exception occurred') |
104
|
|
|
|