osm_poi_matchmaker.create_db   A
last analyzed

Complexity

Total Complexity 17

Size/Duplication

Total Lines 187
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 149
dl 0
loc 187
rs 10
c 0
b 0
f 0
wmc 17

5 Methods

Rating   Name   Duplication   Size   Complexity  
A WorkflowManager.start_exporter() 0 14 2
A WorkflowManager.join() 0 2 1
A WorkflowManager.start_matcher() 0 11 2
A WorkflowManager.start_poi_harvest() 0 13 3
A WorkflowManager.__init__() 0 7 1

5 Functions

Rating   Name   Duplication   Size   Complexity  
B main() 0 54 3
A init_log() 0 2 1
A import_basic_data() 0 12 1
A load_common_data() 0 3 1
A load_poi_data() 0 8 2
1
#!/usr/bin/python
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
# -*- coding: utf-8 -*-
3
4
__author__ = 'kami911'
5
__program__ = 'create_db'
6
__version__ = '0.7.0'
7
8
try:
9
    import os
10
    import logging
11
    import logging.config
12
    import sys
13
    import numpy as np
14
    import pandas as pd
15
    import multiprocessing
16
    import datetime
17
    from osm_poi_matchmaker.utils import config, timing
18
    from osm_poi_matchmaker.libs.osm import timestamp_now
0 ignored issues
show
Unused Code introduced by
Unused timestamp_now imported from osm_poi_matchmaker.libs.osm
Loading history...
19
    from osm_poi_matchmaker.libs.online_poi_matching import online_poi_matching
20
    from osm_poi_matchmaker.libs.import_poi_data_module import import_poi_data_module
21
    from osm_poi_matchmaker.libs.export import export_raw_poi_data, export_raw_poi_data_xml, export_grouped_poi_data, \
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (119/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
Unused Code introduced by
Unused export_raw_poi_data_xml imported from osm_poi_matchmaker.libs.export
Loading history...
22
        export_grouped_poi_data_with_postcode_groups
23
    from sqlalchemy.orm import scoped_session, sessionmaker
24
    from osm_poi_matchmaker.dao.poi_base import POIBase
0 ignored issues
show
introduced by
Imports from package osm_poi_matchmaker are not grouped
Loading history...
25
except ImportError as err:
26
    logging.error('Error %s import module: %s', __name__, err)
27
    logging.exception('Exception occurred')
28
29
    sys.exit(128)
30
31
POI_COLS = ['poi_code', 'poi_postcode', 'poi_city', 'poi_name', 'poi_branch', 'poi_website', 'original',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (104/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
32
            'poi_addr_street',
33
            'poi_addr_housenumber', 'poi_conscriptionnumber', 'poi_ref', 'poi_geom']
34
RETRY = 3
35
36
37
def init_log():
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
38
    logging.config.fileConfig('log.conf')
39
40
41
def import_basic_data(session):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
42
    logging.info('Importing cities ...')
43
    from osm_poi_matchmaker.dataproviders.hu_generic import hu_city_postcode_from_xml
0 ignored issues
show
introduced by
Import outside toplevel (osm_poi_matchmaker.dataproviders.hu_generic.hu_city_postcode_from_xml)
Loading history...
44
    work = hu_city_postcode_from_xml(session, 'http://httpmegosztas.posta.hu/PartnerExtra/OUT/ZipCodes.xml',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (108/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
45
                                     config.get_directory_cache_url())
46
    work.process()
47
48
    logging.info('Importing street types ...')
49
    from osm_poi_matchmaker.dataproviders.hu_generic import hu_street_types_from_xml
0 ignored issues
show
introduced by
Import outside toplevel (osm_poi_matchmaker.dataproviders.hu_generic.hu_street_types_from_xml)
Loading history...
50
    work = hu_street_types_from_xml(session, 'http://httpmegosztas.posta.hu/PartnerExtra/OUT/StreetTypes.xml',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (110/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
51
                                    config.get_directory_cache_url())
52
    work.process()
53
54
55
def load_poi_data(database):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
56
    logging.info('Loading POI_data from database ...')
57
    if not os.path.exists(config.get_directory_output()):
58
        os.makedirs(config.get_directory_output())
59
    # Build Dataframe from our POI database
60
    addr_data = database.query_all_gpd_in_order('poi_address')
61
    addr_data[['poi_addr_city', 'poi_postcode']] = addr_data[['poi_addr_city', 'poi_postcode']].fillna('0').astype(int)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (119/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
62
    return addr_data
63
64
65
def load_common_data(database):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
66
    logging.info('Loading common data from database ...')
67
    return database.query_all_pd('poi_common')
68
69
70
class WorkflowManager(object):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
introduced by
Class 'WorkflowManager' inherits from object, can be safely removed from bases in python3
Loading history...
71
72
    def __init__(self):
73
        self.manager = multiprocessing.Manager()
74
        self.queue = self.manager.Queue()
75
        self.NUMBER_OF_PROCESSES = multiprocessing.cpu_count()
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "NUMBER_OF_PROCESSES" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
76
        self.items = 0
77
        self.pool = None
78
        self.results = []
79
80
    def start_poi_harvest(self):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
81
        for m in config.get_dataproviders_modules_enable():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "m" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
82
            self.queue.put(m)
83
        try:
84
            # Start multiprocessing in case multiple cores
85
            logging.info('Starting processing on %s cores.', self.NUMBER_OF_PROCESSES)
86
            self.results = []
87
            self.pool = multiprocessing.Pool(processes=self.NUMBER_OF_PROCESSES)
88
            self.results = self.pool.map_async(import_poi_data_module, config.get_dataproviders_modules_enable())
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (113/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
89
            self.pool.close()
90
        except Exception as e:
0 ignored issues
show
Best Practice introduced by
Catching very general exceptions such as Exception is usually not recommended.

Generally, you would want to handle very specific errors in the exception handler. This ensure that you do not hide other types of errors which should be fixed.

So, unless you specifically plan to handle any error, consider adding a more specific exception.

Loading history...
Coding Style Naming introduced by
Variable name "e" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
91
            logging.error(e)
92
            logging.exception('Exception occurred')
93
94
    def start_exporter(self, data: list, postfix: str = '', to_do=export_grouped_poi_data):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
95
        poi_codes = data['poi_code'].unique()
96
        modules = [[config.get_directory_output(), 'poi_address_{}{}'.format(postfix, c), data[data.poi_code == c],
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (115/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
97
                    'poi_address'] for c in poi_codes]
98
        try:
99
            # Start multiprocessing in case multiple cores
100
            logging.info('Starting processing on %s cores.', self.NUMBER_OF_PROCESSES)
101
            self.results = []
102
            self.pool = multiprocessing.Pool(processes=self.NUMBER_OF_PROCESSES)
103
            self.results = self.pool.map_async(to_do, modules)
104
            self.pool.close()
105
        except Exception as e:
0 ignored issues
show
Best Practice introduced by
Catching very general exceptions such as Exception is usually not recommended.

Generally, you would want to handle very specific errors in the exception handler. This ensure that you do not hide other types of errors which should be fixed.

So, unless you specifically plan to handle any error, consider adding a more specific exception.

Loading history...
Coding Style Naming introduced by
Variable name "e" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
106
            logging.error(e)
107
            logging.exception('Exception occurred')
108
109
    def start_matcher(self, data, comm_data):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
110
        try:
111
            workers = self.NUMBER_OF_PROCESSES
112
            self.pool = multiprocessing.Pool(processes=self.NUMBER_OF_PROCESSES)
113
            self.results = self.pool.map_async(online_poi_matching,
114
                                               [(d, comm_data) for d in np.array_split(data, workers)])
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (103/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
115
            self.pool.close()
116
            return pd.concat(list(self.results.get()), sort=False)
117
        except Exception as e:
0 ignored issues
show
Best Practice introduced by
Catching very general exceptions such as Exception is usually not recommended.

Generally, you would want to handle very specific errors in the exception handler. This ensure that you do not hide other types of errors which should be fixed.

So, unless you specifically plan to handle any error, consider adding a more specific exception.

Loading history...
Coding Style Naming introduced by
Variable name "e" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
118
            logging.error(e)
119
            logging.exception('Exception occurred')
120
121
    def join(self):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
122
        self.pool.join()
123
124
125
def main():
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
126
    logging.info('Starting %s ...', __program__)
127
    db = POIBase('{}://{}:{}@{}:{}/{}'.format(config.get_database_type(), config.get_database_writer_username(),
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
Coding Style Naming introduced by
Variable name "db" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
128
                                              config.get_database_writer_password(),
129
                                              config.get_database_writer_host(),
130
                                              config.get_database_writer_port(),
131
                                              config.get_database_poi_database()))
132
    pgsql_pool = db.pool
133
    session_factory = sessionmaker(pgsql_pool)
134
    Session = scoped_session(session_factory)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "Session" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
135
    session = Session()
0 ignored issues
show
Unused Code introduced by
The variable session seems to be unused.
Loading history...
136
    try:
137
        import_basic_data(db.session)
138
        manager = WorkflowManager()
139
        manager.start_poi_harvest()
140
        manager.join()
141
        # Load basic dataset from database
142
        poi_addr_data = load_poi_data(db)
143
        # Download and load POI dataset to database
144
        poi_common_data = load_common_data(db)
145
        logging.info('Merging dataframes ...')
146
        poi_addr_data = pd.merge(poi_addr_data, poi_common_data, left_on='poi_common_id', right_on='pc_id', how='inner')
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (120/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
147
        # Add additional empty fields
148
        poi_addr_data['osm_id'] = None
149
        poi_addr_data['osm_node'] = None
150
        poi_addr_data['osm_version'] = None
151
        poi_addr_data['osm_changeset'] = None
152
        poi_addr_data['osm_timestamp'] = datetime.datetime.now()
153
        poi_addr_data['osm_live_tags'] = None
154
        # Export non-transformed data
155
        export_raw_poi_data(poi_addr_data, poi_common_data)
156
        # export_raw_poi_data_xml(poi_addr_data)
157
        logging.info('Saving poi_code grouped filesets...')
158
        # Export non-transformed filesets
159
        manager.start_exporter(poi_addr_data)
160
        manager.join()
161
        logging.info('Merging with OSM datasets ...')
162
        poi_addr_data['osm_nodes'] = None
163
        poi_addr_data['poi_distance'] = None
164
        # Enrich POI datasets from online OpenStreetMap database
165
        logging.info('Starting online POI matching part...')
166
        poi_addr_data = manager.start_matcher(poi_addr_data, poi_common_data)
167
        manager.join()
168
        # Export filesets
169
        export_raw_poi_data(poi_addr_data, poi_common_data, '_merge')
170
        manager.start_exporter(poi_addr_data, 'merge_')
171
        manager.start_exporter(poi_addr_data, 'merge_', export_grouped_poi_data_with_postcode_groups)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (101/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
172
        manager.join()
173
174
    except (KeyboardInterrupt, SystemExit):
175
        logging.info('Interrupt signal received')
176
        sys.exit(1)
177
    except Exception as err:
178
        raise err
179
180
181
if __name__ == '__main__':
182
    config.set_mode(config.Mode.matcher)
183
    init_log()
184
    timer = timing.Timing()
185
    main()
186
    logging.info('Total duration of process: %s. Finished, exiting and go home ...', timer.end())
187