Passed
Pull Request — master (#348)
by
unknown
02:14
created

closestgeoname   A

Complexity

Total Complexity 21

Size/Duplication

Total Lines 254
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 167
dl 0
loc 254
rs 10
c 0
b 0
f 0
wmc 21

8 Functions

Rating   Name   Duplication   Size   Complexity  
A query_closest_city() 0 52 4
A main() 0 14 2
A reporthook() 0 12 2
B download_dataset() 0 41 5
A import_dump() 0 28 1
A extract_zip() 0 11 4
A query_db_size() 0 2 1
A generate_db() 0 45 2
1
#!/usr/bin/python3
2
3
import pandas as pd
4
import sqlite3
5
import os
6
import argparse
7
import urllib.request
8
import time
9
import sys
10
from zipfile import ZipFile
11
12
# Schema of geonames databases from
13
# as of 07/01/2020
14
CITY_COLNAMES = ['Geonameid',
15
                'Name',
16
                'Asciiname',
17
                'Alternatenames',
18
                'Latitude',
19
                'Longitude',
20
                'FeatureClass',
21
                'FeatureCode',
22
                'CountryCode',
23
                'Cc2',
24
                'Admin1Code',
25
                'Admin2Code',
26
                'Admin3Code',
27
                'Admin4Code',
28
                'Population',
29
                'Elevation',
30
                'Dem',
31
                'Timezone',
32
                'ModificationDate']
33
STATE_COLNAMES = ["AdCode", "State", "StateClean", "ID"]
34
DBFILENAME = 'geonames.sqlite'
35
MIN_QUERY_DIST = 0.1 # Metres
36
37
def import_dump(city_filename, admin_filename, country_filename, city_colnames, state_colnames, encoding='utf-8', delimiter='\t'):
38
    MULTIPLIER = 1.4 # DB size versus original text file
39
    filesize = os.stat(city_filename).st_size/1048576 # In MB
40
    print("Initial filesize\t{} MB\nExpected database size\t{} MB\n".format(
41
                            round(filesize,2), round(filesize*MULTIPLIER,2)))
42
43
    df = pd.read_csv(city_filename, delimiter=delimiter, encoding=encoding,
44
                     header=None, names=city_colnames, low_memory=False)
45
46
    # Filter only the necessary columns
47
    cities = df[['Geonameid', 'Name', 'Latitude', 'Longitude', 'Admin1Code', 'CountryCode']]
48
49
    # Admin 1 table (commonly the state names)
50
    states = pd.read_csv(admin_filename, encoding='utf-8', delimiter='\t', header=None, names=state_colnames)
51
52
    # Split the Country/Admin code by the separating decimal
53
    splitdf = states['AdCode'].str.split(".", expand=True)
54
    states['CountryID'] = splitdf[0]
55
    states['AdminCode'] = splitdf[1]
56
    states = states[['State', 'CountryID', 'AdminCode']]
57
58
    # This file is a bit messier, skip the first 51 rows, only use columns 0 and 4
59
    countries = pd.read_csv(country_filename, encoding='utf-8', delimiter='\t', header=None, skiprows=51)
60
    countries = countries[[0, 4]]
61
    countries.columns = ['ISO', 'Country']
62
    states = pd.merge(states, countries, left_on='CountryID', right_on='ISO', how='left')
63
64
    return cities, states, countries
65
66
def query_db_size(db_path):
67
    print("Database size {} MB\n".format(round(os.stat(db_path).st_size/1048576, 2)))
68
69
def generate_db(db_path, cities, states, countries):
70
    with sqlite3.connect(db_path) as conn:
71
        # Use pandas SQL export to generate SQLite DB
72
        print("Populating database", end='... ')
73
        cities.to_sql('cities', conn, if_exists='replace', index=False)
74
        states.to_sql('states', conn, if_exists='replace', index=False)
75
        countries.to_sql('countries', conn, if_exists='replace', index=False)
76
        print("Done")
77
        query_db_size(db_path)
78
79
        # Initialise spatialite
80
        conn.enable_load_extension(True)
81
        conn.load_extension("mod_spatialite")
82
        conn.execute("SELECT InitSpatialMetaData(1);")
83
84
        # Build geometry columns
85
        print("Building geometry columns", end='... ')
86
        conn.execute(
87
            """
88
            SELECT AddGeometryColumn('cities', 'geom', 4326, 'POINT', 2);
89
            """
90
        )
91
        print("Done")
92
        query_db_size(db_path)
93
94
        # Form geometry column 'geom' from latitude and logitude columns
95
        print("Generating spatial columns from lat/long columns", end='... ')
96
        conn.execute(
97
            """
98
            UPDATE cities
99
            SET geom = MakePoint(Longitude, Latitude, 4326);
100
            """
101
        )
102
        print("Done")
103
        query_db_size(db_path)
104
105
        # Generate the spatial index for super-fast queries
106
        print("Building spatial index", end='... ')
107
        conn.execute(
108
            """
109
            SELECT createspatialindex('cities', 'geom');
110
            """
111
        )
112
        print("Done")
113
        query_db_size(db_path)
114
115
def query_closest_city(db_path, latitude, longitude, epsg=4326, query_buffer_distance=0.1):
116
    # Start the buffer size for searching nearest points at a low number for speed,
117
    # but keep iterating (doubling distance in size) until somewhere is found.
118
    # Hence, this is faster for huge datasets as less points are considered in the spatial
119
    # query, but slower for small datasets as more iterations will need to occur if no point
120
    # exists
121
    row = None
122
    while row is None:
123
        # Prevent an infinite loop
124
        if query_buffer_distance > 12756 * 1000: # 12,756 km Longest distance on earth
125
            print("Distance more than length of earth. Never going to find this point")
126
            return None
127
128
        # Form tuple to represent values missing in SQL query
129
        query_tuple = (longitude,
130
                    latitude,
131
                    epsg,
132
                    longitude,
133
                    latitude,
134
                    epsg,
135
                    query_buffer_distance)
136
        with sqlite3.connect(db_path) as conn:
137
            conn.enable_load_extension(True)
138
            conn.load_extension("mod_spatialite")
139
            cur = conn.cursor()
140
141
            # Query database with spatial index
142
            cur.execute(
143
                """
144
                select a.Name, b.State, b.Country from (
145
                    select *, distance(geom, makepoint(?, ?, ?)) dist
146
                    from cities
147
                    where rowid in (
148
                        select rowid
149
                        from spatialindex
150
                        where f_table_name = 'cities'
151
                        and f_geometry_column = 'geom'
152
                        and search_frame = buffer(makepoint(?, ?, ?), ?)
153
                    )
154
                    order by dist
155
                    limit 1
156
                ) a,
157
                states b
158
                where a.CountryCode == b.ISO
159
                and a.Admin1Code == b.AdminCode;
160
                """, query_tuple
161
            )
162
            row = cur.fetchone()
163
164
        query_buffer_distance *= 2
165
166
    return row
167
168
# Reporthook function to show file download progress. Code from
169
# https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html
170
def reporthook(count, block_size, total_size):
171
    global start_time
172
    if count == 0:
173
        start_time = time.time()
174
        return
175
    duration = time.time() - start_time
176
    progress_size = int(count * block_size)
177
    speed = int(progress_size / (1024 * duration))
178
    percent = int(count * block_size * 100 / total_size)
179
    sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
180
                    (percent, progress_size / (1024 * 1024), speed, duration))
181
    sys.stdout.flush()
182
183
def download_dataset(city_colnames, state_colnames, db_path):
184
    options = [
185
    ("http://download.geonames.org/export/dump/allCountries.zip",
186
     "all countries combined in one file (HUGE! > 1.2 GB)"),
187
    ("http://download.geonames.org/export/dump/cities500.zip",
188
     "all cities with a population > 500 or seats of adm div down to PPLA4 (ca 185.000)"),
189
    ("http://download.geonames.org/export/dump/cities1000.zip",
190
     "all cities with a population > 1000 or seats of adm div down to PPLA3 (ca 130.000)"),
191
    ("http://download.geonames.org/export/dump/cities5000.zip",
192
     "all cities with a population > 5000 or PPLA (ca 50.000)"),
193
    ("http://download.geonames.org/export/dump/cities15000.zip",
194
     "all cities with a population > 15000 or capitals (ca 25.000)"),
195
    ]
196
197
    # Let user choose which file to download
198
    for id, option in enumerate(options):
199
        print("[{}] {}:\t{}".format(id, option[0].split('/')[-1], option[1]))
200
    try:
201
        choice = int(input("Choose which file to download: "))
202
        if choice < 0 or choice >= len(options):
203
            exit("Error: Choose between {} and {}".format(0, len(options)-1))
204
    except ValueError:
205
        exit('Error: Not an integer')
206
207
    urllib.request.urlretrieve(options[choice][0], "rawdata.zip", reporthook)
208
    urllib.request.urlretrieve("http://download.geonames.org/export/dump/countryInfo.txt", "countryInfo.txt", reporthook)
209
    urllib.request.urlretrieve("http://download.geonames.org/export/dump/admin1CodesASCII.txt", "admin1CodesASCII.txt", reporthook)
210
211
    extracted_txt = extract_zip('rawdata.zip')
212
213
    print()
214
    # Create database
215
    cities, states, countries = import_dump(extracted_txt, "admin1CodesASCII.txt", "countryInfo.txt", city_colnames, state_colnames)
216
    generate_db(db_path, cities, states, countries)
217
218
    # Remove downloaded files
219
    os.remove("rawdata.zip")
220
    os.remove(extracted_txt)
221
    os.remove("countryInfo.txt")
222
    os.remove("admin1CodesASCII.txt")
223
    print("Success")
224
225
def extract_zip(filename):
226
    with ZipFile('rawdata.zip', 'r') as zipObj:
227
        files = zipObj.namelist()
228
        # Iterate over the file names
229
        for fileName in files:
230
            # Check filename endswith csv
231
            if fileName.endswith('.txt'):
232
                # Extract a single file from zip
233
                zipObj.extract(fileName)
234
                filename = fileName
235
    return filename
236
237
def main():
238
    if os.path.exists(os.path.join(os.getcwd(), DBFILENAME)):
239
        parser = argparse.ArgumentParser()
240
        parser.add_argument("--database", type=str, help="Set the file for database (default: geonames.sqlite)", default="geonames.sqlite")
241
        parser.add_argument("longitude", type=float, help="X coordinate (Longitude)")
242
        parser.add_argument("latitude", type=float, help="Y coordinate (Latitude)")
243
        args = parser.parse_args()
244
245
        result = query_closest_city(args.database, args.latitude, args.longitude,
246
                                        query_buffer_distance=MIN_QUERY_DIST)
247
        print("{}, {}, {}".format(result[0], result[1], result[2]))
248
    else:
249
        print("GeoNames database", DBFILENAME, "does not exist. Choose an option")
250
        download_dataset(CITY_COLNAMES, STATE_COLNAMES, DBFILENAME)
251
252
if __name__ == "__main__":
253
    main()
254