|
1
|
|
|
#!/usr/bin/python3 |
|
2
|
|
|
|
|
3
|
|
|
import pandas as pd |
|
4
|
|
|
import sqlite3 |
|
5
|
|
|
import os |
|
6
|
|
|
import argparse |
|
7
|
|
|
import urllib.request |
|
8
|
|
|
import time |
|
9
|
|
|
import sys |
|
10
|
|
|
from zipfile import ZipFile |
|
11
|
|
|
|
|
12
|
|
|
# Schema of geonames databases from |
|
13
|
|
|
# as of 07/01/2020 |
|
14
|
|
|
CITY_COLNAMES = ['Geonameid', |
|
15
|
|
|
'Name', |
|
16
|
|
|
'Asciiname', |
|
17
|
|
|
'Alternatenames', |
|
18
|
|
|
'Latitude', |
|
19
|
|
|
'Longitude', |
|
20
|
|
|
'FeatureClass', |
|
21
|
|
|
'FeatureCode', |
|
22
|
|
|
'CountryCode', |
|
23
|
|
|
'Cc2', |
|
24
|
|
|
'Admin1Code', |
|
25
|
|
|
'Admin2Code', |
|
26
|
|
|
'Admin3Code', |
|
27
|
|
|
'Admin4Code', |
|
28
|
|
|
'Population', |
|
29
|
|
|
'Elevation', |
|
30
|
|
|
'Dem', |
|
31
|
|
|
'Timezone', |
|
32
|
|
|
'ModificationDate'] |
|
33
|
|
|
STATE_COLNAMES = ["AdCode", "State", "StateClean", "ID"] |
|
34
|
|
|
DBFILENAME = 'geonames.sqlite' |
|
35
|
|
|
MIN_QUERY_DIST = 0.1 # Metres |
|
36
|
|
|
|
|
37
|
|
|
def import_dump(city_filename, admin_filename, country_filename, city_colnames, state_colnames, encoding='utf-8', delimiter='\t'): |
|
38
|
|
|
MULTIPLIER = 1.4 # DB size versus original text file |
|
39
|
|
|
filesize = os.stat(city_filename).st_size/1048576 # In MB |
|
40
|
|
|
print("Initial filesize\t{} MB\nExpected database size\t{} MB\n".format( |
|
41
|
|
|
round(filesize,2), round(filesize*MULTIPLIER,2))) |
|
42
|
|
|
|
|
43
|
|
|
df = pd.read_csv(city_filename, delimiter=delimiter, encoding=encoding, |
|
44
|
|
|
header=None, names=city_colnames, low_memory=False) |
|
45
|
|
|
|
|
46
|
|
|
# Filter only the necessary columns |
|
47
|
|
|
cities = df[['Geonameid', 'Name', 'Latitude', 'Longitude', 'Admin1Code', 'CountryCode']] |
|
48
|
|
|
|
|
49
|
|
|
# Admin 1 table (commonly the state names) |
|
50
|
|
|
states = pd.read_csv(admin_filename, encoding='utf-8', delimiter='\t', header=None, names=state_colnames) |
|
51
|
|
|
|
|
52
|
|
|
# Split the Country/Admin code by the separating decimal |
|
53
|
|
|
splitdf = states['AdCode'].str.split(".", expand=True) |
|
54
|
|
|
states['CountryID'] = splitdf[0] |
|
55
|
|
|
states['AdminCode'] = splitdf[1] |
|
56
|
|
|
states = states[['State', 'CountryID', 'AdminCode']] |
|
57
|
|
|
|
|
58
|
|
|
# This file is a bit messier, skip the first 51 rows, only use columns 0 and 4 |
|
59
|
|
|
countries = pd.read_csv(country_filename, encoding='utf-8', delimiter='\t', header=None, skiprows=51) |
|
60
|
|
|
countries = countries[[0, 4]] |
|
61
|
|
|
countries.columns = ['ISO', 'Country'] |
|
62
|
|
|
states = pd.merge(states, countries, left_on='CountryID', right_on='ISO', how='left') |
|
63
|
|
|
|
|
64
|
|
|
return cities, states, countries |
|
65
|
|
|
|
|
66
|
|
|
def query_db_size(db_path): |
|
67
|
|
|
print("Database size {} MB\n".format(round(os.stat(db_path).st_size/1048576, 2))) |
|
68
|
|
|
|
|
69
|
|
|
def generate_db(db_path, cities, states, countries): |
|
70
|
|
|
with sqlite3.connect(db_path) as conn: |
|
71
|
|
|
# Use pandas SQL export to generate SQLite DB |
|
72
|
|
|
print("Populating database", end='... ') |
|
73
|
|
|
cities.to_sql('cities', conn, if_exists='replace', index=False) |
|
74
|
|
|
states.to_sql('states', conn, if_exists='replace', index=False) |
|
75
|
|
|
countries.to_sql('countries', conn, if_exists='replace', index=False) |
|
76
|
|
|
print("Done") |
|
77
|
|
|
query_db_size(db_path) |
|
78
|
|
|
|
|
79
|
|
|
# Initialise spatialite |
|
80
|
|
|
conn.enable_load_extension(True) |
|
81
|
|
|
conn.load_extension("mod_spatialite") |
|
82
|
|
|
conn.execute("SELECT InitSpatialMetaData(1);") |
|
83
|
|
|
|
|
84
|
|
|
# Build geometry columns |
|
85
|
|
|
print("Building geometry columns", end='... ') |
|
86
|
|
|
conn.execute( |
|
87
|
|
|
""" |
|
88
|
|
|
SELECT AddGeometryColumn('cities', 'geom', 4326, 'POINT', 2); |
|
89
|
|
|
""" |
|
90
|
|
|
) |
|
91
|
|
|
print("Done") |
|
92
|
|
|
query_db_size(db_path) |
|
93
|
|
|
|
|
94
|
|
|
# Form geometry column 'geom' from latitude and logitude columns |
|
95
|
|
|
print("Generating spatial columns from lat/long columns", end='... ') |
|
96
|
|
|
conn.execute( |
|
97
|
|
|
""" |
|
98
|
|
|
UPDATE cities |
|
99
|
|
|
SET geom = MakePoint(Longitude, Latitude, 4326); |
|
100
|
|
|
""" |
|
101
|
|
|
) |
|
102
|
|
|
print("Done") |
|
103
|
|
|
query_db_size(db_path) |
|
104
|
|
|
|
|
105
|
|
|
# Generate the spatial index for super-fast queries |
|
106
|
|
|
print("Building spatial index", end='... ') |
|
107
|
|
|
conn.execute( |
|
108
|
|
|
""" |
|
109
|
|
|
SELECT createspatialindex('cities', 'geom'); |
|
110
|
|
|
""" |
|
111
|
|
|
) |
|
112
|
|
|
print("Done") |
|
113
|
|
|
query_db_size(db_path) |
|
114
|
|
|
|
|
115
|
|
|
def query_closest_city(db_path, latitude, longitude, epsg=4326, query_buffer_distance=0.1): |
|
116
|
|
|
# Start the buffer size for searching nearest points at a low number for speed, |
|
117
|
|
|
# but keep iterating (doubling distance in size) until somewhere is found. |
|
118
|
|
|
# Hence, this is faster for huge datasets as less points are considered in the spatial |
|
119
|
|
|
# query, but slower for small datasets as more iterations will need to occur if no point |
|
120
|
|
|
# exists |
|
121
|
|
|
row = None |
|
122
|
|
|
while row is None: |
|
123
|
|
|
# Prevent an infinite loop |
|
124
|
|
|
if query_buffer_distance > 12756 * 1000: # 12,756 km Longest distance on earth |
|
125
|
|
|
print("Distance more than length of earth. Never going to find this point") |
|
126
|
|
|
return None |
|
127
|
|
|
|
|
128
|
|
|
# Form tuple to represent values missing in SQL query |
|
129
|
|
|
query_tuple = (longitude, |
|
130
|
|
|
latitude, |
|
131
|
|
|
epsg, |
|
132
|
|
|
longitude, |
|
133
|
|
|
latitude, |
|
134
|
|
|
epsg, |
|
135
|
|
|
query_buffer_distance) |
|
136
|
|
|
with sqlite3.connect(db_path) as conn: |
|
137
|
|
|
conn.enable_load_extension(True) |
|
138
|
|
|
conn.load_extension("mod_spatialite") |
|
139
|
|
|
cur = conn.cursor() |
|
140
|
|
|
|
|
141
|
|
|
# Query database with spatial index |
|
142
|
|
|
cur.execute( |
|
143
|
|
|
""" |
|
144
|
|
|
select a.Name, b.State, b.Country from ( |
|
145
|
|
|
select *, distance(geom, makepoint(?, ?, ?)) dist |
|
146
|
|
|
from cities |
|
147
|
|
|
where rowid in ( |
|
148
|
|
|
select rowid |
|
149
|
|
|
from spatialindex |
|
150
|
|
|
where f_table_name = 'cities' |
|
151
|
|
|
and f_geometry_column = 'geom' |
|
152
|
|
|
and search_frame = buffer(makepoint(?, ?, ?), ?) |
|
153
|
|
|
) |
|
154
|
|
|
order by dist |
|
155
|
|
|
limit 1 |
|
156
|
|
|
) a, |
|
157
|
|
|
states b |
|
158
|
|
|
where a.CountryCode == b.ISO |
|
159
|
|
|
and a.Admin1Code == b.AdminCode; |
|
160
|
|
|
""", query_tuple |
|
161
|
|
|
) |
|
162
|
|
|
row = cur.fetchone() |
|
163
|
|
|
|
|
164
|
|
|
query_buffer_distance *= 2 |
|
165
|
|
|
|
|
166
|
|
|
return row |
|
167
|
|
|
|
|
168
|
|
|
# Reporthook function to show file download progress. Code from |
|
169
|
|
|
# https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html |
|
170
|
|
|
def reporthook(count, block_size, total_size): |
|
171
|
|
|
global start_time |
|
172
|
|
|
if count == 0: |
|
173
|
|
|
start_time = time.time() |
|
174
|
|
|
return |
|
175
|
|
|
duration = time.time() - start_time |
|
176
|
|
|
progress_size = int(count * block_size) |
|
177
|
|
|
speed = int(progress_size / (1024 * duration)) |
|
178
|
|
|
percent = int(count * block_size * 100 / total_size) |
|
179
|
|
|
sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" % |
|
180
|
|
|
(percent, progress_size / (1024 * 1024), speed, duration)) |
|
181
|
|
|
sys.stdout.flush() |
|
182
|
|
|
|
|
183
|
|
|
def download_dataset(city_colnames, state_colnames, db_path): |
|
184
|
|
|
options = [ |
|
185
|
|
|
("http://download.geonames.org/export/dump/allCountries.zip", |
|
186
|
|
|
"all countries combined in one file (HUGE! > 1.2 GB)"), |
|
187
|
|
|
("http://download.geonames.org/export/dump/cities500.zip", |
|
188
|
|
|
"all cities with a population > 500 or seats of adm div down to PPLA4 (ca 185.000)"), |
|
189
|
|
|
("http://download.geonames.org/export/dump/cities1000.zip", |
|
190
|
|
|
"all cities with a population > 1000 or seats of adm div down to PPLA3 (ca 130.000)"), |
|
191
|
|
|
("http://download.geonames.org/export/dump/cities5000.zip", |
|
192
|
|
|
"all cities with a population > 5000 or PPLA (ca 50.000)"), |
|
193
|
|
|
("http://download.geonames.org/export/dump/cities15000.zip", |
|
194
|
|
|
"all cities with a population > 15000 or capitals (ca 25.000)"), |
|
195
|
|
|
] |
|
196
|
|
|
|
|
197
|
|
|
# Let user choose which file to download |
|
198
|
|
|
for id, option in enumerate(options): |
|
199
|
|
|
print("[{}] {}:\t{}".format(id, option[0].split('/')[-1], option[1])) |
|
200
|
|
|
try: |
|
201
|
|
|
choice = int(input("Choose which file to download: ")) |
|
202
|
|
|
if choice < 0 or choice >= len(options): |
|
203
|
|
|
exit("Error: Choose between {} and {}".format(0, len(options)-1)) |
|
204
|
|
|
except ValueError: |
|
205
|
|
|
exit('Error: Not an integer') |
|
206
|
|
|
|
|
207
|
|
|
urllib.request.urlretrieve(options[choice][0], "rawdata.zip", reporthook) |
|
208
|
|
|
urllib.request.urlretrieve("http://download.geonames.org/export/dump/countryInfo.txt", "countryInfo.txt", reporthook) |
|
209
|
|
|
urllib.request.urlretrieve("http://download.geonames.org/export/dump/admin1CodesASCII.txt", "admin1CodesASCII.txt", reporthook) |
|
210
|
|
|
|
|
211
|
|
|
extracted_txt = extract_zip('rawdata.zip') |
|
212
|
|
|
|
|
213
|
|
|
print() |
|
214
|
|
|
# Create database |
|
215
|
|
|
cities, states, countries = import_dump(extracted_txt, "admin1CodesASCII.txt", "countryInfo.txt", city_colnames, state_colnames) |
|
216
|
|
|
generate_db(db_path, cities, states, countries) |
|
217
|
|
|
|
|
218
|
|
|
# Remove downloaded files |
|
219
|
|
|
os.remove("rawdata.zip") |
|
220
|
|
|
os.remove(extracted_txt) |
|
221
|
|
|
os.remove("countryInfo.txt") |
|
222
|
|
|
os.remove("admin1CodesASCII.txt") |
|
223
|
|
|
print("Success") |
|
224
|
|
|
|
|
225
|
|
|
def extract_zip(filename): |
|
226
|
|
|
with ZipFile('rawdata.zip', 'r') as zipObj: |
|
227
|
|
|
files = zipObj.namelist() |
|
228
|
|
|
# Iterate over the file names |
|
229
|
|
|
for fileName in files: |
|
230
|
|
|
# Check filename endswith csv |
|
231
|
|
|
if fileName.endswith('.txt'): |
|
232
|
|
|
# Extract a single file from zip |
|
233
|
|
|
zipObj.extract(fileName) |
|
234
|
|
|
filename = fileName |
|
235
|
|
|
return filename |
|
236
|
|
|
|
|
237
|
|
|
def main(): |
|
238
|
|
|
if os.path.exists(os.path.join(os.getcwd(), DBFILENAME)): |
|
239
|
|
|
parser = argparse.ArgumentParser() |
|
240
|
|
|
parser.add_argument("--database", type=str, help="Set the file for database (default: geonames.sqlite)", default="geonames.sqlite") |
|
241
|
|
|
parser.add_argument("longitude", type=float, help="X coordinate (Longitude)") |
|
242
|
|
|
parser.add_argument("latitude", type=float, help="Y coordinate (Latitude)") |
|
243
|
|
|
args = parser.parse_args() |
|
244
|
|
|
|
|
245
|
|
|
result = query_closest_city(args.database, args.latitude, args.longitude, |
|
246
|
|
|
query_buffer_distance=MIN_QUERY_DIST) |
|
247
|
|
|
print("{}, {}, {}".format(result[0], result[1], result[2])) |
|
248
|
|
|
else: |
|
249
|
|
|
print("GeoNames database", DBFILENAME, "does not exist. Choose an option") |
|
250
|
|
|
download_dataset(CITY_COLNAMES, STATE_COLNAMES, DBFILENAME) |
|
251
|
|
|
|
|
252
|
|
|
if __name__ == "__main__": |
|
253
|
|
|
main() |
|
254
|
|
|
|