closestgeoname.query_closest_city() - Code Metrics - Inspection of "Exception handling for null GNSS coords" - jmathai/elodie - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#348)

unknown

created 2020-01-10 19:00 UTC

closestgeoname.query_closest_city() A

↳ Parent: closestgeoname

Complexity

Conditions

Size

Total Lines	52
Code Lines	23

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	23
nop	5
dl	0
loc	52
rs	9.328
c	0
b	0
f	0

How to fix Long Method

#!/usr/bin/python3

import pandas as pd
import sqlite3
import os
import argparse
import urllib.request
import time
import sys
from zipfile import ZipFile

# Schema of geonames databases from
# as of 07/01/2020
CITY_COLNAMES = ['Geonameid',
                'Name',
                'Asciiname',
                'Alternatenames',
                'Latitude',
                'Longitude',
                'FeatureClass',
                'FeatureCode',
                'CountryCode',
                'Cc2',
                'Admin1Code',
                'Admin2Code',
                'Admin3Code',
                'Admin4Code',
                'Population',
                'Elevation',
                'Dem',
                'Timezone',
                'ModificationDate']
STATE_COLNAMES = ["AdCode", "State", "StateClean", "ID"]
DBFILENAME = 'geonames.sqlite'
MIN_QUERY_DIST = 0.1 # Metres

def import_dump(city_filename, admin_filename, country_filename, city_colnames, state_colnames, encoding='utf-8', delimiter='\t'):
    MULTIPLIER = 1.4 # DB size versus original text file
    filesize = os.stat(city_filename).st_size/1048576 # In MB
    print("Initial filesize\t{} MB\nExpected database size\t{} MB\n".format(
                            round(filesize,2), round(filesize*MULTIPLIER,2)))

    df = pd.read_csv(city_filename, delimiter=delimiter, encoding=encoding,
                     header=None, names=city_colnames, low_memory=False)

    # Filter only the necessary columns
    cities = df[['Geonameid', 'Name', 'Latitude', 'Longitude', 'Admin1Code', 'CountryCode']]

    # Admin 1 table (commonly the state names)
    states = pd.read_csv(admin_filename, encoding='utf-8', delimiter='\t', header=None, names=state_colnames)

    # Split the Country/Admin code by the separating decimal
    splitdf = states['AdCode'].str.split(".", expand=True)
    states['CountryID'] = splitdf[0]
    states['AdminCode'] = splitdf[1]
    states = states[['State', 'CountryID', 'AdminCode']]

    # This file is a bit messier, skip the first 51 rows, only use columns 0 and 4
    countries = pd.read_csv(country_filename, encoding='utf-8', delimiter='\t', header=None, skiprows=51)
    countries = countries[[0, 4]]
    countries.columns = ['ISO', 'Country']
    states = pd.merge(states, countries, left_on='CountryID', right_on='ISO', how='left')

    return cities, states, countries

def query_db_size(db_path):
    print("Database size {} MB\n".format(round(os.stat(db_path).st_size/1048576, 2)))

def generate_db(db_path, cities, states, countries):
    with sqlite3.connect(db_path) as conn:
        # Use pandas SQL export to generate SQLite DB
        print("Populating database", end='... ')
        cities.to_sql('cities', conn, if_exists='replace', index=False)
        states.to_sql('states', conn, if_exists='replace', index=False)
        countries.to_sql('countries', conn, if_exists='replace', index=False)
        print("Done")
        query_db_size(db_path)

        # Initialise spatialite
        conn.enable_load_extension(True)
        conn.load_extension("mod_spatialite")
        conn.execute("SELECT InitSpatialMetaData(1);")

        # Build geometry columns
        print("Building geometry columns", end='... ')
        conn.execute(
            """
            SELECT AddGeometryColumn('cities', 'geom', 4326, 'POINT', 2);
            """
        )
        print("Done")
        query_db_size(db_path)

        # Form geometry column 'geom' from latitude and logitude columns
        print("Generating spatial columns from lat/long columns", end='... ')
        conn.execute(
            """
            UPDATE cities
            SET geom = MakePoint(Longitude, Latitude, 4326);
            """
        )
        print("Done")
        query_db_size(db_path)

        # Generate the spatial index for super-fast queries
        print("Building spatial index", end='... ')
        conn.execute(
            """
            SELECT createspatialindex('cities', 'geom');
            """
        )
        print("Done")
        query_db_size(db_path)

def query_closest_city(db_path, latitude, longitude, epsg=4326, query_buffer_distance=0.1):
    # Start the buffer size for searching nearest points at a low number for speed,
    # but keep iterating (doubling distance in size) until somewhere is found.
    # Hence, this is faster for huge datasets as less points are considered in the spatial
    # query, but slower for small datasets as more iterations will need to occur if no point
    # exists
    row = None
    while row is None:
        # Prevent an infinite loop
        if query_buffer_distance > 12756 * 1000: # 12,756 km Longest distance on earth
            print("Distance more than length of earth. Never going to find this point")
            return None

        # Form tuple to represent values missing in SQL query
        query_tuple = (longitude,
                    latitude,
                    epsg,
                    longitude,
                    latitude,
                    epsg,
                    query_buffer_distance)
        with sqlite3.connect(db_path) as conn:
            conn.enable_load_extension(True)
            conn.load_extension("mod_spatialite")
            cur = conn.cursor()

            # Query database with spatial index
            cur.execute(
                """
                select a.Name, b.State, b.Country from (
                    select *, distance(geom, makepoint(?, ?, ?)) dist
                    from cities
                    where rowid in (
                        select rowid
                        from spatialindex
                        where f_table_name = 'cities'
                        and f_geometry_column = 'geom'
                        and search_frame = buffer(makepoint(?, ?, ?), ?)
                    )
                    order by dist
                    limit 1
                ) a,
                states b
                where a.CountryCode == b.ISO
                and a.Admin1Code == b.AdminCode;
                """, query_tuple
            )
            row = cur.fetchone()

        query_buffer_distance *= 2

    return row

# Reporthook function to show file download progress. Code from
# https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html
def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = int(progress_size / (1024 * duration))
    percent = int(count * block_size * 100 / total_size)
    sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
                    (percent, progress_size / (1024 * 1024), speed, duration))
    sys.stdout.flush()

def download_dataset(city_colnames, state_colnames, db_path):
    options = [
    ("http://download.geonames.org/export/dump/allCountries.zip",
     "all countries combined in one file (HUGE! > 1.2 GB)"),
    ("http://download.geonames.org/export/dump/cities500.zip",
     "all cities with a population > 500 or seats of adm div down to PPLA4 (ca 185.000)"),
    ("http://download.geonames.org/export/dump/cities1000.zip",
     "all cities with a population > 1000 or seats of adm div down to PPLA3 (ca 130.000)"),
    ("http://download.geonames.org/export/dump/cities5000.zip",
     "all cities with a population > 5000 or PPLA (ca 50.000)"),
    ("http://download.geonames.org/export/dump/cities15000.zip",
     "all cities with a population > 15000 or capitals (ca 25.000)"),
    ]

    # Let user choose which file to download
    for id, option in enumerate(options):
        print("[{}] {}:\t{}".format(id, option[0].split('/')[-1], option[1]))
    try:
        choice = int(input("Choose which file to download: "))
        if choice < 0 or choice >= len(options):
            exit("Error: Choose between {} and {}".format(0, len(options)-1))
    except ValueError:
        exit('Error: Not an integer')

    urllib.request.urlretrieve(options[choice][0], "rawdata.zip", reporthook)
    urllib.request.urlretrieve("http://download.geonames.org/export/dump/countryInfo.txt", "countryInfo.txt", reporthook)
    urllib.request.urlretrieve("http://download.geonames.org/export/dump/admin1CodesASCII.txt", "admin1CodesASCII.txt", reporthook)

    extracted_txt = extract_zip('rawdata.zip')

    print()
    # Create database
    cities, states, countries = import_dump(extracted_txt, "admin1CodesASCII.txt", "countryInfo.txt", city_colnames, state_colnames)
    generate_db(db_path, cities, states, countries)

    # Remove downloaded files
    os.remove("rawdata.zip")
    os.remove(extracted_txt)
    os.remove("countryInfo.txt")
    os.remove("admin1CodesASCII.txt")
    print("Success")

def extract_zip(filename):
    with ZipFile('rawdata.zip', 'r') as zipObj:
        files = zipObj.namelist()
        # Iterate over the file names
        for fileName in files:
            # Check filename endswith csv
            if fileName.endswith('.txt'):
                # Extract a single file from zip
                zipObj.extract(fileName)
                filename = fileName
    return filename

def main():
    if os.path.exists(os.path.join(os.getcwd(), DBFILENAME)):
        parser = argparse.ArgumentParser()
        parser.add_argument("--database", type=str, help="Set the file for database (default: geonames.sqlite)", default="geonames.sqlite")
        parser.add_argument("longitude", type=float, help="X coordinate (Longitude)")
        parser.add_argument("latitude", type=float, help="Y coordinate (Latitude)")
        args = parser.parse_args()

        result = query_closest_city(args.database, args.latitude, args.longitude,
                                        query_buffer_distance=MIN_QUERY_DIST)
        print("{}, {}, {}".format(result[0], result[1], result[2]))
    else:
        print("GeoNames database", DBFILENAME, "does not exist. Choose an option")
        download_dataset(CITY_COLNAMES, STATE_COLNAMES, DBFILENAME)

if __name__ == "__main__":
    main()


1			#!/usr/bin/python3
2
3			import pandas as pd
4			import sqlite3
5			import os
6			import argparse
7			import urllib.request
8			import time
9			import sys
10			from zipfile import ZipFile
11
12			# Schema of geonames databases from
13			# as of 07/01/2020
14			CITY_COLNAMES = ['Geonameid',
15			'Name',
16			'Asciiname',
17			'Alternatenames',
18			'Latitude',
19			'Longitude',
20			'FeatureClass',
21			'FeatureCode',
22			'CountryCode',
23			'Cc2',
24			'Admin1Code',
25			'Admin2Code',
26			'Admin3Code',
27			'Admin4Code',
28			'Population',
29			'Elevation',
30			'Dem',
31			'Timezone',
32			'ModificationDate']
33			STATE_COLNAMES = ["AdCode", "State", "StateClean", "ID"]
34			DBFILENAME = 'geonames.sqlite'
35			MIN_QUERY_DIST = 0.1 # Metres
36
37			def import_dump(city_filename, admin_filename, country_filename, city_colnames, state_colnames, encoding='utf-8', delimiter='\t'):
38			MULTIPLIER = 1.4 # DB size versus original text file
39			filesize = os.stat(city_filename).st_size/1048576 # In MB
40			print("Initial filesize\t{} MB\nExpected database size\t{} MB\n".format(
41			round(filesize,2), round(filesize*MULTIPLIER,2)))
42
43			df = pd.read_csv(city_filename, delimiter=delimiter, encoding=encoding,
44			header=None, names=city_colnames, low_memory=False)
45
46			# Filter only the necessary columns
47			cities = df[['Geonameid', 'Name', 'Latitude', 'Longitude', 'Admin1Code', 'CountryCode']]
48
49			# Admin 1 table (commonly the state names)
50			states = pd.read_csv(admin_filename, encoding='utf-8', delimiter='\t', header=None, names=state_colnames)
51
52			# Split the Country/Admin code by the separating decimal
53			splitdf = states['AdCode'].str.split(".", expand=True)
54			states['CountryID'] = splitdf[0]
55			states['AdminCode'] = splitdf[1]
56			states = states[['State', 'CountryID', 'AdminCode']]
57
58			# This file is a bit messier, skip the first 51 rows, only use columns 0 and 4
59			countries = pd.read_csv(country_filename, encoding='utf-8', delimiter='\t', header=None, skiprows=51)
60			countries = countries[[0, 4]]
61			countries.columns = ['ISO', 'Country']
62			states = pd.merge(states, countries, left_on='CountryID', right_on='ISO', how='left')
63
64			return cities, states, countries
65
66			def query_db_size(db_path):
67			print("Database size {} MB\n".format(round(os.stat(db_path).st_size/1048576, 2)))
68
69			def generate_db(db_path, cities, states, countries):
70			with sqlite3.connect(db_path) as conn:
71			# Use pandas SQL export to generate SQLite DB
72			print("Populating database", end='... ')
73			cities.to_sql('cities', conn, if_exists='replace', index=False)
74			states.to_sql('states', conn, if_exists='replace', index=False)
75			countries.to_sql('countries', conn, if_exists='replace', index=False)
76			print("Done")
77			query_db_size(db_path)
78
79			# Initialise spatialite
80			conn.enable_load_extension(True)
81			conn.load_extension("mod_spatialite")
82			conn.execute("SELECT InitSpatialMetaData(1);")
83
84			# Build geometry columns
85			print("Building geometry columns", end='... ')
86			conn.execute(
87			"""
88			SELECT AddGeometryColumn('cities', 'geom', 4326, 'POINT', 2);
89			"""
90			)
91			print("Done")
92			query_db_size(db_path)
93
94			# Form geometry column 'geom' from latitude and logitude columns
95			print("Generating spatial columns from lat/long columns", end='... ')
96			conn.execute(
97			"""
98			UPDATE cities
99			SET geom = MakePoint(Longitude, Latitude, 4326);
100			"""
101			)
102			print("Done")
103			query_db_size(db_path)
104
105			# Generate the spatial index for super-fast queries
106			print("Building spatial index", end='... ')
107			conn.execute(
108			"""
109			SELECT createspatialindex('cities', 'geom');
110			"""
111			)
112			print("Done")
113			query_db_size(db_path)
114
115			def query_closest_city(db_path, latitude, longitude, epsg=4326, query_buffer_distance=0.1):
116			# Start the buffer size for searching nearest points at a low number for speed,
117			# but keep iterating (doubling distance in size) until somewhere is found.
118			# Hence, this is faster for huge datasets as less points are considered in the spatial
119			# query, but slower for small datasets as more iterations will need to occur if no point
120			# exists
121			row = None
122			while row is None:
123			# Prevent an infinite loop
124			if query_buffer_distance > 12756 * 1000: # 12,756 km Longest distance on earth
125			print("Distance more than length of earth. Never going to find this point")
126			return None
127
128			# Form tuple to represent values missing in SQL query
129			query_tuple = (longitude,
130			latitude,
131			epsg,
132			longitude,
133			latitude,
134			epsg,
135			query_buffer_distance)
136			with sqlite3.connect(db_path) as conn:
137			conn.enable_load_extension(True)
138			conn.load_extension("mod_spatialite")
139			cur = conn.cursor()
140
141			# Query database with spatial index
142			cur.execute(
143			"""
144			select a.Name, b.State, b.Country from (
145			select *, distance(geom, makepoint(?, ?, ?)) dist
146			from cities
147			where rowid in (
148			select rowid
149			from spatialindex
150			where f_table_name = 'cities'
151			and f_geometry_column = 'geom'
152			and search_frame = buffer(makepoint(?, ?, ?), ?)
153			)
154			order by dist
155			limit 1
156			) a,
157			states b
158			where a.CountryCode == b.ISO
159			and a.Admin1Code == b.AdminCode;
160			""", query_tuple
161			)
162			row = cur.fetchone()
163
164			query_buffer_distance *= 2
165
166			return row
167
168			# Reporthook function to show file download progress. Code from
169			# https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html
170			def reporthook(count, block_size, total_size):
171			global start_time
172			if count == 0:
173			start_time = time.time()
174			return
175			duration = time.time() - start_time
176			progress_size = int(count * block_size)
177			speed = int(progress_size / (1024 * duration))
178			percent = int(count * block_size * 100 / total_size)
179			sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
180			(percent, progress_size / (1024 * 1024), speed, duration))
181			sys.stdout.flush()
182
183			def download_dataset(city_colnames, state_colnames, db_path):
184			options = [
185			("http://download.geonames.org/export/dump/allCountries.zip",
186			"all countries combined in one file (HUGE! > 1.2 GB)"),
187			("http://download.geonames.org/export/dump/cities500.zip",
188			"all cities with a population > 500 or seats of adm div down to PPLA4 (ca 185.000)"),
189			("http://download.geonames.org/export/dump/cities1000.zip",
190			"all cities with a population > 1000 or seats of adm div down to PPLA3 (ca 130.000)"),
191			("http://download.geonames.org/export/dump/cities5000.zip",
192			"all cities with a population > 5000 or PPLA (ca 50.000)"),
193			("http://download.geonames.org/export/dump/cities15000.zip",
194			"all cities with a population > 15000 or capitals (ca 25.000)"),
195			]
196
197			# Let user choose which file to download
198			for id, option in enumerate(options):
199			print("[{}] {}:\t{}".format(id, option[0].split('/')[-1], option[1]))
200			try:
201			choice = int(input("Choose which file to download: "))
202			if choice < 0 or choice >= len(options):
203			exit("Error: Choose between {} and {}".format(0, len(options)-1))
204			except ValueError:
205			exit('Error: Not an integer')
206
207			urllib.request.urlretrieve(options[choice][0], "rawdata.zip", reporthook)
208			urllib.request.urlretrieve("http://download.geonames.org/export/dump/countryInfo.txt", "countryInfo.txt", reporthook)
209			urllib.request.urlretrieve("http://download.geonames.org/export/dump/admin1CodesASCII.txt", "admin1CodesASCII.txt", reporthook)
210
211			extracted_txt = extract_zip('rawdata.zip')
212
213			print()
214			# Create database
215			cities, states, countries = import_dump(extracted_txt, "admin1CodesASCII.txt", "countryInfo.txt", city_colnames, state_colnames)
216			generate_db(db_path, cities, states, countries)
217
218			# Remove downloaded files
219			os.remove("rawdata.zip")
220			os.remove(extracted_txt)
221			os.remove("countryInfo.txt")
222			os.remove("admin1CodesASCII.txt")
223			print("Success")
224
225			def extract_zip(filename):
226			with ZipFile('rawdata.zip', 'r') as zipObj:
227			files = zipObj.namelist()
228			# Iterate over the file names
229			for fileName in files:
230			# Check filename endswith csv
231			if fileName.endswith('.txt'):
232			# Extract a single file from zip
233			zipObj.extract(fileName)
234			filename = fileName
235			return filename
236
237			def main():
238			if os.path.exists(os.path.join(os.getcwd(), DBFILENAME)):
239			parser = argparse.ArgumentParser()
240			parser.add_argument("--database", type=str, help="Set the file for database (default: geonames.sqlite)", default="geonames.sqlite")
241			parser.add_argument("longitude", type=float, help="X coordinate (Longitude)")
242			parser.add_argument("latitude", type=float, help="Y coordinate (Latitude)")
243			args = parser.parse_args()
244
245			result = query_closest_city(args.database, args.latitude, args.longitude,
246			query_buffer_distance=MIN_QUERY_DIST)
247			print("{}, {}, {}".format(result[0], result[1], result[2]))
248			else:
249			print("GeoNames database", DBFILENAME, "does not exist. Choose an option")
250			download_dataset(CITY_COLNAMES, STATE_COLNAMES, DBFILENAME)
251
252			if __name__ == "__main__":
253			main()
254

jmathai / elodie

Pull Request — master (#348)

closestgeoname.query_closest_city() A

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like