data.datasets.zensus.adjust_zensus_misc() - Code Metrics - Inspection of "Add zipfile check" - openego/eGon-data - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — dev (#970)

unknown

created 2022-10-11 15:59 UTC

data.datasets.zensus.adjust_zensus_misc() A

↳ Parent: data.datasets.zensus

Complexity

Conditions

Size

Total Lines	31
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	9
dl	0
loc	31
rs	9.95
c	0
b	0
f	0
cc	2
nop	0

"""The central module containing all code dealing with importing Zensus data.
"""

from pathlib import Path
from urllib.request import urlretrieve
import csv
import json
import os
import zipfile

from shapely.geometry import Point, shape
from shapely.prepared import prep
import pandas as pd

from egon.data import db, subprocess
from egon.data.config import settings
from egon.data.datasets import Dataset
import egon.data.config


class ZensusPopulation(Dataset):
    def __init__(self, dependencies):
        super().__init__(
            name="ZensusPopulation",
            version="0.0.1",
            dependencies=dependencies,
            tasks=(
                download_zensus_pop,
                create_zensus_pop_table,
                population_to_postgres,
            ),
        )


class ZensusMiscellaneous(Dataset):
    def __init__(self, dependencies):
        super().__init__(
            name="ZensusMiscellaneous",
            version="0.0.1",
            dependencies=dependencies,
            tasks=(
                download_zensus_misc,
                create_zensus_misc_tables,
                zensus_misc_to_postgres,
            ),
        )


def download_and_check(url, target_file, max_iteration=5):
    """Download file from url (http) if it doesn't exist and check afterwards.
    If bad zip remove file and re-download. Repeat until file is fine or
    reached maximum iterations."""
    bad_file = True
    count = 0
    while bad_file:

        # download file if it doesn't exist
        if not os.path.isfile(target_file):
            # check if url
            if url.lower().startswith("http"):
                urlretrieve(url, target_file)
            else:
                raise ValueError("No http url")

        # check zipfile
        try:
            with zipfile.ZipFile(target_file):
                print(f"Zip file {target_file} is good.")
            bad_file = False
        except zipfile.BadZipFile as ex:
            os.remove(target_file)
            count += 1
            if count > max_iteration:
                raise StopIteration(
                    f"Max iteration of {max_iteration} is exceeded"
                ) from ex


def download_zensus_pop():
    """Download Zensus csv file on population per hectare grid cell."""
    data_config = egon.data.config.datasets()
    zensus_population_config = data_config["zensus_population"][
        "original_data"
    ]
    download_directory = Path(".") / "zensus_population"
    # Create the folder, if it does not exist already
    if not os.path.exists(download_directory):
        os.mkdir(download_directory)

    target_file = (
        download_directory / zensus_population_config["target"]["file"]
    )

    url = zensus_population_config["source"]["url"]
    download_and_check(url, target_file, max_iteration=5)


def download_zensus_misc():
    """Download Zensus csv files on data per hectare grid cell."""

    # Get data config
    data_config = egon.data.config.datasets()
    download_directory = Path(".") / "zensus_population"
    # Create the folder, if it does not exist already
    if not os.path.exists(download_directory):
        os.mkdir(download_directory)
    # Download remaining zensus data set on households, buildings, apartments

    zensus_config = data_config["zensus_misc"]["original_data"]
    zensus_misc_processed = data_config["zensus_misc"]["processed"]
    zensus_url = zensus_config["source"]["url"]
    zensus_files = zensus_misc_processed["file_table_map"].keys()
    url_path_map = list(zip(zensus_url, zensus_files))

    for url, path in url_path_map:
        target_file_misc = download_directory / path

        download_and_check(url, target_file_misc, max_iteration=5)


def create_zensus_pop_table():
    """Create tables for zensus data in postgres database"""

    # Get information from data configuration file
    data_config = egon.data.config.datasets()
    zensus_population_processed = data_config["zensus_population"]["processed"]

    # Create target schema
    db.execute_sql(
        f"CREATE SCHEMA IF NOT EXISTS {zensus_population_processed['schema']};"
    )

    # Create table for population data
    population_table = (
        f"{zensus_population_processed['schema']}"
        f".{zensus_population_processed['table']}"
    )

    db.execute_sql(f"DROP TABLE IF EXISTS {population_table} CASCADE;")

    db.execute_sql(
        f"CREATE TABLE {population_table}"
        f""" (id        SERIAL NOT NULL,
              grid_id    character varying(254) NOT NULL,
              x_mp       int,
              y_mp       int,
              population smallint,
              geom_point geometry(Point,3035),
              geom geometry (Polygon, 3035),
              CONSTRAINT {zensus_population_processed['table']}_pkey
              PRIMARY KEY (id)
        );
        """
    )


def create_zensus_misc_tables():
    """Create tables for zensus data in postgres database"""

    # Get information from data configuration file
    data_config = egon.data.config.datasets()
    zensus_misc_processed = data_config["zensus_misc"]["processed"]

    # Create target schema
    db.execute_sql(
        f"CREATE SCHEMA IF NOT EXISTS {zensus_misc_processed['schema']};"
    )

    # Create tables for household, apartment and building
    for table in zensus_misc_processed["file_table_map"].values():
        misc_table = f"{zensus_misc_processed['schema']}.{table}"

        db.execute_sql(f"DROP TABLE IF EXISTS {misc_table} CASCADE;")
        db.execute_sql(
            f"CREATE TABLE {misc_table}"
            f""" (id                 SERIAL,
                  grid_id            VARCHAR(50),
                  grid_id_new        VARCHAR (50),
                  attribute          VARCHAR(50),
                  characteristics_code smallint,
                  characteristics_text text,
                  quantity           smallint,
                  quantity_q         smallint,
                  zensus_population_id int,
                  CONSTRAINT {table}_pkey PRIMARY KEY (id)
            );
            """
        )


def target(source, dataset):
    """Generate the target path corresponding to a source path.

    Parameters
    ----------
    dataset: str
        Toggles between production (`dataset='Everything'`) and test mode e.g.
        (`dataset='Schleswig-Holstein'`).
        In production mode, data covering entire Germany
        is used. In the test mode a subset of this data is used for testing the
        workflow.
    Returns
    -------
    Path
        Path to target csv-file

    """
    return Path(
        os.path.join(Path("."), "zensus_population", source.stem)
        + "."
        + dataset
        + source.suffix
    )


def select_geom():
    """Select the union of the geometries of Schleswig-Holstein from the
    database, convert their projection to the one used in the CSV file,
    output the result to stdout as a GeoJSON string and read it into a
    prepared shape for filtering.

    """
    docker_db_config = db.credentials()

    geojson = subprocess.run(
        ["ogr2ogr"]
        + ["-s_srs", "epsg:4326"]
        + ["-t_srs", "epsg:3035"]
        + ["-f", "GeoJSON"]
        + ["/vsistdout/"]
        + [
            f"PG:host={docker_db_config['HOST']}"
            f" user='{docker_db_config['POSTGRES_USER']}'"
            f" password='{docker_db_config['POSTGRES_PASSWORD']}'"
            f" port={docker_db_config['PORT']}"
            f" dbname='{docker_db_config['POSTGRES_DB']}'"
        ]
        + ["-sql", "SELECT ST_Union(geometry) FROM boundaries.vg250_lan"],
        text=True,
    )
    features = json.loads(geojson.stdout)["features"]
    assert (
        len(features) == 1
    ), f"Found {len(features)} geometry features, expected exactly one."

    return prep(shape(features[0]["geometry"]))


def filter_zensus_population(filename, dataset):
    """This block filters lines in the source CSV file and copies
    the appropriate ones to the destination based on geometry.


    Parameters
    ----------
    filename : str
        Path to input csv-file
    dataset: str, optional
        Toggles between production (`dataset='Everything'`) and test mode e.g.
        (`dataset='Schleswig-Holstein'`).
        In production mode, data covering entire Germany
        is used. In the test mode a subset of this data is used for testing the
        workflow.
    Returns
    -------
    str
        Path to output csv-file

    """

    csv_file = Path(filename).resolve(strict=True)

    schleswig_holstein = select_geom()

    if not os.path.isfile(target(csv_file, dataset)):

        with open(csv_file, mode="r", newline="") as input_lines:
            rows = csv.DictReader(input_lines, delimiter=";")
            gitter_ids = set()
            with open(
                target(csv_file, dataset), mode="w", newline=""
            ) as destination:
                output = csv.DictWriter(
                    destination, delimiter=";", fieldnames=rows.fieldnames
                )
                output.writeheader()
                output.writerows(
                    gitter_ids.add(row["Gitter_ID_100m"]) or row
                    for row in rows
                    if schleswig_holstein.intersects(
                        Point(float(row["x_mp_100m"]), float(row["y_mp_100m"]))
                    )
                )
    return target(csv_file, dataset)


def filter_zensus_misc(filename, dataset):
    """This block filters lines in the source CSV file and copies
    the appropriate ones to the destination based on grid_id values.


    Parameters
    ----------
    filename : str
        Path to input csv-file
    dataset: str, optional
        Toggles between production (`dataset='Everything'`) and test mode e.g.
        (`dataset='Schleswig-Holstein'`).
        In production mode, data covering entire Germany
        is used. In the test mode a subset of this data is used for testing the
        workflow.
    Returns
    -------
    str
        Path to output csv-file

    """
    csv_file = Path(filename).resolve(strict=True)

    gitter_ids = set(
        pd.read_sql(
            "SELECT grid_id from society.destatis_zensus_population_per_ha",
            con=db.engine(),
        ).grid_id.values
    )

    if not os.path.isfile(target(csv_file, dataset)):
        with open(
            csv_file, mode="r", newline="", encoding="iso-8859-1"
        ) as inputs:
            rows = csv.DictReader(inputs, delimiter=",")
            with open(
                target(csv_file, dataset),
                mode="w",
                newline="",
                encoding="iso-8859-1",
            ) as destination:
                output = csv.DictWriter(
                    destination, delimiter=",", fieldnames=rows.fieldnames
                )
                output.writeheader()
                output.writerows(
                    row for row in rows if row["Gitter_ID_100m"] in gitter_ids
                )
    return target(csv_file, dataset)


def population_to_postgres():
    """Import Zensus population data to postgres database"""
    # Get information from data configuration file
    data_config = egon.data.config.datasets()
    zensus_population_orig = data_config["zensus_population"]["original_data"]
    zensus_population_processed = data_config["zensus_population"]["processed"]
    input_file = (
        Path(".")
        / "zensus_population"
        / zensus_population_orig["target"]["file"]
    )
    dataset = settings()["egon-data"]["--dataset-boundary"]

    # Read database configuration from docker-compose.yml
    docker_db_config = db.credentials()

    population_table = (
        f"{zensus_population_processed['schema']}"
        f".{zensus_population_processed['table']}"
    )

    with zipfile.ZipFile(input_file) as zf:
        for filename in zf.namelist():

            zf.extract(filename)

            if dataset == "Everything":
                filename_insert = filename
            else:
                filename_insert = filter_zensus_population(filename, dataset)

            host = ["-h", f"{docker_db_config['HOST']}"]
            port = ["-p", f"{docker_db_config['PORT']}"]
            pgdb = ["-d", f"{docker_db_config['POSTGRES_DB']}"]
            user = ["-U", f"{docker_db_config['POSTGRES_USER']}"]
            command = [
                "-c",
                rf"\copy {population_table} (grid_id, x_mp, y_mp, population)"
                rf" FROM '{filename_insert}' DELIMITER ';' CSV HEADER;",
            ]
            subprocess.run(
                ["psql"] + host + port + pgdb + user + command,
                env={"PGPASSWORD": docker_db_config["POSTGRES_PASSWORD"]},
            )

        os.remove(filename)


    db.execute_sql(
        f"UPDATE {population_table} zs"
        " SET geom_point=ST_SetSRID(ST_MakePoint(zs.x_mp, zs.y_mp), 3035);"
    )

    db.execute_sql(
        f"UPDATE {population_table} zs"
        """ SET geom=ST_SetSRID(
                (ST_MakeEnvelope(zs.x_mp-50,zs.y_mp-50,zs.x_mp+50,zs.y_mp+50)),
                3035
            );
        """
    )

    db.execute_sql(
        f"CREATE INDEX {zensus_population_processed['table']}_geom_idx ON"
        f" {population_table} USING gist (geom);"
    )

    db.execute_sql(
        f"CREATE INDEX"
        f" {zensus_population_processed['table']}_geom_point_idx"
        f" ON  {population_table} USING gist (geom_point);"
    )


def zensus_misc_to_postgres():
    """Import data on buildings, households and apartments to postgres db"""

    # Get information from data configuration file
    data_config = egon.data.config.datasets()
    zensus_misc_processed = data_config["zensus_misc"]["processed"]
    zensus_population_processed = data_config["zensus_population"]["processed"]
    file_path = Path(".") / "zensus_population"
    dataset = settings()["egon-data"]["--dataset-boundary"]

    population_table = (
        f"{zensus_population_processed['schema']}"
        f".{zensus_population_processed['table']}"
    )

    # Read database configuration from docker-compose.yml
    docker_db_config = db.credentials()

    for input_file, table in zensus_misc_processed["file_table_map"].items():
        with zipfile.ZipFile(file_path / input_file) as zf:
            csvfiles = [n for n in zf.namelist() if n.lower()[-3:] == "csv"]
            for filename in csvfiles:
                zf.extract(filename)

                if dataset == "Everything":
                    filename_insert = filename
                else:
                    filename_insert = filter_zensus_misc(filename, dataset)

                host = ["-h", f"{docker_db_config['HOST']}"]
                port = ["-p", f"{docker_db_config['PORT']}"]
                pgdb = ["-d", f"{docker_db_config['POSTGRES_DB']}"]
                user = ["-U", f"{docker_db_config['POSTGRES_USER']}"]
                command = [
                    "-c",
                    rf"\copy {zensus_population_processed['schema']}.{table}"
                    f"""(grid_id,
                        grid_id_new,
                        attribute,
                        characteristics_code,
                        characteristics_text,
                        quantity,
                        quantity_q)
                        FROM '{filename_insert}' DELIMITER ','
                        CSV HEADER
                        ENCODING 'iso-8859-1';""",
                ]
                subprocess.run(
                    ["psql"] + host + port + pgdb + user + command,
                    env={"PGPASSWORD": docker_db_config["POSTGRES_PASSWORD"]},
                )

            os.remove(filename)


        db.execute_sql(
            f"""UPDATE {zensus_population_processed['schema']}.{table} as b
                    SET zensus_population_id = zs.id
                    FROM {population_table} zs
                    WHERE b.grid_id = zs.grid_id;"""
        )

        db.execute_sql(
            f"""ALTER TABLE {zensus_population_processed['schema']}.{table}
                    ADD CONSTRAINT {table}_fkey
                    FOREIGN KEY (zensus_population_id)
                    REFERENCES {population_table}(id);"""
        )

    # Create combined table
    create_combined_zensus_table()

    # Delete entries for unpopulated cells
    adjust_zensus_misc()


def create_combined_zensus_table():
    """Create combined table with buildings, apartments and population per cell

    Only apartment and building data with acceptable data quality
    (quantity_q<2) is used, all other data is dropped. For more details on data
    quality see Zensus docs:
    https://www.zensus2011.de/DE/Home/Aktuelles/DemografischeGrunddaten.html

    If there's no data on buildings or apartments for a certain cell, the value
    for building_count resp. apartment_count contains NULL.
    """
    sql_script = os.path.join(
        os.path.dirname(__file__), "create_combined_zensus_table.sql"
    )
    db.execute_sql_script(sql_script)


def adjust_zensus_misc():
    """Delete unpopulated cells in zensus-households, -buildings and -apartments

    Some unpopulated zensus cells are listed in:
    - egon_destatis_zensus_household_per_ha
    - egon_destatis_zensus_building_per_ha
    - egon_destatis_zensus_apartment_per_ha

    This can be caused by missing population
    information due to privacy or other special cases (e.g. holiday homes
    are listed as buildings but are not permanently populated.)
    In the following tasks of egon-data, only data of populated cells is used.

    Returns
    -------
    None.

    """
    # Get information from data configuration file
    data_config = egon.data.config.datasets()
    zensus_population_processed = data_config["zensus_population"]["processed"]
    zensus_misc_processed = data_config["zensus_misc"]["processed"]

    population_table = (
        f"{zensus_population_processed['schema']}"
        f".{zensus_population_processed['table']}"
    )

    for input_file, table in zensus_misc_processed["file_table_map"].items():
        db.execute_sql(
            f"""
             DELETE FROM {zensus_population_processed['schema']}.{table} as b
             WHERE b.zensus_population_id IN (
                 SELECT id FROM {population_table}
                 WHERE population < 0);"""
        )


1			"""The central module containing all code dealing with importing Zensus data.
2			"""
3
4			from pathlib import Path
5			from urllib.request import urlretrieve
6			import csv
7			import json
8			import os
9			import zipfile
10
11			from shapely.geometry import Point, shape
12			from shapely.prepared import prep
13			import pandas as pd
14
15			from egon.data import db, subprocess
16			from egon.data.config import settings
17			from egon.data.datasets import Dataset
18			import egon.data.config
19
20
21			class ZensusPopulation(Dataset):
22			def __init__(self, dependencies):
23			super().__init__(
24			name="ZensusPopulation",
25			version="0.0.1",
26			dependencies=dependencies,
27			tasks=(
28			download_zensus_pop,
29			create_zensus_pop_table,
30			population_to_postgres,
31			),
32			)
33
34
35			class ZensusMiscellaneous(Dataset):
36			def __init__(self, dependencies):
37			super().__init__(
38			name="ZensusMiscellaneous",
39			version="0.0.1",
40			dependencies=dependencies,
41			tasks=(
42			download_zensus_misc,
43			create_zensus_misc_tables,
44			zensus_misc_to_postgres,
45			),
46			)
47
48
49			def download_and_check(url, target_file, max_iteration=5):
50			"""Download file from url (http) if it doesn't exist and check afterwards.
51			If bad zip remove file and re-download. Repeat until file is fine or
52			reached maximum iterations."""
53			bad_file = True
54			count = 0
55			while bad_file:
56
57			# download file if it doesn't exist
58			if not os.path.isfile(target_file):
59			# check if url
60			if url.lower().startswith("http"):
61			urlretrieve(url, target_file)
62			else:
63			raise ValueError("No http url")
64
65			# check zipfile
66			try:
67			with zipfile.ZipFile(target_file):
68			print(f"Zip file {target_file} is good.")
69			bad_file = False
70			except zipfile.BadZipFile as ex:
71			os.remove(target_file)
72			count += 1
73			if count > max_iteration:
74			raise StopIteration(
75			f"Max iteration of {max_iteration} is exceeded"
76			) from ex
77
78
79			def download_zensus_pop():
80			"""Download Zensus csv file on population per hectare grid cell."""
81			data_config = egon.data.config.datasets()
82			zensus_population_config = data_config["zensus_population"][
83			"original_data"
84			]
85			download_directory = Path(".") / "zensus_population"
86			# Create the folder, if it does not exist already
87			if not os.path.exists(download_directory):
88			os.mkdir(download_directory)
89
90			target_file = (
91			download_directory / zensus_population_config["target"]["file"]
92			)
93
94			url = zensus_population_config["source"]["url"]
95			download_and_check(url, target_file, max_iteration=5)
96
97
98			def download_zensus_misc():
99			"""Download Zensus csv files on data per hectare grid cell."""
100
101			# Get data config
102			data_config = egon.data.config.datasets()
103			download_directory = Path(".") / "zensus_population"
104			# Create the folder, if it does not exist already
105			if not os.path.exists(download_directory):
106			os.mkdir(download_directory)
107			# Download remaining zensus data set on households, buildings, apartments
108
109			zensus_config = data_config["zensus_misc"]["original_data"]
110			zensus_misc_processed = data_config["zensus_misc"]["processed"]
111			zensus_url = zensus_config["source"]["url"]
112			zensus_files = zensus_misc_processed["file_table_map"].keys()
113			url_path_map = list(zip(zensus_url, zensus_files))
114
115			for url, path in url_path_map:
116			target_file_misc = download_directory / path
117
118			download_and_check(url, target_file_misc, max_iteration=5)
119
120
121			def create_zensus_pop_table():
122			"""Create tables for zensus data in postgres database"""
123
124			# Get information from data configuration file
125			data_config = egon.data.config.datasets()
126			zensus_population_processed = data_config["zensus_population"]["processed"]
127
128			# Create target schema
129			db.execute_sql(
130			f"CREATE SCHEMA IF NOT EXISTS {zensus_population_processed['schema']};"
131			)
132
133			# Create table for population data
134			population_table = (
135			f"{zensus_population_processed['schema']}"
136			f".{zensus_population_processed['table']}"
137			)
138
139			db.execute_sql(f"DROP TABLE IF EXISTS {population_table} CASCADE;")
140
141			db.execute_sql(
142			f"CREATE TABLE {population_table}"
143			f""" (id SERIAL NOT NULL,
144			grid_id character varying(254) NOT NULL,
145			x_mp int,
146			y_mp int,
147			population smallint,
148			geom_point geometry(Point,3035),
149			geom geometry (Polygon, 3035),
150			CONSTRAINT {zensus_population_processed['table']}_pkey
151			PRIMARY KEY (id)
152			);
153			"""
154			)
155
156
157			def create_zensus_misc_tables():
158			"""Create tables for zensus data in postgres database"""
159
160			# Get information from data configuration file
161			data_config = egon.data.config.datasets()
162			zensus_misc_processed = data_config["zensus_misc"]["processed"]
163
164			# Create target schema
165			db.execute_sql(
166			f"CREATE SCHEMA IF NOT EXISTS {zensus_misc_processed['schema']};"
167			)
168
169			# Create tables for household, apartment and building
170			for table in zensus_misc_processed["file_table_map"].values():
171			misc_table = f"{zensus_misc_processed['schema']}.{table}"
172
173			db.execute_sql(f"DROP TABLE IF EXISTS {misc_table} CASCADE;")
174			db.execute_sql(
175			f"CREATE TABLE {misc_table}"
176			f""" (id SERIAL,
177			grid_id VARCHAR(50),
178			grid_id_new VARCHAR (50),
179			attribute VARCHAR(50),
180			characteristics_code smallint,
181			characteristics_text text,
182			quantity smallint,
183			quantity_q smallint,
184			zensus_population_id int,
185			CONSTRAINT {table}_pkey PRIMARY KEY (id)
186			);
187			"""
188			)
189
190
191			def target(source, dataset):
192			"""Generate the target path corresponding to a source path.
193
194			Parameters
195			----------
196			dataset: str
197			Toggles between production (`dataset='Everything'`) and test mode e.g.
198			(`dataset='Schleswig-Holstein'`).
199			In production mode, data covering entire Germany
200			is used. In the test mode a subset of this data is used for testing the
201			workflow.
202			Returns
203			-------
204			Path
205			Path to target csv-file
206
207			"""
208			return Path(
209			os.path.join(Path("."), "zensus_population", source.stem)
210			+ "."
211			+ dataset
212			+ source.suffix
213			)
214
215
216			def select_geom():
217			"""Select the union of the geometries of Schleswig-Holstein from the
218			database, convert their projection to the one used in the CSV file,
219			output the result to stdout as a GeoJSON string and read it into a
220			prepared shape for filtering.
221
222			"""
223			docker_db_config = db.credentials()
224
225			geojson = subprocess.run(
226			["ogr2ogr"]
227			+ ["-s_srs", "epsg:4326"]
228			+ ["-t_srs", "epsg:3035"]
229			+ ["-f", "GeoJSON"]
230			+ ["/vsistdout/"]
231			+ [
232			f"PG:host={docker_db_config['HOST']}"
233			f" user='{docker_db_config['POSTGRES_USER']}'"
234			f" password='{docker_db_config['POSTGRES_PASSWORD']}'"
235			f" port={docker_db_config['PORT']}"
236			f" dbname='{docker_db_config['POSTGRES_DB']}'"
237			]
238			+ ["-sql", "SELECT ST_Union(geometry) FROM boundaries.vg250_lan"],
239			text=True,
240			)
241			features = json.loads(geojson.stdout)["features"]
242			assert (
243			len(features) == 1
244			), f"Found {len(features)} geometry features, expected exactly one."
245
246			return prep(shape(features[0]["geometry"]))
247
248
249			def filter_zensus_population(filename, dataset):
250			"""This block filters lines in the source CSV file and copies
251			the appropriate ones to the destination based on geometry.
252
253
254			Parameters
255			----------
256			filename : str
257			Path to input csv-file
258			dataset: str, optional
259			Toggles between production (`dataset='Everything'`) and test mode e.g.
260			(`dataset='Schleswig-Holstein'`).
261			In production mode, data covering entire Germany
262			is used. In the test mode a subset of this data is used for testing the
263			workflow.
264			Returns
265			-------
266			str
267			Path to output csv-file
268
269			"""
270
271			csv_file = Path(filename).resolve(strict=True)
272
273			schleswig_holstein = select_geom()
274
275			if not os.path.isfile(target(csv_file, dataset)):
276
277			with open(csv_file, mode="r", newline="") as input_lines:
278			rows = csv.DictReader(input_lines, delimiter=";")
279			gitter_ids = set()
280			with open(
281			target(csv_file, dataset), mode="w", newline=""
282			) as destination:
283			output = csv.DictWriter(
284			destination, delimiter=";", fieldnames=rows.fieldnames
285			)
286			output.writeheader()
287			output.writerows(
288			gitter_ids.add(row["Gitter_ID_100m"]) or row
289			for row in rows
290			if schleswig_holstein.intersects(
291			Point(float(row["x_mp_100m"]), float(row["y_mp_100m"]))
292			)
293			)
294			return target(csv_file, dataset)
295
296
297			def filter_zensus_misc(filename, dataset):
298			"""This block filters lines in the source CSV file and copies
299			the appropriate ones to the destination based on grid_id values.
300
301
302			Parameters
303			----------
304			filename : str
305			Path to input csv-file
306			dataset: str, optional
307			Toggles between production (`dataset='Everything'`) and test mode e.g.
308			(`dataset='Schleswig-Holstein'`).
309			In production mode, data covering entire Germany
310			is used. In the test mode a subset of this data is used for testing the
311			workflow.
312			Returns
313			-------
314			str
315			Path to output csv-file
316
317			"""
318			csv_file = Path(filename).resolve(strict=True)
319
320			gitter_ids = set(
321			pd.read_sql(
322			"SELECT grid_id from society.destatis_zensus_population_per_ha",
323			con=db.engine(),
324			).grid_id.values
325			)
326
327			if not os.path.isfile(target(csv_file, dataset)):
328			with open(
329			csv_file, mode="r", newline="", encoding="iso-8859-1"
330			) as inputs:
331			rows = csv.DictReader(inputs, delimiter=",")
332			with open(
333			target(csv_file, dataset),
334			mode="w",
335			newline="",
336			encoding="iso-8859-1",
337			) as destination:
338			output = csv.DictWriter(
339			destination, delimiter=",", fieldnames=rows.fieldnames
340			)
341			output.writeheader()
342			output.writerows(
343			row for row in rows if row["Gitter_ID_100m"] in gitter_ids
344			)
345			return target(csv_file, dataset)
346
347
348			def population_to_postgres():
349			"""Import Zensus population data to postgres database"""
350			# Get information from data configuration file
351			data_config = egon.data.config.datasets()
352			zensus_population_orig = data_config["zensus_population"]["original_data"]
353			zensus_population_processed = data_config["zensus_population"]["processed"]
354			input_file = (
355			Path(".")
356			/ "zensus_population"
357			/ zensus_population_orig["target"]["file"]
358			)
359			dataset = settings()["egon-data"]["--dataset-boundary"]
360
361			# Read database configuration from docker-compose.yml
362			docker_db_config = db.credentials()
363
364			population_table = (
365			f"{zensus_population_processed['schema']}"
366			f".{zensus_population_processed['table']}"
367			)
368
369			with zipfile.ZipFile(input_file) as zf:
370			for filename in zf.namelist():
371
372			zf.extract(filename)
373
374			if dataset == "Everything":
375			filename_insert = filename
376			else:
377			filename_insert = filter_zensus_population(filename, dataset)
378
379			host = ["-h", f"{docker_db_config['HOST']}"]
380			port = ["-p", f"{docker_db_config['PORT']}"]
381			pgdb = ["-d", f"{docker_db_config['POSTGRES_DB']}"]
382			user = ["-U", f"{docker_db_config['POSTGRES_USER']}"]
383			command = [
384			"-c",
385			rf"\copy {population_table} (grid_id, x_mp, y_mp, population)"
386			rf" FROM '{filename_insert}' DELIMITER ';' CSV HEADER;",
387			]
388			subprocess.run(
389			["psql"] + host + port + pgdb + user + command,
390			env={"PGPASSWORD": docker_db_config["POSTGRES_PASSWORD"]},
391			)
392
393			os.remove(filename)
			0 ignored issues – show introduced 2021-01-14 05:17 UTC by Report Bug Copy Issue Report The variable `filename` does not seem to be defined in case the `for` loop on line `370` is not entered. Are you sure this can never be the case? Loading history...
394
395			db.execute_sql(
396			f"UPDATE {population_table} zs"
397			" SET geom_point=ST_SetSRID(ST_MakePoint(zs.x_mp, zs.y_mp), 3035);"
398			)
399
400			db.execute_sql(
401			f"UPDATE {population_table} zs"
402			""" SET geom=ST_SetSRID(
403			(ST_MakeEnvelope(zs.x_mp-50,zs.y_mp-50,zs.x_mp+50,zs.y_mp+50)),
404			3035
405			);
406			"""
407			)
408
409			db.execute_sql(
410			f"CREATE INDEX {zensus_population_processed['table']}_geom_idx ON"
411			f" {population_table} USING gist (geom);"
412			)
413
414			db.execute_sql(
415			f"CREATE INDEX"
416			f" {zensus_population_processed['table']}_geom_point_idx"
417			f" ON {population_table} USING gist (geom_point);"
418			)
419
420
421			def zensus_misc_to_postgres():
422			"""Import data on buildings, households and apartments to postgres db"""
423
424			# Get information from data configuration file
425			data_config = egon.data.config.datasets()
426			zensus_misc_processed = data_config["zensus_misc"]["processed"]
427			zensus_population_processed = data_config["zensus_population"]["processed"]
428			file_path = Path(".") / "zensus_population"
429			dataset = settings()["egon-data"]["--dataset-boundary"]
430
431			population_table = (
432			f"{zensus_population_processed['schema']}"
433			f".{zensus_population_processed['table']}"
434			)
435
436			# Read database configuration from docker-compose.yml
437			docker_db_config = db.credentials()
438
439			for input_file, table in zensus_misc_processed["file_table_map"].items():
440			with zipfile.ZipFile(file_path / input_file) as zf:
441			csvfiles = [n for n in zf.namelist() if n.lower()[-3:] == "csv"]
442			for filename in csvfiles:
443			zf.extract(filename)
444
445			if dataset == "Everything":
446			filename_insert = filename
447			else:
448			filename_insert = filter_zensus_misc(filename, dataset)
449
450			host = ["-h", f"{docker_db_config['HOST']}"]
451			port = ["-p", f"{docker_db_config['PORT']}"]
452			pgdb = ["-d", f"{docker_db_config['POSTGRES_DB']}"]
453			user = ["-U", f"{docker_db_config['POSTGRES_USER']}"]
454			command = [
455			"-c",
456			rf"\copy {zensus_population_processed['schema']}.{table}"
457			f"""(grid_id,
458			grid_id_new,
459			attribute,
460			characteristics_code,
461			characteristics_text,
462			quantity,
463			quantity_q)
464			FROM '{filename_insert}' DELIMITER ','
465			CSV HEADER
466			ENCODING 'iso-8859-1';""",
467			]
468			subprocess.run(
469			["psql"] + host + port + pgdb + user + command,
470			env={"PGPASSWORD": docker_db_config["POSTGRES_PASSWORD"]},
471			)
472
473			os.remove(filename)
			0 ignored issues – show introduced 2021-01-28 16:52 UTC by Report Bug Copy Issue Report The variable `filename` does not seem to be defined for all execution paths. Loading history...
474
475			db.execute_sql(
476			f"""UPDATE {zensus_population_processed['schema']}.{table} as b
477			SET zensus_population_id = zs.id
478			FROM {population_table} zs
479			WHERE b.grid_id = zs.grid_id;"""
480			)
481
482			db.execute_sql(
483			f"""ALTER TABLE {zensus_population_processed['schema']}.{table}
484			ADD CONSTRAINT {table}_fkey
485			FOREIGN KEY (zensus_population_id)
486			REFERENCES {population_table}(id);"""
487			)
488
489			# Create combined table
490			create_combined_zensus_table()
491
492			# Delete entries for unpopulated cells
493			adjust_zensus_misc()
494
495
496			def create_combined_zensus_table():
497			"""Create combined table with buildings, apartments and population per cell
498
499			Only apartment and building data with acceptable data quality
500			(quantity_q<2) is used, all other data is dropped. For more details on data
501			quality see Zensus docs:
502			https://www.zensus2011.de/DE/Home/Aktuelles/DemografischeGrunddaten.html
503
504			If there's no data on buildings or apartments for a certain cell, the value
505			for building_count resp. apartment_count contains NULL.
506			"""
507			sql_script = os.path.join(
508			os.path.dirname(__file__), "create_combined_zensus_table.sql"
509			)
510			db.execute_sql_script(sql_script)
511
512
513			def adjust_zensus_misc():
514			"""Delete unpopulated cells in zensus-households, -buildings and -apartments
515
516			Some unpopulated zensus cells are listed in:
517			- egon_destatis_zensus_household_per_ha
518			- egon_destatis_zensus_building_per_ha
519			- egon_destatis_zensus_apartment_per_ha
520
521			This can be caused by missing population
522			information due to privacy or other special cases (e.g. holiday homes
523			are listed as buildings but are not permanently populated.)
524			In the following tasks of egon-data, only data of populated cells is used.
525
526			Returns
527			-------
528			None.
529
530			"""
531			# Get information from data configuration file
532			data_config = egon.data.config.datasets()
533			zensus_population_processed = data_config["zensus_population"]["processed"]
534			zensus_misc_processed = data_config["zensus_misc"]["processed"]
535
536			population_table = (
537			f"{zensus_population_processed['schema']}"
538			f".{zensus_population_processed['table']}"
539			)
540
541			for input_file, table in zensus_misc_processed["file_table_map"].items():
542			db.execute_sql(
543			f"""
544			DELETE FROM {zensus_population_processed['schema']}.{table} as b
545			WHERE b.zensus_population_id IN (
546			SELECT id FROM {population_table}
547			WHERE population < 0);"""
548			)
549

openego / eGon-data

Pull Request — dev (#970)

data.datasets.zensus.adjust_zensus_misc() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like