Passed
Pull Request — dev (#1375)
by
unknown
02:18
created

data.datasets.vg250.Vg250.__init__()   B

Complexity

Conditions 1

Size

Total Lines 91
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 74
dl 0
loc 91
rs 7.8509
c 0
b 0
f 0
cc 1
nop 2

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""The central module containing all code dealing with VG250 data.
2
3
This module either directly contains the code dealing with importing VG250
4
data, or it re-exports everything needed to handle it. Please refrain
5
from importing code from any modules below this one, because it might
6
lead to unwanted behaviour.
7
8
If you have to import code from a module below this one because the code
9
isn't exported from this module, please file a bug, so we can fix this.
10
"""
11
12
from pathlib import Path
13
from urllib.request import urlretrieve
14
import codecs
15
import datetime
16
import json
17
import os
18
import time
19
20
from geoalchemy2 import Geometry
21
import geopandas as gpd
22
23
from egon.data import db
24
from egon.data.config import settings
25
from egon.data.datasets import Dataset
26
from egon.data.metadata import (
27
    context,
28
    licenses_datenlizenz_deutschland,
29
    meta_metadata,
30
)
31
import egon.data.config
32
from egon_validation import (
33
    RowCountValidation,
34
    DataTypeValidation,
35
    NotNullAndNotNaNValidation,
36
    WholeTableNotNullAndNotNaNValidation,
37
    ValueSetValidation,
38
    SRIDUniqueNonZero
39
)
40
41
42
def download_files():
43
    """
44
    Download VG250 (Verwaltungsgebiete) shape files.
45
46
    Data is downloaded from source specified in *datasets.yml* in section
47
    *vg250/original_data/source/url* and saved to file specified in
48
    *vg250/original_data/target/file*.
49
50
    """
51
    data_config = egon.data.config.datasets()
52
    vg250_config = data_config["vg250"]["original_data"]
53
54
    download_directory = Path(".") / "vg250"
55
    # Create the folder, if it does not exist already
56
    if not os.path.exists(download_directory):
57
        os.mkdir(download_directory)
58
59
    target_file = download_directory / vg250_config["target"]["file"]
60
61
    if not os.path.isfile(target_file):
62
        urlretrieve(vg250_config["source"]["url"], target_file)
63
64
65
def to_postgres():
66
    """
67
    Writes original VG250 data to database.
68
69
    Creates schema boundaries if it does not yet exist.
70
    Newly creates all tables specified as keys in *datasets.yml* in section
71
    *vg250/processed/file_table_map*.
72
73
    """
74
75
    # Get information from data configuration file
76
    data_config = egon.data.config.datasets()
77
    vg250_orig = data_config["vg250"]["original_data"]
78
    vg250_processed = data_config["vg250"]["processed"]
79
80
    # Create target schema
81
    db.execute_sql(f"CREATE SCHEMA IF NOT EXISTS {vg250_processed['schema']};")
82
83
    zip_file = Path(".") / "vg250" / vg250_orig["target"]["file"]
84
    engine_local_db = db.engine()
85
86
    # Extract shapefiles from zip archive and send it to postgres db
87
    for filename, table in vg250_processed["file_table_map"].items():
88
        # Open files and read .shp (within .zip) with geopandas
89
        data = gpd.read_file(
90
            f"zip://{zip_file}!vg250_01-01.geo84.shape.ebenen/"
91
            f"vg250_ebenen_0101/{filename}"
92
        )
93
94
        boundary = settings()["egon-data"]["--dataset-boundary"]
95
        if boundary != "Everything":
96
            # read-in borders of federal state Schleswig-Holstein
97
            data_sta = gpd.read_file(
98
                f"zip://{zip_file}!vg250_01-01.geo84.shape.ebenen/"
99
                f"vg250_ebenen_0101/VG250_LAN.shp"
100
            ).query(f"GEN == '{boundary}'")
101
            data_sta.BEZ = "Bundesrepublik"
102
            data_sta.NUTS = "DE"
103
            # import borders of Schleswig-Holstein as borders of state
104
            if table == "vg250_sta":
105
                data = data_sta
106
            # choose only areas in Schleswig-Holstein
107
            else:
108
                data = data[
109
                    data.within(data_sta.dissolve(by="GEN").geometry.values[0])
110
                ]
111
112
        # Set index column and format column headings
113
        data.index.set_names("id", inplace=True)
114
        data.columns = [x.lower() for x in data.columns]
115
116
        # Drop table before inserting data
117
        db.execute_sql(
118
            f"DROP TABLE IF EXISTS "
119
            f"{vg250_processed['schema']}.{table} CASCADE;"
120
        )
121
122
        # create database table from geopandas dataframe
123
        data.to_postgis(
124
            table,
125
            engine_local_db,
126
            schema=vg250_processed["schema"],
127
            index=True,
128
            if_exists="replace",
129
            dtype={"geometry": Geometry()},
130
        )
131
132
        db.execute_sql(
133
            f"ALTER TABLE {vg250_processed['schema']}.{table} "
134
            f"ADD PRIMARY KEY (id);"
135
        )
136
137
        # Add index on geometry column
138
        db.execute_sql(
139
            f"CREATE INDEX {table}_geometry_idx ON "
140
            f"{vg250_processed['schema']}.{table} USING gist (geometry);"
141
        )
142
143
144
def add_metadata():
145
    """Writes metadata JSON string into table comment."""
146
    # Prepare variables
147
    vg250_config = egon.data.config.datasets()["vg250"]
148
149
    title_and_description = {
150
        "vg250_sta": {
151
            "title": "BKG - Verwaltungsgebiete 1:250.000 - Staat (STA)",
152
            "description": "Staatsgrenzen der Bundesrepublik Deutschland",
153
        },
154
        "vg250_lan": {
155
            "title": "BKG - Verwaltungsgebiete 1:250.000 - Länder (LAN)",
156
            "description": "Landesgrenzen der Bundesländer in der "
157
            "Bundesrepublik Deutschland",
158
        },
159
        "vg250_rbz": {
160
            "title": "BKG - Verwaltungsgebiete 1:250.000 - Regierungsbezirke "
161
            "(RBZ)",
162
            "description": "Grenzen der Regierungsbezirke in der "
163
            "Bundesrepublik Deutschland",
164
        },
165
        "vg250_krs": {
166
            "title": "BKG - Verwaltungsgebiete 1:250.000 - Kreise (KRS)",
167
            "description": "Grenzen der Landkreise in der "
168
            "Bundesrepublik Deutschland",
169
        },
170
        "vg250_vwg": {
171
            "title": "BKG - Verwaltungsgebiete 1:250.000 - "
172
            "Verwaltungsgemeinschaften (VWG)",
173
            "description": "Grenzen der Verwaltungsgemeinschaften in der "
174
            "Bundesrepublik Deutschland",
175
        },
176
        "vg250_gem": {
177
            "title": "BKG - Verwaltungsgebiete 1:250.000 - Gemeinden (GEM)",
178
            "description": "Grenzen der Gemeinden in der "
179
            "Bundesrepublik Deutschland",
180
        },
181
    }
182
183
    licenses = [
184
        licenses_datenlizenz_deutschland(
185
            attribution="© Bundesamt für Kartographie und Geodäsie "
186
            "2020 (Daten verändert)"
187
        )
188
    ]
189
190
    vg250_source = {
191
        "title": "Verwaltungsgebiete 1:250 000 (Ebenen)",
192
        "description": "Der Datenbestand umfasst sämtliche Verwaltungseinheiten der "
193
        "hierarchischen Verwaltungsebenen vom Staat bis zu den Gemeinden "
194
        "mit ihren Grenzen, statistischen Schlüsselzahlen, Namen der "
195
        "Verwaltungseinheit sowie die spezifische Bezeichnung der "
196
        "Verwaltungsebene des jeweiligen Landes.",
197
        "path": vg250_config["original_data"]["source"]["url"],
198
        "licenses": licenses,
199
    }
200
201
    for table in vg250_config["processed"]["file_table_map"].values():
202
        schema_table = ".".join([vg250_config["processed"]["schema"], table])
203
        meta = {
204
            "name": schema_table,
205
            "title": title_and_description[table]["title"],
206
            "id": "WILL_BE_SET_AT_PUBLICATION",
207
            "description": title_and_description[table]["title"],
208
            "language": ["de-DE"],
209
            "publicationDate": datetime.date.today().isoformat(),
210
            "context": context(),
211
            "spatial": {
212
                "location": None,
213
                "extent": "Germany",
214
                "resolution": "1:250000",
215
            },
216
            "temporal": {
217
                "referenceDate": "2020-01-01",
218
                "timeseries": {
219
                    "start": None,
220
                    "end": None,
221
                    "resolution": None,
222
                    "alignment": None,
223
                    "aggregationType": None,
224
                },
225
            },
226
            "sources": [vg250_source],
227
            "licenses": licenses,
228
            "contributors": [
229
                {
230
                    "title": "Guido Pleßmann",
231
                    "email": "http://github.com/gplssm",
232
                    "date": time.strftime("%Y-%m-%d"),
233
                    "object": None,
234
                    "comment": "Imported data",
235
                },
236
                {
237
                    "title": "Jonathan Amme",
238
                    "email": "http://github.com/nesnoj",
239
                    "date": time.strftime("%Y-%m-%d"),
240
                    "object": None,
241
                    "comment": "Metadata extended",
242
                },
243
            ],
244
            "resources": [
245
                {
246
                    "profile": "tabular-data-resource",
247
                    "name": schema_table,
248
                    "path": None,
249
                    "format": "PostgreSQL",
250
                    "encoding": "UTF-8",
251
                    "schema": {
252
                        "fields": vg250_metadata_resources_fields(),
253
                        "primaryKey": ["id"],
254
                        "foreignKeys": [],
255
                    },
256
                    "dialect": {"delimiter": None, "decimalSeparator": "."},
257
                }
258
            ],
259
            "metaMetadata": meta_metadata(),
260
        }
261
262
        meta_json = "'" + json.dumps(meta) + "'"
263
264
        db.submit_comment(
265
            meta_json, vg250_config["processed"]["schema"], table
266
        )
267
268
269
def nuts_mview():
270
    """
271
    Creates MView boundaries.vg250_lan_nuts_id.
272
273
    """
274
    db.execute_sql_script(
275
        os.path.join(os.path.dirname(__file__), "vg250_lan_nuts_id_mview.sql")
276
    )
277
278
279
def cleaning_and_preperation():
280
    """
281
    Creates tables and MViews with cleaned and corrected geometry data.
282
283
    The following table is created:
284
      * boundaries.vg250_gem_clean where municipalities (Gemeinden) that are fragmented
285
        are cleaned from ringholes
286
287
    The following MViews are created:
288
      * boundaries.vg250_gem_hole
289
      * boundaries.vg250_gem_valid
290
      * boundaries.vg250_krs_area
291
      * boundaries.vg250_lan_union
292
      * boundaries.vg250_sta_bbox
293
      * boundaries.vg250_sta_invalid_geometry
294
      * boundaries.vg250_sta_tiny_buffer
295
      * boundaries.vg250_sta_union
296
297
    """
298
299
    db.execute_sql_script(
300
        os.path.join(os.path.dirname(__file__), "cleaning_and_preparation.sql")
301
    )
302
303
304
def vg250_metadata_resources_fields():
305
    """
306
    Returns metadata string for VG250 tables.
307
308
    """
309
310
    return [
311
        {
312
            "description": "Index",
313
            "name": "id",
314
            "type": "integer",
315
            "unit": "none",
316
        },
317
        {
318
            "description": "Administrative level",
319
            "name": "ade",
320
            "type": "integer",
321
            "unit": "none",
322
        },
323
        {
324
            "description": "Geofactor",
325
            "name": "gf",
326
            "type": "integer",
327
            "unit": "none",
328
        },
329
        {
330
            "description": "Particular areas",
331
            "name": "bsg",
332
            "type": "integer",
333
            "unit": "none",
334
        },
335
        {
336
            "description": "Territorial code",
337
            "name": "ars",
338
            "type": "string",
339
            "unit": "none",
340
        },
341
        {
342
            "description": "Official Municipality Key",
343
            "name": "ags",
344
            "type": "string",
345
            "unit": "none",
346
        },
347
        {
348
            "description": "Seat of the administration (territorial code)",
349
            "name": "sdv_ars",
350
            "type": "string",
351
            "unit": "none",
352
        },
353
        {
354
            "description": "Geographical name",
355
            "name": "gen",
356
            "type": "string",
357
            "unit": "none",
358
        },
359
        {
360
            "description": "Designation of the administrative unit",
361
            "name": "bez",
362
            "type": "string",
363
            "unit": "none",
364
        },
365
        {
366
            "description": "Identifier",
367
            "name": "ibz",
368
            "type": "integer",
369
            "unit": "none",
370
        },
371
        {
372
            "description": "Note",
373
            "name": "bem",
374
            "type": "string",
375
            "unit": "none",
376
        },
377
        {
378
            "description": "Name generation",
379
            "name": "nbd",
380
            "type": "string",
381
            "unit": "none",
382
        },
383
        {
384
            "description": "Land (state)",
385
            "name": "sn_l",
386
            "type": "string",
387
            "unit": "none",
388
        },
389
        {
390
            "description": "Administrative district",
391
            "name": "sn_r",
392
            "type": "string",
393
            "unit": "none",
394
        },
395
        {
396
            "description": "District",
397
            "name": "sn_k",
398
            "type": "string",
399
            "unit": "none",
400
        },
401
        {
402
            "description": "Administrative association – front part",
403
            "name": "sn_v1",
404
            "type": "string",
405
            "unit": "none",
406
        },
407
        {
408
            "description": "Administrative association – rear part",
409
            "name": "sn_v2",
410
            "type": "string",
411
            "unit": "none",
412
        },
413
        {
414
            "description": "Municipality",
415
            "name": "sn_g",
416
            "type": "string",
417
            "unit": "none",
418
        },
419
        {
420
            "description": "Function of the 3rd key digit",
421
            "name": "fk_s3",
422
            "type": "string",
423
            "unit": "none",
424
        },
425
        {
426
            "description": "European statistics key",
427
            "name": "nuts",
428
            "type": "string",
429
            "unit": "none",
430
        },
431
        {
432
            "description": "Filled territorial code",
433
            "name": "ars_0",
434
            "type": "string",
435
            "unit": "none",
436
        },
437
        {
438
            "description": "Filled Official Municipality Key",
439
            "name": "ags_0",
440
            "type": "string",
441
            "unit": "none",
442
        },
443
        {
444
            "description": "Effectiveness",
445
            "name": "wsk",
446
            "type": "string",
447
            "unit": "none",
448
        },
449
        {
450
            "description": "DLM identifier",
451
            "name": "debkg_id",
452
            "type": "string",
453
            "unit": "none",
454
        },
455
        {
456
            "description": "Territorial code (deprecated column)",
457
            "name": "rs",
458
            "type": "string",
459
            "unit": "none",
460
        },
461
        {
462
            "description": "Seat of the administration (territorial code, deprecated column)",
463
            "name": "sdv_rs",
464
            "type": "string",
465
            "unit": "none",
466
        },
467
        {
468
            "description": "Filled territorial code (deprecated column)",
469
            "name": "rs_0",
470
            "type": "string",
471
            "unit": "none",
472
        },
473
        {
474
            "description": "Geometry of areas as WKB",
475
            "name": "geometry",
476
            "type": "Geometry(Polygon, srid=4326)",
477
            "unit": "none",
478
        },
479
    ]
480
481
482
class Vg250(Dataset):
483
    """
484
    Obtains and processes VG250 data and writes it to database.
485
486
    Original data is downloaded using :py:func:`download_files` function and written
487
    to database using :py:func:`to_postgres` function.
488
489
    *Dependencies*
490
      No dependencies
491
492
    *Resulting tables*
493
      * :py:func:`boundaries.vg250_gem <to_postgres>` is created and filled
494
      * :py:func:`boundaries.vg250_krs <to_postgres>` is created and filled
495
      * :py:func:`boundaries.vg250_lan <to_postgres>` is created and filled
496
      * :py:func:`boundaries.vg250_rbz <to_postgres>` is created and filled
497
      * :py:func:`boundaries.vg250_sta <to_postgres>` is created and filled
498
      * :py:func:`boundaries.vg250_vwg <to_postgres>` is created and filled
499
      * :py:func:`boundaries.vg250_lan_nuts_id <nuts_mview>` is created and filled
500
      * :py:func:`boundaries.vg250_gem_hole <cleaning_and_preperation>` is created and
501
        filled
502
      * :py:func:`boundaries.vg250_gem_valid <cleaning_and_preperation>` is created and
503
        filled
504
      * :py:func:`boundaries.vg250_krs_area <cleaning_and_preperation>` is created and
505
        filled
506
      * :py:func:`boundaries.vg250_lan_union <cleaning_and_preperation>` is created and
507
        filled
508
      * :py:func:`boundaries.vg250_sta_bbox <cleaning_and_preperation>` is created and
509
        filled
510
      * :py:func:`boundaries.vg250_sta_invalid_geometry <cleaning_and_preperation>` is
511
        created and filled
512
      * :py:func:`boundaries.vg250_sta_tiny_buffer <cleaning_and_preperation>` is
513
        created and filled
514
      * :py:func:`boundaries.vg250_sta_union <cleaning_and_preperation>` is
515
        created and filled
516
517
    """
518
519
    filename = egon.data.config.datasets()["vg250"]["original_data"]["source"][
520
        "url"
521
    ]
522
523
    #:
524
    name: str = "VG250"
525
    #:
526
    version: str = filename + "-0.0.4"
527
528
    def __init__(self, dependencies):
529
        super().__init__(
530
            name=self.name,
531
            version=self.version,
532
            dependencies=dependencies,
533
            tasks=(
534
                download_files,
535
                to_postgres,
536
                nuts_mview,
537
                add_metadata,
538
                cleaning_and_preperation,
539
            ),
540
            validation={
541
                "data_quality": [
542
                    RowCountValidation(
543
                        table="boundaries.vg250_krs",
544
                        rule_id="TEST_ROW_COUNT.vg250_krs",
545
                        expected_count={"Schleswig-Holstein":27, "Everything":431}
546
                    ),
547
                    DataTypeValidation(
548
                        table="boundaries.vg250_krs",
549
                        rule_id="TEST_DATA_MULTIPLE_TYPES.vg250_krs",
550
                        column_types={"Schleswig-Holstein":{"id":"bigint","ade":"integer", "gf":"integer", "bsg":"integer","ars":"text",
551
                                      "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"integer",
552
                                      "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text",
553
                                      "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text",
554
                                      "ars_0":"text", "ags_0":"text", "wsk":"timestamp without time zone", "debkg_id":"text", "rs":"text",
555
                                      "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"},
556
                                      "Everything":{"id":"bigint","ade":"bigint", "gf":"bigint", "bsg":"bigint","ars":"text",
557
                                      "ags":"text", "sdv_ars":"text", "gen":"text", "bez":"text","ibz":"bigint",
558
                                      "bem":"text", "nbd":"text", "sn_l":"text", "sn_r":"text", "sn_k":"text",
559
                                      "sn_v1":"text", "sn_v2":"text", "sn_g":"text", "fk_s3":"text", "nuts":"text",
560
                                      "ars_0":"text", "ags_0":"text", "wsk":"text", "debkg_id":"text", "rs":"text",
561
                                      "sdv_rs":"text", "rs_0":"text", "geometry":"geometry"}
562
                                      }
563
                    ),
564
                    NotNullAndNotNaNValidation(
565
                        table="boundaries.vg250_krs",
566
                        rule_id="TEST_NOT_NAN.vg250_krs",
567
                        columns=["gf","bsg"]
568
                    ),
569
                    WholeTableNotNullAndNotNaNValidation(
570
                        table="boundaries.vg250_krs",
571
                        rule_id="TEST_WHOLE_TABLE_NOT_NAN.vg250_krs"
572
                    ),
573
                    SRIDUniqueNonZero(
574
                        table="boundaries.vg250_krs",
575
                        rule_id="SRIDUniqueNonZero.vg250_krs.geometry",
576
                        column="geometry"
577
                    ),
578
                    ValueSetValidation(
579
                        table="boundaries.vg250_krs",
580
                        rule_id="TEST_VALUE_SET_NBD.vg250_krs",
581
                        column="nbd",
582
                        expected_values=["ja", "nein"]
583
                    ),
584
                    RowCountValidation(
585
                        table="society.destatis_zensus_population_per_ha_inside_germany",
586
                        rule_id="TEST_ROW_COUNT.destatis_zensus_population_per_ha_inside_germany",
587
                        expected_count={"Schleswig-Holstein": 143521, "Everything": 3177723}
588
                    ),
589
                    DataTypeValidation(
590
                        table="society.destatis_zensus_population_per_ha_inside_germany",
591
                        rule_id="TEST_DATA_MULTIPLE_TYPES.destatis_zensus_population_per_ha_inside_germany",
592
                        column_types={
593
                            "id": "integer", "grid_id": "character varying (254)", "population": "smallint",
594
                            "geom_point": "geometry","geom": "geometry"
595
                        }
596
                    ),
597
                    NotNullAndNotNaNValidation(
598
                        table="society.destatis_zensus_population_per_ha_inside_germany",
599
                        rule_id="TEST_NOT_NAN.destatis_zensus_population_per_ha_inside_germany",
600
                        columns=["id", "grid_id", "population", "geom_point", "geom"]
601
                    ),
602
                    WholeTableNotNullAndNotNaNValidation(
603
                        table="society.destatis_zensus_population_per_ha_inside_germany",
604
                        rule_id="TEST_WHOLE_TABLE_NOT_NAN.destatis_zensus_population_per_ha_inside_germany"
605
                    ),
606
                    SRIDUniqueNonZero(
607
                        table="society.destatis_zensus_population_per_ha_inside_germany",
608
                        rule_id="SRIDUniqueNonZero.destatis_zensus_population_per_ha_inside_germany.geom_point",
609
                        column="geom_point"
610
                    ),
611
                    SRIDUniqueNonZero(
612
                        table="society.destatis_zensus_population_per_ha_inside_germany",
613
                        rule_id="SRIDUniqueNonZero.destatis_zensus_population_per_ha_inside_germany.geom",
614
                        column="geom"
615
                    ),
616
                ]
617
            },
618
            on_validation_failure="continue"
619
        )
620