aggregatedexposuremodel.py 90.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#!/usr/bin/env python3

# Copyright (C) 2021:
#   Helmholtz-Zentrum Potsdam Deutsches GeoForschungsZentrum GFZ
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
# General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.

19
import os
20
21
22
23
import abc
import logging
import numpy
import pandas
24
import geopandas
25
import pyproj
26
from gdeimporter.exposureentity import ExposureEntity
27
from gdeimporter.dataunit import DataUnit
28
from gdeimporter.tools.spatial import SpatialTools
29
from gdeimporter.tools.buildingtaxonomy import GEMTaxonomy_v3
30

31
32
33
34
35
36
37
38
39
40
41
42

logger = logging.getLogger()


class AggregatedExposureModel(abc.ABC):
    """This class represents an input aggregated exposure model.

    Attributes:
        self.model_name (str):
            Name of the input aggregated model.
        self.exposure_format (str):
            Format of the input aggregated model. Currently supported values: "esrm20".
43
44
45
46
        self.domain_geometry (Shapely Polygon or Multipolygon):
            Geometry within which the AggregatedExposureModel is defined. It should not
            follow the boundaries of its exposure entities with precision but allow to determine
            if the bounding boxes of its exposure entities are fully contained within it.
47
48
49
50
        self.occupancy_cases (dict):
            Dictionary in which each first level key corresponds to an occupancy case (e.g.
            "residential", "commercial", "industrial") for which the input aggregated exposure
            model is defined. The subkeys will point at specific mappings for each sub-class.
51
52
53
54
55
        self.exposure_entities (dictionary of ExposureEntity):
            Dictionary of instances of ExposureEntity objects, each of which represent an
            exposure entity where the input aggregated exposure model is defined. See
            attributes in description of ExposureEntity. The keys of the dictionary are the
            names of the corresponding exposure entities.
56
57
        self.filename_pattern (str):
            Pattern of the names of the files that define the input aggregated exposure model.
58
59
60
        self.boundary_filename_pattern (str):
            Pattern of the names of the geodata files that contain the boundaries of the
            exposure entities of the input aggregated exposure model.
61
62
        self.currency (str):
            Currency used in the AggregatedExposureModel to express building replacement costs.
63
64
65
    """

    def __init__(self, configuration):
66
67
68
69
70
71
        """
        Args:
            configuration (Configuration object):
                Instance of the Configuration class.
        """

72
73
        self.model_name = configuration.model_name
        self.exposure_format = configuration.exposure_format
74
        self.domain_geometry = self._retrieve_domain_geometry(configuration)
75
76
77
        self.occupancy_cases = None
        self.exposure_entities = None
        self.filename_pattern = None
78
        self.boundary_filename_pattern = None
79
        self.currency = None
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

    def retrieve_exposure_entities(self, configuration):
        """This function retrieves the exposure entities for which an input aggregated exposure
        model is defined, together with the definition of their types of data units for each
        occupancy case, by reading and processing the relevant data (as specified in the
        respective subclasses).

        Exposure entities can be, for example, countries or any other spatial/administrative
        unit for which an aggregated exposure model is defined. Arbitrary polygons can be
        administrative units, Voronoi cells, etc.

        Args:
            configuration (Configuration object):
                Instance of the Configuration class.

        Returns:
            exposure_entities (dictionary of ExposureEntity):
                Dictionary of instances of ExposureEntity objects, each of which represent an
                exposure entity where the input aggregated exposure model is defined. See
                attributes in description of ExposureEntity. The keys of the dictionary are the
                names of the corresponding exposure entities.
        """

        raise NotImplementedError

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    def write_source_to_database(self, db_cursor, db_table):
        """This function stores self.model_name, self.exposure_format to an SQL database table
        named db_table. The cursor to the database is provided as db_cursor.

        If an entry already exists in db_table with name = self.model_name, the format field is
        updated as per self.exposure_format and the aggregated_source_id remains the same. If an
        entry does not already exist, it is created. It is assumed that the database table can
        automatically assign sequential values of aggregated_source_id. The assigned value is
        returned by this function.

        Args:
            db_cursor (psycopg2.extensions.cursor):
                Cursor object that allows to execute commands on a target SQL database.
            db_table (str):
                Name of the table within the SQL database (associated with db_cursor), in which
                information on the AggregatedExposureModel will be stored. This table is assumed
                to contain, at least, the following fields:
                    aggregated_source_id (int):
                        ID of the source of the aggregated exposure model.
                    name (str):
                        Name of the source of the aggregated exposure model.
                    format (str):
                        Format of the source files.

        Returns:
            aggregated_source_id (int):
                ID of the aggregated source in db_table.
            writing_mode (str):
                "Inserted", if an entry did not already exist in db_table with name =
                self.model_name and was created; "Updated" if an entry already existed in
                db_table with name = self.model_name and then only the format was updated;
                "Error" if more than one entries already existed in db_table with name =
                self.model_name and thus nothing was done.
        """

        sql_commands = {}
        sql_commands["insert"] = "INSERT INTO %s(name,format) VALUES('%s','%s');" % (
            db_table,
            self.model_name,
            self.exposure_format,
        )
        sql_commands["update"] = "UPDATE %s SET format='%s' WHERE name='%s';" % (
            db_table,
            self.exposure_format,
            self.model_name,
        )
        sql_commands["query"] = "SELECT aggregated_source_id FROM %s WHERE name='%s';" % (
            db_table,
            self.model_name,
        )

        # Check if an entry already exists for a source with this name
        db_cursor.execute(sql_commands["query"])
        exec_result = db_cursor.fetchall()

        if len(exec_result) == 0:  # No entry with this name exists yet --> append
            db_cursor.execute(sql_commands["insert"])
            # Retrieve the source ID (assigned automatically by SQL)
            db_cursor.execute(sql_commands["query"])
            exec_result = db_cursor.fetchall()
            aggregated_source_id = exec_result[0][0]
            writing_mode = "Inserted"
        elif len(exec_result) == 1:  # Entry exists --> overwrite
            db_cursor.execute(sql_commands["update"])
            # Retrieve the source ID
            aggregated_source_id = exec_result[0][0]
            writing_mode = "Updated"
        else:  # More than one entries found, this is an error
            logger.error(
                "ERROR IN write_source_to_database: "
                "MORE THAN ONE ENTRY FOUND FOR SOURCE NAME %s" % (self.model_name)
            )
            aggregated_source_id = numpy.nan
            writing_mode = "Error"

        return aggregated_source_id, writing_mode

182
183
184
    def store_data_units(
        self,
        db_data_units_config,
185
186
        db_table_data_units,
        db_table_data_units_buildings,
187
188
189
190
191
        exposure_entity_name,
        occupancy_case,
        aggregated_source_id,
    ):
        """This function stores all data units associated with
Cecilia Nievas's avatar
Cecilia Nievas committed
192
193
194
195
        self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case] and all the
        information associated with their building classes to the tables db_table_data_units and
        db_table_data_units_buildings in the database whose credentials are indicated in
        db_data_units_config.
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211

        Args:
            db_data_units_config (dict):
                Dictionary containing the credentials needed to connect to the SQL database in
                which information on the data units is stored. The keys of the dictionary need
                to be:
                    host (str):
                        SQL database host address.
                    dbname (str):
                        Name of the SQL database.
                    port (int):
                        Port where the SQL database can be found.
                    username (str):
                        User name to connect to the SQL database.
                    password (str):
                        Password associated with self.username.
212
            db_table_data_units (str):
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
                Name of the table of the SQL database where the data units will be stored. It is
                assumed that this table contains, at least, the following fields:
                    data_unit_id (str):
                        ID of the data unit.
                    exposure_entity (str):
                        3-character code of the exposure entity.
                    occupancy_case (enum):
                        SQL enumerated type describing the building occupancy cases.
                    aggregated_source_id (int):
                        ID of the source of the aggregated exposure model.
                    buildings_total (float):
                        Total number of buildings in the DataUnit.
                    dwellings_total (float):
                        Total number of dwellings in the DataUnit.
                    people_census (float):
                        Total number of census people in the DataUnit.
                    cost_total (float):
                        Total replacement cost of buildings in the DataUnit.
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
            db_table_data_units_buildings (str):
                Name of the table of the SQL database where the attributes of the building
                classes of the Data Unit will be stored. It is assumed that this table contains,
                at least, the following fields:
                    building_class_name (str):
                        Building class as per the GEM Building Taxonomy.
                    settlement_type (enum):
                        Type of settlement within the Data Unit. Possible values: "urban",
                        "rural", "big_city", "all".
                    occupancy_subtype (str):
                        Details on the occupancy, if relevant to characterise the building
                        class.
                    aggregated_source_id (int):
                        ID of the source of the aggregated exposure model.
                    exposure_entity (str):
                        3-character code of the exposure entity.
                    occupancy_case (enum):
                        SQL enumerated type describing the building occupancy cases.
                    data_unit_id (str):
                        ID of the data unit.
                    proportions (float):
                        Proportions in which each of the building classes defined by
                        building_class_name, settlement_type and occupancy_subtype are
                        present in the Data Unit. This column needs to add up to 1.0 for all
                        rows with the same building_class_name, settlement_type,
                        occupancy_subtype, data_unit_id, occupancy_case and
                        aggregated_source_id.
                    census_people_per_building (float):
                        Number of census-derived people (i.e. not accounting for time of the
                        day) per building of this class.
                    total_cost_per_building (float):
                        Total replacement cost of a building of this class, including costs of
                        structural and non-structural components as well as contents.
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
            exposure_entity_name (str):
                Name of the ExposureEntity whose data units will be stored. It needs to be a
                key of self.exposure_entities and self.exposure_entities[exposure_entity_name]
                needs to have at least the following attributes:
                    code (str):
                        3-character code that uniquely identifies this ExposureEntity.
                    occupancy_cases (dict):
                        Dictionary as defined in the attributes of ExposureEntity.
                        'occupancy_case' needs to be one of its keys, with existing subkey
                        'data_units'.
            occupancy_case (str):
                Name of the occupancy case (e.g. "residential", "commercial", "industrial")
                associated with this Data Unit. It needs to be a key of
                self.exposure_entities[exposure_entity_name].occupancy_cases.
            aggregated_source_id (int):
                ID of the source of the aggregated exposure model.
        Returns:
281
282
283
284
285
286
287
            This function calls DataUnit.write_data_unit_to_database() to store the following
            attributes of all data units present in self.exposure_entities[exposure_entity_name]
            .occupancy_cases[occupancy_case]["data_units"]: ID, occupancy_case, total_buildings,
            total_dwellings, total_people["Census"] and total_cost["Total"]. This function also
            calls DataUnit.write_data_unit_buildings_to_database() to store attributes and
            proportions of building classes present in all data units of self.exposure_entities
            [exposure_entity_name].occupancy_cases[occupancy_case]["data_units"].
288
289
290
291
292
293
294
295
296
        """

        data_units = self.exposure_entities[exposure_entity_name].occupancy_cases[
            occupancy_case
        ]["data_units"]

        for data_unit_id in data_units.keys():
            data_units[data_unit_id].write_data_unit_to_database(
                db_data_units_config,
297
298
299
300
301
302
303
304
                db_table_data_units,
                aggregated_source_id,
                occupancy_case,
                self.exposure_entities[exposure_entity_name].code,
            )
            data_units[data_unit_id].write_data_unit_buildings_to_database(
                db_data_units_config,
                db_table_data_units_buildings,
305
306
307
308
309
310
311
                aggregated_source_id,
                occupancy_case,
                self.exposure_entities[exposure_entity_name].code,
            )

        return

312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    def _retrieve_domain_geometry(self, configuration):
        """This function retrieves the geometry of the geographic domain of the
        AggregatedExposureModel, following the path indicated in 'configuration'. If
        configuration.domain_boundary_filepath is None, it returns None.

        Args:
            configuration (Configuration object):
                Instance of the Configuration class.

        Returns:
            domain_polygon (Shapely Polygon or Multipolygon):
                Geometry retrieved from configuration.domain_boundary_filepath.
        """

        if configuration.domain_boundary_filepath is None:
            return None

        # Read the data file (errors will be handled by geopandas)
        domain_polygon_gdf = geopandas.GeoDataFrame.from_file(
            os.path.join(configuration.domain_boundary_filepath),
        )
        domain_polygon_gdf = domain_polygon_gdf.to_crs("EPSG:4326")

        if domain_polygon_gdf.shape[0] > 1:
            domain_polygon_gdf = domain_polygon_gdf.dissolve()

        domain_polygon = domain_polygon_gdf["geometry"].to_numpy()[0]

        return domain_polygon

342
343
344
345
346
347

class ExposureModelESRM20(AggregatedExposureModel):
    """This class represents the European Seismic Risk Model 2020 (ESRM20) aggregated exposure
    model.

    See details in https://git.gfz-potsdam.de/dynamicexposure/datasources/-/tree/master/esrm20.
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362

    Attributes:
        self.occupancy_cases (dict):
            Dictionary in which each first level key corresponds to an occupancy case (e.g.
            "residential", "commercial", "industrial") for which ESRM20 is defined. Each first
            level key contains three sub-keys: "short", "sheet_name" and
            "data_units_types_field":
                short (str):
                    Name of the occupancy case as found in the ESRM20 model files.
                sheet_name (str):
                    Name of the sheet in the meatadata file of the input aggregated exposure
                    model from which info on this occupancy case can be retrieved.
                data_units_types_field (str):
                    Name of the field in sheet_name from which to retrieve information on the
                    types of data_units.
363
364
365
366
367
368
369
370
                costs_disaggregation_header (str):
                    Name of the header associated with the occupancy case in the file that
                    contains information on the disaggregation of replacement costs.
                population_time_distribution_fields (dict):
                    Dictionary specifying the names of the fields within the file
                    self.file_structure["population_time_distribution"]["filename"] associated
                    with the occupancy case and a specific time of the day. It contains the
                    subkeys "Day", "Night" and "Transit".
371
372
        self.filename_pattern (str):
            Pattern of the names of the ESRM20 CSV files.
373
374
375
        self.boundary_filename_pattern (str):
            Pattern of the names of the geodata files that contain the boundaries of the
            exposure entities of the ESRM20 model.
376
377
378
379
380
381
382
        self.file_structure (dict):
            Dictionary specifying the file structure of the ESRM20 model files within the data
            pathname specified in the configuration file. It contains the following subkeys:
                metadata (str):
                    Relative path to the metadata .xlsx file.
                CSVs (str):
                    Relative path to the folder that contains the CSV files per exposure entity.
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
                population_time_distribution (dict):
                    Dictionary specifying the file and fields from which information on the
                    distribution of the census population present in each building at a specific
                    time of the day can be retrieved. It contains the following subkeys:
                        filename (str):
                            Relative path to the file containing these data.
                        sheet-name (str):
                            Name of the tab in the .xlsx file.
                costs_disaggregation (dict):
                    Dictionary specifying the file structure and fields from which information
                    on the disaggregation of replacement costs into structural components,
                    non-structural components and contents can be retrieved. It contains the
                    following subkeys:
                        filename (str):
                            Relative path to the file containing these data.
                        sheet-name (str):
                            Name of the tab in the metadata .xlsx file where the information on
                            cost disaggregation can be found.
                        fields (dict):
                            Names of the fields containing the factors to disaggregate
                            replacement costs into structural components, non-structural
                            components and contents.
405
406
407
408
        self.csv_column_names
            Names of columns of interest in the CSV files.
        self.currency (str):
            Currency used in ESRM20 to express building replacement costs.
409
410
411
412
413
        self.exposure_entities (dictionary of ExposureEntity):
            Dictionary of instances of ExposureEntity objects, each of which represent an
            exposure entity where the input aggregated exposure model is defined. See
            attributes in description of ExposureEntity. The keys of the dictionary are the
            names of the corresponding exposure entities.
414
415
    """

416
417
418
419
420
421
422
    def __init__(self, configuration):
        super().__init__(configuration)
        self.occupancy_cases = {
            "residential": {
                "short": "Res",
                "sheet_name": "RES",
                "data_units_types_field": "Admin level resolution/aggregation",
423
424
425
426
427
428
                "costs_disaggregation_header": "RES",
                "population_time_distribution_fields": {
                    "Day": "RES_DAY",
                    "Night": "RES_NIGHT",
                    "Transit": "RES_TRANSIT",
                },
429
430
431
432
433
            },
            "commercial": {
                "short": "Com",
                "sheet_name": "COM",
                "data_units_types_field": "Admin level resolution/aggregation",
434
435
436
437
438
439
                "costs_disaggregation_header": "COM",
                "population_time_distribution_fields": {
                    "Day": "NONRES_DAY",
                    "Night": "NONRES_NIGHT",
                    "Transit": "NONRES_TRANSIT",
                },
440
441
442
443
444
            },
            "industrial": {
                "short": "Ind",
                "sheet_name": "IND",
                "data_units_types_field": "Resolution",
445
446
447
448
449
450
                "costs_disaggregation_header": "IND",
                "population_time_distribution_fields": {
                    "Day": "NONRES_DAY",
                    "Night": "NONRES_NIGHT",
                    "Transit": "NONRES_TRANSIT",
                },
451
452
453
454
            },
        }
        self.filename_pattern = {
            "filename": "Exposure_Model_%s_%s.csv",
455
456
457
            "first": "name",
            "second": "short",
        }
458
        self.boundary_filename_pattern = {
459
            "subfolder": "Adm%s",
460
461
462
463
            "filename": "Adm%s_%s.shp",
            "first": "data_units_level",
            "second": "name",
        }
464
465
466
        self.file_structure = {
            "metadata": "sources/European_Exposure_Model_Data_Inputs_Sources.xlsx",
            "CSVs": "_exposure_models",
467
468
            "population_time_distribution": {
                "filename": "social_indicators/population_distribution_PAGER.xlsx",
469
470
471
472
473
474
475
476
477
                "sheet_name": "data",
            },
            "costs_disaggregation": {
                "filename": "sources/European_Exposure_Model_Data_Inputs_Sources.xlsx",
                "sheet_name": "Notes",
                "fields": {
                    "Structural": "Replacement cost: structural / total",
                    "Non-Structural": "Replacement cost: non-structural / total",
                    "Contents": "Replacement cost: contents / total",
478
479
                },
            },
480
        }
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
        self.csv_column_names = {
            "Buildings": "BUILDINGS",
            "Dwellings": "DWELLINGS",
            "People": {
                "Census": "OCCUPANTS_PER_ASSET",
                "Day": "OCCUPANTS_PER_ASSET_DAY",
                "Night": "OCCUPANTS_PER_ASSET_NIGHT",
                "Transit": "OCCUPANTS_PER_ASSET_TRANSIT",
                "Average": "OCCUPANTS_PER_ASSET_AVERAGE",
            },
            "Cost": {
                "Total": "TOTAL_REPL_COST_EUR",
                "Structural": "COST_STRUCTURAL_EUR",
                "Non-Structural": "COST_NONSTRUCTURAL_EUR",
                "Contents": "COST_CONTENTS_EUR",
            },
497
498
499
            "building_class_name": "TAXONOMY",
            "settlement_type": "SETTLEMENT_TYPE",
            "occupancy_subtype": "OCCUPANCY_TYPE",
500
501
502
503
504
505
            "area_per_dwelling": "AREA_PER_DWELLING_SQM",
            "cost_per_area": "COST_PER_AREA_EUR",
            "dwellings_per_building": "DWELLINGS_PER_BUILDING",
            "people_per_dwelling": "PEOPLE_PER_DWELLING",
            "census_people_per_building": "OCCUPANTS_PER_BUILDING",
            "total_cost_per_building": "TOTAL_REPL_COST_PER_BUILDING",
506
        }
507
        self.currency = "EUR 2020"
508
509
        self.exposure_entities = self.retrieve_exposure_entities(configuration)

510
511
512
513
514
515
516
    def retrieve_exposure_entities(self, configuration):
        """This function retrieves the exposure entities of the ESRM20 model from the
        corresponding .xlsx metadata file.

        Args:
            configuration (Configuration object):
                Instance of the Configuration class, with at least the following attributes:
517
                    exposure_format (str):
518
519
                        Format of the input aggregated model. Currently supported values:
                        "esrm20" (any other input will return an empty dictionary).
520
521
522
                    data_pathname (str):
                        Path to the directory that contains the input aggregated exposure model
                        data.
523
524
525
526
527
                    exposure_entities_code (str or dict):
                        If "ISO3", the country ISO3 codes will be automatically retrieved and
                        used as the codes for the exposure entities. If it is a dictionary
                        instead, its contents will be used to assign the codes to the exposure
                        entities.
528
529
530
                    boundaries_pathname (str):
                        Path to the directory that contains the geodata files with the
                        boundaries.
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546

        Returns:
            exposure_entities (dictionary of ExposureEntity):
                Dictionary of instances of ExposureEntity objects, each of which represent an
                exposure entity where the input aggregated exposure model is defined. See
                attributes in description of ExposureEntity. The keys of the dictionary are the
                names of the corresponding exposure entities.
        """

        exposure_entities = {}

        logger.info(
            "Retrieving exposure_entities from exposure with format %s "
            "with retrieve_exposure_entities" % configuration.exposure_format
        )

547
548
549
550
551
552
        # Read metadata on the distribution of population during day, night and transit times
        population_time = pandas.read_excel(
            os.path.join(
                configuration.data_pathname,
                self.file_structure["population_time_distribution"]["filename"],
            ),
553
            sheet_name=self.file_structure["population_time_distribution"]["sheet_name"],
554
555
556
            index_col=0,  # Use first column as index
        )

557
558
559
        # Read metadata on the disaggregation of costs and retrieve factors
        costs_disaggregation_all = self._retrieve_costs_disaggregation(configuration)

560
561
562
        # Needs to go by occupancy case because the names and properties of the exposure
        # entities can only be read from the metadata file for a sheet that is associated
        # with a particular occupancy case:
563
        for case in self.occupancy_cases.keys():
564

565
            # Read the general metadata file (errors will be handled by pandas)
566
            metadata = pandas.read_excel(
567
                os.path.join(configuration.data_pathname, self.file_structure["metadata"]),
568
                sheet_name=self.occupancy_cases[case]["sheet_name"],
569
570
571
572
573
574
575
576
577
578
579
580
581
582
                header=None,  # Otherwise we cannot handle repeated column names properly
                index_col=0,  # Use first column as index
            )

            # Retrieve names of exposure entities
            read_names = numpy.array(metadata.loc["Variables", :])
            # Fix the possibility that there might be other rows named "Variables" other
            # than the first one
            if len(read_names.shape) > 1:
                read_names = read_names[0, :]

            # Check if there are repeated names of exposure entities (terminate if True)
            if len(read_names) != len(numpy.unique(read_names)):
                logger.critical(
583
584
585
                    "Error: repeated names of exposure entities found "
                    "in occupancy case %s. "
                    "'retrieve_exposure_entities' could not run." % (case)
586
587
588
589
590
591
592
593
594
595
596
597
598
                )
                break

            # Use first row as header, once it has been confirmed that there are no repeated
            # names of exposure entities
            new_header = metadata.loc["Variables", :]
            if len(new_header.shape) > 1:  # When there are more than one "Variables" row
                new_header = new_header.iloc[0, :]
            metadata = metadata[1:]  # Keep the data below the header row
            metadata.columns = new_header  # Set the header row as the new column names

            # Retrieve the row from which the types of data units can be interpreted
            data_units_types_row = metadata.loc[
599
                self.occupancy_cases[case]["data_units_types_field"], :
600
601
602
            ]
            if len(data_units_types_row.shape) > 1:  # This should not occur
                logger.critical(
603
604
605
606
607
608
                    "Error reading %s: row not found."
                    % (
                        os.path.join(
                            configuration.data_pathname, self.file_structure["metadata"]
                        )
                    )
609
610
611
612
613
614
                )
                data_units_types_row = data_units_types_row.iloc[0, :]
                data_units_types_row.iloc[:] = "unknown"

            for exposure_entity in read_names:
                if exposure_entity not in exposure_entities.keys():
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
                    # Retrieve boundaries of the exposure entity
                    geometry_exposure_entity = self._read_exposure_entity_geometry(
                        configuration,
                        exposure_entity,
                        datatypes={"ID_0": str},
                    )

                    if self.domain_geometry is not None:
                        # Verify that the bounding box of 'geometry' lies within the domain
                        # polygon of the Aggregated Exposure Model
                        exposure_entity_fully_inside = (
                            SpatialTools.check_bounding_box_inside_polygon(
                                geometry_exposure_entity, self.domain_geometry
                            )
                        )
                        if not exposure_entity_fully_inside:
                            # Trim the boundaries of the exposure entity
632
633
634
635
                            geometry_exposure_entity = (
                                SpatialTools.single_geometries_intersection(
                                    geometry_exposure_entity, self.domain_geometry
                                )
636
637
638
639
640
641
642
643
                            )
                            logger.warning(
                                "Boundary for Exposure Entity '%s' was trimmed to fit within "
                                "the domain polygon of the Aggregated Exposure Model."
                                % (exposure_entity)
                            )

                    # Create ExposureEntity object
644
                    exposure_entities[exposure_entity] = ExposureEntity(
645
646
647
648
                        exposure_entity,
                        configuration.exposure_entities_code,
                        geometry_exposure_entity,
                        self.currency,
649
                    )
650

651
                # Retrieve metadata regarding the data units of this exposure_entity
652
653
654
655
656
657
                output = self._map_data_units_types(data_units_types_row.loc[exposure_entity])
                (data_units_type, data_units_level, data_units_definition) = output
                output = {
                    "data_units_type": data_units_type,
                    "data_units_level": data_units_level,
                    "data_units_definition": data_units_definition,
658
                    "population_time_distribution": {},
659
660
                }

661
662
663
                for time in self.occupancy_cases[case][
                    "population_time_distribution_fields"
                ].keys():
664
665
666
                    output["population_time_distribution"][time] = round(
                        population_time.loc[
                            exposure_entity,
667
668
669
                            self.occupancy_cases[case]["population_time_distribution_fields"][
                                time
                            ],
670
671
672
673
674
                        ]
                        / 100.0,
                        7,
                    )

675
676
                output["costs_disaggregation"] = costs_disaggregation_all[case]

677
678
679
680
681
                # Write the contents occupancy_cases to the ExposureEntity object
                exposure_entities[exposure_entity].occupancy_cases[case] = output

        return exposure_entities

682
683
684
685
686
687
    def get_data_units(
        self,
        configuration,
        exposure_entity_name,
        occupancy_case,
    ):
688
689
        """This function adds the DataUnits associated with an ExposureEntity with name
        'exposure_entity_name' and occupancy `occupancy_case`.
690
691
692

        Args:
            configuration (Configuration object):
693
                Instance of the Configuration class, with at least the following attributes:
694
695
696
                    data_pathname (str):
                        Path to the directory that contains the input aggregated exposure model
                        data.
697
698
699
700
701
702
703
704
705
706
707
708
709
710
                    number_cores (int):
                        Number of CPU cores to be used to run
                        ExposureEntity.sum_surface_area_of_data_units(), which is called by this
                        function.
                    surface_threshold (positive float):
                        Absolute threshold of the difference between the surface area of the
                        exposure entity and that of all of its data units that defines the
                        actions to be taken by this function (percentage equal to or larger than
                        zero). Default: 1.0.
                    force_creation_data_units (bool):
                        If True, a data unit covering the spatial extent of
                        'exposure_entity_name' that is not covered by already existing data
                        units will be created, irrespective of all the built-in checks that make
                        such a decision automatically.
711
712
713
            exposure_entity_name (str):
                Name of the ExposureEntity whose data units will be retrieved. It needs to be a
                key of self.exposure_entities and self.exposure_entities[exposure_entity_name]
714
                needs to have at least the following attributes:
715
                    name (str):
716
                        Name of the ExposureEntity.
717
718
719
720
721
722
723
                    occupancy_cases (dict):
                        Dictionary as defined in the attributes of ExposureEntity.
                        'occupancy_case' needs to be one of its keys, with existing subkey
                        'data_units_level'.
            occupancy_case (str):
                Name of the occupancy case (e.g. "residential", "commercial", "industrial") for
                which the names of the data units will be retrieved. It needs to exist as a key
724
                of self.exposure_entities[exposure_entity_name].occupancy_cases.
725
726

        Returns:
727
728
729
730
731
            The function creates the subkey 'data_units' in
            self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case] and
            stores in it a dictionary with instances of DataUnit. The keys of the dictionary are
            the data unit IDs. The dictionary is empty if it was impossible to retrieve the IDs
            of the data units.
732
733
        """

734
        # Read names of data units from ESRM20's CSV files
735
        id_column_name_data = "ID_%s" % (
736
737
738
739
740
            str(
                self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
                    "data_units_level"
                ]
            )
741
        )
742
        datatypes = {id_column_name_data: str}
743
        data_table = self._read_data_table(
744
745
746
747
            configuration,
            self.exposure_entities[exposure_entity_name],
            occupancy_case,
            datatypes,
748
749
        )
        try:
750
            data_units_ids = list(data_table[id_column_name_data].unique())
751
        except KeyError:
752
            data_units_ids = []
753
            logger.critical(
754
755
                "Error while retrieving 'data_units_ids' of %s, %s: column `%s` not found"
                % (exposure_entity_name, occupancy_case, id_column_name_data)
756
757
            )

758
759
760
761
762
        # Read geometries of data units from ESRM20's boundary files
        self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
            "data_units"
        ] = {}

763
        id_column_name_geometries = "ID_%s" % (
764
765
766
767
            self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
                self.boundary_filename_pattern["first"]
            ]
        )
768
        geometries_table = self._read_data_units_geometries_table(
769
770
771
772
773
774
            configuration,
            self.exposure_entities[exposure_entity_name],
            occupancy_case,
            datatypes,
        )

775
776
777
778
779
780
        # Retrieve total number of buildings, dwellings, people and cost
        results = self._retrieve_totals_all_data_units(
            data_table, id_column_name_data, data_units_ids, exposure_entity_name
        )
        total_buildings, total_dwellings, total_people, total_cost = results

781
        # Retrieve proportions and properties of building classes
782
783
        building_classes_proportions_and_properties = (
            self._retrieve_building_classes_of_data_units(
784
785
786
787
788
789
                data_table,
                id_column_name_data,
                data_units_ids,
                exposure_entity_name,
                occupancy_case,
                False,
790
            )
791
792
        )

793
        for data_unit_id in data_units_ids:
794
795
            self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
                "data_units"
796
797
798
799
800
801
802
803
            ][data_unit_id] = DataUnit(
                data_unit_id,
                geometries_table,
                id_column_name_geometries,
                total_buildings[data_unit_id],
                total_dwellings[data_unit_id],
                total_people[data_unit_id],
                total_cost[data_unit_id],
804
                building_classes_proportions_and_properties[data_unit_id],
805
806
            )

807
808
809
810
811
812
813
814
        # Check that the data units cover all of the exposure entity; if not, create an
        # additional filler unit to "fill in" the gaps
        if len(data_units_ids) > 0:  # nothing to "fill in" if no data units are defined at all
            filler_data_unit_geometry_table = self.ensure_full_geographic_coverage(
                exposure_entity_name,
                occupancy_case,
                geometries_table,
                configuration.number_cores,
815
816
                configuration.data_units_min_admisible_area,
                configuration.data_units_max_admisible_area,
817
818
819
820
                configuration.data_units_surface_threshold,
                configuration.force_creation_data_units,
            )
            if filler_data_unit_geometry_table.shape[0] > 0:
821
822
823
                # Retrieve proportions and properties of building classes (weighted average of
                # the whole exposure entity, as available from 'data_table'), applicable to all
                # filler data units created
824
825
826
                (
                    building_classes_proportions_and_properties,
                    _,
827
828
829
                ) = self._retrieve_building_classes(
                    data_table, exposure_entity_name, occupancy_case, True
                )
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857

                for i in range(filler_data_unit_geometry_table.shape[0]):
                    filler_data_unit_id = filler_data_unit_geometry_table["ID"].to_numpy()[i]

                    # Create new DataUnit object
                    self.exposure_entities[exposure_entity_name].occupancy_cases[
                        occupancy_case
                    ]["data_units"][filler_data_unit_id] = DataUnit(
                        filler_data_unit_id,
                        filler_data_unit_geometry_table,
                        "ID",
                        0.0,  # buildings
                        0.0,  # dwellings
                        {
                            "Census": 0.0,
                            "Day": 0.0,
                            "Night": 0.0,
                            "Transit": 0.0,
                            "Average": 0.0,
                        },
                        {
                            "Total": 0.0,
                            "Structural": 0.0,
                            "Non-Structural": 0.0,
                            "Contents": 0.0,
                        },
                        building_classes_proportions_and_properties,
                    )
858

859
860
        return

861
862
863
864
865
866
    def ensure_full_geographic_coverage(
        self,
        exposure_entity_name,
        occupancy_case,
        data_units_geometries_all,
        number_cores,
867
868
        min_admisible_area,
        max_admisible_area,
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
        surface_threshold=1.0,
        force_creation_data_units=False,
    ):
        """This function verifies that the data units that already exist in self.
        exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case]["data_units"]
        cover the whole geographical extent of self.exposure_entities[exposure_entity_name].
        geometry and creates a data unit to cover the missing area if needed. A series of checks
        are carried out to decide whether the creation of the data unit is needed:

        - If the number of already defined data units is different from the number of data units
        in 'data_units_geometries_all', a data unit is created.
        - Otherwise, the surface area of self.exposure_entities[exposure_entity_name].geometry
        is compared against the total surface area of all already defined data units:
            - If the percent difference between the two is larger than 'surface_threshold', a
            data unit is created.
            - If the percent difference between the two is smaller than 'surface_threshold' with
            a negative sign, a warning is logged, as this means that the area of the exposure
            entity is smaller than that of all already defined data units. A manual verification
            by the user is needed in this case. The function can be re-run with
            'force_creation_data_units' set to True so that a data unit is created irrespective
            of this check.
            - If the percent difference between the two is larger than 'surface_threshold' with
            a negative sign and smaller than 'surface_threshold', no data unit is created,
            unless 'force_creation_data_units' is set to True.

        Setting 'force_creation_data_units' set to True leads to the outcome of these checks to
        be ignored and a data unit to be created.

        The checks are required to avoid creating data units due to the geometries of the
        exposure entities and data units being defined with different resolutions or precision
        levels, i.e. to account for involuntary inconsistencies in these geometries.

        Args:
            exposure_entity_name (str):
                Name of the ExposureEntity for which the function will be run.
            occupancy_case (str):
                Name of the occupancy case (e.g. "residential", "commercial", "industrial") for
                which the function will be run. It needs to exist as a key of
                self.exposure_entities[exposure_entity_name].occupancy_cases.
            data_units_geometries_all (GeoPandas GeoDataFrame):
                GeoPandas GeoDataFrame containing the geometries of all data units associated
                with 'exposure_entity_name' and 'occupancy_case', irrespective of whether
                exposure is defined for them in the aggregated exposure model or not. They
                would normally be retrieved from a geodatafile.
            number_cores (int):
                Number of CPU cores to be used to run
                ExposureEntity.sum_surface_area_of_data_units(), which is called by this
                function.
917
918
919
920
921
922
            min_admisible_area (float):
                Minimum surface area (in m2) of the data unit(s) created by this function.
            max_admisible_area (float):
                Maximum surface area (in m2) of the data unit(s) created by this function. If
                the area to cover is larger than 'max_admisible_area', it gets successively
                subdivided until complying with this requisite.
923
924
925
926
927
            surface_threshold (positive float):
                Absolute threshold of the difference between the surface area of the exposure
                entity and that of all of its data units that defines the actions to be taken by
                this function (percentage equal to or larger than zero). Default: 1.0.
            force_creation_data_units (bool):
928
929
930
                If True, (a) data unit(s) covering the spatial extent of 'exposure_entity_name'
                that is not covered by already existing data units will be created, irrespective
                of all the built-in checks that make such a decision automatically.
931
932
933
934
935
936
937

        Returns:
            filler_data_unit_geometry_table (GeoPandas GeoDataFrame):
                GeoDataFrame with maximum one row. If empty, no data units were created as by
                this function. If not empty, it contains the geometry and name of the data unit
                created to cover the missing area of 'exposure_entity_name'. It contains the
                following columns:
938
                    ID:
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
                        ID of the created data unit.
                    geometry:
                        Geometry of the created data unit.
        """

        # Retrieve/prepare relevant variables
        # Number of data units that exist for 'exposure_entity_name' and 'occupancy_case'
        num_data_units_defined = len(
            self.exposure_entities[exposure_entity_name]
            .occupancy_cases[occupancy_case]["data_units"]
            .keys()
        )
        # Geometry of 'exposure_entity_name'
        geometry_exposure_entity = self.exposure_entities[exposure_entity_name].geometry
        # List of the geometries of all data units existing in the geodatafile for
        # 'exposure_entity_name' and 'occupancy_case'
        data_units_geometries_all_list = list(data_units_geometries_all["geometry"].to_numpy())
        # Geometries of all data units existing in the geodatafile for 'exposure_entity_name'
        # and 'occupancy_case', merged together as one Shapely Polygon/MultyPolygon
        data_units_geometries_all_dissolved = data_units_geometries_all.dissolve()[
            "geometry"
        ].to_numpy()[0]

        # Check that 'data_units_geometries_all' are contained in self.domain_geometry
        if self.domain_geometry is not None:
            data_units_all_fully_inside = SpatialTools.check_bounding_box_inside_polygon(
                data_units_geometries_all_dissolved, self.domain_geometry
            )
            if not data_units_all_fully_inside:
                # Trim the data units
                data_units_geometries_all_list = SpatialTools.multiple_geometries_intersection(
                    data_units_geometries_all_list, self.domain_geometry, False
                )

        num_data_units_all = len(data_units_geometries_all_list)

        new_data_unit_geometry = None

        if (num_data_units_defined != num_data_units_all) or force_creation_data_units:
            # Create data units to cover missing areas
            new_data_unit_geometry = SpatialTools.single_geometries_difference(
                geometry_exposure_entity,
                self.exposure_entities[exposure_entity_name].join_data_units(occupancy_case),
            )
        else:
            # Check difference in surface area of exposure entity and defined data units
            # Surface area of the data units for which exposure of occupancy_case is defined
            surface_data_units_defined = self.exposure_entities[
                exposure_entity_name
            ].sum_surface_area_of_data_units(occupancy_case, number_cores)
            # Surface area of the exposure entity (defined as administrative level 0)
            surface_exposure_entity = SpatialTools.get_area_albers_equal_area(
                geometry_exposure_entity
            )
            surface_difference = (
                100.0
                * (surface_exposure_entity - surface_data_units_defined)
                / surface_exposure_entity
            )

            if surface_difference > surface_threshold:
                # Create data units to cover what is missing