Commit 5565d977 authored by Cecilia Nievas's avatar Cecilia Nievas
Browse files

Added DataUnit class and feature to read data unit geometries

parent b9650126
Pipeline #25892 passed with stage
in 1 minute and 54 seconds
model_name: esrm20
exposure_format: esrm20 # Only supported value for now
data_pathname: path_to_directory_with_model_data
boundaries_pathname: path_to_directory_with_boundary_files
occupancies_to_run: residential, commercial # From 'occupancy_cases', industrial not supported
......@@ -21,7 +21,9 @@ import abc
import logging
import numpy
import pandas
import geopandas
from gdeimporter.exposureentity import ExposureEntity
from gdeimporter.dataunit import DataUnit
logger = logging.getLogger()
......@@ -45,6 +47,9 @@ class AggregatedExposureModel(abc.ABC):
names of the corresponding exposure entities.
self.filename_pattern (str):
Pattern of the names of the files that define the input aggregated exposure model.
self.boundary_filename_pattern (str):
Pattern of the names of the geodata files that contain the boundaries of the
exposure entities of the input aggregated exposure model.
"""
def __init__(self, configuration):
......@@ -53,6 +58,7 @@ class AggregatedExposureModel(abc.ABC):
self.occupancy_cases = None
self.exposure_entities = None
self.filename_pattern = None
self.boundary_filename_pattern = None
def retrieve_exposure_entities(self, configuration):
"""This function retrieves the exposure entities for which an input aggregated exposure
......@@ -101,6 +107,9 @@ class ExposureModelESRM20(AggregatedExposureModel):
types of data_units.
self.filename_pattern (str):
Pattern of the names of the ESRM20 CSV files.
self.boundary_filename_pattern (str):
Pattern of the names of the geodata files that contain the boundaries of the
exposure entities of the ESRM20 model.
self.file_structure (dict):
Dictionary specifying the file structure of the ESRM20 model files within the data
pathname specified in the configuration file. It contains the following subkeys:
......@@ -139,6 +148,11 @@ class ExposureModelESRM20(AggregatedExposureModel):
"first": "name",
"second": "short",
}
self.boundary_filename_pattern = {
"filename": "Adm%s_%s.shp",
"first": "data_units_level",
"second": "name",
}
self.file_structure = {
"metadata": "sources/European_Exposure_Model_Data_Inputs_Sources.xlsx",
"CSVs": "_exposure_models",
......@@ -244,9 +258,9 @@ class ExposureModelESRM20(AggregatedExposureModel):
return exposure_entities
def get_data_units_names(self, configuration, exposure_entity, occupancy_case):
"""This function retrieves the names of the data units associated with a specified
exposure_entity and occupancy_case in the ESRM20 model.
def get_data_units(self, configuration, exposure_entity_name, occupancy_case):
"""This function adds the DataUnits associated with an ExposureEntity with name
'exposure_entity_name' and occupancy `occupancy_case`.
Args:
configuration (Configuration object):
......@@ -254,10 +268,12 @@ class ExposureModelESRM20(AggregatedExposureModel):
data_pathname (str):
Path to the directory that contains the input aggregated exposure model
data.
exposure_entity (ExposureEntity object):
Instance of the ExposureEntity class, with at leas the following attribute:
exposure_entity_name (str):
Name of the ExposureEntity whose data units will be retrieved. It needs to be a
key of self.exposure_entities and self.exposure_entities[exposure_entity_name]
needs to have at least the following attribute:
name (str):
Name of the exposure entity.
Name of the ExposureEntity.
occupancy_cases (dict):
Dictionary as defined in the attributes of ExposureEntity.
'occupancy_case' needs to be one of its keys, with existing subkey
......@@ -265,33 +281,61 @@ class ExposureModelESRM20(AggregatedExposureModel):
occupancy_case (str):
Name of the occupancy case (e.g. "residential", "commercial", "industrial") for
which the names of the data units will be retrieved. It needs to exist as a key
of exposure_entity.occupancy_cases.
of self.exposure_entities[exposure_entity_name].occupancy_cases.
Returns:
The function will update 'exposure_entity', creating the subkey 'data_units_names'
to exposure_entity.occupancy_cases[occupancy_case] and storing in it the list of
data unit names (as strings). The list will be empty if it was impossible to
retrieve the names (but the subkey will still be created).
The function creates the subkey 'data_units' in
self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case] and
stores in it a dictionary with instances of DataUnit. The keys of the dictionary are
the data unit IDs. The dictionary is empty if it was impossible to retrieve the IDs
of the data units.
"""
# Read names of data units from ESRM20's CSV files
target_column_name = "ID_%s" % (
str(exposure_entity.occupancy_cases[occupancy_case]["data_units_level"])
str(
self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
"data_units_level"
]
)
)
datatypes = {target_column_name: str}
data_table = self._read_data_table(
configuration, exposure_entity, occupancy_case, datatypes
configuration,
self.exposure_entities[exposure_entity_name],
occupancy_case,
datatypes,
)
try:
exposure_entity.occupancy_cases[occupancy_case]["data_units_names"] = list(
data_table[target_column_name].unique()
)
data_units_names = list(data_table[target_column_name].unique())
except KeyError:
exposure_entity.occupancy_cases[occupancy_case]["data_units_names"] = []
data_units_names = []
logger.critical(
"Error while retrieving 'data_units_names' of %s, %s: column `%s` not found"
% (exposure_entity.name, occupancy_case, target_column_name)
% (exposure_entity_name, occupancy_case, target_column_name)
)
# Read geometries of data units from ESRM20's boundary files
self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
"data_units"
] = {}
target_column_name = "ID_%s" % (
self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
self.boundary_filename_pattern["first"]
]
)
geometries_table = self._read_geometries_table(
configuration,
self.exposure_entities[exposure_entity_name],
occupancy_case,
datatypes,
)
for data_unit_name in data_units_names:
self.exposure_entities[exposure_entity_name].occupancy_cases[occupancy_case][
"data_units"
][data_unit_name] = DataUnit(data_unit_name, geometries_table, target_column_name)
return
def _map_data_units_types(self, original_description):
......@@ -384,4 +428,69 @@ class ExposureModelESRM20(AggregatedExposureModel):
sep=",",
)
if exposure_entity.name == "France" and "ID_5" in datatypes:
data_table["ID_5"] = numpy.array(
[val.zfill(5) for val in data_table["ID_5"].values]
)
return data_table
def _read_geometries_table(self, configuration, exposure_entity, occupancy_case, datatypes):
"""This function reads a geodata file containing ESRM20-compatible boundaries and
returns it as a GeoPandas GeoDataFrame with geometries defined in EPSG:4326.
Args:
configuration (Configuration object):
Instance of the Configuration class, with at least the following attribute:
boundaries_pathname (str):
Path to the directory that contains the geodata files with the
boundaries.
exposure_entity (ExposureEntity object):
Instance of the ExposureEntity class, with at least the following attribute:
name (str):
Name of the exposure entity.
occupancy_cases (dict):
Dictionary definining the type, level and definition of the data units
used for each occupancy case of the model.
occupancy_case (str):
Name of the occupancy case (e.g. "residential", "commercial", "industrial") for
which the geodata file will be read.
datatypes (dict):
Dictionary indicating the data type/s of (a) column/s of interest in the CSV
file. The data types are first automatically determined by GeoPandas but then
forced to match those of `datatypes` within this function. It can be an empty
dictionary.
Returns:
geometries_table (GeoPandas GeoDataFrame):
GeoDataFrame with all contents of the geodata file. Geometries are returned in
EPSG:4326.
"""
filename = self.boundary_filename_pattern["filename"] % (
exposure_entity.occupancy_cases[occupancy_case][
self.boundary_filename_pattern["first"]
],
getattr(exposure_entity, self.boundary_filename_pattern["second"]),
)
# Read the data file (errors will be handled by geopandas)
geometries_table = geopandas.GeoDataFrame.from_file(
os.path.join(configuration.boundaries_pathname, filename),
)
geometries_table = geometries_table.to_crs("EPSG:4326")
# Force data types (dtype=datatypes not working in geopandas.GeoDataFrame.from_file)
column_names = list(datatypes)
for column_name in column_names:
if (
geometries_table[column_name].dtype == "float64"
and datatypes[column_name] is str
):
# Condition needed to deal with integers that geopandas reads as floats
geometries_table[column_name] = geometries_table[column_name].astype(int)
geometries_table[column_name] = geometries_table[column_name].astype(
datatypes[column_name]
)
return geometries_table
......@@ -32,6 +32,9 @@ class Configuration:
Format of the input aggregated model. Currently supported values: "esrm20".
self.data_pathname (str):
Path to the directory that contains the input aggregated exposure model data.
self.boundaries_pathname (str):
Path to the directory that contains the boundaries of the exposure units convered
by the input aggregated exposure model.
self.occupancies_to_run (list of str):
List of keys of occupancy_cases of the input aggregated exposure model for which
data will be retrieved.
......@@ -39,6 +42,7 @@ class Configuration:
REQUIRES = [
"data_pathname",
"boundaries_pathname",
"occupancies_to_run",
]
......@@ -55,6 +59,7 @@ class Configuration:
self.exposure_format = self._assign_parameter(config, "exposure_format")
self.data_pathname = self._assign_parameter(config, "data_pathname")
self.boundaries_pathname = self._assign_parameter(config, "boundaries_pathname")
self.occupancies_to_run = self._assign_listed_parameters(config, "occupancies_to_run")
# Terminate if critical parameters are missing (not all parameters are critical)
......
#!/usr/bin/env python3
# Copyright (C) 2021:
# Helmholtz-Zentrum Potsdam Deutsches GeoForschungsZentrum GFZ
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
# General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.
import logging
import numpy
logger = logging.getLogger()
class DataUnit:
"""This class represents the smallest geographical unit of an ExposureEntity, that is, the
smallest geographical unit where an exposure model is defined, i.e. where data is available
from an input aggregated exposure model for a particular occupancy case.
Attributes:
self.id (str):
ID of the DataUnit (e.g. ID of the administrative unit it represents).
self.geometry (Shapely polygon):
Geometry of the data unit.
"""
def __init__(self, dataunit_id, geometries_table, target_column_name):
self.id = dataunit_id
self.geometry = self.get_data_unit_geometry(geometries_table, target_column_name)
def get_data_unit_geometry(self, geometries_table, target_column_name):
"""This function retrieves the geometry of the data unit, by reading it from
geometries_table.
Args:
geometries_table (GeoPandas GeoDataFrame):
GeoPandas GeoDataFrame containing at least two columns:
target_column_name (str):
Column where the ID of the Data Unit will be sought. The data type of
the elements of this column needs to be 'string', otherwise the function
might fail to find the name of the Data Unit in it.
geometry (Shapely geometry):
Geometry.
target_column_name (str):
Name of the column in which the ID of the Data Unit will be sought.
Returns:
geometry (Shapely geometry):
Geometry of the DataUnit (in the same EPSG as 'geometries_table').
"""
try:
target_row = numpy.where(geometries_table[target_column_name].values == self.id)[0]
geometry = geometries_table["geometry"].values[target_row[0]]
except IndexError:
geometry = None
logger.critical(
"Error: geometry of data unit '%s' not found in target column %s"
% (self.id, target_column_name)
)
return geometry
......@@ -63,9 +63,11 @@ class ExposureEntity:
| | (if "polygon" type) in which the data unit is defined. E.g.
| | "WGS84" for cells, "NUTS" for polygons that represent
| | administrative units.
| |_ data_units_names (list of str):
| | List of names of the data units associated with this
| | ExposureEntity and occupancy case.
| |_ data_units (dict of DataUnit):
| | Dictionary of instances of DataUnit objects, each of which
| | represent the smallest geographical unit where an exposure model
| | is defined. See attributes in description of DataUnit. The keys
| | of the dictionary are the IDs of the corresponding data units.
|_ occupancy_cases.keys()[1]
| |_ data_units_type: ...
|_ ...
......
......@@ -23,7 +23,7 @@ from gdeimporter.aggregatedexposuremodel import ExposureModelESRM20
# Add a logger printing error, warning, info and debug messages to the screen
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))
EXPOSURE_MODELS = {"esrm20": ExposureModelESRM20}
......@@ -47,12 +47,10 @@ def main():
if len(aem.exposure_entities) < 1:
raise ValueError("no exposure entities found in %s" % (aem.model_name))
# Retrieve names of data units per exposure entity and occupancy case
# Retrieve data units per exposure entity and occupancy case
for exposure_entity_name in aem.exposure_entities:
for occupancy_case in config.occupancies_to_run:
aem.get_data_units_names(
config, aem.exposure_entities[exposure_entity_name], occupancy_case
)
aem.get_data_units(config, exposure_entity_name, occupancy_case)
print("Name of the model: %s" % (aem.model_name))
print("Format: %s" % (aem.exposure_format))
......@@ -72,23 +70,38 @@ def main():
aem.exposure_entities[exposure_entity].occupancy_cases[case][attr],
)
)
if (
"data_units_names"
in aem.exposure_entities[exposure_entity].occupancy_cases[case]
):
if "data_units" in aem.exposure_entities[exposure_entity].occupancy_cases[case]:
if (
len(
aem.exposure_entities[exposure_entity].occupancy_cases[case][
"data_units_names"
"data_units"
]
)
> 0
):
print(" data_units_names: retrieved")
print(" data_units names: retrieved")
geometries_retrieved = True
for data_unit_id in aem.exposure_entities[exposure_entity].occupancy_cases[
case
]["data_units"]:
if (
aem.exposure_entities[exposure_entity]
.occupancy_cases[case]["data_units"][data_unit_id]
.geometry
is None
):
geometries_retrieved = False
break
if geometries_retrieved:
print(" data_units geometries: retrieved")
else:
print(" data_units geometries: not retrieved")
else:
print(" data_units_names: not retrieved")
print(" data_units names: not retrieved")
print(" data_units geometries: not retrieved")
else:
print(" data_units_names: not retrieved")
print(" data_units names: not retrieved")
print(" data_units geometries: not retrieved")
# Leave the program
logger.info("gde-importer has finished")
......
......@@ -28,7 +28,7 @@ setup(
keywords="Global Dynamic Exposure, GDE, buildings, exposure model",
author="Helmholtz-Zentrum Potsdam Deutsches GeoForschungsZentrum GFZ",
license="AGPLv3+",
install_requires=["numpy", "pyyaml", "pandas", "openpyxl"],
install_requires=["numpy", "pyyaml", "pandas", "openpyxl", "geopandas"],
extras_require={
"tests": tests_require,
"linters": linters_require,
......
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
\ No newline at end of file
GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]]
\ No newline at end of file
Entity,DataUnit,LonW,LonE,LatN,LatS
Entity_1,Unit_X,11.00,12.00,35.00,34.00
Entity_1,Unit_Y,12.00,13.00,36.00,35.00
Entity_1,Unit_Z,13.00,14.00,37.00,36.00
Entity_1,Unit_1,14.00,15.00,38.00,37.00
Entity_1,Unit_2,15.00,16.00,39.00,38.00
Entity_1,Unit_3,16.00,17.00,40.00,39.00
Entity_2,Unit_A,21.00,22.00,45.00,44.00
Entity_2,Unit_B,22.00,23.00,46.00,45.00
Entity_2,Unit_C,23.00,24.00,47.00,46.00
model_name: esrm20
exposure_format: esrm20
data_pathname: /some/path/to/directory
boundaries_pathname: /some/path/to/directory
occupancies_to_run: residential, commercial, industrial
model_name: esrm20
exposure_format: esrm20
data_pathname: /some/path/to/directory
boundaries_pathname: /some/path/to/directory
occupancies_to_run: residential
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment