From 131edb30f5f3b9c39d0bf2c728411b16d629646a Mon Sep 17 00:00:00 2001 From: Cecilia Nievas <cnievas@gfz-potsdam.de> Date: Fri, 8 Apr 2022 18:41:53 +0200 Subject: [PATCH] Added feature to process data-unit tiles --- .gitlab-ci.yml | 1 + config_example.yml | 8 + gdecore/configuration.py | 32 ++++ gdecore/database_queries.py | 71 ++++++++ gdecore/gdecore.py | 23 ++- gdecore/processor.py | 199 ++++++++++++++++++++++ tests/data/config_for_testing_good.yml | 7 + tests/data/config_for_testing_missing.yml | 7 + tests/data/test_database_set_up.sql | 13 ++ tests/test_configuration.py | 10 ++ tests/test_database_queries.py | 24 +++ tests/test_processor.py | 105 ++++++++++++ 12 files changed, 497 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 558305c..449c565 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,6 +14,7 @@ variables: GDEIMPORTER_DB: ${POSTGRES_DB} GDEIMPORTER_USER: ${POSTGRES_USER} GDEIMPORTER_PASSWORD: ${POSTGRES_PASSWORD} + GDEIMPORTER_SOURCEID: 1 cache: paths: diff --git a/config_example.yml b/config_example.yml index 3ce8db1..bd48285 100644 --- a/config_example.yml +++ b/config_example.yml @@ -14,3 +14,11 @@ database_obm_buildings: # Database where info on the OBM buildings is stored port: port_number # Leave empty if a port number is not needed username: username password: password_of_username +database_completeness: # Database where completeness per tile is stored + host: host_name + dbname: database_name + port: port_number # Leave empty if a port number is not needed + username: username + password: password_of_username + sourceid: 1 +number_cores: 1 # Number of cores used for parallelisation diff --git a/gdecore/configuration.py b/gdecore/configuration.py index b6d3c7a..ff69775 100644 --- a/gdecore/configuration.py +++ b/gdecore/configuration.py @@ -75,6 +75,25 @@ class Configuration: User name to connect to the SQL database. password (str): Password associated with self.username. + self.database_completeness (dict): + Dictionary containing the credentials needed to connect to the SQL database in which + the completeness per quadtile is stored. The exact parameters needed depend on the + database. They can be: + host (str): + SQL database host address. + dbname (str): + Name of the SQL database. + port (int): + Port where the SQL database can be found. + username (str): + User name to connect to the SQL database. + password (str): + Password associated with self.username. + sourceid (int): + ID of the built-up area source dataset used to determine completeness that + will be sought for. + self.number_cores (int): + Number of cores that will be used to run the code. """ REQUIRES = [ @@ -84,6 +103,8 @@ class Configuration: "occupancies_to_run", "database_gde_tiles", "database_obm_buildings", + "database_completeness", + "number_cores", ] def __init__(self, filepath, force_config_over_hierarchies=False): @@ -114,6 +135,13 @@ class Configuration: force_config_over_hierarchies, ) + self.database_completeness = ConfigurationMethods.retrieve_database_credentials( + config, + "database_completeness", + "test_db_completeness.env", + force_config_over_hierarchies, + ) + self.exposure_entities_to_run = ConfigurationMethods.assign_listed_parameters( config, "exposure_entities_to_run" ) @@ -141,6 +169,10 @@ class Configuration: config, "occupancies_to_run" ) + self.number_cores = ConfigurationMethods.assign_integer_parameter( + config, "number_cores" + ) + # Terminate if critical parameters are missing (not all parameters are critical) for key_parameter in self.REQUIRES: if getattr(self, key_parameter) is None: diff --git a/gdecore/database_queries.py b/gdecore/database_queries.py index d49e639..3ef8045 100644 --- a/gdecore/database_queries.py +++ b/gdecore/database_queries.py @@ -785,3 +785,74 @@ class DatabaseQueries: ) return data_unit_tiles + + @staticmethod + def get_automatic_completeness_of_quadkey(quadkey, db_completeness_config, db_table): + """This function retrieves the completeness status of a tile with respect to the + representation of buildings in OpenStreetMap (OSM). If a tile is OSM-complete, then all + buildings that exist in reality are represented in OSM. If a tile is OSM-incomplete, + then some buildings that exist in reality are not yet represented in OSM. + + Args: + quadkey (str): + Quadkey of the tile for which the completeness status will be retrieved. + db_completeness_config (dict): + Dictionary containing the credentials needed to connect to the SQL database in + which completeness data are stored. The keys of the dictionary need to be: + host (str): + SQL database host address. + dbname (str): + Name of the SQL database. + port (int): + Port where the SQL database can be found. + username (str): + User name to connect to the SQL database. + password (str): + Password associated with self.username. + sourceid (int): + ID of the automatic completeness source dataset that will be sought for. + db_table (str): + Name of the table of the SQL database where the completeness data are stored. It + is assumed that this table contains, at least, the following fields: + quadkey (str): + String indicating the quadkey of a tile. + completeness (int): + Completeness code: 0 = incomplete, 1 = complete. + source_id (int): + ID of the source used to define the built-up area and completeness. + + Returns: + completeness (bool): + True if the tile is OSM-complete, False if it is OSM-incomplete. + """ + + sql_query = "SELECT completeness FROM %s WHERE (quadkey='%s' AND source_id=%s);" + + db_completeness = Database(**db_completeness_config) + db_completeness.create_connection_and_cursor() + + db_completeness.cursor.execute( + sql_query % (db_table, quadkey, db_completeness_config["sourceid"]) + ) + exec_result = db_completeness.cursor.fetchall() + + if len(exec_result) == 0: + # If quadkey not found => GHSL built-up area is zero => treat as complete + completeness_code = 1 + elif len(exec_result) == 1: + completeness_code = exec_result[0][0] + else: # More than one entries found, this is an error + # This should not happen, as the database should not allow two entries with the + # same primary key + logger.error( + "ERROR in get_tile_automatic_completeness: " + "more than one entry found for quadkey='%s' AND source_id='%s " + % (quadkey, db_completeness_config["sourceid"]) + ) + completeness_code = 0 + + db_completeness.close_connection() + + completeness = bool(completeness_code) + + return completeness diff --git a/gdecore/gdecore.py b/gdecore/gdecore.py index b8a4a6f..93df43a 100644 --- a/gdecore/gdecore.py +++ b/gdecore/gdecore.py @@ -102,15 +102,16 @@ def main(): ) ) - # Retrieve OBM buildings and assign building classes and probabilities to them for i, data_unit_id in enumerate(data_units_ids): + # Going by data unit so as to minimise intersection operations, need to hold + # excessively large data in RAM and because building classes are associated with + # specific data units aux_log_string = ( "Data unit '%s' (of exposure entity '%s' and occupancy case '%s')" % (data_unit_id, exposure_entity_code, occupancy_case) ) - # Going by data unit so as to minimise intersection operations and because - # building classes are associated with specific data units + # Retrieve OBM buildings and assign building classes and probabilities to them # Retrieve OBM buildings obm_buildings_raw = ( DatabaseQueries.get_OBM_buildings_in_data_unit_by_occupancy_case( @@ -135,6 +136,13 @@ def main(): % (aux_log_string, str(obm_buildings.shape[0])) ) + del obm_buildings_raw + + # Calculate number of OBM buildings per quadkey + obm_buildings_per_quadkey = GDEProcessor.calculate_buildings_per_quadkey( + obm_buildings["quadkey"].to_numpy() + ) + # Retrieve building classes of this data unit data_unit_building_classes = DatabaseQueries.get_building_classes_of_data_unit( data_unit_id, @@ -172,6 +180,15 @@ def main(): % (aux_log_string, str(data_unit_tiles.shape[0])) ) + # Calculate remainder buildings in data-unit tiles + data_unit_tiles = GDEProcessor.process_group_data_unit_tiles( + data_unit_tiles, + obm_buildings_per_quadkey, + config.database_completeness, + "obm_built_area_assessments", + config.number_cores, + ) + # Leave the program logger.info("gde-core has finished") sys.exit() diff --git a/gdecore/processor.py b/gdecore/processor.py index 58e7886..7beaf08 100644 --- a/gdecore/processor.py +++ b/gdecore/processor.py @@ -18,10 +18,13 @@ import logging from copy import deepcopy +from multiprocessing import Pool +from functools import partial import numpy import mercantile import pandas import pyproj +from gdecore.database_queries import DatabaseQueries logger = logging.getLogger() @@ -683,3 +686,199 @@ class GDEProcessor: building_classes.loc[:, "proportions"] = old_proportions / old_proportions.sum() return building_classes + + @staticmethod + def calculate_buildings_per_quadkey(quadkeys_of_buildings): + """This function identifies unique elements in 'quadkeys_of_buildings' and returns a + Pandas DataFrame in which the indices are the unique elements of 'quadkeys_of_buildings' + and the column "counts" indicates the number of times each index (quadkey) is present in + the input array. + + Args: + quadkeys_of_buildings (arr of str): + Array of quadkeys in which elements can be repeated. + + Returns: + counts_per_quadkey (Pandas DataFrame): + DataFrame in which the indices are the unique elements of + 'quadkeys_of_buildings' and the column "counts" indicates the number of times + each index (quadkey) is present in 'quadkeys_of_buildings'. + """ + + unique_quadkeys, counts = numpy.unique(quadkeys_of_buildings, return_counts=True) + + counts_per_quadkey = pandas.DataFrame({"counts": counts}, index=unique_quadkeys) + + return counts_per_quadkey + + @staticmethod + def process_group_data_unit_tiles( + data_unit_tiles, + obm_buildings_per_quadkey, + db_completeness_config, + db_table, + number_cores=1, + ): + """This function processes the data-unit tiles contained in 'data_unit_tiles' using as + many cores as indicated by 'number_cores'. The processing consists of retrieving the + OSM-completeness value of each data-unit tile and calculating the number of remainder + buildings as a function of the number of aggregated buildings, OBM buildings and + completeness. + + Args: + data_unit_tiles (Pandas DataFrame): + Pandas DataFrame with data-unit tiles. It contains the following columns: + quadkey (str): + String indicating the quadkey of a tile. + aggregated_buildings (float): + Number of buildings in the data-unit tile as per the aggregated exposure + model with ID 'aggregated_source_id'. + obm_buildings_per_quadkey (Pandas DataFrame): + Pandas DataFrame with number of OBM buildings ("counts" column) per quadkey + (index). + db_completeness_config (dict): + Dictionary containing the credentials needed to connect to the SQL database in + which completeness data are stored. The keys of the dictionary need to be: + host (str): + SQL database host address. + dbname (str): + Name of the SQL database. + port (int): + Port where the SQL database can be found. + username (str): + User name to connect to the SQL database. + password (str): + Password associated with self.username. + sourceid (int): + ID of the automatic completeness source dataset that will be sought for. + db_table (str): + Name of the table of the SQL database where the completeness data are stored. It + is assumed that this table contains, at least, the following fields: + quadkey (str): + String indicating the quadkey of a tile. + completeness (int): + Completeness code: 0 = incomplete, 1 = complete. + source_id (int): + ID of the source used to define the built-up area and completeness. + number_cores (int): + Number of CPU cores to be used to run this function. Default: 1. + + Returns: + data_unit_tiles_full (Pandas DataFrame): + Pandas DataFrame with data-unit tiles. It contains the following columns: + quadkey (str): + String indicating the quadkey of a tile. + aggregated_buildings (float): + Number of buildings in the data-unit tile as per the aggregated exposure + model with ID 'aggregated_source_id'. + remainder_buildings (float): + Number of remainder buildings in the data-unit tile. + complete (bool): + True if the tile is OSM-complete, False if it is OSM-incomplete. + """ + + # Combine 'data_unit_tiles' and 'obm_buildings_per_quadkey' to have numbers of + # aggregated and OBM buildings per data-unit tiles in the same DataFrame + data_unit_tiles_full = data_unit_tiles.join(obm_buildings_per_quadkey.counts, "quadkey") + data_unit_tiles_full = data_unit_tiles_full.rename(columns={"counts": "obm_buildings"}) + + # Prepare 'data_unit_tiles' for parallel processing: + # each tuple contains (quadkey, aggregated_buildings, obm_buildings) of a data-unit tile + data_unit_tiles_list = [ + ( + data_unit_tiles_full["quadkey"].to_numpy()[j], + data_unit_tiles_full["aggregated_buildings"].to_numpy()[j], + data_unit_tiles_full["obm_buildings"].to_numpy()[j], + ) + for j in range(data_unit_tiles.shape[0]) + ] + + # Process data-unit tiles in parallel + p = Pool(processes=number_cores) + func = partial( + GDEProcessor.process_data_unit_tile, + db_completeness_config, + db_table, + ) + completeness_and_remainder = p.map(func, data_unit_tiles_list) + p.close() + p.join() + + data_unit_tiles_full["remainder_buildings"] = [ + completeness_and_remainder[i][0] for i in range(len(completeness_and_remainder)) + ] + data_unit_tiles_full["complete"] = [ + completeness_and_remainder[i][1] for i in range(len(completeness_and_remainder)) + ] + + return data_unit_tiles_full + + @staticmethod + def process_data_unit_tile(db_completeness_config, db_table, data_unit_tiles_attributes): + """This function calculates the number of buildings expected to exist in the data-unit + tile apart from the OBM buildings, i.e. the "remainder" buildings. If the tile is + complete, the number of remainder buildings is zero. If the tile is incomplete, the + number of remainder buildings is the difference between the number of aggregated + buildings and the number of OBM buildings, with a minimum value of zero. + + Args: + db_completeness_config (dict): + Dictionary containing the credentials needed to connect to the SQL database in + which completeness data are stored. The keys of the dictionary need to be: + host (str): + SQL database host address. + dbname (str): + Name of the SQL database. + port (int): + Port where the SQL database can be found. + username (str): + User name to connect to the SQL database. + password (str): + Password associated with self.username. + sourceid (int): + ID of the automatic completeness source dataset that will be sought for. + db_table (str): + Name of the table of the SQL database where the completeness data are stored. It + is assumed that this table contains, at least, the following fields: + quadkey (str): + String indicating the quadkey of a tile. + completeness (int): + Completeness code: 0 = incomplete, 1 = complete. + source_id (int): + ID of the source used to define the built-up area and completeness. + data_unit_tiles_attributes (tuple of (str, float, int)): + Attributes of this data-unit tile. The elements of the tuple are: + quadkey (str): + String indicating the quadkey of a tile. + aggregated_buildings (float): + Number of buildings in the data-unit tile as per an aggregated exposure + model. + obm_buildings (int): + Number of OBM buildings in the data-unit tile. + + Returns: + complete, remainder_buildings (tuple of (float, bool)): + The elements of the tuple are: + remainder_buildings (float): + Number of remainder buildings in the data-unit tile. + complete (bool): + True if the 'quadkey' in 'data_unit_tiles_attributes' is OSM-complete, + False if it is OSM-incomplete. + """ + + # Split contents of data_unit_tiles_attributes + quadkey = data_unit_tiles_attributes[0] + aggregated_buildings = data_unit_tiles_attributes[1] + obm_buildings = data_unit_tiles_attributes[2] + + # Retrieve completeness value + complete = DatabaseQueries.get_automatic_completeness_of_quadkey( + quadkey, db_completeness_config, db_table + ) + + if complete: + remainder_buildings = 0.0 + else: + remainder_buildings = max(0.0, aggregated_buildings - obm_buildings) + + return (remainder_buildings, complete) diff --git a/tests/data/config_for_testing_good.yml b/tests/data/config_for_testing_good.yml index a4c957c..73010bc 100644 --- a/tests/data/config_for_testing_good.yml +++ b/tests/data/config_for_testing_good.yml @@ -12,3 +12,10 @@ database_obm_buildings: dbname: some_database_name username: some_username password: some_password +database_completeness: + host: host.somewhere.xx + dbname: some_database_name + username: some_username + password: some_password + sourceid: 1 +number_cores: 1 diff --git a/tests/data/config_for_testing_missing.yml b/tests/data/config_for_testing_missing.yml index d577ba0..3f7c4c3 100644 --- a/tests/data/config_for_testing_missing.yml +++ b/tests/data/config_for_testing_missing.yml @@ -11,3 +11,10 @@ database_obm_buildings: dbname: some_database_name username: some_username password: some_password +database_completeness: + host: host.somewhere.xx + dbname: some_database_name + username: some_username + password: some_password + sourceid: 1 +number_cores: 1 diff --git a/tests/data/test_database_set_up.sql b/tests/data/test_database_set_up.sql index be3a9da..c253762 100644 --- a/tests/data/test_database_set_up.sql +++ b/tests/data/test_database_set_up.sql @@ -3,6 +3,7 @@ DROP TABLE IF EXISTS data_units; DROP TABLE IF EXISTS obm_buildings; DROP TABLE IF EXISTS data_units_buildings; DROP TABLE IF EXISTS data_unit_tiles; +DROP TABLE IF EXISTS obm_built_area_assessments; DROP TYPE IF EXISTS occupancycase; DROP TYPE IF EXISTS settlement; DROP EXTENSION IF EXISTS postgis; @@ -197,3 +198,15 @@ VALUES ('122010321033023130', 2, 'residential', 'ABC', 'ABC_10269', 0.0, 0.0, 0. ('122010321033023120', 2, 'commercial', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 17.6), ('122010321033023132', 2, 'residential', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 34.4), ('122010321033023132', 2, 'commercial', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 11.5); + +CREATE TABLE obm_built_area_assessments +( + quadkey char(18), + source_id SMALLINT, + completeness SMALLINT, + PRIMARY KEY (quadkey, source_id) +); +INSERT INTO obm_built_area_assessments(quadkey, source_id, completeness) +VALUES ('122010321033023130', 1, 0), +('122010321033023120', 1, 0), +('122010321033023132', 1, 1); diff --git a/tests/test_configuration.py b/tests/test_configuration.py index 8d767c1..eabcd39 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -33,12 +33,22 @@ def test_Configuration(): assert returned_config.database_gde_tiles["dbname"] == "some_database_name" assert returned_config.database_gde_tiles["username"] == "some_username" assert returned_config.database_gde_tiles["password"] == "some_password" + assert returned_config.database_obm_buildings["host"] == "host.somewhere.xx" + assert returned_config.database_obm_buildings["dbname"] == "some_database_name" + assert returned_config.database_obm_buildings["username"] == "some_username" + assert returned_config.database_obm_buildings["password"] == "some_password" + assert returned_config.database_completeness["host"] == "host.somewhere.xx" + assert returned_config.database_completeness["dbname"] == "some_database_name" + assert returned_config.database_completeness["username"] == "some_username" + assert returned_config.database_completeness["password"] == "some_password" + assert returned_config.database_completeness["sourceid"] == 1 assert len(returned_config.exposure_entities_to_run) == 1 assert returned_config.exposure_entities_to_run[0] == "Italy" assert returned_config.exposure_entities_code == "ISO3" assert len(returned_config.occupancies_to_run) == 2 assert returned_config.occupancies_to_run[0] == "residential" assert returned_config.occupancies_to_run[1] == "commercial" + assert returned_config.number_cores == 1 # Test case in which the file is not found with pytest.raises(OSError) as excinfo: diff --git a/tests/test_database_queries.py b/tests/test_database_queries.py index e534064..9f7eedd 100644 --- a/tests/test_database_queries.py +++ b/tests/test_database_queries.py @@ -379,3 +379,27 @@ def test_get_data_unit_tiles_of_data_unit_as_DataFrame(test_db): assert returned_data_unit_tiles.shape[0] == 0 assert "quadkey" in returned_data_unit_tiles.columns assert "aggregated_buildings" in returned_data_unit_tiles.columns + + +def test_get_automatic_completeness_of_quadkey(test_db): + # Database connection (the Configuration class will define the credentials based on whether + # the code is running in the CI or locally) + config = Configuration( + os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml") + ) + + quadkeys = [ + "122010321033023130", + "122010321033023120", + "122010321033023132", + "999999999999999999", # will not be found -> return complete + ] + + expected_completeness = [False, False, True, True] + + for i, quadkey in enumerate(quadkeys): + returned_completeness = DatabaseQueries.get_automatic_completeness_of_quadkey( + quadkey, config.database_completeness, "obm_built_area_assessments" + ) + + assert returned_completeness == expected_completeness[i] diff --git a/tests/test_processor.py b/tests/test_processor.py index f4622f8..2c5c2fa 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -408,3 +408,108 @@ def test_narrow_down_by_commercial_occupancy_details(): assert returned_obm_building_classes.shape[0] == building_classes.shape[0] for bdg_class_name in building_classes["building_class_name"].to_numpy(): assert bdg_class_name in returned_obm_building_classes["building_class_name"].to_numpy() + + +def test_calculate_buildings_per_quadkey(): + + quadkeys_of_buildings = numpy.array( + [ + "122010321033023130", + "122010321033023120", + "122010321033023120", + "122010321033023130", + "122010321033023132", + "122010321033023132", + "122010321033023132", + "122010321033023132", + "122010321033023130", + ] + ) + + returned_counts_per_quadkey = GDEProcessor.calculate_buildings_per_quadkey( + quadkeys_of_buildings + ) + + assert returned_counts_per_quadkey.shape[0] == 3 + assert returned_counts_per_quadkey.loc["122010321033023130", "counts"] == 3 + assert returned_counts_per_quadkey.loc["122010321033023120", "counts"] == 2 + assert returned_counts_per_quadkey.loc["122010321033023132", "counts"] == 4 + + +def test_process_data_unit_tile(test_db): + # Database connection (the Configuration class will define the credentials based on whether + # the code is running in the CI or locally) + config = Configuration( + os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml") + ) + + data_unit_tiles_attributes = [ + ("122010321033023130", 15.7, 3), + ("122010321033023120", 39.1, 41), + ("122010321033023132", 34.4, 12), + ] + + expected_output = [ + (12.7, False), + (0.0, False), + (0.0, True), + ] + + for i in range(len(data_unit_tiles_attributes)): + returned_output = GDEProcessor.process_data_unit_tile( + config.database_completeness, + "obm_built_area_assessments", + data_unit_tiles_attributes[i], + ) + + assert round(returned_output[0], 2) == round(expected_output[i][0], 2) + assert returned_output[1] == expected_output[i][1] # completeness + + +def test_process_group_data_unit_tiles(test_db): + # Database connection (the Configuration class will define the credentials based on whether + # the code is running in the CI or locally) + config = Configuration( + os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml") + ) + + obm_buildings_per_quadkey = pandas.DataFrame( + {"counts": [3, 41, 12]}, + index=["122010321033023130", "122010321033023120", "122010321033023132"], + ) + + data_unit_tiles = pandas.DataFrame( + { + "quadkey": ["122010321033023120", "122010321033023132", "122010321033023130"], + "aggregated_buildings": [39.1, 34.4, 15.7], + } + ) + + returned_data_unit_tiles_full = GDEProcessor.process_group_data_unit_tiles( + data_unit_tiles, + obm_buildings_per_quadkey, + config.database_completeness, + "obm_built_area_assessments", + number_cores=1, + ) + + expected_output = {} + expected_output["122010321033023130"] = (12.7, False) + expected_output["122010321033023120"] = (0.0, False) + expected_output["122010321033023132"] = (0.0, True) + + assert returned_data_unit_tiles_full.shape[0] == 3 + + for quadkey in expected_output.keys(): + assert ( + returned_data_unit_tiles_full[returned_data_unit_tiles_full.quadkey == quadkey][ + "remainder_buildings" + ].to_numpy()[0] + == expected_output[quadkey][0] + ) + assert ( + returned_data_unit_tiles_full[returned_data_unit_tiles_full.quadkey == quadkey][ + "complete" + ].to_numpy()[0] + == expected_output[quadkey][1] + ) -- GitLab