Commit 131edb30 authored by Cecilia Nievas's avatar Cecilia Nievas
Browse files

Added feature to process data-unit tiles

parent 9d969f3d
Pipeline #41260 passed with stage
in 2 minutes and 35 seconds
......@@ -14,6 +14,7 @@ variables:
GDEIMPORTER_DB: ${POSTGRES_DB}
GDEIMPORTER_USER: ${POSTGRES_USER}
GDEIMPORTER_PASSWORD: ${POSTGRES_PASSWORD}
GDEIMPORTER_SOURCEID: 1
cache:
paths:
......
......@@ -14,3 +14,11 @@ database_obm_buildings: # Database where info on the OBM buildings is stored
port: port_number # Leave empty if a port number is not needed
username: username
password: password_of_username
database_completeness: # Database where completeness per tile is stored
host: host_name
dbname: database_name
port: port_number # Leave empty if a port number is not needed
username: username
password: password_of_username
sourceid: 1
number_cores: 1 # Number of cores used for parallelisation
......@@ -75,6 +75,25 @@ class Configuration:
User name to connect to the SQL database.
password (str):
Password associated with self.username.
self.database_completeness (dict):
Dictionary containing the credentials needed to connect to the SQL database in which
the completeness per quadtile is stored. The exact parameters needed depend on the
database. They can be:
host (str):
SQL database host address.
dbname (str):
Name of the SQL database.
port (int):
Port where the SQL database can be found.
username (str):
User name to connect to the SQL database.
password (str):
Password associated with self.username.
sourceid (int):
ID of the built-up area source dataset used to determine completeness that
will be sought for.
self.number_cores (int):
Number of cores that will be used to run the code.
"""
REQUIRES = [
......@@ -84,6 +103,8 @@ class Configuration:
"occupancies_to_run",
"database_gde_tiles",
"database_obm_buildings",
"database_completeness",
"number_cores",
]
def __init__(self, filepath, force_config_over_hierarchies=False):
......@@ -114,6 +135,13 @@ class Configuration:
force_config_over_hierarchies,
)
self.database_completeness = ConfigurationMethods.retrieve_database_credentials(
config,
"database_completeness",
"test_db_completeness.env",
force_config_over_hierarchies,
)
self.exposure_entities_to_run = ConfigurationMethods.assign_listed_parameters(
config, "exposure_entities_to_run"
)
......@@ -141,6 +169,10 @@ class Configuration:
config, "occupancies_to_run"
)
self.number_cores = ConfigurationMethods.assign_integer_parameter(
config, "number_cores"
)
# Terminate if critical parameters are missing (not all parameters are critical)
for key_parameter in self.REQUIRES:
if getattr(self, key_parameter) is None:
......
......@@ -785,3 +785,74 @@ class DatabaseQueries:
)
return data_unit_tiles
@staticmethod
def get_automatic_completeness_of_quadkey(quadkey, db_completeness_config, db_table):
"""This function retrieves the completeness status of a tile with respect to the
representation of buildings in OpenStreetMap (OSM). If a tile is OSM-complete, then all
buildings that exist in reality are represented in OSM. If a tile is OSM-incomplete,
then some buildings that exist in reality are not yet represented in OSM.
Args:
quadkey (str):
Quadkey of the tile for which the completeness status will be retrieved.
db_completeness_config (dict):
Dictionary containing the credentials needed to connect to the SQL database in
which completeness data are stored. The keys of the dictionary need to be:
host (str):
SQL database host address.
dbname (str):
Name of the SQL database.
port (int):
Port where the SQL database can be found.
username (str):
User name to connect to the SQL database.
password (str):
Password associated with self.username.
sourceid (int):
ID of the automatic completeness source dataset that will be sought for.
db_table (str):
Name of the table of the SQL database where the completeness data are stored. It
is assumed that this table contains, at least, the following fields:
quadkey (str):
String indicating the quadkey of a tile.
completeness (int):
Completeness code: 0 = incomplete, 1 = complete.
source_id (int):
ID of the source used to define the built-up area and completeness.
Returns:
completeness (bool):
True if the tile is OSM-complete, False if it is OSM-incomplete.
"""
sql_query = "SELECT completeness FROM %s WHERE (quadkey='%s' AND source_id=%s);"
db_completeness = Database(**db_completeness_config)
db_completeness.create_connection_and_cursor()
db_completeness.cursor.execute(
sql_query % (db_table, quadkey, db_completeness_config["sourceid"])
)
exec_result = db_completeness.cursor.fetchall()
if len(exec_result) == 0:
# If quadkey not found => GHSL built-up area is zero => treat as complete
completeness_code = 1
elif len(exec_result) == 1:
completeness_code = exec_result[0][0]
else: # More than one entries found, this is an error
# This should not happen, as the database should not allow two entries with the
# same primary key
logger.error(
"ERROR in get_tile_automatic_completeness: "
"more than one entry found for quadkey='%s' AND source_id='%s "
% (quadkey, db_completeness_config["sourceid"])
)
completeness_code = 0
db_completeness.close_connection()
completeness = bool(completeness_code)
return completeness
......@@ -102,15 +102,16 @@ def main():
)
)
# Retrieve OBM buildings and assign building classes and probabilities to them
for i, data_unit_id in enumerate(data_units_ids):
# Going by data unit so as to minimise intersection operations, need to hold
# excessively large data in RAM and because building classes are associated with
# specific data units
aux_log_string = (
"Data unit '%s' (of exposure entity '%s' and occupancy case '%s')"
% (data_unit_id, exposure_entity_code, occupancy_case)
)
# Going by data unit so as to minimise intersection operations and because
# building classes are associated with specific data units
# Retrieve OBM buildings and assign building classes and probabilities to them
# Retrieve OBM buildings
obm_buildings_raw = (
DatabaseQueries.get_OBM_buildings_in_data_unit_by_occupancy_case(
......@@ -135,6 +136,13 @@ def main():
% (aux_log_string, str(obm_buildings.shape[0]))
)
del obm_buildings_raw
# Calculate number of OBM buildings per quadkey
obm_buildings_per_quadkey = GDEProcessor.calculate_buildings_per_quadkey(
obm_buildings["quadkey"].to_numpy()
)
# Retrieve building classes of this data unit
data_unit_building_classes = DatabaseQueries.get_building_classes_of_data_unit(
data_unit_id,
......@@ -172,6 +180,15 @@ def main():
% (aux_log_string, str(data_unit_tiles.shape[0]))
)
# Calculate remainder buildings in data-unit tiles
data_unit_tiles = GDEProcessor.process_group_data_unit_tiles(
data_unit_tiles,
obm_buildings_per_quadkey,
config.database_completeness,
"obm_built_area_assessments",
config.number_cores,
)
# Leave the program
logger.info("gde-core has finished")
sys.exit()
......
......@@ -18,10 +18,13 @@
import logging
from copy import deepcopy
from multiprocessing import Pool
from functools import partial
import numpy
import mercantile
import pandas
import pyproj
from gdecore.database_queries import DatabaseQueries
logger = logging.getLogger()
......@@ -683,3 +686,199 @@ class GDEProcessor:
building_classes.loc[:, "proportions"] = old_proportions / old_proportions.sum()
return building_classes
@staticmethod
def calculate_buildings_per_quadkey(quadkeys_of_buildings):
"""This function identifies unique elements in 'quadkeys_of_buildings' and returns a
Pandas DataFrame in which the indices are the unique elements of 'quadkeys_of_buildings'
and the column "counts" indicates the number of times each index (quadkey) is present in
the input array.
Args:
quadkeys_of_buildings (arr of str):
Array of quadkeys in which elements can be repeated.
Returns:
counts_per_quadkey (Pandas DataFrame):
DataFrame in which the indices are the unique elements of
'quadkeys_of_buildings' and the column "counts" indicates the number of times
each index (quadkey) is present in 'quadkeys_of_buildings'.
"""
unique_quadkeys, counts = numpy.unique(quadkeys_of_buildings, return_counts=True)
counts_per_quadkey = pandas.DataFrame({"counts": counts}, index=unique_quadkeys)
return counts_per_quadkey
@staticmethod
def process_group_data_unit_tiles(
data_unit_tiles,
obm_buildings_per_quadkey,
db_completeness_config,
db_table,
number_cores=1,
):
"""This function processes the data-unit tiles contained in 'data_unit_tiles' using as
many cores as indicated by 'number_cores'. The processing consists of retrieving the
OSM-completeness value of each data-unit tile and calculating the number of remainder
buildings as a function of the number of aggregated buildings, OBM buildings and
completeness.
Args:
data_unit_tiles (Pandas DataFrame):
Pandas DataFrame with data-unit tiles. It contains the following columns:
quadkey (str):
String indicating the quadkey of a tile.
aggregated_buildings (float):
Number of buildings in the data-unit tile as per the aggregated exposure
model with ID 'aggregated_source_id'.
obm_buildings_per_quadkey (Pandas DataFrame):
Pandas DataFrame with number of OBM buildings ("counts" column) per quadkey
(index).
db_completeness_config (dict):
Dictionary containing the credentials needed to connect to the SQL database in
which completeness data are stored. The keys of the dictionary need to be:
host (str):
SQL database host address.
dbname (str):
Name of the SQL database.
port (int):
Port where the SQL database can be found.
username (str):
User name to connect to the SQL database.
password (str):
Password associated with self.username.
sourceid (int):
ID of the automatic completeness source dataset that will be sought for.
db_table (str):
Name of the table of the SQL database where the completeness data are stored. It
is assumed that this table contains, at least, the following fields:
quadkey (str):
String indicating the quadkey of a tile.
completeness (int):
Completeness code: 0 = incomplete, 1 = complete.
source_id (int):
ID of the source used to define the built-up area and completeness.
number_cores (int):
Number of CPU cores to be used to run this function. Default: 1.
Returns:
data_unit_tiles_full (Pandas DataFrame):
Pandas DataFrame with data-unit tiles. It contains the following columns:
quadkey (str):
String indicating the quadkey of a tile.
aggregated_buildings (float):
Number of buildings in the data-unit tile as per the aggregated exposure
model with ID 'aggregated_source_id'.
remainder_buildings (float):
Number of remainder buildings in the data-unit tile.
complete (bool):
True if the tile is OSM-complete, False if it is OSM-incomplete.
"""
# Combine 'data_unit_tiles' and 'obm_buildings_per_quadkey' to have numbers of
# aggregated and OBM buildings per data-unit tiles in the same DataFrame
data_unit_tiles_full = data_unit_tiles.join(obm_buildings_per_quadkey.counts, "quadkey")
data_unit_tiles_full = data_unit_tiles_full.rename(columns={"counts": "obm_buildings"})
# Prepare 'data_unit_tiles' for parallel processing:
# each tuple contains (quadkey, aggregated_buildings, obm_buildings) of a data-unit tile
data_unit_tiles_list = [
(
data_unit_tiles_full["quadkey"].to_numpy()[j],
data_unit_tiles_full["aggregated_buildings"].to_numpy()[j],
data_unit_tiles_full["obm_buildings"].to_numpy()[j],
)
for j in range(data_unit_tiles.shape[0])
]
# Process data-unit tiles in parallel
p = Pool(processes=number_cores)
func = partial(
GDEProcessor.process_data_unit_tile,
db_completeness_config,
db_table,
)
completeness_and_remainder = p.map(func, data_unit_tiles_list)
p.close()
p.join()
data_unit_tiles_full["remainder_buildings"] = [
completeness_and_remainder[i][0] for i in range(len(completeness_and_remainder))
]
data_unit_tiles_full["complete"] = [
completeness_and_remainder[i][1] for i in range(len(completeness_and_remainder))
]
return data_unit_tiles_full
@staticmethod
def process_data_unit_tile(db_completeness_config, db_table, data_unit_tiles_attributes):
"""This function calculates the number of buildings expected to exist in the data-unit
tile apart from the OBM buildings, i.e. the "remainder" buildings. If the tile is
complete, the number of remainder buildings is zero. If the tile is incomplete, the
number of remainder buildings is the difference between the number of aggregated
buildings and the number of OBM buildings, with a minimum value of zero.
Args:
db_completeness_config (dict):
Dictionary containing the credentials needed to connect to the SQL database in
which completeness data are stored. The keys of the dictionary need to be:
host (str):
SQL database host address.
dbname (str):
Name of the SQL database.
port (int):
Port where the SQL database can be found.
username (str):
User name to connect to the SQL database.
password (str):
Password associated with self.username.
sourceid (int):
ID of the automatic completeness source dataset that will be sought for.
db_table (str):
Name of the table of the SQL database where the completeness data are stored. It
is assumed that this table contains, at least, the following fields:
quadkey (str):
String indicating the quadkey of a tile.
completeness (int):
Completeness code: 0 = incomplete, 1 = complete.
source_id (int):
ID of the source used to define the built-up area and completeness.
data_unit_tiles_attributes (tuple of (str, float, int)):
Attributes of this data-unit tile. The elements of the tuple are:
quadkey (str):
String indicating the quadkey of a tile.
aggregated_buildings (float):
Number of buildings in the data-unit tile as per an aggregated exposure
model.
obm_buildings (int):
Number of OBM buildings in the data-unit tile.
Returns:
complete, remainder_buildings (tuple of (float, bool)):
The elements of the tuple are:
remainder_buildings (float):
Number of remainder buildings in the data-unit tile.
complete (bool):
True if the 'quadkey' in 'data_unit_tiles_attributes' is OSM-complete,
False if it is OSM-incomplete.
"""
# Split contents of data_unit_tiles_attributes
quadkey = data_unit_tiles_attributes[0]
aggregated_buildings = data_unit_tiles_attributes[1]
obm_buildings = data_unit_tiles_attributes[2]
# Retrieve completeness value
complete = DatabaseQueries.get_automatic_completeness_of_quadkey(
quadkey, db_completeness_config, db_table
)
if complete:
remainder_buildings = 0.0
else:
remainder_buildings = max(0.0, aggregated_buildings - obm_buildings)
return (remainder_buildings, complete)
......@@ -12,3 +12,10 @@ database_obm_buildings:
dbname: some_database_name
username: some_username
password: some_password
database_completeness:
host: host.somewhere.xx
dbname: some_database_name
username: some_username
password: some_password
sourceid: 1
number_cores: 1
......@@ -11,3 +11,10 @@ database_obm_buildings:
dbname: some_database_name
username: some_username
password: some_password
database_completeness:
host: host.somewhere.xx
dbname: some_database_name
username: some_username
password: some_password
sourceid: 1
number_cores: 1
......@@ -3,6 +3,7 @@ DROP TABLE IF EXISTS data_units;
DROP TABLE IF EXISTS obm_buildings;
DROP TABLE IF EXISTS data_units_buildings;
DROP TABLE IF EXISTS data_unit_tiles;
DROP TABLE IF EXISTS obm_built_area_assessments;
DROP TYPE IF EXISTS occupancycase;
DROP TYPE IF EXISTS settlement;
DROP EXTENSION IF EXISTS postgis;
......@@ -197,3 +198,15 @@ VALUES ('122010321033023130', 2, 'residential', 'ABC', 'ABC_10269', 0.0, 0.0, 0.
('122010321033023120', 2, 'commercial', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 17.6),
('122010321033023132', 2, 'residential', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 34.4),
('122010321033023132', 2, 'commercial', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 11.5);
CREATE TABLE obm_built_area_assessments
(
quadkey char(18),
source_id SMALLINT,
completeness SMALLINT,
PRIMARY KEY (quadkey, source_id)
);
INSERT INTO obm_built_area_assessments(quadkey, source_id, completeness)
VALUES ('122010321033023130', 1, 0),
('122010321033023120', 1, 0),
('122010321033023132', 1, 1);
......@@ -33,12 +33,22 @@ def test_Configuration():
assert returned_config.database_gde_tiles["dbname"] == "some_database_name"
assert returned_config.database_gde_tiles["username"] == "some_username"
assert returned_config.database_gde_tiles["password"] == "some_password"
assert returned_config.database_obm_buildings["host"] == "host.somewhere.xx"
assert returned_config.database_obm_buildings["dbname"] == "some_database_name"
assert returned_config.database_obm_buildings["username"] == "some_username"
assert returned_config.database_obm_buildings["password"] == "some_password"
assert returned_config.database_completeness["host"] == "host.somewhere.xx"
assert returned_config.database_completeness["dbname"] == "some_database_name"
assert returned_config.database_completeness["username"] == "some_username"
assert returned_config.database_completeness["password"] == "some_password"
assert returned_config.database_completeness["sourceid"] == 1
assert len(returned_config.exposure_entities_to_run) == 1
assert returned_config.exposure_entities_to_run[0] == "Italy"
assert returned_config.exposure_entities_code == "ISO3"
assert len(returned_config.occupancies_to_run) == 2
assert returned_config.occupancies_to_run[0] == "residential"
assert returned_config.occupancies_to_run[1] == "commercial"
assert returned_config.number_cores == 1
# Test case in which the file is not found
with pytest.raises(OSError) as excinfo:
......
......@@ -379,3 +379,27 @@ def test_get_data_unit_tiles_of_data_unit_as_DataFrame(test_db):
assert returned_data_unit_tiles.shape[0] == 0
assert "quadkey" in returned_data_unit_tiles.columns
assert "aggregated_buildings" in returned_data_unit_tiles.columns
def test_get_automatic_completeness_of_quadkey(test_db):
# Database connection (the Configuration class will define the credentials based on whether
# the code is running in the CI or locally)
config = Configuration(
os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml")
)
quadkeys = [
"122010321033023130",
"122010321033023120",
"122010321033023132",
"999999999999999999", # will not be found -> return complete
]
expected_completeness = [False, False, True, True]
for i, quadkey in enumerate(quadkeys):
returned_completeness = DatabaseQueries.get_automatic_completeness_of_quadkey(
quadkey, config.database_completeness, "obm_built_area_assessments"
)
assert returned_completeness == expected_completeness[i]
......@@ -408,3 +408,108 @@ def test_narrow_down_by_commercial_occupancy_details():
assert returned_obm_building_classes.shape[0] == building_classes.shape[0]
for bdg_class_name in building_classes["building_class_name"].to_numpy():
assert bdg_class_name in returned_obm_building_classes["building_class_name"].to_numpy()
def test_calculate_buildings_per_quadkey():
quadkeys_of_buildings = numpy.array(
[
"122010321033023130",
"122010321033023120",
"122010321033023120",
"122010321033023130",
"122010321033023132",
"122010321033023132",
"122010321033023132",
"122010321033023132",
"122010321033023130",
]
)
returned_counts_per_quadkey = GDEProcessor.calculate_buildings_per_quadkey(
quadkeys_of_buildings
)
assert returned_counts_per_quadkey.shape[0] == 3
assert returned_counts_per_quadkey.loc["122010321033023130", "counts"] == 3
assert returned_counts_per_quadkey.loc["122010321033023120", "counts"] == 2
assert returned_counts_per_quadkey.loc["122010321033023132", "counts"] == 4
def test_process_data_unit_tile(test_db):
# Database connection (the Configuration class will define the credentials based on whether
# the code is running in the CI or locally)
config = Configuration(
os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml")
)
data_unit_tiles_attributes = [
("122010321033023130", 15.7, 3),
("122010321033023120", 39.1, 41),
("122010321033023132", 34.4, 12),
]
expected_output = [
(12.7, False),
(0.0, False),
(0.0, True),
]
for i in range(len(data_unit_tiles_attributes)):
returned_output = GDEProcessor.process_data_unit_tile(
config.database_completeness,
"obm_built_area_assessments",
data_unit_tiles_attributes[i],
)
assert round(returned_output[0], 2) == round(expected_output[i][0], 2)
assert returned_output[1] == expected_output[i][1] # completeness
def test_process_group_data_unit_tiles(test_db):
# Database connection (the Configuration class will define the credentials based on whether
# the code is running in the CI or locally)
config = Configuration(
os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml")
)
obm_buildings_per_quadkey = pandas.DataFrame(
{"counts": [3, 41, 12]},
index=["122010321033023130", "122010321033023120", "122010321033023132"],
)
data_unit_tiles = pandas.DataFrame(
{
"quadkey": ["122010321033023120", "122010321033023132", "122010321033023130"],
"aggregated_buildings": [39.1, 34.4, 15.7],
}
)
returned_data_unit_tiles_full = GDEProcessor.process_group_data_unit_tiles(
data_unit_tiles,
obm_buildings_per_quadkey,
config.database_completeness,
"obm_built_area_assessments",
number_cores=1,
)
expected_output = {}
expected_output["122010321033023130"] = (12.7, False)
expected_output["122010321033023120"] = (0.0, False)
expected_output["122010321033023132"] = (0.0, True)
assert returned_data_unit_tiles_full.shape[0] == 3