From 131edb30f5f3b9c39d0bf2c728411b16d629646a Mon Sep 17 00:00:00 2001
From: Cecilia Nievas <cnievas@gfz-potsdam.de>
Date: Fri, 8 Apr 2022 18:41:53 +0200
Subject: [PATCH] Added feature to process data-unit tiles

---
 .gitlab-ci.yml                            |   1 +
 config_example.yml                        |   8 +
 gdecore/configuration.py                  |  32 ++++
 gdecore/database_queries.py               |  71 ++++++++
 gdecore/gdecore.py                        |  23 ++-
 gdecore/processor.py                      | 199 ++++++++++++++++++++++
 tests/data/config_for_testing_good.yml    |   7 +
 tests/data/config_for_testing_missing.yml |   7 +
 tests/data/test_database_set_up.sql       |  13 ++
 tests/test_configuration.py               |  10 ++
 tests/test_database_queries.py            |  24 +++
 tests/test_processor.py                   | 105 ++++++++++++
 12 files changed, 497 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 558305c..449c565 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,6 +14,7 @@ variables:
   GDEIMPORTER_DB: ${POSTGRES_DB}
   GDEIMPORTER_USER: ${POSTGRES_USER}
   GDEIMPORTER_PASSWORD: ${POSTGRES_PASSWORD}
+  GDEIMPORTER_SOURCEID: 1
 
 cache:
   paths:
diff --git a/config_example.yml b/config_example.yml
index 3ce8db1..bd48285 100644
--- a/config_example.yml
+++ b/config_example.yml
@@ -14,3 +14,11 @@ database_obm_buildings:  # Database where info on the OBM buildings is stored
   port: port_number  # Leave empty if a port number is not needed
   username: username
   password: password_of_username
+database_completeness:  # Database where completeness per tile is stored
+  host: host_name
+  dbname: database_name
+  port: port_number  # Leave empty if a port number is not needed
+  username: username
+  password: password_of_username
+  sourceid: 1
+number_cores: 1  # Number of cores used for parallelisation
diff --git a/gdecore/configuration.py b/gdecore/configuration.py
index b6d3c7a..ff69775 100644
--- a/gdecore/configuration.py
+++ b/gdecore/configuration.py
@@ -75,6 +75,25 @@ class Configuration:
                     User name to connect to the SQL database.
                 password (str):
                     Password associated with self.username.
+        self.database_completeness (dict):
+            Dictionary containing the credentials needed to connect to the SQL database in which
+            the completeness per quadtile is stored. The exact parameters needed depend on the
+            database. They can be:
+                host (str):
+                    SQL database host address.
+                dbname (str):
+                    Name of the SQL database.
+                port (int):
+                    Port where the SQL database can be found.
+                username (str):
+                    User name to connect to the SQL database.
+                password (str):
+                    Password associated with self.username.
+                sourceid (int):
+                    ID of the built-up area source dataset used to determine completeness that
+                    will be sought for.
+        self.number_cores (int):
+            Number of cores that will be used to run the code.
     """
 
     REQUIRES = [
@@ -84,6 +103,8 @@ class Configuration:
         "occupancies_to_run",
         "database_gde_tiles",
         "database_obm_buildings",
+        "database_completeness",
+        "number_cores",
     ]
 
     def __init__(self, filepath, force_config_over_hierarchies=False):
@@ -114,6 +135,13 @@ class Configuration:
             force_config_over_hierarchies,
         )
 
+        self.database_completeness = ConfigurationMethods.retrieve_database_credentials(
+            config,
+            "database_completeness",
+            "test_db_completeness.env",
+            force_config_over_hierarchies,
+        )
+
         self.exposure_entities_to_run = ConfigurationMethods.assign_listed_parameters(
             config, "exposure_entities_to_run"
         )
@@ -141,6 +169,10 @@ class Configuration:
             config, "occupancies_to_run"
         )
 
+        self.number_cores = ConfigurationMethods.assign_integer_parameter(
+            config, "number_cores"
+        )
+
         # Terminate if critical parameters are missing (not all parameters are critical)
         for key_parameter in self.REQUIRES:
             if getattr(self, key_parameter) is None:
diff --git a/gdecore/database_queries.py b/gdecore/database_queries.py
index d49e639..3ef8045 100644
--- a/gdecore/database_queries.py
+++ b/gdecore/database_queries.py
@@ -785,3 +785,74 @@ class DatabaseQueries:
         )
 
         return data_unit_tiles
+
+    @staticmethod
+    def get_automatic_completeness_of_quadkey(quadkey, db_completeness_config, db_table):
+        """This function retrieves the completeness status of a tile with respect to the
+        representation of buildings in OpenStreetMap (OSM). If a tile is OSM-complete, then all
+        buildings that exist in reality are represented in OSM. If a tile is OSM-incomplete,
+        then some buildings that exist in reality are not yet represented in OSM.
+
+        Args:
+            quadkey (str):
+                Quadkey of the tile for which the completeness status will be retrieved.
+            db_completeness_config (dict):
+                Dictionary containing the credentials needed to connect to the SQL database in
+                which completeness data are stored. The keys of the dictionary need to be:
+                    host (str):
+                        SQL database host address.
+                    dbname (str):
+                        Name of the SQL database.
+                    port (int):
+                        Port where the SQL database can be found.
+                    username (str):
+                        User name to connect to the SQL database.
+                    password (str):
+                        Password associated with self.username.
+                    sourceid (int):
+                        ID of the automatic completeness source dataset that will be sought for.
+            db_table (str):
+                Name of the table of the SQL database where the completeness data are stored. It
+                is assumed that this table contains, at least, the following fields:
+                    quadkey (str):
+                        String indicating the quadkey of a tile.
+                    completeness (int):
+                        Completeness code: 0 = incomplete, 1 = complete.
+                    source_id (int):
+                        ID of the source used to define the built-up area and completeness.
+
+        Returns:
+            completeness (bool):
+                True if the tile is OSM-complete, False if it is OSM-incomplete.
+        """
+
+        sql_query = "SELECT completeness FROM %s WHERE (quadkey='%s' AND source_id=%s);"
+
+        db_completeness = Database(**db_completeness_config)
+        db_completeness.create_connection_and_cursor()
+
+        db_completeness.cursor.execute(
+            sql_query % (db_table, quadkey, db_completeness_config["sourceid"])
+        )
+        exec_result = db_completeness.cursor.fetchall()
+
+        if len(exec_result) == 0:
+            # If quadkey not found => GHSL built-up area is zero => treat as complete
+            completeness_code = 1
+        elif len(exec_result) == 1:
+            completeness_code = exec_result[0][0]
+        else:  # More than one entries found, this is an error
+            # This should not happen, as the database should not allow two entries with the
+            # same primary key
+            logger.error(
+                "ERROR in get_tile_automatic_completeness: "
+                "more than one entry found for quadkey='%s' AND source_id='%s "
+                % (quadkey, db_completeness_config["sourceid"])
+            )
+            completeness_code = 0
+
+        db_completeness.close_connection()
+
+        completeness = bool(completeness_code)
+
+        return completeness
diff --git a/gdecore/gdecore.py b/gdecore/gdecore.py
index b8a4a6f..93df43a 100644
--- a/gdecore/gdecore.py
+++ b/gdecore/gdecore.py
@@ -102,15 +102,16 @@ def main():
                     )
                 )
 
-            # Retrieve OBM buildings and assign building classes and probabilities to them
             for i, data_unit_id in enumerate(data_units_ids):
+                # Going by data unit so as to minimise intersection operations, need to hold
+                # excessively large data in RAM and because building classes are associated with
+                # specific data units
                 aux_log_string = (
                     "Data unit '%s' (of exposure entity '%s' and occupancy case '%s')"
                     % (data_unit_id, exposure_entity_code, occupancy_case)
                 )
-                # Going by data unit so as to minimise intersection operations and because
-                # building classes are associated with specific data units
 
+                # Retrieve OBM buildings and assign building classes and probabilities to them
                 # Retrieve OBM buildings
                 obm_buildings_raw = (
                     DatabaseQueries.get_OBM_buildings_in_data_unit_by_occupancy_case(
@@ -135,6 +136,13 @@ def main():
                     % (aux_log_string, str(obm_buildings.shape[0]))
                 )
 
+                del obm_buildings_raw
+
+                # Calculate number of OBM buildings per quadkey
+                obm_buildings_per_quadkey = GDEProcessor.calculate_buildings_per_quadkey(
+                    obm_buildings["quadkey"].to_numpy()
+                )
+
                 # Retrieve building classes of this data unit
                 data_unit_building_classes = DatabaseQueries.get_building_classes_of_data_unit(
                     data_unit_id,
@@ -172,6 +180,15 @@ def main():
                     % (aux_log_string, str(data_unit_tiles.shape[0]))
                 )
 
+                # Calculate remainder buildings in data-unit tiles
+                data_unit_tiles = GDEProcessor.process_group_data_unit_tiles(
+                    data_unit_tiles,
+                    obm_buildings_per_quadkey,
+                    config.database_completeness,
+                    "obm_built_area_assessments",
+                    config.number_cores,
+                )
+
     # Leave the program
     logger.info("gde-core has finished")
     sys.exit()
diff --git a/gdecore/processor.py b/gdecore/processor.py
index 58e7886..7beaf08 100644
--- a/gdecore/processor.py
+++ b/gdecore/processor.py
@@ -18,10 +18,13 @@
 
 import logging
 from copy import deepcopy
+from multiprocessing import Pool
+from functools import partial
 import numpy
 import mercantile
 import pandas
 import pyproj
+from gdecore.database_queries import DatabaseQueries
 
 
 logger = logging.getLogger()
@@ -683,3 +686,199 @@ class GDEProcessor:
             building_classes.loc[:, "proportions"] = old_proportions / old_proportions.sum()
 
         return building_classes
+
+    @staticmethod
+    def calculate_buildings_per_quadkey(quadkeys_of_buildings):
+        """This function identifies unique elements in 'quadkeys_of_buildings' and returns a
+        Pandas DataFrame in which the indices are the unique elements of 'quadkeys_of_buildings'
+        and the column "counts" indicates the number of times each index (quadkey) is present in
+        the input array.
+
+        Args:
+            quadkeys_of_buildings (arr of str):
+                Array of quadkeys in which elements can be repeated.
+
+        Returns:
+            counts_per_quadkey (Pandas DataFrame):
+                DataFrame in which the indices are the unique elements of
+                'quadkeys_of_buildings' and the column "counts" indicates the number of times
+                each index (quadkey) is present in 'quadkeys_of_buildings'.
+        """
+
+        unique_quadkeys, counts = numpy.unique(quadkeys_of_buildings, return_counts=True)
+
+        counts_per_quadkey = pandas.DataFrame({"counts": counts}, index=unique_quadkeys)
+
+        return counts_per_quadkey
+
+    @staticmethod
+    def process_group_data_unit_tiles(
+        data_unit_tiles,
+        obm_buildings_per_quadkey,
+        db_completeness_config,
+        db_table,
+        number_cores=1,
+    ):
+        """This function processes the data-unit tiles contained in 'data_unit_tiles' using as
+        many cores as indicated by 'number_cores'. The processing consists of retrieving the
+        OSM-completeness value of each data-unit tile and calculating the number of remainder
+        buildings as a function of the number of aggregated buildings, OBM buildings and
+        completeness.
+
+        Args:
+            data_unit_tiles (Pandas DataFrame):
+                Pandas DataFrame with data-unit tiles. It contains the following columns:
+                    quadkey (str):
+                        String indicating the quadkey of a tile.
+                    aggregated_buildings (float):
+                        Number of buildings in the data-unit tile as per the aggregated exposure
+                        model with ID 'aggregated_source_id'.
+            obm_buildings_per_quadkey (Pandas DataFrame):
+                Pandas DataFrame with number of OBM buildings ("counts" column) per quadkey
+                (index).
+            db_completeness_config (dict):
+                Dictionary containing the credentials needed to connect to the SQL database in
+                which completeness data are stored. The keys of the dictionary need to be:
+                    host (str):
+                        SQL database host address.
+                    dbname (str):
+                        Name of the SQL database.
+                    port (int):
+                        Port where the SQL database can be found.
+                    username (str):
+                        User name to connect to the SQL database.
+                    password (str):
+                        Password associated with self.username.
+                    sourceid (int):
+                        ID of the automatic completeness source dataset that will be sought for.
+            db_table (str):
+                Name of the table of the SQL database where the completeness data are stored. It
+                is assumed that this table contains, at least, the following fields:
+                    quadkey (str):
+                        String indicating the quadkey of a tile.
+                    completeness (int):
+                        Completeness code: 0 = incomplete, 1 = complete.
+                    source_id (int):
+                        ID of the source used to define the built-up area and completeness.
+            number_cores (int):
+                Number of CPU cores to be used to run this function. Default: 1.
+
+        Returns:
+            data_unit_tiles_full (Pandas DataFrame):
+                Pandas DataFrame with data-unit tiles. It contains the following columns:
+                    quadkey (str):
+                        String indicating the quadkey of a tile.
+                    aggregated_buildings (float):
+                        Number of buildings in the data-unit tile as per the aggregated exposure
+                        model with ID 'aggregated_source_id'.
+                    remainder_buildings (float):
+                        Number of remainder buildings in the data-unit tile.
+                    complete (bool):
+                        True if the tile is OSM-complete, False if it is OSM-incomplete.
+        """
+
+        # Combine 'data_unit_tiles' and 'obm_buildings_per_quadkey' to have numbers of
+        # aggregated and OBM buildings per data-unit tiles in the same DataFrame
+        data_unit_tiles_full = data_unit_tiles.join(obm_buildings_per_quadkey.counts, "quadkey")
+        data_unit_tiles_full = data_unit_tiles_full.rename(columns={"counts": "obm_buildings"})
+
+        # Prepare 'data_unit_tiles' for parallel processing:
+        # each tuple contains (quadkey, aggregated_buildings, obm_buildings) of a data-unit tile
+        data_unit_tiles_list = [
+            (
+                data_unit_tiles_full["quadkey"].to_numpy()[j],
+                data_unit_tiles_full["aggregated_buildings"].to_numpy()[j],
+                data_unit_tiles_full["obm_buildings"].to_numpy()[j],
+            )
+            for j in range(data_unit_tiles.shape[0])
+        ]
+
+        # Process data-unit tiles in parallel
+        p = Pool(processes=number_cores)
+        func = partial(
+            GDEProcessor.process_data_unit_tile,
+            db_completeness_config,
+            db_table,
+        )
+        completeness_and_remainder = p.map(func, data_unit_tiles_list)
+        p.close()
+        p.join()
+
+        data_unit_tiles_full["remainder_buildings"] = [
+            completeness_and_remainder[i][0] for i in range(len(completeness_and_remainder))
+        ]
+        data_unit_tiles_full["complete"] = [
+            completeness_and_remainder[i][1] for i in range(len(completeness_and_remainder))
+        ]
+
+        return data_unit_tiles_full
+
+    @staticmethod
+    def process_data_unit_tile(db_completeness_config, db_table, data_unit_tiles_attributes):
+        """This function calculates the number of buildings expected to exist in the data-unit
+        tile apart from the OBM buildings, i.e. the "remainder" buildings. If the tile is
+        complete, the number of remainder buildings is zero. If the tile is incomplete, the
+        number of remainder buildings is the difference between the number of aggregated
+        buildings and the number of OBM buildings, with a minimum value of zero.
+
+        Args:
+            db_completeness_config (dict):
+                Dictionary containing the credentials needed to connect to the SQL database in
+                which completeness data are stored. The keys of the dictionary need to be:
+                    host (str):
+                        SQL database host address.
+                    dbname (str):
+                        Name of the SQL database.
+                    port (int):
+                        Port where the SQL database can be found.
+                    username (str):
+                        User name to connect to the SQL database.
+                    password (str):
+                        Password associated with self.username.
+                    sourceid (int):
+                        ID of the automatic completeness source dataset that will be sought for.
+            db_table (str):
+                Name of the table of the SQL database where the completeness data are stored. It
+                is assumed that this table contains, at least, the following fields:
+                    quadkey (str):
+                        String indicating the quadkey of a tile.
+                    completeness (int):
+                        Completeness code: 0 = incomplete, 1 = complete.
+                    source_id (int):
+                        ID of the source used to define the built-up area and completeness.
+            data_unit_tiles_attributes (tuple of (str, float, int)):
+                Attributes of this data-unit tile. The elements of the tuple are:
+                    quadkey (str):
+                        String indicating the quadkey of a tile.
+                    aggregated_buildings (float):
+                        Number of buildings in the data-unit tile as per an aggregated exposure
+                        model.
+                    obm_buildings (int):
+                        Number of OBM buildings in the data-unit tile.
+
+        Returns:
+            complete, remainder_buildings (tuple of (float, bool)):
+                The elements of the tuple are:
+                    remainder_buildings (float):
+                        Number of remainder buildings in the data-unit tile.
+                    complete (bool):
+                        True if the 'quadkey' in 'data_unit_tiles_attributes' is OSM-complete,
+                        False if it is OSM-incomplete.
+        """
+
+        # Split contents of data_unit_tiles_attributes
+        quadkey = data_unit_tiles_attributes[0]
+        aggregated_buildings = data_unit_tiles_attributes[1]
+        obm_buildings = data_unit_tiles_attributes[2]
+
+        # Retrieve completeness value
+        complete = DatabaseQueries.get_automatic_completeness_of_quadkey(
+            quadkey, db_completeness_config, db_table
+        )
+
+        if complete:
+            remainder_buildings = 0.0
+        else:
+            remainder_buildings = max(0.0, aggregated_buildings - obm_buildings)
+
+        return (remainder_buildings, complete)
diff --git a/tests/data/config_for_testing_good.yml b/tests/data/config_for_testing_good.yml
index a4c957c..73010bc 100644
--- a/tests/data/config_for_testing_good.yml
+++ b/tests/data/config_for_testing_good.yml
@@ -12,3 +12,10 @@ database_obm_buildings:
   dbname: some_database_name
   username: some_username
   password: some_password
+database_completeness:
+  host: host.somewhere.xx
+  dbname: some_database_name
+  username: some_username
+  password: some_password
+  sourceid: 1
+number_cores: 1
diff --git a/tests/data/config_for_testing_missing.yml b/tests/data/config_for_testing_missing.yml
index d577ba0..3f7c4c3 100644
--- a/tests/data/config_for_testing_missing.yml
+++ b/tests/data/config_for_testing_missing.yml
@@ -11,3 +11,10 @@ database_obm_buildings:
   dbname: some_database_name
   username: some_username
   password: some_password
+database_completeness:
+  host: host.somewhere.xx
+  dbname: some_database_name
+  username: some_username
+  password: some_password
+  sourceid: 1
+number_cores: 1
diff --git a/tests/data/test_database_set_up.sql b/tests/data/test_database_set_up.sql
index be3a9da..c253762 100644
--- a/tests/data/test_database_set_up.sql
+++ b/tests/data/test_database_set_up.sql
@@ -3,6 +3,7 @@ DROP TABLE IF EXISTS data_units;
 DROP TABLE IF EXISTS obm_buildings;
 DROP TABLE IF EXISTS data_units_buildings;
 DROP TABLE IF EXISTS data_unit_tiles;
+DROP TABLE IF EXISTS obm_built_area_assessments;
 DROP TYPE IF EXISTS occupancycase;
 DROP TYPE IF EXISTS settlement;
 DROP EXTENSION IF EXISTS postgis;
@@ -197,3 +198,15 @@ VALUES ('122010321033023130', 2, 'residential', 'ABC', 'ABC_10269', 0.0, 0.0, 0.
 ('122010321033023120', 2, 'commercial', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 17.6),
 ('122010321033023132', 2, 'residential', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 34.4),
 ('122010321033023132', 2, 'commercial', 'ABC', 'ABC_10269', 0.0, 0.0, 0.0, 0.0, 11.5);
+
+CREATE TABLE obm_built_area_assessments
+(
+    quadkey char(18),
+    source_id SMALLINT,
+    completeness SMALLINT,
+    PRIMARY KEY (quadkey, source_id)
+);
+INSERT INTO obm_built_area_assessments(quadkey, source_id, completeness)
+VALUES ('122010321033023130', 1, 0),
+('122010321033023120', 1, 0),
+('122010321033023132', 1, 1);
diff --git a/tests/test_configuration.py b/tests/test_configuration.py
index 8d767c1..eabcd39 100644
--- a/tests/test_configuration.py
+++ b/tests/test_configuration.py
@@ -33,12 +33,22 @@ def test_Configuration():
     assert returned_config.database_gde_tiles["dbname"] == "some_database_name"
     assert returned_config.database_gde_tiles["username"] == "some_username"
     assert returned_config.database_gde_tiles["password"] == "some_password"
+    assert returned_config.database_obm_buildings["host"] == "host.somewhere.xx"
+    assert returned_config.database_obm_buildings["dbname"] == "some_database_name"
+    assert returned_config.database_obm_buildings["username"] == "some_username"
+    assert returned_config.database_obm_buildings["password"] == "some_password"
+    assert returned_config.database_completeness["host"] == "host.somewhere.xx"
+    assert returned_config.database_completeness["dbname"] == "some_database_name"
+    assert returned_config.database_completeness["username"] == "some_username"
+    assert returned_config.database_completeness["password"] == "some_password"
+    assert returned_config.database_completeness["sourceid"] == 1
     assert len(returned_config.exposure_entities_to_run) == 1
     assert returned_config.exposure_entities_to_run[0] == "Italy"
     assert returned_config.exposure_entities_code == "ISO3"
     assert len(returned_config.occupancies_to_run) == 2
     assert returned_config.occupancies_to_run[0] == "residential"
     assert returned_config.occupancies_to_run[1] == "commercial"
+    assert returned_config.number_cores == 1
 
     # Test case in which the file is not found
     with pytest.raises(OSError) as excinfo:
diff --git a/tests/test_database_queries.py b/tests/test_database_queries.py
index e534064..9f7eedd 100644
--- a/tests/test_database_queries.py
+++ b/tests/test_database_queries.py
@@ -379,3 +379,27 @@ def test_get_data_unit_tiles_of_data_unit_as_DataFrame(test_db):
     assert returned_data_unit_tiles.shape[0] == 0
     assert "quadkey" in returned_data_unit_tiles.columns
     assert "aggregated_buildings" in returned_data_unit_tiles.columns
+
+
+def test_get_automatic_completeness_of_quadkey(test_db):
+    # Database connection (the Configuration class will define the credentials based on whether
+    # the code is running in the CI or locally)
+    config = Configuration(
+        os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml")
+    )
+
+    quadkeys = [
+        "122010321033023130",
+        "122010321033023120",
+        "122010321033023132",
+        "999999999999999999",  # will not be found -> return complete
+    ]
+
+    expected_completeness = [False, False, True, True]
+
+    for i, quadkey in enumerate(quadkeys):
+        returned_completeness = DatabaseQueries.get_automatic_completeness_of_quadkey(
+            quadkey, config.database_completeness, "obm_built_area_assessments"
+        )
+
+        assert returned_completeness == expected_completeness[i]
diff --git a/tests/test_processor.py b/tests/test_processor.py
index f4622f8..2c5c2fa 100644
--- a/tests/test_processor.py
+++ b/tests/test_processor.py
@@ -408,3 +408,108 @@ def test_narrow_down_by_commercial_occupancy_details():
     assert returned_obm_building_classes.shape[0] == building_classes.shape[0]
     for bdg_class_name in building_classes["building_class_name"].to_numpy():
         assert bdg_class_name in returned_obm_building_classes["building_class_name"].to_numpy()
+
+
+def test_calculate_buildings_per_quadkey():
+
+    quadkeys_of_buildings = numpy.array(
+        [
+            "122010321033023130",
+            "122010321033023120",
+            "122010321033023120",
+            "122010321033023130",
+            "122010321033023132",
+            "122010321033023132",
+            "122010321033023132",
+            "122010321033023132",
+            "122010321033023130",
+        ]
+    )
+
+    returned_counts_per_quadkey = GDEProcessor.calculate_buildings_per_quadkey(
+        quadkeys_of_buildings
+    )
+
+    assert returned_counts_per_quadkey.shape[0] == 3
+    assert returned_counts_per_quadkey.loc["122010321033023130", "counts"] == 3
+    assert returned_counts_per_quadkey.loc["122010321033023120", "counts"] == 2
+    assert returned_counts_per_quadkey.loc["122010321033023132", "counts"] == 4
+
+
+def test_process_data_unit_tile(test_db):
+    # Database connection (the Configuration class will define the credentials based on whether
+    # the code is running in the CI or locally)
+    config = Configuration(
+        os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml")
+    )
+
+    data_unit_tiles_attributes = [
+        ("122010321033023130", 15.7, 3),
+        ("122010321033023120", 39.1, 41),
+        ("122010321033023132", 34.4, 12),
+    ]
+
+    expected_output = [
+        (12.7, False),
+        (0.0, False),
+        (0.0, True),
+    ]
+
+    for i in range(len(data_unit_tiles_attributes)):
+        returned_output = GDEProcessor.process_data_unit_tile(
+            config.database_completeness,
+            "obm_built_area_assessments",
+            data_unit_tiles_attributes[i],
+        )
+
+        assert round(returned_output[0], 2) == round(expected_output[i][0], 2)
+        assert returned_output[1] == expected_output[i][1]  # completeness
+
+
+def test_process_group_data_unit_tiles(test_db):
+    # Database connection (the Configuration class will define the credentials based on whether
+    # the code is running in the CI or locally)
+    config = Configuration(
+        os.path.join(os.path.dirname(__file__), "data", "config_for_testing_good.yml")
+    )
+
+    obm_buildings_per_quadkey = pandas.DataFrame(
+        {"counts": [3, 41, 12]},
+        index=["122010321033023130", "122010321033023120", "122010321033023132"],
+    )
+
+    data_unit_tiles = pandas.DataFrame(
+        {
+            "quadkey": ["122010321033023120", "122010321033023132", "122010321033023130"],
+            "aggregated_buildings": [39.1, 34.4, 15.7],
+        }
+    )
+
+    returned_data_unit_tiles_full = GDEProcessor.process_group_data_unit_tiles(
+        data_unit_tiles,
+        obm_buildings_per_quadkey,
+        config.database_completeness,
+        "obm_built_area_assessments",
+        number_cores=1,
+    )
+
+    expected_output = {}
+    expected_output["122010321033023130"] = (12.7, False)
+    expected_output["122010321033023120"] = (0.0, False)
+    expected_output["122010321033023132"] = (0.0, True)
+
+    assert returned_data_unit_tiles_full.shape[0] == 3
+
+    for quadkey in expected_output.keys():
+        assert (
+            returned_data_unit_tiles_full[returned_data_unit_tiles_full.quadkey == quadkey][
+                "remainder_buildings"
+            ].to_numpy()[0]
+            == expected_output[quadkey][0]
+        )
+        assert (
+            returned_data_unit_tiles_full[returned_data_unit_tiles_full.quadkey == quadkey][
+                "complete"
+            ].to_numpy()[0]
+            == expected_output[quadkey][1]
+        )
-- 
GitLab