Use dictionary to detect if a boundary already exists

d73beae9 · Laurens Oostwegel · 9cad7080 · d73beae9
Commit d73beae9 authored 1 year ago by Laurens Oostwegel
--- a/exposureinitializer/exposureinitializer.py
+++ b/exposureinitializer/exposureinitializer.py
@@ -205,17 +205,17 @@ class ExposureInitializer:
                asset_dict[taxonomy_id]["floorspace"] += asset["floorspace"]
        return asset_dict

-    def multiprocess_districts(self, districts, num_processors):
+    def multiprocess_districts(self, country_iso_code, district_assets, num_processors):
        """
        Initializes the queue and a pool of workers that run the `process_district` function on
        one core each.

        Args:
-            districts (list):
-                A list of all districts, containing the information below for each district:
-                    Boundary ID
-                    Country ISO code
-                    Asset dictionary
+            country_iso_code (str):
+                ISO 3166-1 alpha-3 code of the country
+            district_assets (dict):
+                A dictionary of all districts, with the boundary ID as key and the asset
+                dictionary as values.
            num_processors (int):
                Number of processors that are used in the multiprocessing pool.
        """
@@ -227,7 +227,7 @@ class ExposureInitializer:
        pool = Pool(num_processors - 1, self.worker, (queue,))

        # Fill the queue
-        for boundary_id, country_iso_code, asset_dict in districts:
+        for boundary_id, asset_dict in district_assets.items():
            queue.put([boundary_id, country_iso_code, asset_dict])

        # Wait until the queue is empty
@@ -397,28 +397,16 @@ class ExposureInitializer:
        """

        country_asset_dict = {}
-        districts = []
+        district_assets = {}
        # Iterate through all given exposure files
        for exposure_filepath in glob.glob(exposure_model_search_pattern):
            logger.info(f"Processing {exposure_filepath}")
            csv_reader = csv.DictReader(open(exposure_filepath), delimiter=",")
-            # Sort the exposure file by boundary ID to have all assets of one district being
-            # listed consecutively to avoid listing same taxonomies multiple times
-            sorted_exposure = sorted(csv_reader, key=lambda line: line["BOUNDARY_ID"])
-            # Prepare the control variables
-            last_boundary_id = None
-            location_count = 0
-            asset_dict = {}
-            boundary_id = None
-            for row in sorted_exposure:
-                # Check if the line starts the asset list of a new location
-                if not (last_boundary_id == row["BOUNDARY_ID"]):
-                    if location_count > 0:
-                        districts.append([boundary_id, country_iso_code, asset_dict])
-                    location_count += 1
+            for row in csv_reader:
+                # Check if the asset dict of the boundary already exists
                boundary_id = row["BOUNDARY_ID"]
-                    asset_dict = {}  # Reset the location-based asset dictionary
-                    last_boundary_id = row["BOUNDARY_ID"]
+                if boundary_id not in district_assets:
+                    district_assets[boundary_id] = {}

                # Read in an asset
                # Create the expanded taxonomy string and add the occupancy to it
@@ -447,21 +435,21 @@ class ExposureInitializer:
                    area_per_dwelling = float(row["AREA_PER_DWELLING_SQM"])
                    asset["floorspace"] = dwellings * area_per_dwelling

-                # Store the asset in a location-based list and country-based list
-                asset_dict = self.add_asset_to_dict(asset_dict, taxonomy_id, asset)
+                # Store the asset in a location-based dictionary and country-based list
+                district_assets[boundary_id] = self.add_asset_to_dict(
+                    district_assets[boundary_id], taxonomy_id, asset
+                )
                country_asset_dict = self.add_asset_to_dict(
                    country_asset_dict, taxonomy_id, asset
                )

-            districts.append([boundary_id, country_iso_code, asset_dict])
-
        # If there is more than one processor, process the districts with the
        # `multiprocess_districts` function
        if num_processors == 1:
-            for boundary_id, country_iso_code, asset_dict in districts:
+            for boundary_id, asset_dict in district_assets.items():
                self.process_district(boundary_id, country_iso_code, asset_dict)
        else:
-            self.multiprocess_districts(districts, num_processors)
+            self.multiprocess_districts(country_iso_code, district_assets, num_processors)

        logger.info("Assign the country-average assets")
        # Normalize the country-average asset distribution