From 1613ba3508ee47b64f7b2692a1b8b46667adf8dd Mon Sep 17 00:00:00 2001 From: Nicolas Garcia Ospina Date: Fri, 11 Jun 2021 08:28:05 +0200 Subject: [PATCH 1/5] Improved memory usage and removed geometry sotrage --- obmgapanalysis/fileprocessor.py | 33 +++----------------------------- obmgapanalysis/obmgapanalysis.py | 6 ++++-- obmgapanalysis/tileprocessor.py | 2 -- 3 files changed, 7 insertions(+), 34 deletions(-) diff --git a/obmgapanalysis/fileprocessor.py b/obmgapanalysis/fileprocessor.py index 390c1e9..9f3f5e7 100644 --- a/obmgapanalysis/fileprocessor.py +++ b/obmgapanalysis/fileprocessor.py @@ -19,7 +19,6 @@ import os import logging -import geopandas import pandas # Initialize log @@ -28,34 +27,7 @@ logger = logging.getLogger(__name__) class FileProcessor: @staticmethod - def write_tiles_to_csv( - list_of_dictionaries, output_pathname, column_geometry="built_area", crs="epsg:4326" - ): - """Write a csv file from a list of dictionaries. - - Args: - list_of_dictionaries (list): List of dictionaries with built-up areas to - write. - - output_pathname (str): Target path name for the csv file. - - column_geometry (str): Name of the field that contains geometries. - Default = "built_area" - - crs (str): EPSG code of the data projection. Default = "epsg:4326" - """ - - tiles_gdf = geopandas.GeoDataFrame( - list_of_dictionaries, geometry=column_geometry, crs=crs - ) - filepath_out = os.path.join( - output_pathname, "{}_{}.csv".format(tiles_gdf.quadkey.iloc[0], len(tiles_gdf.index)) - ) - logger.info("Creating {}".format(filepath_out)) - tiles_gdf.to_csv(filepath_out, index=False) - - @staticmethod - def write_obm_tiles_to_csv(list_of_dictionaries, output_pathname): + def write_tiles_to_csv_no_geom(list_of_dictionaries, output_pathname): """Write a csv file from a list of dictionaries without geometries. Args: @@ -66,9 +38,10 @@ class FileProcessor: """ tiles_df = pandas.DataFrame(list_of_dictionaries) + tiles_df = tiles_df.drop_duplicates(keep="first") filepath_out = os.path.join( output_pathname, - "OBM_{}_{}.csv".format(tiles_df.quadkey.iloc[0], len(tiles_df.index)), + "{}_{}.csv".format(tiles_df.quadkey.iloc[0], len(tiles_df.index)), ) logger.info("Creating {}".format(filepath_out)) tiles_df.to_csv(filepath_out, index=False) diff --git a/obmgapanalysis/obmgapanalysis.py b/obmgapanalysis/obmgapanalysis.py index 128bb42..93a7155 100644 --- a/obmgapanalysis/obmgapanalysis.py +++ b/obmgapanalysis/obmgapanalysis.py @@ -116,7 +116,8 @@ def multiprocess_built_estimation_batch(quadkey_batch): if built_up_areas: # Write output into a csv file - FileProcessor.write_tiles_to_csv(built_up_areas, output_pathname) + FileProcessor.write_tiles_to_csv_no_geom(built_up_areas, output_pathname) + del built_up_areas roads_database.connection.close() @@ -163,10 +164,11 @@ def multiprocess_buildings_batch(quadkey_batch): if obm_built_up_areas: # Write output into a csv file - FileProcessor.write_obm_tiles_to_csv( + FileProcessor.write_tiles_to_csv_no_geom( list_of_dictionaries=obm_built_up_areas, output_pathname=obm_output_pathname, ) + del obm_built_up_areas buildings_database.connection.close() diff --git a/obmgapanalysis/tileprocessor.py b/obmgapanalysis/tileprocessor.py index 8ae3bdd..96d9eed 100644 --- a/obmgapanalysis/tileprocessor.py +++ b/obmgapanalysis/tileprocessor.py @@ -278,7 +278,6 @@ class TileProcessor: Contains: quadkey (str): Tile quadkey source_id (int): Integer associated to a predefined method - built_area (str): Polygon string projected to WGS84 coordinates. built_area_size (float): Area measured in squared meters. last_update (str): Date when the pickle was generated. @@ -302,7 +301,6 @@ class TileProcessor: results = { "quadkey": tile.quadkey, "source_id": datasource.source_id, - "built_area": TileProcessor.reproject_polygon(built_polygon, tile.crs, "epsg:4326"), "built_area_size": TileProcessor.albers_area_calculation(built_polygon, tile.crs), "last_update": str(date.today()), } -- GitLab From 40281e0e5232bab9aa3b3057f14397cfa33943f2 Mon Sep 17 00:00:00 2001 From: Nicolas Garcia Ospina Date: Fri, 25 Jun 2021 10:11:10 +0200 Subject: [PATCH 2/5] Let geometry retrieval to be optional --- config-example.yml | 1 + obmgapanalysis/fileprocessor.py | 45 +++++++++++++++++++++++++------- obmgapanalysis/obmgapanalysis.py | 9 +++++-- obmgapanalysis/tileprocessor.py | 4 ++- setup.py | 1 + 5 files changed, 47 insertions(+), 13 deletions(-) diff --git a/config-example.yml b/config-example.yml index 2879f48..5bc2c4b 100644 --- a/config-example.yml +++ b/config-example.yml @@ -15,6 +15,7 @@ obm_output_pathname: ./obm_results import_pathname: ./results number_cores: 1 batch_size: 1000 +get_geometry: False roads_database: host: your_host.dir.request_data diff --git a/obmgapanalysis/fileprocessor.py b/obmgapanalysis/fileprocessor.py index 9f3f5e7..aba313e 100644 --- a/obmgapanalysis/fileprocessor.py +++ b/obmgapanalysis/fileprocessor.py @@ -20,6 +20,7 @@ import os import logging import pandas +import geopandas # Initialize log logger = logging.getLogger(__name__) @@ -27,21 +28,45 @@ logger = logging.getLogger(__name__) class FileProcessor: @staticmethod - def write_tiles_to_csv_no_geom(list_of_dictionaries, output_pathname): - """Write a csv file from a list of dictionaries without geometries. + def write_tiles_to_csv( + list_of_dictionaries, + output_pathname, + get_geometry=False, + column_geometry="built_area", + crs="epsg:4326", + ): + """Write a csv file from a list of dictionaries. Args: list_of_dictionaries (list): List of dictionaries with built-up areas to write. output_pathname (str): Target path name for the csv file. + + get_geometry (bool): Set if the geometry will be written + + column_geometry (str): Name of the field that contains geometries. + Default = "built_area" + + crs (str): EPSG code of the data projection. Default = "epsg:4326" """ - tiles_df = pandas.DataFrame(list_of_dictionaries) - tiles_df = tiles_df.drop_duplicates(keep="first") - filepath_out = os.path.join( - output_pathname, - "{}_{}.csv".format(tiles_df.quadkey.iloc[0], len(tiles_df.index)), - ) - logger.info("Creating {}".format(filepath_out)) - tiles_df.to_csv(filepath_out, index=False) + if get_geometry is False: + tiles_df = pandas.DataFrame(list_of_dictionaries) + tiles_df = tiles_df.drop_duplicates(keep="first") + filepath_out = os.path.join( + output_pathname, + "{}_{}.csv".format(tiles_df.quadkey.iloc[0], len(tiles_df.index)), + ) + logger.info("Creating {}".format(filepath_out)) + tiles_df.to_csv(filepath_out, index=False) + else: + tiles_gdf = geopandas.GeoDataFrame( + list_of_dictionaries, geometry=column_geometry, crs=crs + ) + filepath_out = os.path.join( + output_pathname, + "{}_{}.csv".format(tiles_gdf.quadkey.iloc[0], len(tiles_gdf.index)), + ) + logger.info("Creating {}".format(filepath_out)) + tiles_gdf.to_csv(filepath_out, index=False) diff --git a/obmgapanalysis/obmgapanalysis.py b/obmgapanalysis/obmgapanalysis.py index 93a7155..e41ed64 100644 --- a/obmgapanalysis/obmgapanalysis.py +++ b/obmgapanalysis/obmgapanalysis.py @@ -65,6 +65,8 @@ if args.import_csv: target_db_config = config["target_database"] import_pathname = os.path.abspath(config["import_pathname"]) +get_geometry = config["get_geometry"] + def multiprocess_built_estimation_batch(quadkey_batch): """ @@ -116,7 +118,9 @@ def multiprocess_built_estimation_batch(quadkey_batch): if built_up_areas: # Write output into a csv file - FileProcessor.write_tiles_to_csv_no_geom(built_up_areas, output_pathname) + FileProcessor.write_tiles_to_csv( + built_up_areas, output_pathname, get_geometry=get_geometry + ) del built_up_areas roads_database.connection.close() @@ -164,7 +168,8 @@ def multiprocess_buildings_batch(quadkey_batch): if obm_built_up_areas: # Write output into a csv file - FileProcessor.write_tiles_to_csv_no_geom( + FileProcessor.write_tiles_to_csv( + get_geometry=get_geometry, list_of_dictionaries=obm_built_up_areas, output_pathname=obm_output_pathname, ) diff --git a/obmgapanalysis/tileprocessor.py b/obmgapanalysis/tileprocessor.py index 96d9eed..4f870e1 100644 --- a/obmgapanalysis/tileprocessor.py +++ b/obmgapanalysis/tileprocessor.py @@ -277,7 +277,8 @@ class TileProcessor: associated to the Tile and a given DataSource. Contains: quadkey (str): Tile quadkey - source_id (int): Integer associated to a predefined method + source_id (int): Integer associated to a predefined method. + built_area (str): Polygon string projected to WGS84 coordinates. built_area_size (float): Area measured in squared meters. last_update (str): Date when the pickle was generated. @@ -301,6 +302,7 @@ class TileProcessor: results = { "quadkey": tile.quadkey, "source_id": datasource.source_id, + "built_area": TileProcessor.reproject_polygon(built_polygon, tile.crs, "epsg:4326"), "built_area_size": TileProcessor.albers_area_calculation(built_polygon, tile.crs), "last_update": str(date.today()), } diff --git a/setup.py b/setup.py index 6f980ca..e2a81ef 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ setup( "babelgrid", "fiona", "rtree", + "pandas", "geopandas", "rasterio", "psycopg2-binary", -- GitLab From 89cf99e9f065e9d3130fb22cf83571b5990ac1d7 Mon Sep 17 00:00:00 2001 From: Nicolas Garcia Ospina Date: Fri, 25 Jun 2021 10:15:18 +0200 Subject: [PATCH 3/5] Removed geometry option from OBM --- obmgapanalysis/obmgapanalysis.py | 1 - 1 file changed, 1 deletion(-) diff --git a/obmgapanalysis/obmgapanalysis.py b/obmgapanalysis/obmgapanalysis.py index e41ed64..7edf8e0 100644 --- a/obmgapanalysis/obmgapanalysis.py +++ b/obmgapanalysis/obmgapanalysis.py @@ -169,7 +169,6 @@ def multiprocess_buildings_batch(quadkey_batch): if obm_built_up_areas: # Write output into a csv file FileProcessor.write_tiles_to_csv( - get_geometry=get_geometry, list_of_dictionaries=obm_built_up_areas, output_pathname=obm_output_pathname, ) -- GitLab From ff8023324aa4f26ada0190ceabf22e1ccc7b4ad7 Mon Sep 17 00:00:00 2001 From: Nicolas Garcia Ospina Date: Fri, 25 Jun 2021 10:22:59 +0200 Subject: [PATCH 4/5] Included optional geometry in dictionary creation --- obmgapanalysis/obmgapanalysis.py | 1 + obmgapanalysis/tileprocessor.py | 44 +++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/obmgapanalysis/obmgapanalysis.py b/obmgapanalysis/obmgapanalysis.py index 7edf8e0..affb1e6 100644 --- a/obmgapanalysis/obmgapanalysis.py +++ b/obmgapanalysis/obmgapanalysis.py @@ -108,6 +108,7 @@ def multiprocess_built_estimation_batch(quadkey_batch): database_crs_number=roads_database_crs_number, table_config=db_config["roads_table"], buffer_magnitude=db_config["process_buffer_magnitude"], + get_geometry=get_geometry, ) if result is not None: built_up_areas.append(result) diff --git a/obmgapanalysis/tileprocessor.py b/obmgapanalysis/tileprocessor.py index 4f870e1..9d077ce 100644 --- a/obmgapanalysis/tileprocessor.py +++ b/obmgapanalysis/tileprocessor.py @@ -272,7 +272,7 @@ class TileProcessor: return polygon.area @staticmethod - def build_dictionary(tile, datasource, built_polygon): + def build_dictionary(tile, datasource, built_polygon, get_geometry=False): """Returns a dictionary with the built-up area related attributes associated to the Tile and a given DataSource. Contains: @@ -298,22 +298,40 @@ class TileProcessor: if built_polygon.is_empty: logging.info("No built area found in {}".format(tile.quadkey)) return - - results = { - "quadkey": tile.quadkey, - "source_id": datasource.source_id, - "built_area": TileProcessor.reproject_polygon(built_polygon, tile.crs, "epsg:4326"), - "built_area_size": TileProcessor.albers_area_calculation(built_polygon, tile.crs), - "last_update": str(date.today()), - } + if get_geometry is False: + results = { + "quadkey": tile.quadkey, + "source_id": datasource.source_id, + "built_area_size": TileProcessor.albers_area_calculation( + built_polygon, tile.crs + ), + "last_update": str(date.today()), + } + else: + results = { + "quadkey": tile.quadkey, + "source_id": datasource.source_id, + "built_area": TileProcessor.reproject_polygon( + built_polygon, tile.crs, "epsg:4326" + ), + "built_area_size": TileProcessor.albers_area_calculation( + built_polygon, tile.crs + ), + "last_update": str(date.today()), + } if not results["source_id"]: del results["source_id"] - return results @staticmethod def get_built_up_area( - quadkey, datasource, database, database_crs_number, table_config, buffer_magnitude + quadkey, + datasource, + database, + database_crs_number, + table_config, + buffer_magnitude, + get_geometry=False, ): """Run the complete processing of a quadkey and returns a dictionary created with TileProcessor.build_dictionary. @@ -362,7 +380,9 @@ class TileProcessor: refined_built_area = TileProcessor.polygon_difference( clip_built_geometry, roads_processed ) - result = TileProcessor.build_dictionary(tile, datasource, refined_built_area) + result = TileProcessor.build_dictionary( + tile, datasource, refined_built_area, get_geometry=get_geometry + ) return result @staticmethod -- GitLab From 193bd9691b25f9d587b82d4f6e34b9f7795f2926 Mon Sep 17 00:00:00 2001 From: Nicolas Garcia Ospina Date: Mon, 28 Jun 2021 09:22:24 +0200 Subject: [PATCH 5/5] Included geometry in 03_Configuration_file.md and changed README.md --- README.md | 15 ++++++++++++++- docs/03_Configuration_file.md | 5 ++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 63cd11e..1af22cb 100644 --- a/README.md +++ b/README.md @@ -35,13 +35,26 @@ pip3 install . ## Running obmgapanalysis copy the config-example.yml to your working directory as config.yml and modify -the variables regarding the data source, database credentials and tiles for input quadkeys. +the variables regarding the data source, database credentials, multiprocessing +framework and tiles for input quadkeys. + +To assess built-up areas with a configured dataset: ```bash cd /your/working/directory obmgapanalysis ``` +To assess built-up areas based on a `buildings_database`: +```bash +obmgapanalysis --obm_built_up +``` + +To insert entries from the `import_pathname` into the `target_database`: +```bash +obmgapanalysis --import_csv +``` + ## Copyright and copyleft Copyright (C) 2021 diff --git a/docs/03_Configuration_file.md b/docs/03_Configuration_file.md index 8f367b2..1261005 100644 --- a/docs/03_Configuration_file.md +++ b/docs/03_Configuration_file.md @@ -35,8 +35,11 @@ amount of tiles to be handled per process. Each CSV file may contain maximum thi all of them provide built areas. output_pathname (str): Target path name for the csv file writing and reading. + obm_output_pathname (str): Target path name for the OBM csv file writing and reading. + import_pathname (str): Target path name with csv files to import. number_cores (int): Desired maximum number of parallel processes to execute. - batch_size (int): Maximum amount of tiles to be handled per process + batch_size (int): Maximum amount of tiles to be handled per process. + get_geometry (bool): If True, geometries will be stored in the output csv files. The last sections refer to database connections. `database` holds a database from which roads can be extracted to refine built areas, also it may contain buildings if the program wants to calculate a -- GitLab