From 2e3d6255ba5241e2336560138d1058c5104c3984 Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 16:20:44 +0100 Subject: [PATCH 1/9] Only run poly.buffer(0) on invalid polygons. --- py_tools_ds/geo/raster/conversion.py | 29 ++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index fbe28db..f51fbbd 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -147,19 +147,28 @@ def raster2polygon(array: np.ndarray, # convert JSON string to dict gjs = json.loads(content_str) - gc = GeometryCollection([shape(f["geometry"]).buffer(0) - for f in gjs['features'] if f['properties']['DN'] == str(DN2extract)]) - - # convert GeometryCollection into MultiPolygon + def get_valid_polys(val: Union[Polygon, MultiPolygon, GeometryCollection]): + if isinstance(val, Polygon): + val = val if val.is_valid else val.buffer(0) + if isinstance(val, Polygon): + return val + for g in val.geoms: + return get_valid_polys(g) + + # extract polygons from GeoJSON dict polys = [] - for i in gc.geoms: - if isinstance(i, Polygon): - polys.append(i) - else: - polys.extend(list(i.geoms)) + for f in gjs['features']: + if f['properties']['DN'] == str(DN2extract): + geom = shape(f["geometry"]) + geom = get_valid_polys(geom) + + if isinstance(geom, list): + polys.extend(geom) + else: + polys.append(geom) # drop polygons with an area below npx - if min_npx > 1: + if min_npx: area_1px = gt[1] * abs(gt[5]) area_min = min_npx * area_1px -- GitLab From 39da52e75bf311bd810c204e41b80ed7d98762fd Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 20:45:58 +0100 Subject: [PATCH 2/9] Implement gdal.Sieve to remove small polygons (speedup) --- py_tools_ds/geo/raster/conversion.py | 25 +++++++-- tests/test_geo/test_raster/test_conversion.py | 54 ++++++++++++------- 2 files changed, 58 insertions(+), 21 deletions(-) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index f51fbbd..12abe91 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -31,7 +31,7 @@ import json from typing import Union from shapely.geometry import shape, Polygon, MultiPolygon, GeometryCollection -from osgeo import gdal, osr, ogr +from osgeo import gdal, osr, ogr # noqa import numpy as np from ...io.raster.gdal import get_GDAL_ds_inmem @@ -60,7 +60,7 @@ def raster2polygon(array: np.ndarray, :param exact: whether to compute the exact footprint polygon or a simplified one for speed :param maxfeatCount: the maximum expected number of polygons. If more polygons are found, every further processing is cancelled and a RunTimeError is raised. - :param min_npx: minmal polygon area to be included in the result (in numbers of pixels) + :param min_npx: minmal polygon area to be included in the result (in numbers of pixels; default: 1) :param timeout: breaks the process after a given time in seconds :param progress: show progress bars (default: True) :param q: quiet mode (default: False) @@ -79,7 +79,7 @@ def raster2polygon(array: np.ndarray, assert array.ndim == 2, "Only 2D arrays are supported. Got a %sD array." % array.ndim gt_orig = gt - shape_orig = array.shape + rows, cols = shape_orig = array.shape # downsample input array in case it has more than 1e8 pixels to prevent crash if not exact and array.size > 1e8: # 10000 x 10000 px @@ -91,6 +91,25 @@ def raster2polygon(array: np.ndarray, rspAlg='near', q=True) + # remove raster polygons smaller than min_npx + if array.size > min_npx > 1: + drv = gdal.GetDriverByName('GTiff') + path_tmp = '/vsimem/raster2polygon_gdalsieve.tif' + + ds = drv.Create(path_tmp, cols, rows, 1, gdal.GDT_Int16) + band = ds.GetRasterBand(1) + band.WriteArray(array) + + callback = gdal.TermProgress_nocb if array.size > 1e8 else None + gdal.SieveFilter(srcBand=band, maskBand=None, dstBand=band, threshold=min_npx, + # connectedness=4 if exact else 8, + connectedness=4, + callback=callback) + + array = ds.ReadAsArray() + del ds, band + gdal.Unlink(path_tmp) + src_ds = get_GDAL_ds_inmem(array, gt, prj) src_band = src_ds.GetRasterBand(1) diff --git a/tests/test_geo/test_raster/test_conversion.py b/tests/test_geo/test_raster/test_conversion.py index 4f191fe..2c2364b 100644 --- a/tests/test_geo/test_raster/test_conversion.py +++ b/tests/test_geo/test_raster/test_conversion.py @@ -46,54 +46,72 @@ tests_path = os.path.abspath(os.path.join(py_tools_ds.__file__, "..", "..", "tes class Test_raster2polygon(TestCase): - def setUp(self) -> None: + @classmethod + def setUpClass(cls) -> None: try: # load a large mask containing an irregular binary shape with a complex footprint (27408x17604) ds_l = gdal.Open(os.path.join(tests_path, 'data', 'mask_large.tif')) - self.mask_l = ds_l.ReadAsArray().astype(bool) - self.gt_l = ds_l.GetGeoTransform() - self.prj_l = ds_l.GetProjection() + cls.mask_l = ds_l.ReadAsArray().astype(bool) + cls.gt_l = ds_l.GetGeoTransform() + cls.prj_l = ds_l.GetProjection() finally: del ds_l try: - # load a large mask containing 239 features + # load a large mask containing 192 features ds_mf = gdal.Open(os.path.join(tests_path, 'data', 'mask_many_features.tif')) - self.mask_mf = ds_mf.ReadAsArray().astype(bool) - self.gt_mf = ds_mf.GetGeoTransform() - self.prj_mf = ds_mf.GetProjection() + cls.mask_mf = ds_mf.ReadAsArray().astype(bool) + cls.gt_mf = ds_mf.GetGeoTransform() + cls.prj_mf = ds_mf.GetProjection() finally: del ds_mf - self.testarr = np.array([[2, 0, 2, 0, 0], - [0, 2, 0, 1, 2], - [1, 2, 0, 1, 0], - [1, 2, 2, 2, 1], - [1, 1, 2, 1, 1]]) + try: + # load a large mask containing 550 features including many very small ones + ds_msf = gdal.Open(os.path.join(tests_path, 'data', 'mask_many_small_features.tif')) + cls.mask_msf = ds_msf.ReadAsArray().astype(bool) + cls.gt_msf = ds_msf.GetGeoTransform() + cls.prj_msf = ds_msf.GetProjection() + + finally: + del ds_msf + + cls.testarr = np.array([[2, 0, 2, 0, 0], + [0, 2, 0, 1, 2], + [1, 2, 0, 1, 0], + [1, 2, 2, 2, 1], + [1, 1, 2, 1, 1]]) def test_mask_large(self): - poly = raster2polygon(self.mask_l, self.gt_l, self.prj_l, exact=False) + poly = raster2polygon(self.mask_l, self.gt_l, self.prj_l, exact=False, min_npx=1) self.assertIsInstance(poly, MultiPolygon) def test_mask_many_features(self): - poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False) + poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=True, min_npx=1) self.assertIsInstance(poly, MultiPolygon) self.assertEqual(len(poly.geoms), 239) def test_min_npx(self): - poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False, min_npx=5) + poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False, min_npx=20) self.assertIsInstance(poly, MultiPolygon) self.assertLess(len(poly.geoms), 239) + def test_mask_many_small_features(self): + # the issue here is that due to the many small features, the largest polygon gets >100k vertices + poly = raster2polygon(self.mask_msf, self.gt_msf, self.prj_msf, exact=False, min_npx=5) + self.assertIsInstance(poly, MultiPolygon) + self.assertLess(len(poly.geoms), 550) + def test_handle_multipart_polygons(self): - # this causes gdal.Polygonize to produce some MultiPolygons instead pf Polygons which has to be handled + # this causes gdal.Polygonize to produce some MultiPolygons instead of Polygons which has to be handled poly = raster2polygon(~self.mask_mf.astype(bool), self.gt_mf, self.prj_mf, exact=True) self.assertIsInstance(poly, MultiPolygon) def test_DN2extract(self): for DN, nexp in zip(range(3), [5, 3, 4]): with self.subTest(DN=DN): - poly = raster2polygon(self.testarr, self.gt_mf, self.prj_mf, exact=False, DN2extract=DN) + poly = raster2polygon(self.testarr, self.gt_mf, self.prj_mf, exact=True, DN2extract=DN, min_npx=1) + self.assertIsInstance(poly, MultiPolygon) self.assertEqual(len(poly.geoms), nexp) -- GitLab From 1704173cf629bc46f57c312a797f321743d1cefc Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 20:47:43 +0100 Subject: [PATCH 3/9] Exact polygonize output requires 4 connectedness. --- py_tools_ds/geo/raster/conversion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index 12abe91..3344d0e 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -137,7 +137,8 @@ def raster2polygon(array: np.ndarray, src_band, # .GetMaskBand(), dst_layer, 0, - ["8CONNECTED=8"] if exact else [], + [] if exact else ["8CONNECTED=8"], # 4-connectedness for exact output + # callback=gdal.TermProgress_nocb) callback=callback) del dst_layer, dst_ds -- GitLab From e92b3d40115e09a45357fc0cd0352dee7ae56fc7 Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 21:00:57 +0100 Subject: [PATCH 4/9] Ensure raster2polygon uses uint8 internally (faster, less memory). --- py_tools_ds/geo/raster/conversion.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index 3344d0e..65526e8 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -66,16 +66,14 @@ def raster2polygon(array: np.ndarray, :param q: quiet mode (default: False) :return: """ - array = array.astype(int) - # TODO if maxfeatCount is not None: warnings.warn("'maxfeatCount' is deprecated and will be removed soon.", DeprecationWarning) # pragma: no cover - # gdal.Polygonize ignores the 0 - if DN2extract == 0: - array = array == 0 - DN2extract = 1 + if not isinstance(array.dtype, np.integer): + array = array.astype(int) + + array = (array == DN2extract).astype(np.uint8) assert array.ndim == 2, "Only 2D arrays are supported. Got a %sD array." % array.ndim gt_orig = gt @@ -96,7 +94,7 @@ def raster2polygon(array: np.ndarray, drv = gdal.GetDriverByName('GTiff') path_tmp = '/vsimem/raster2polygon_gdalsieve.tif' - ds = drv.Create(path_tmp, cols, rows, 1, gdal.GDT_Int16) + ds = drv.Create(path_tmp, cols, rows, 1, gdal.GDT_Byte) band = ds.GetRasterBand(1) band.WriteArray(array) @@ -178,7 +176,7 @@ def raster2polygon(array: np.ndarray, # extract polygons from GeoJSON dict polys = [] for f in gjs['features']: - if f['properties']['DN'] == str(DN2extract): + if f['properties']['DN'] == str(1): geom = shape(f["geometry"]) geom = get_valid_polys(geom) -- GitLab From 9121e4260835d03c8e5273f935329c41015b983f Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 21:04:51 +0100 Subject: [PATCH 5/9] Add comment. --- py_tools_ds/geo/raster/conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index 65526e8..e7400e4 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -101,7 +101,7 @@ def raster2polygon(array: np.ndarray, callback = gdal.TermProgress_nocb if array.size > 1e8 else None gdal.SieveFilter(srcBand=band, maskBand=None, dstBand=band, threshold=min_npx, # connectedness=4 if exact else 8, - connectedness=4, + connectedness=4, # 4-connectedness is 30% faster callback=callback) array = ds.ReadAsArray() -- GitLab From 36586fc1fa4478b933ff8925026f8693797d0dc7 Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 21:30:04 +0100 Subject: [PATCH 6/9] Add missing test data (mask file). --- tests/data/mask_many_small_features.tif | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tests/data/mask_many_small_features.tif diff --git a/tests/data/mask_many_small_features.tif b/tests/data/mask_many_small_features.tif new file mode 100644 index 0000000..d0df1eb --- /dev/null +++ b/tests/data/mask_many_small_features.tif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957e95563d592b94aeb2c91a7cf007d9d16f97b08a4bacc437dcb8e9288eab8c +size 1353916 -- GitLab From 68472af856155db0a01e632a1be0035abc0497c2 Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 21:47:33 +0100 Subject: [PATCH 7/9] Only use a callback if progress is True. --- py_tools_ds/geo/raster/conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index e7400e4..aa49e33 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -98,7 +98,7 @@ def raster2polygon(array: np.ndarray, band = ds.GetRasterBand(1) band.WriteArray(array) - callback = gdal.TermProgress_nocb if array.size > 1e8 else None + callback = gdal.TermProgress_nocb if array.size > 1e8 and progress else None gdal.SieveFilter(srcBand=band, maskBand=None, dstBand=band, threshold=min_npx, # connectedness=4 if exact else 8, connectedness=4, # 4-connectedness is 30% faster -- GitLab From af25b9daf59f29e735878e8c61e1051a518db60f Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 21:56:29 +0100 Subject: [PATCH 8/9] Use multiprocessing for downsampling. --- py_tools_ds/geo/raster/conversion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index aa49e33..909b5ce 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -87,6 +87,7 @@ def raster2polygon(array: np.ndarray, out_gsd=(gt[1] / zoom_factor, gt[5] / zoom_factor), rspAlg='near', + CPUs=None, # all CPUs q=True) # remove raster polygons smaller than min_npx -- GitLab From af3e0359de05e731a56984764da46a90faab98de Mon Sep 17 00:00:00 2001 From: Daniel Scheffler Date: Fri, 3 Dec 2021 22:09:46 +0100 Subject: [PATCH 9/9] Updated HISTORY.rst. --- HISTORY.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 1aebe2a..a526b99 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,14 @@ History ======= +0.20.1 (2021-12-03) +------------------- + +* `!35`_: Added gdal.Sieve to raster2polygon and polygon buffering only runs for invalid polygons. + It is now much faster for masks that contain many small features if min_npx>1. + +.. _!35: https://git.gfz-potsdam.de/danschef/py_tools_ds/-/merge_requests/33] + 0.20.0 (2021-11-24) ------------------- @@ -10,7 +18,6 @@ History .coveragerc. * `!34`_: Fixed AttributeError in ProgressBar in case sys.stderr is None. Added tests for processing module. - .. _!33: https://git.gfz-potsdam.de/danschef/py_tools_ds/-/merge_requests/33] .. _!34: https://git.gfz-potsdam.de/danschef/py_tools_ds/-/merge_requests/34] -- GitLab