diff --git a/HISTORY.rst b/HISTORY.rst index 1aebe2abb7983eba79bc72b82a58b87bd97a05f5..a526b99036d4c9eef5ffea50ec5449fd203d3b37 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,14 @@ History ======= +0.20.1 (2021-12-03) +------------------- + +* `!35`_: Added gdal.Sieve to raster2polygon and polygon buffering only runs for invalid polygons. + It is now much faster for masks that contain many small features if min_npx>1. + +.. _!35: https://git.gfz-potsdam.de/danschef/py_tools_ds/-/merge_requests/33] + 0.20.0 (2021-11-24) ------------------- @@ -10,7 +18,6 @@ History .coveragerc. * `!34`_: Fixed AttributeError in ProgressBar in case sys.stderr is None. Added tests for processing module. - .. _!33: https://git.gfz-potsdam.de/danschef/py_tools_ds/-/merge_requests/33] .. _!34: https://git.gfz-potsdam.de/danschef/py_tools_ds/-/merge_requests/34] diff --git a/py_tools_ds/geo/raster/conversion.py b/py_tools_ds/geo/raster/conversion.py index fbe28db481a9c0445d14eec1b17258cbccfb4187..909b5ced9b82eaca4cbec2bd0c5db7154fc4cabc 100644 --- a/py_tools_ds/geo/raster/conversion.py +++ b/py_tools_ds/geo/raster/conversion.py @@ -31,7 +31,7 @@ import json from typing import Union from shapely.geometry import shape, Polygon, MultiPolygon, GeometryCollection -from osgeo import gdal, osr, ogr +from osgeo import gdal, osr, ogr # noqa import numpy as np from ...io.raster.gdal import get_GDAL_ds_inmem @@ -60,26 +60,24 @@ def raster2polygon(array: np.ndarray, :param exact: whether to compute the exact footprint polygon or a simplified one for speed :param maxfeatCount: the maximum expected number of polygons. If more polygons are found, every further processing is cancelled and a RunTimeError is raised. - :param min_npx: minmal polygon area to be included in the result (in numbers of pixels) + :param min_npx: minmal polygon area to be included in the result (in numbers of pixels; default: 1) :param timeout: breaks the process after a given time in seconds :param progress: show progress bars (default: True) :param q: quiet mode (default: False) :return: """ - array = array.astype(int) - # TODO if maxfeatCount is not None: warnings.warn("'maxfeatCount' is deprecated and will be removed soon.", DeprecationWarning) # pragma: no cover - # gdal.Polygonize ignores the 0 - if DN2extract == 0: - array = array == 0 - DN2extract = 1 + if not isinstance(array.dtype, np.integer): + array = array.astype(int) + + array = (array == DN2extract).astype(np.uint8) assert array.ndim == 2, "Only 2D arrays are supported. Got a %sD array." % array.ndim gt_orig = gt - shape_orig = array.shape + rows, cols = shape_orig = array.shape # downsample input array in case it has more than 1e8 pixels to prevent crash if not exact and array.size > 1e8: # 10000 x 10000 px @@ -89,8 +87,28 @@ def raster2polygon(array: np.ndarray, out_gsd=(gt[1] / zoom_factor, gt[5] / zoom_factor), rspAlg='near', + CPUs=None, # all CPUs q=True) + # remove raster polygons smaller than min_npx + if array.size > min_npx > 1: + drv = gdal.GetDriverByName('GTiff') + path_tmp = '/vsimem/raster2polygon_gdalsieve.tif' + + ds = drv.Create(path_tmp, cols, rows, 1, gdal.GDT_Byte) + band = ds.GetRasterBand(1) + band.WriteArray(array) + + callback = gdal.TermProgress_nocb if array.size > 1e8 and progress else None + gdal.SieveFilter(srcBand=band, maskBand=None, dstBand=band, threshold=min_npx, + # connectedness=4 if exact else 8, + connectedness=4, # 4-connectedness is 30% faster + callback=callback) + + array = ds.ReadAsArray() + del ds, band + gdal.Unlink(path_tmp) + src_ds = get_GDAL_ds_inmem(array, gt, prj) src_band = src_ds.GetRasterBand(1) @@ -118,7 +136,8 @@ def raster2polygon(array: np.ndarray, src_band, # .GetMaskBand(), dst_layer, 0, - ["8CONNECTED=8"] if exact else [], + [] if exact else ["8CONNECTED=8"], # 4-connectedness for exact output + # callback=gdal.TermProgress_nocb) callback=callback) del dst_layer, dst_ds @@ -147,19 +166,28 @@ def raster2polygon(array: np.ndarray, # convert JSON string to dict gjs = json.loads(content_str) - gc = GeometryCollection([shape(f["geometry"]).buffer(0) - for f in gjs['features'] if f['properties']['DN'] == str(DN2extract)]) + def get_valid_polys(val: Union[Polygon, MultiPolygon, GeometryCollection]): + if isinstance(val, Polygon): + val = val if val.is_valid else val.buffer(0) + if isinstance(val, Polygon): + return val + for g in val.geoms: + return get_valid_polys(g) - # convert GeometryCollection into MultiPolygon + # extract polygons from GeoJSON dict polys = [] - for i in gc.geoms: - if isinstance(i, Polygon): - polys.append(i) - else: - polys.extend(list(i.geoms)) + for f in gjs['features']: + if f['properties']['DN'] == str(1): + geom = shape(f["geometry"]) + geom = get_valid_polys(geom) + + if isinstance(geom, list): + polys.extend(geom) + else: + polys.append(geom) # drop polygons with an area below npx - if min_npx > 1: + if min_npx: area_1px = gt[1] * abs(gt[5]) area_min = min_npx * area_1px diff --git a/tests/data/mask_many_small_features.tif b/tests/data/mask_many_small_features.tif new file mode 100644 index 0000000000000000000000000000000000000000..d0df1eb9f8d57959aead6a1faacc4e46ded9e338 --- /dev/null +++ b/tests/data/mask_many_small_features.tif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957e95563d592b94aeb2c91a7cf007d9d16f97b08a4bacc437dcb8e9288eab8c +size 1353916 diff --git a/tests/test_geo/test_raster/test_conversion.py b/tests/test_geo/test_raster/test_conversion.py index 4f191fea21aa132c91062b9fa04827aada669907..2c2364ba796b5a1d26d4b455454d8ccea6f8bec3 100644 --- a/tests/test_geo/test_raster/test_conversion.py +++ b/tests/test_geo/test_raster/test_conversion.py @@ -46,54 +46,72 @@ tests_path = os.path.abspath(os.path.join(py_tools_ds.__file__, "..", "..", "tes class Test_raster2polygon(TestCase): - def setUp(self) -> None: + @classmethod + def setUpClass(cls) -> None: try: # load a large mask containing an irregular binary shape with a complex footprint (27408x17604) ds_l = gdal.Open(os.path.join(tests_path, 'data', 'mask_large.tif')) - self.mask_l = ds_l.ReadAsArray().astype(bool) - self.gt_l = ds_l.GetGeoTransform() - self.prj_l = ds_l.GetProjection() + cls.mask_l = ds_l.ReadAsArray().astype(bool) + cls.gt_l = ds_l.GetGeoTransform() + cls.prj_l = ds_l.GetProjection() finally: del ds_l try: - # load a large mask containing 239 features + # load a large mask containing 192 features ds_mf = gdal.Open(os.path.join(tests_path, 'data', 'mask_many_features.tif')) - self.mask_mf = ds_mf.ReadAsArray().astype(bool) - self.gt_mf = ds_mf.GetGeoTransform() - self.prj_mf = ds_mf.GetProjection() + cls.mask_mf = ds_mf.ReadAsArray().astype(bool) + cls.gt_mf = ds_mf.GetGeoTransform() + cls.prj_mf = ds_mf.GetProjection() finally: del ds_mf - self.testarr = np.array([[2, 0, 2, 0, 0], - [0, 2, 0, 1, 2], - [1, 2, 0, 1, 0], - [1, 2, 2, 2, 1], - [1, 1, 2, 1, 1]]) + try: + # load a large mask containing 550 features including many very small ones + ds_msf = gdal.Open(os.path.join(tests_path, 'data', 'mask_many_small_features.tif')) + cls.mask_msf = ds_msf.ReadAsArray().astype(bool) + cls.gt_msf = ds_msf.GetGeoTransform() + cls.prj_msf = ds_msf.GetProjection() + + finally: + del ds_msf + + cls.testarr = np.array([[2, 0, 2, 0, 0], + [0, 2, 0, 1, 2], + [1, 2, 0, 1, 0], + [1, 2, 2, 2, 1], + [1, 1, 2, 1, 1]]) def test_mask_large(self): - poly = raster2polygon(self.mask_l, self.gt_l, self.prj_l, exact=False) + poly = raster2polygon(self.mask_l, self.gt_l, self.prj_l, exact=False, min_npx=1) self.assertIsInstance(poly, MultiPolygon) def test_mask_many_features(self): - poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False) + poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=True, min_npx=1) self.assertIsInstance(poly, MultiPolygon) self.assertEqual(len(poly.geoms), 239) def test_min_npx(self): - poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False, min_npx=5) + poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False, min_npx=20) self.assertIsInstance(poly, MultiPolygon) self.assertLess(len(poly.geoms), 239) + def test_mask_many_small_features(self): + # the issue here is that due to the many small features, the largest polygon gets >100k vertices + poly = raster2polygon(self.mask_msf, self.gt_msf, self.prj_msf, exact=False, min_npx=5) + self.assertIsInstance(poly, MultiPolygon) + self.assertLess(len(poly.geoms), 550) + def test_handle_multipart_polygons(self): - # this causes gdal.Polygonize to produce some MultiPolygons instead pf Polygons which has to be handled + # this causes gdal.Polygonize to produce some MultiPolygons instead of Polygons which has to be handled poly = raster2polygon(~self.mask_mf.astype(bool), self.gt_mf, self.prj_mf, exact=True) self.assertIsInstance(poly, MultiPolygon) def test_DN2extract(self): for DN, nexp in zip(range(3), [5, 3, 4]): with self.subTest(DN=DN): - poly = raster2polygon(self.testarr, self.gt_mf, self.prj_mf, exact=False, DN2extract=DN) + poly = raster2polygon(self.testarr, self.gt_mf, self.prj_mf, exact=True, DN2extract=DN, min_npx=1) + self.assertIsInstance(poly, MultiPolygon) self.assertEqual(len(poly.geoms), nexp)