Commit a9a33146 authored by Daniel Scheffler's avatar Daniel Scheffler
Browse files

Merge branch 'enhancement/revise_polygonize' into 'master'

Enhancement/revise polygonize

See merge request !32
parents b7177478 d4108888
Pipeline #30320 passed with stages
in 4 minutes and 34 seconds
......@@ -2,6 +2,14 @@
History
=======
0.19.0 (2021-11-11)
-------------------
* Revised raster2polygon() and fill_holes_within_poly() to be much faster and robust in case of MultiPolygons
with many features.
* Removed 'geopandas' as requirement.
0.18.1 (2021-10-20)
-------------------
......
......@@ -39,7 +39,6 @@ to resolve the following dependencies before the pip installer is run:
* gdal
* geopandas
* numpy
* pyproj >=2.1.0
* shapely
......
......@@ -26,8 +26,15 @@
__author__ = "Daniel Scheffler"
from shapely.wkb import loads
import os
import warnings
from tempfile import TemporaryDirectory
import json
from typing import Union
from shapely.geometry import shape, Polygon, MultiPolygon, GeometryCollection
from osgeo import gdal, osr, ogr
import numpy as np
from ...io.raster.gdal import get_GDAL_ds_inmem
from ...processing.progress_mon import ProgressBar, Timer
......@@ -35,8 +42,17 @@ from ..raster.reproject import warp_ndarray
from ..vector.topology import get_bounds_polygon, polyVertices_outside_poly, get_overlap_polygon
def raster2polygon(array, gt, prj, DN2extract=1, exact=True, maxfeatCount=None,
timeout=None, progress=True, q=False):
def raster2polygon(array: np.ndarray,
gt: Union[list, tuple],
prj: Union[str, int],
DN2extract: int = 1,
exact: bool = True,
maxfeatCount: int = None,
min_npx: int = 1,
timeout: float = None,
progress: bool = True,
q: bool = False
) -> Union[Polygon, MultiPolygon]:
"""Calculate a footprint polygon for the given array.
:param array: 2D numpy array
......@@ -44,13 +60,25 @@ def raster2polygon(array, gt, prj, DN2extract=1, exact=True, maxfeatCount=None,
:param prj: projection as WKT string, 'EPSG:1234' or <EPSG_int>
:param DN2extract: <int, float> pixel value to create polygons for
:param exact: whether to compute the exact footprint polygon or a simplified one for speed
:param maxfeatCount: <int> the maximum expected number of polygons. If more polygons are found, every further
:param maxfeatCount: the maximum expected number of polygons. If more polygons are found, every further
processing is cancelled and a RunTimeError is raised.
:param min_npx: minmal polygon area to be included in the result (in numbers of pixels)
:param timeout: breaks the process after a given time in seconds
:param progress: show progress bars (default: True)
:param q: quiet mode (default: False)
:return:
"""
array = array.astype(int)
# TODO
if maxfeatCount is not None:
warnings.warn("'maxfeatCount' is deprecated and will be removed soon.", DeprecationWarning) # pragma: no cover
# gdal.Polygonize ignores the 0
if DN2extract == 0:
array = array == 0
DN2extract = 1
assert array.ndim == 2, "Only 2D arrays are supported. Got a %sD array." % array.ndim
gt_orig = gt
shape_orig = array.shape
......@@ -68,97 +96,76 @@ def raster2polygon(array, gt, prj, DN2extract=1, exact=True, maxfeatCount=None,
src_ds = get_GDAL_ds_inmem(array, gt, prj)
src_band = src_ds.GetRasterBand(1)
# Create a memory OGR datasource to put results in.
mem_drv = ogr.GetDriverByName('Memory')
mem_ds = mem_drv.CreateDataSource('out')
# Create a GeoJSON OGR datasource to put results in.
srs = osr.SpatialReference()
srs.ImportFromWkt(prj)
mem_layer = mem_ds.CreateLayer('poly', srs, ogr.wkbPolygon)
fd = ogr.FieldDefn('DN', ogr.OFTInteger)
mem_layer.CreateField(fd)
# set callback
callback = \
ProgressBar(prefix='Polygonize progress ',
suffix='Complete',
barLength=50,
timeout=timeout,
use_as_callback=True) \
if progress and not q else Timer(timeout, use_as_callback=True) if timeout else None
# run the algorithm
status = gdal.Polygonize(src_band,
src_band.GetMaskBand(),
mem_layer,
0,
["8CONNECTED=8"] if exact else [],
callback=callback)
# handle exit status other than 0 (fail)
if status != 0:
errMsg = gdal.GetLastErrorMsg()
# Catch the KeyboardInterrupt raised in case of a timeout within the callback. It seems like there is no other
# way to catch exceptions within callbacks.
if errMsg == 'User terminated':
raise TimeoutError('raster2polygon timed out!')
raise Exception(errMsg)
# extract polygon
mem_layer.SetAttributeFilter('DN = %s' % DN2extract)
from geopandas import GeoDataFrame
featCount = mem_layer.GetFeatureCount()
if not featCount:
raise RuntimeError('No features with DN=%s found in the input image.' % DN2extract)
if maxfeatCount and featCount > maxfeatCount:
raise RuntimeError('Found %s features with DN=%s but maximum feature count was set to %s.'
% (featCount, DN2extract, maxfeatCount))
# tmp = np.full((featCount,2), DN, geoArr.dtype)
# tmp[:,0] = range(featCount)
# GDF = GeoDataFrame(tmp, columns=['idx','DN'])
# def get_shplyPoly(GDF_row):
# if not is_timed_out(3):
# element = mem_layer.GetNextFeature()
# shplyPoly = loads(element.GetGeometryRef().ExportToWkb()).buffer(0)
# element = None
# return shplyPoly
# else:
# raise TimeoutError
# GDF['geometry'] = GDF.apply(get_shplyPoly, axis=1)
GDF = GeoDataFrame(columns=['geometry', 'DN'])
GDF.DN = GDF.DN.astype(float)
timer = Timer(timeout)
for i in range(featCount):
if not timer.timed_out:
element = mem_layer.GetNextFeature()
wkb = bytes(element.GetGeometryRef().ExportToWkb())
GDF.loc[i] = [loads(wkb).buffer(0), DN2extract]
del element
with TemporaryDirectory() as td:
drv = ogr.GetDriverByName("GeoJSON")
path_geojson = os.path.join(td, "polygonize_result.geojson")
dst_ds = drv.CreateDataSource(path_geojson)
dst_layer = dst_ds.CreateLayer('polygonize_result', srs=srs)
dst_layer.CreateField(ogr.FieldDefn("DN", 4))
# set callback
callback = \
ProgressBar(prefix='Polygonize progress ',
suffix='Complete',
barLength=50,
timeout=timeout,
use_as_callback=True) \
if progress and not q else Timer(timeout, use_as_callback=True) if timeout else None
# run the algorithm
status = gdal.Polygonize(src_band,
src_band, # .GetMaskBand(),
dst_layer,
0,
["8CONNECTED=8"] if exact else [],
callback=callback)
del dst_layer, dst_ds
# handle exit status other than 0 (fail)
if status != 0:
errMsg = gdal.GetLastErrorMsg()
# Catch the KeyboardInterrupt raised in case of a timeout within the callback.
# It seems like there is no other way to catch exceptions within callbacks.
if errMsg == 'User terminated':
raise TimeoutError('raster2polygon timed out!')
raise Exception(errMsg)
with open(path_geojson) as f:
gjs = json.load(f)
gc = GeometryCollection([shape(f["geometry"]).buffer(0)
for f in gjs['features'] if f['properties']['DN'] == str(DN2extract)])
# convert GeometryCollection into MultiPolygon
polys = []
for i in gc:
if isinstance(i, Polygon):
polys.append(i)
else:
raise TimeoutError('raster2polygon timed out!')
polys.extend(list(i))
GDF = GDF.dissolve(by='DN')
# drop polygons with an area below npx
if min_npx:
area_1px = gt[1] * abs(gt[5])
area_min = min_npx * area_1px
del mem_ds, mem_layer
polys = [p for p in polys if p.area >= area_min]
shplyPoly = GDF.loc[1, 'geometry']
poly = MultiPolygon(polys)
# the downsampling in case exact=False may cause vertices of shplyPoly to be outside the input array bounds
# -> clip shplyPoly with bounds_poly in that case
# the downsampling in case exact=False may cause vertices of poly to be outside the input array bounds
# -> clip poly with bounds_poly in that case
if not exact:
bounds_poly = get_bounds_polygon(gt_orig, *shape_orig)
if polyVertices_outside_poly(shplyPoly, bounds_poly, 1e-5):
shplyPoly = get_overlap_polygon(shplyPoly, bounds_poly)['overlap poly']
if polyVertices_outside_poly(poly, bounds_poly, 1e-5):
poly = get_overlap_polygon(poly, bounds_poly)['overlap poly']
return shplyPoly
return poly
......@@ -29,7 +29,6 @@ import warnings
import numpy as np
from typing import Union # noqa F401 # flake8 issue
from geopandas import GeoDataFrame
from shapely.geometry import shape, Polygon, box
from shapely.geometry import MultiPolygon # noqa F401 # flake8 issue
from ..coord_trafo import mapYX2imYX
......@@ -169,13 +168,19 @@ def polyVertices_outside_poly(inner_poly, outer_poly, tolerance=0):
return not outer_poly.buffer(tolerance).contains(inner_poly)
def fill_holes_within_poly(poly):
# type: (Union[Polygon, MultiPolygon]) -> Polygon
def fill_holes_within_poly(poly: Union[Polygon, MultiPolygon]
) -> Polygon:
"""Fill the holes within a shapely Polygon or MultiPolygon and return a Polygon with only the outer boundary.
:param poly: <shapely.geometry.Polygon, shapely.geometry.MultiPolygon>, shapely.geometry.GeometryCollection>
:return:
"""
def close_holes(polygon: Polygon) -> Polygon:
if polygon.interiors:
return Polygon(list(polygon.exterior.coords))
else:
return polygon
if poly.geom_type not in ['Polygon', 'MultiPolygon']:
raise ValueError("Unexpected geometry type %s." % poly.geom_type)
......@@ -184,30 +189,17 @@ def fill_holes_within_poly(poly):
filled_poly = Polygon(poly.exterior)
else: # 'MultiPolygon'
gdf = GeoDataFrame(columns=['geometry'])
gdf['geometry'] = poly.geoms
# get the area of each polygon of the multipolygon EXCLUDING the gaps in it
# noinspection PyTypeChecker
gdf['area_filled'] = gdf.apply(
lambda GDF_row: Polygon(np.swapaxes(np.array(GDF_row.geometry.exterior.coords.xy), 0, 1)).area, axis=1)
largest_poly_filled = gdf.loc[gdf['area_filled'].astype(np.float64).idxmax()]['geometry']
multipoly_closed = MultiPolygon([close_holes(p) for p in poly])
polys_areasorted = list(sorted(multipoly_closed, key=lambda a: a.area, reverse=True))
poly_largest = polys_areasorted[0]
# noinspection PyTypeChecker
gdf['within_or_equal'] = gdf.apply(
lambda GDF_row:
GDF_row.geometry.within(largest_poly_filled.buffer(1e-5)) or
GDF_row.geometry.equals(largest_poly_filled),
axis=1)
polys_disjunct = [p for p in polys_areasorted[1:] if p.disjoint(poly_largest)]
if False in gdf.within_or_equal.values:
n_disjunct_polys = int(np.sum(~gdf.within_or_equal))
if polys_disjunct:
warnings.warn(RuntimeWarning('The given MultiPolygon contains %d disjunct polygone(s) outside of the '
'largest polygone. fill_holes_within_poly() will only return the largest '
'polygone as a filled version.' % n_disjunct_polys))
'polygone as a filled version.' % len(polys_disjunct)))
# return the outer boundary of the largest polygon
filled_poly = Polygon(np.swapaxes(np.array(largest_poly_filled.exterior.coords.xy), 0, 1))
filled_poly = poly_largest
return filled_poly
......@@ -22,5 +22,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = '0.18.1'
__versionalias__ = '20211020_01'
__version__ = '0.19.0'
__versionalias__ = '20211111_01'
......@@ -41,7 +41,6 @@ with open("py_tools_ds/version.py") as version_file:
requirements = [
'gdal>=2.1.0',
'geopandas',
'numpy',
'pandas',
'pyproj>=2.5.0',
......
......@@ -7,7 +7,7 @@ dependencies:
- python=3.*.*
- pip # avoids that conda uses the wrong pip
- gdal>=2.1.0
- geopandas
- numpy
- pyproj>=2.5.0
- shapely
......
......@@ -36,6 +36,8 @@ import os
from unittest import TestCase
from osgeo import gdal
from shapely.geometry import MultiPolygon
import numpy as np
from py_tools_ds.geo.raster.conversion import raster2polygon
import py_tools_ds
......@@ -47,13 +49,51 @@ class Test_raster2polygon(TestCase):
def setUp(self) -> None:
try:
# load a large mask containing an irregular binary shape with a complex footprint (27408x17604)
ds = gdal.Open(os.path.join(tests_path, 'data', 'mask_large.tif'))
self.mask = ds.ReadAsArray().astype(bool)
self.gt = ds.GetGeoTransform()
self.prj = ds.GetProjection()
ds_l = gdal.Open(os.path.join(tests_path, 'data', 'mask_large.tif'))
self.mask_l = ds_l.ReadAsArray().astype(bool)
self.gt_l = ds_l.GetGeoTransform()
self.prj_l = ds_l.GetProjection()
finally:
del ds
del ds_l
def test_raster2polygon(self):
raster2polygon(self.mask, self.gt, self.prj, exact=False)
try:
# load a large mask containing 239 features
ds_mf = gdal.Open(os.path.join(tests_path, 'data', 'mask_many_features.tif'))
self.mask_mf = ds_mf.ReadAsArray().astype(bool)
self.gt_mf = ds_mf.GetGeoTransform()
self.prj_mf = ds_mf.GetProjection()
finally:
del ds_mf
self.testarr = np.array([[2, 0, 2, 0, 0],
[0, 2, 0, 1, 2],
[1, 2, 0, 1, 0],
[1, 2, 2, 2, 1],
[1, 1, 2, 1, 1]])
def test_mask_large(self):
poly = raster2polygon(self.mask_l, self.gt_l, self.prj_l, exact=False)
self.assertIsInstance(poly, MultiPolygon)
def test_mask_many_features(self):
poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False)
self.assertIsInstance(poly, MultiPolygon)
self.assertEqual(len(poly), 239)
def test_min_npx(self):
poly = raster2polygon(self.mask_mf, self.gt_mf, self.prj_mf, exact=False, min_npx=5)
self.assertIsInstance(poly, MultiPolygon)
self.assertLess(len(poly), 239)
def test_handle_multipart_polygons(self):
# this causes gdal.Polygonize to produce some MultiPolygons instead pf Polygons which has to be handled
poly = raster2polygon(~self.mask_mf.astype(bool), self.gt_mf, self.prj_mf, exact=True)
self.assertIsInstance(poly, MultiPolygon)
def test_DN2extract(self):
for DN, nexp in zip(range(3), [5, 3, 4]):
with self.subTest(DN=DN):
poly = raster2polygon(self.testarr, self.gt_mf, self.prj_mf, exact=False, DN2extract=DN)
self.assertEqual(len(poly), nexp)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment