processor.py 14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python3

# Copyright (C) 2022:
#   Helmholtz-Zentrum Potsdam Deutsches GeoForschungsZentrum GFZ
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
# General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.

import logging
from copy import deepcopy
import numpy
import mercantile
import pandas
import pyproj


logger = logging.getLogger()


class GDEProcessor:
    """This class contains methods that are fundamental to the calculations of `gde-core`."""

    @staticmethod
    def post_process_obm_relations(obm_buildings):
        """This function processes the contents of 'obm_buildings' to identify entries that
        correspond to individual buildings and entries that correspond to parts of buildings
        that belong to the same OpenStreetMap (OSM) relation ID. Data on the former stays the
        same. For the latter (i.e. parts of relations), the function gathers the parts and
        transforms them into one individual entry in which:
        - the 'osm_id' becomes that of the 'relation_id',
        - the number of storeys becomes the maximum of all individual parts,
        - the quadkey of the ensemble is identified,
        - the occupancy of the ensemble is identified.

        Args:
            obm_buildings (GeoPandas GeoDataFrame):
                GeoDataFrame with data on OBM buildings. It comprises the following columns:
                    osm_id (int):
                        OpenStreetMap (OSM) ID of the building. It cannot contain missing values
                        (by definition).
                    relation_id (str):
                        OpenStreetMap (OSM) ID of the relation to which an osm_id belongs, if
                        any. Missing values can be "nan" or "None".
                    quadkey (str):
                        String indicating the quadkey of the tile to which the centroid of the
                        building belongs. Missing values can be "nan" or "None".
                    storeys (float):
                        Number of storeys of the building. Treated as floats so as to be able to
                        use numpy.nan for missing values.
                    occupancy (str):
                        Occupancy of the building as per the GEM Building Taxonomy v3.0. Missing
                        values can be "nan" or "None".
                    geometry (Shapely Polygon or MultiPolygon):
                        Geometry (footprint) of the building, defined in EPSG:4326. It cannot
                        contain missing values (by definition).

        Returns:
            obm_buildings_adjusted (Pandas DataFrame):
                Processed version of 'obm_buildings' in which individual parts of the same
                'relation_id' have been grouped together and relevant values of the other fields
                have been identified. It comprises the following columns:
                    osm_id (int):
                        OpenStreetMap (OSM) ID of the building. If the building is represented
                        by a relation, this is the ID of the relation.
                    quadkey (str):
                        String indicating the quadkey of the tile to which the centroid of the
                        building belongs.
                    storeys (float):
                        Number of storeys of the building (maximum of all components if building
                        is an OSM relation). Treated as floats so as to be able to use numpy.nan
                        for missing values.
                    occupancy (str):
                        Occupancy of the building as per the GEM Building Taxonomy v3.0. Missing
                        values can be "nan" or "None".
        """

        # Identify unique relation IDs in 'obm_buildings'
        unique_relations = numpy.unique(obm_buildings["relation_id"].to_numpy())

        # Get rid of "nan" and "None"
        unique_relations = unique_relations[
            numpy.logical_and(unique_relations != "nan", unique_relations != "None")
        ]

        # Identify rows that contain buildings that are not part of a relation
        which_no_relation = numpy.where(
            numpy.logical_or(
                obm_buildings["relation_id"].to_numpy() == "nan",
                obm_buildings["relation_id"].to_numpy() == "None",
            )
        )[0]

        if len(which_no_relation) > 0:
            # Start output DataFrame with buildings that are not part of relations
            obm_buildings_adjusted = obm_buildings.iloc[which_no_relation, :]
            obm_buildings_adjusted = obm_buildings_adjusted.drop(
                columns=["relation_id", "geometry"]
            )

        # Process buildings that are part of relations
        add_osm_ids = []
        add_quadkey = []
        add_storeys = []
        add_occupancy = []

        for relation_id in unique_relations:
            # Identify components of this relation_id
            which = numpy.where(obm_buildings["relation_id"].to_numpy() == relation_id)[0]
            obm_buildings_of_relation = deepcopy(obm_buildings.iloc[which, :])

            add_osm_ids.append(int(float(relation_id)))

            add_quadkey.append(
                GDEProcessor._identify_unique_quadkey_L18(obm_buildings_of_relation)
            )

            add_storeys.append(
                GDEProcessor._select_max_of_array(
                    obm_buildings_of_relation["storeys"].to_numpy()
                )
            )

            add_occupancy.append(
                GDEProcessor._ensure_unique_occupancy(obm_buildings_of_relation)
            )

        relation_buildings = pandas.DataFrame(
            {
                "osm_id": pandas.Series(numpy.array(add_osm_ids).astype(int), dtype="int"),
                "quadkey": pandas.Series(numpy.array(add_quadkey).astype(str), dtype="str"),
                "storeys": pandas.Series(numpy.array(add_storeys).astype(float), dtype="float"),
                "occupancy": pandas.Series(numpy.array(add_occupancy).astype(str), dtype="str"),
            }
        )

        if len(which_no_relation) > 0:
            obm_buildings_adjusted = pandas.concat([obm_buildings_adjusted, relation_buildings])
        else:
            obm_buildings_adjusted = deepcopy(relation_buildings)

        return obm_buildings_adjusted

    @staticmethod
    def _identify_unique_quadkey_L18(obm_buildings):
        """This function identifies the unique level 18 quadkey associated with the (parts of)
        buildings contained in 'obm_buildings'.

        It is asssumed that 'obm_buildings' contains different OpenStreetMap (OSM) building
        polygons that are linked by the same OSM relation ID. If all values of 'quadkey' within
        'obm_buildings' are the same, this is identified as the sought unique level 18 quadkey.
        If 'obm_buildings' contains different values of 'quadkey', then all geometries of
        'obm_buildings' are aggregated ("dissolved") into one, whose centroid is calculated and
        used to identify the corresponding level 18 quadkey.

        If 'obm_buildings' contains more than one value of 'relation_id', this function returns
        an empty string.

        Args:
            obm_buildings (GeoPandas GeoDataFrame):
                GeoDataFrame with data on OBM buildings. It comprises the following columns:
                    osm_id (int):
                        OpenStreetMap (OSM) ID of the building.
                    relation_id (str):
                        OpenStreetMap (OSM) ID of the relation to which an osm_id belongs, if
                        any.
                    quadkey (str):
                        String indicating the quadkey of the tile to which the centroid of the
                        building belongs.
                    storeys (float):
                        Number of storeys of the building.
                    occupancy (str):
                        Occupancy of the building as per the GEM Building Taxonomy v3.0.
                    geometry (Shapely Polygon or MultiPolygon):
                        Geometry (footprint) of the building, defined in EPSG:4326.

        Returns:
            unique_quadkey (str):
                String indicating the unique quadkey to which the centroid of the geometries in
                'obm_buildings' belongs.
        """

        if len(numpy.unique(obm_buildings["relation_id"].to_numpy())) != 1:
            logger.error(
                "'obm_buildings' passed on to GDEProcessor._identify_unique_quadkey_L18() "
                "contains more than one unique value of 'relation_id'. The program cannot run."
            )
            return ""

        unique_quadkeys = numpy.unique(obm_buildings["quadkey"].to_numpy())

        if len(unique_quadkeys) == 1:
            unique_quadkey = unique_quadkeys[0]
        else:
            obm_buildings_aux = deepcopy(obm_buildings)
            obm_buildings_aux = obm_buildings_aux.drop(
                columns=["osm_id", "quadkey", "storeys", "occupancy"]
            )
            obm_buildings_dissolved = obm_buildings_aux.dissolve(by="relation_id")

            # Project 'obm_buildings_dissolved' onto Albers Equal Area
            lon_w = obm_buildings_dissolved.total_bounds[0]
            lat_s = obm_buildings_dissolved.total_bounds[1]
            lon_e = obm_buildings_dissolved.total_bounds[2]
            lat_n = obm_buildings_dissolved.total_bounds[3]
            projection_string = "+proj=aea +lat_1={} +lat_2={} +lat_0={} +lon_0={}".format(
                lat_s, lat_n, (lat_s + lat_n) / 2.0, (lon_w + lon_e) / 2.0
            )
            obm_buildings_dissolved = obm_buildings_dissolved.to_crs(
                pyproj.CRS(projection_string)
            )

            centroid = obm_buildings_dissolved["geometry"].centroid.to_crs("EPSG:4326")[0]
            unique_quadkey = mercantile.quadkey(mercantile.tile(centroid.x, centroid.y, 18))

        return unique_quadkey

    @staticmethod
    def _select_max_of_array(numbers):
        """This function returns the largest value in 'numbers'. If all elements of 'numbers'
        are NaNs, it returns numpy.nan. If some elements of 'numbers' are NaNs, the NaNs are
        ignored and the maximum of the rest of the elements is returned.

        Note: The purpose of this function is to be able to handle the case in which all
        elements of 'numbers' are NaNs, as numpy.nanmax() returns a RuntimeWarning in this case.

        Args:
            numbers (array of int or float):
                Array of integers or floats. May contain NaNs.

        Returns:
            max_number (int, float or numpy.nan):
                Maximum value of 'numbers'.
        """

        if numpy.all(numpy.isnan(numbers)):
            max_number = numpy.nan
        else:
            max_number = numpy.nanmax(numbers)

        return max_number

    @staticmethod
    def _ensure_unique_occupancy(obm_buildings):
        """This function identifies the unique occupancy associated with the (parts of)
        buildings contained in 'obm_buildings'.

        It is asssumed that 'obm_buildings' contains different OpenStreetMap (OSM) building
        polygons that are linked by the same OSM relation ID. If all values of 'occupancy'
        within 'obm_buildings' are the same, this is identified as the sought unique occupancy.
        If 'obm_buildings' contains different values of 'occupancy', or if 'obm_buildings'
        contains more than one value of 'relation_id', an empty string is returned.

        NOTE: This function assumes that all parts of the same relation are assigned the same
        occupancy by `rabotnik-obm`.

        Args:
            obm_buildings (GeoPandas GeoDataFrame):
                GeoDataFrame with data on OBM buildings. It comprises the following columns:
                    osm_id (int):
                        OpenStreetMap (OSM) ID of the building.
                    relation_id (str):
                        OpenStreetMap (OSM) ID of the relation to which an osm_id belongs, if
                        any.
                    quadkey (str):
                        String indicating the quadkey of the tile to which the centroid of the
                        building belongs.
                    storeys (float):
                        Number of storeys of the building.
                    occupancy (str):
                        Occupancy of the building as per the GEM Building Taxonomy v3.0.
                    geometry (Shapely Polygon or MultiPolygon):
                        Geometry (footprint) of the building, defined in EPSG:4326.

        Returns:
            unique_occupancy (str):
                String indicating the unique occupancy of all the parts of the building
                contained in 'obm_buildings' as per the GEM Building Taxonomy v3.0.
        """

        if len(numpy.unique(obm_buildings["relation_id"].to_numpy())) != 1:
            logger.error(
                "'obm_buildings' passed on to GDEProcessor._ensure_unique_occupancy() contains "
                "more than one unique value of 'relation_id'. The program cannot run."
            )
            return ""

        unique_occupancies = numpy.unique(obm_buildings["occupancy"].to_numpy())

        if len(unique_occupancies) == 1:
            unique_occupancy = unique_occupancies[0]
        else:
            unique_occupancy = ""
            logger.warning(
                "GDEProcessor._ensure_unique_occupancy(): individual parts of 'relation_id' %s "
                "have different values of 'occupancy'. A unique occupancy value could not be "
                "determined." % (numpy.unique(obm_buildings["relation_id"].to_numpy())[0])
            )

        return unique_occupancy