Commit 428cf5c2 authored by Daniel Scheffler's avatar Daniel Scheffler
Browse files

added array caching: GeoArray and all subclasses now remember the last...

added array caching: GeoArray and all subclasses now remember the last position read from disk and return it from memory -> speed improvement

io.raster.GeoArray.GeoArray:
- arr.setter: now flushes cache if shape changes
- from_path(): added array caching
- revised cache_array_subset()

- updated __version__
parent be91c0d5
......@@ -15,7 +15,7 @@ __all__=[#'compatibility',
'similarity',
'GeoArray']
__version__ = '20170120_01'
__version__ = '20170120_02'
__author__='Daniel Scheffler'
# Validate GDAL version
......
......@@ -103,7 +103,7 @@ class GeoArray(object):
self.basename = os.path.splitext(os.path.basename(self.filePath))[0] if not self.is_inmem else 'IN_MEM'
self.progress = progress
self.q = q
self._arr_cache = None
self._arr_cache = None # dict containing key 'pos' and 'arr_cached'
self._geotransform = None
self._projection = None
self._shape = None
......@@ -138,6 +138,10 @@ class GeoArray(object):
# "If you need to change the dimensions, create a new instance of %s." \
# %(self.shape, ndarray.shape, self.__class__.__name__)
# THIS would avoid warping like this: geoArr.arr, geoArr.gt, geoArr.prj = warp(...)
if ndarray.shape != self.shape:
self.flush_cache() # the cached array is not useful anymore
self._arr = ndarray
......@@ -497,8 +501,6 @@ class GeoArray(object):
def __getitem__(self, given):
# TODO check if array cache contains the needed slice and return data from there
if isinstance(given, (int,float,slice)) and self.ndim==3:
# handle 'given' as index for 3rd (bands) dimension
if self.is_inmem:
......@@ -656,6 +658,7 @@ class GeoArray(object):
def from_path(self, path, getitem_params=None):
# type: (str, list) -> np.ndarray
"""Read a GDAL compatible raster image from disk, with respect to the given image position.
NOTE: If the requested array position is already in cache, it is returned from there.
:param path: <str> the file path of the image to read
:param getitem_params: <list> a list of slices in the form [row_slice, col_slice, band_slice]
......@@ -729,37 +732,49 @@ class GeoArray(object):
for val, axIdx, axSize in zip([rS,rE,cS,cE,bS,bE], [0,0,1,1,2,2], [R,R,C,C,B,B]):
if not 0 <= val <= axSize - 1: raise ValueError(msg(val,axIdx,axSize))
# read subset area
if bL == list(range(0, B)):
tempArr = gdalnumeric.LoadFile(path, cS, rS, cE - cS + 1, rE - rS + 1)
if tempArr is None:
raise Exception('Error reading file: ' + gdal.GetLastErrorMsg())
out_arr = np.swapaxes(np.swapaxes(tempArr, 0, 2), 0, 1) if B > 1 else tempArr
# summarize requested array position in arr_pos
#NOTE: # bandlist must be string because truth value of an array with more than one element is ambiguous
arr_pos = dict(rS=rS, rE=rE, cS=cS, cE=cE, bS=bS, bE=bE, bL=bL)
# check if the requested array position is already in cache -> if yes, return it from there
if self._arr_cache is not None and self._arr_cache['pos']==arr_pos:
out_arr = self._arr_cache['arr_cached']
else:
ds = gdal.Open(path)
if ds is None:
raise Exception('Error reading file: ' + gdal.GetLastErrorMsg())
if len(bL) == 1:
band = ds.GetRasterBand(bL[0] + 1)
out_arr= band.ReadAsArray(cS, rS, cE - cS + 1, rE - rS + 1)
band = None
# TODO insert a multiprocessing.Lock here in order to prevent IO bottlenecks?
# read subset area from disk
if bL == list(range(0, B)):
tempArr = gdalnumeric.LoadFile(path, cS, rS, cE - cS + 1, rE - rS + 1)
if tempArr is None:
raise Exception('Error reading file: ' + gdal.GetLastErrorMsg())
out_arr = np.swapaxes(np.swapaxes(tempArr, 0, 2), 0, 1) if B > 1 else tempArr
else:
out_arr = np.empty((rE - rS + 1, cE - cS + 1, len(bL)))
for i, bIdx in enumerate(bL):
band = ds.GetRasterBand(bIdx + 1)
out_arr[:, :, i] = band.ReadAsArray(cS, rS, cE - cS + 1, rE - rS + 1)
ds = gdal.Open(path)
if ds is None:
raise Exception('Error reading file: ' + gdal.GetLastErrorMsg())
if len(bL) == 1:
band = ds.GetRasterBand(bL[0] + 1)
out_arr= band.ReadAsArray(cS, rS, cE - cS + 1, rE - rS + 1)
band = None
else:
out_arr = np.empty((rE - rS + 1, cE - cS + 1, len(bL)))
for i, bIdx in enumerate(bL):
band = ds.GetRasterBand(bIdx + 1)
out_arr[:, :, i] = band.ReadAsArray(cS, rS, cE - cS + 1, rE - rS + 1)
band = None
ds = None
ds = None
if out_arr is None:
raise Exception('Error reading file: ' + gdal.GetLastErrorMsg())
if out_arr is None:
raise Exception('Error reading file: ' + gdal.GetLastErrorMsg())
# only set self.arr if the whole cube has been read (in order to avoid sudden shape changes)
if out_arr.shape==self.shape:
self.arr = out_arr
# only set self.arr if the whole cube has been read (in order to avoid sudden shape changes)
if out_arr.shape==self.shape:
self.arr = out_arr
self._arr_cache = out_arr
# write _arr_cache
self._arr_cache = dict(pos=arr_pos, arr_cached=out_arr)
return out_arr # TODO implement check of returned datatype (e.g. NoDataMask should always return np.bool
# TODO -> would be np.int8 if an int8 file is read from disk
......@@ -1231,10 +1246,18 @@ class GeoArray(object):
self.arr = temp # deep copy: converts view to its own array in order to avoid wrong output
def cache_array_subset(self, subarray):
def cache_array_subset(self, arr_pos):
# type: (list) -> None
"""Sets the array cache of the GeoArray instance to the given array in order to speed up calculations
afterwards."""
self._arr_cache = subarray
afterwards.
:param arr_pos: a list of array indices as passed to __getitem__
"""
if not self.is_inmem:
self[arr_pos] # runs __getitem__ and sets self._arr_cache
else:
pass # no array cache needed because array is in memory anyways
def flush_cache(self):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment