Commit d7702099 authored by Daniela Rabe's avatar Daniela Rabe
Browse files

Merge branch 'carsten_optimization' into 'master'

Carsten optimization

See merge request !26
parents 8edae516 ba94edfd
Pipeline #26154 passed with stages
in 8 minutes and 22 seconds
......@@ -22,12 +22,11 @@ Imports:
randomForest,
e1071,
devtools,
velox,
fasterize,
leaflet,
leafem,
IRdisplay,
htmlwidgets
Remotes:url::https://cran.r-project.org/src/contrib/Archive/velox/velox_0.2.0.tar.gz
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
......
......@@ -10,8 +10,9 @@
#' @param buffer distance (in m) for new sample collection around initial samples (depends on pixel size)
#' @param reference reference spectra either SpatialPointsDataFrame (shape file) or data.frame with lines = classes, column = predictors]
#' @param model which machine learning classifier to use c("rf", "svm") for random forest or suppurt vector machine implementation
#' @param area extent where the the classification is happening
#' @param mtry number of predictor used at random forest splitting nodes (mtry << n predictors)
#' @param mod.error threshold for model error until which iteration is being executed
#' @param in.memory boolean for raster processing (memory = "TRUE", from disk = "FALSE")
#' @param last only true for one class classifier c("FALSE", TRUE")
#' @param seed set seed for reproducable results
#' @param init.seed "sample" for new or use run1@seeds to reproduce previous steps
......@@ -47,11 +48,12 @@ sample_nb <- function(raster,
buffer,
reference,
model,
area,
mtry,
mod.error,
last,
seed,
init.seed,
in.memory,
save_runs,
parallel_mode,
max_num_cores,
......@@ -60,15 +62,11 @@ sample_nb <- function(raster,
paste("models = ", nb_mean)))
###
n_channel <- length(names(raster))
###velox
rID = raster[[1]]
rID[] = 1:(nrow(rID) * ncol(rID))
r = raster::stack(rID, raster)
ras.vx <- velox::velox(r)
###raster in MEMORY or NOT
if (in.memory == TRUE && raster::fromDisk(raster) == TRUE) { rast <- raster::readAll(raster) }else { rast <-raster }
###
l <- 1 ###6. opt=260
pbtn1 <- matrix(1, nrow = 1, ncol = 1)
pbtn2 <- matrix(2, nrow = 1, ncol = 1)
m <- vector("numeric", length = length(nb_samples))
layer <- list()
for (r in nb_samples) {
......@@ -111,16 +109,15 @@ sample_nb <- function(raster,
sample_type = sample_type,
buffer = buffer,
model = model,
area = area,
seed = seed,
n = n,
sample_size = sample_size,
n_channel = n_channel,
seed2 = seed2,
mtry = mtry,
mod.error = mod.error,
pbtn1 = pbtn1,
pbtn2 = pbtn2,
ras_vx = ras.vx,
rast = rast,
max_samples_per_class = max_samples_per_class,
mc.cores = cores,
mc.preschedule = TRUE,
......@@ -138,7 +135,6 @@ sample_nb <- function(raster,
sample_type = sample_type,
buffer = buffer,
model = model,
area = area,
seed = seed,
k = k,
n = n,
......@@ -146,9 +142,9 @@ sample_nb <- function(raster,
n_channel = n_channel,
seed2 = seed2,
mtry = mtry,
mod.error = mod.error,
pbtn1 = pbtn1,
pbtn2 = pbtn2,
ras_vx = ras.vx,
rast = rast,
max_samples_per_class = max_samples_per_class
)
points_list[[k]] <- res$points
......
......@@ -7,16 +7,15 @@
#' @param sample_type distribution of spatial locations c("random","regular")
#' @param buffer distance (in m) for new sample collection around initial samples (depends on pixel size)
#' @param model which machine learning classifier to use c("rf", "svm") for random forest or support vector machine implementation
#' @param area extent where the the classification is happening
#' @param seed set seed for reproducible results
#' @param n number of iterations for model accuracy
#' @param sample_size number of spatial locations
#' @param n_channel number of channels
#' @param seed2 spatial points sample
#' @param mtry number of predictor used at random forest splitting nodes (mtry << n predictors)
#' @param mod.error threshold for model error until which iteration is being executed
#' @param pbtn1 matrix for points
#' @param pbtn2 matrix for points
#' @param ras_vx velox raster
#' @param rast raster
#' @param max_samples_per_class maximum number of samples per class
#'
#' @return a list with 4 elements:
......@@ -30,16 +29,15 @@ model_opt_r <- function(k,
sample_type,
buffer,
model,
area,
seed,
n,
sample_size,
n_channel,
seed2,
mtry,
mod.error,
pbtn1,
pbtn2,
ras_vx,
rast,
max_samples_per_class) {
points <- NULL
models <- NULL
......@@ -56,14 +54,12 @@ model_opt_r <- function(k,
if (sample_type == "regular") {
pbt <- raster::sampleRegular(raster, size = sample_size, sp = T)
}
pbt <- spatialEco::point.in.poly(pbt, area)[, 1:n_channel]
f <- which(is.na(pbt@data[1]))
if (length(f) != 0) {
f <- which(is.na(pbt@data[1]))
if (length(f) != 0) {
pbt <- pbt[-f,]
}
}
set.seed(seed2[k])
classes <-
as.factor(sample(c(1:2), size = nrow(pbt), replace = T))
......@@ -99,22 +95,21 @@ model_opt_r <- function(k,
model_pre <- model1
pbtn1_pre <- pbtn1
pbtn2_pre <- pbtn2
#if ( j > 1) {if (oobe[j,k] < 0.02 || abs(oobe[(j-1),k]-oobe[j,k]) <= 0.011 )
if (j > 1) {
if (oobe[j, 1] < 0.02) {
if (oobe[j, 1] < mod.error) {
models <- model1
points <- rbind(pbtn1, pbtn2)
points <- pbtn1
break
}
if (oobe[(j - 1), 1] <= oobe[j, 1]) {
models <- model_pre
points <- rbind(pbtn1_pre, pbtn2_pre)
points <- pbtn1_pre
break
}
if (j == n & oobe[j, 1] >= 0.02) {
if (j == n & oobe[j, 1] >= mod.error) {
models <- NULL
points <- NULL
break
......@@ -132,17 +127,19 @@ model_opt_r <- function(k,
}
########################################################################
which_classes_correct <- which(classes[correct] == 1)
if (length(which_classes_correct) == 0) {
which_classes_correct_2 <- which(classes[correct] == 2)
if (length(which_classes_correct) == 0 || length(which_classes_correct_2) == 0) {
if (j == 1) {
break
}
} else {
d1 <- correct[which_classes_correct]
d2 <- correct[which_classes_correct_2]
###generate new samples from only correctly classified samples [label 1]
p1 <- pbt@coords[d1, ]
p1 <- pbt@coords[append(d1,d2), ]
pbtn1 <-
as.data.frame(cbind(classes[d1], matrix(p1, ncol = 2)))
as.data.frame(cbind(c(classes[d1],classes[d2]), matrix(p1, ncol = 2)))
sp::coordinates(pbtn1) <- c("V2", "V3")
#sp::proj4string(pbtn1) <- sp::proj4string(pbt)
raster::crs(pbtn1) <- raster::crs(pbt)
......@@ -150,148 +147,63 @@ model_opt_r <- function(k,
poly <- rgeos::gBuffer(spgeom = pbtn1,
width = buffer,
byid = TRUE)
test <- ras_vx$extract(sp = poly)
for (i in 1:length(test)) {
s1 <- dim(test[[i]])[1]
#if (s1 <= 5) {
# test[[i]] <-
# test[[i]]
#} else {
if (s1 > 5) {
set.seed(seed)
test[[i]] <-
test[[i]][sample(c(1:s1), 5, replace = F), ]
}
}
for (i in 1:length(test)) {
if (i == 1) {
co <- raster::xyFromCell(raster, test[[i]][, 1])
} else {
co <- rbind(co, raster::xyFromCell(raster, test[[i]][, 1]))
}
}
pbtn1 <- as.data.frame(cbind(rep(1, nrow(co)), co))
test1<-na.omit(raster::as.matrix(fasterize::fasterize(sf::st_as_sf(poly), rast[[1]])*rast))
nam<-as.vector(fasterize::fasterize(sf::st_as_sf(poly), rast[[1]], field="V1"))[-attr(test1,"na.action")]
co<-raster::xyFromCell(rast, c(1:raster::ncell(rast))[-attr(test1,"na.action")])
pbtn1 <- as.data.frame(cbind(nam, co))
sp::coordinates(pbtn1) <- c("x", "y")
test1 <- as.matrix(do.call(rbind, test)[, -1])
if (ncol(test1) == 1) {
if (ncol(test1) == 1) {
test1 <- t(test1)
}
colnames(test1) <- names(raster)
if (length(which(is.na(test1))) > 0) {
pbtn1 <- pbtn1[complete.cases(test1), ]
test1 <- test1[complete.cases(test1), ]
}
}
}
if (class(test1)[1] == "numeric") {
test1 <- t(matrix(test1))
}
if (nrow(test1) == 0) {
break
}
##############################
##############################
which_classes_correct_2 <- which(classes[correct] == 2)
if (length(which_classes_correct_2) == 0) {
if (j == 1) {
break
}
} else {
d2 <- correct[which_classes_correct_2]
###generate new samples from only correctly classified samples [label 2]
p2 <- pbt@coords[d2, ]
pbtn2 <-
as.data.frame(cbind(classes[d2], matrix(p2, ncol = 2)))
sp::coordinates(pbtn2) <- c("V2", "V3")
#sp::proj4string(pbtn2) <- sp::proj4string(pbt)
raster::crs(pbtn2) <- raster::crs(pbt)
poly <- rgeos::gBuffer(spgeom = pbtn2,
width = buffer,
byid = TRUE)
test <- ras_vx$extract(sp = poly)
for (i in 1:length(test)) {
s1 <- dim(test[[i]])[1]
#if (s1 <= 5) {
# test[[i]] <-
# test[[i]]
#} else {
if (s1 > 5) {
set.seed(seed)
test[[i]] <-
test[[i]][sample(c(1:s1), 5, replace = F), ]
}
}
for (i in 1:length(test)) {
if (i == 1) {
co <- raster::xyFromCell(raster, test[[i]][, 1])
} else {
co <- rbind(co, raster::xyFromCell(raster, test[[i]][, 1]))
}
}
pbtn2 <- as.data.frame(cbind(rep(2, nrow(co)), co))
sp::coordinates(pbtn2) <- c("x", "y")
test2 <- as.matrix(do.call(rbind, test)[, -1])
if (ncol(test2) == 1) {
test2 <- t(test2)
}
colnames(test2) <- names(raster)
if (length(which(is.na(test2))) > 0) {
pbtn2 <- pbtn2[complete.cases(test2), ]
test2 <- test2[complete.cases(test2), ]
}
}
if (class(test2)[1] == "numeric") {
test2 <- t(matrix(test2))
}
if (nrow(test2) == 0) {
if (length(levels(as.factor(nam))) < 2) {
break
}
######################################
###Gleichverteilung samples in Klassen
di <- c(nrow(pbtn1), nrow(pbtn2))
if (abs(nrow(pbtn1) - nrow(pbtn2)) > min(di) * 0.3) {
###balancing sample size
di <- c(sum(pbtn1@data$nam==1), sum(pbtn1@data$nam==2))
if (abs(di[1] - di[2]) > min(di) * 0.3) {
if (which.min(di) == 2) {
set.seed(seed)
d3 <- sample(1:nrow(pbtn1), nrow(pbtn2), replace = F)
pbtn1 <- pbtn1[d3, ]
test1 <- test1[d3, ]
d3 <- sample(which(pbtn1@data$nam==1), di[1]-di[2], replace = F)
pbtn1 <- pbtn1[-d3, ]
test1 <- test1[-d3, ]
} else {
set.seed(seed)
d4 <- sample(1:nrow(pbtn2), nrow(pbtn1), replace = F)
pbtn2 <- pbtn2[d4, ]
test2 <- test2[d4, ]
d4 <- sample(which(pbtn1@data$nam==2), di[2]-di[1], replace = F)
pbtn1 <- pbtn1[-d4, ]
test1 <- test1[-d4, ]
}
}
#####################################
###max Klassenbelegungswert
if (nrow(pbtn1) > max_samples_per_class) {
###maximum sample size
if (sum(pbtn1@data$nam==1) > max_samples_per_class) {
set.seed(seed)
dr <-
sample(1:nrow(pbtn1), max_samples_per_class, replace = F)
pbtn1 <- pbtn1[dr, ]
test1 <- test1[dr, ]
sample(which(pbtn1@data$nam==1), sum(pbtn1@data$nam==1) - max_samples_per_class, replace = F)
pbtn1 <- pbtn1[-dr, ]
test1 <- test1[-dr, ]
}
if (nrow(pbtn2) > max_samples_per_class) {
if (sum(pbtn1@data$nam==2) > max_samples_per_class) {
set.seed(seed)
dr <-
sample(1:nrow(pbtn2), max_samples_per_class, replace = F)
pbtn2 <- pbtn2[dr, ]
test2 <- test2[dr, ]
sample(which(pbtn1@data$nam==2), sum(pbtn1@data$nam==2) - max_samples_per_class, replace = F)
pbtn1 <- pbtn1[-dr, ]
test1 <- test1[-dr, ]
}
########################################################################
data <-
as.data.frame(cbind(append(pbtn1@data$V1, pbtn2@data$V1),
rbind(test1, test2))) ##data
as.data.frame(cbind(pbtn1@data$nam, test1)) ##data
names(data)[1] <- "classes"
classes <- data$classes
pbt <- rbind(pbtn1, pbtn2)
pbt <- pbtn1
}
remove(pbt)
......
......@@ -11,6 +11,7 @@
#' @param reference reference spectra as a data.frame with (lines = classes, column = predictors)
#' @param model which machine learning classifier to use c("rf", "svm") for random forest or support vector machine implementation
#' @param mtry number of predictor used at random forest splitting nodes (mtry << n predictors)
#' @param mod.error threshold for model error until which iteration is being executed
#' @param last only true for one class classifier c("FALSE", TRUE")
#' @param seed set seed for reproducible results
#' @param init.seed "sample" for new or use run1@seeds to reproduce previous steps
......@@ -20,7 +21,8 @@
#' @param n_classes total number of classes (habitat types) to be separated
#' @param multiTest number of test runs to compare different probability outputs
#' @param RGB rgb channel numbers for image plot
#' @param color color pallet
#' @param in.memory boolean for raster processing (memory = "TRUE", from disk = "FALSE")
#' @param color single colors for continuous color palette interpolation
#' @param overwrite overwrite the KML and raster files from previous runs (default TRUE)
#' @param save_runs an Habitat object is saved into disk for each run (default TRUE)
#' @param parallel_mode run loops using all available cores (default FALSE)
......@@ -30,7 +32,7 @@
#' @return 4 files per step:
#' 1) Habitat type probability map as geocoded *.kmz file (with a *.kml layer and *.png image output), and *.tif raster file
#' 2) A Habitat object (only if save_runs is set to TRUE) consisting of 7 slots: \cr
#' run1@models - list of selcted classifiers \cr
#' run1@models - list of selected classifiers \cr
#' run1@ref_samples - list of SpatialPointsDataFrames with same length as run1@models holding reference labels [1,2] for each selected model \cr
#' run1@switch - vector of lenght run1@models indicating if target class equals 2, if not NA the labels need to be switched \cr
#' run1@layer - raster map of habitat type probability \cr
......@@ -76,6 +78,7 @@ multi_Class_Sampling <- function(in.raster,
reference,
model = "rf",
mtry = 10,
mod.error=0.02,
last = F,
seed = 3,
init.seed = "sample",
......@@ -85,6 +88,7 @@ multi_Class_Sampling <- function(in.raster,
n_classes,
multiTest = 1,
RGB = c(19, 20, 21),
in.memory = TRUE,
color = c("lightgrey", "orange", "yellow", "limegreen", "forestgreen"),
overwrite = TRUE,
save_runs = TRUE,
......@@ -94,7 +98,7 @@ multi_Class_Sampling <- function(in.raster,
# Checks if its a new or a resumed run and asks the user to remove all step_*.tif
# files from the results folder in case of a new run.
if(step == 1){
if (step == 1) {
if (length(list.files(
outPath,
pattern = "step_(.*).tif",
......@@ -113,11 +117,6 @@ multi_Class_Sampling <- function(in.raster,
}
input_raster <- in.raster
area <- as(raster::extent(in.raster), 'SpatialPolygons')
area <- sp::SpatialPolygonsDataFrame(area, data.frame(ID = 1:length(area)))
#sp::proj4string(area) <- sp::proj4string(in.raster)
raster::crs(area) <- raster::crs(in.raster)
col <- colorRampPalette(c("lightgrey",
"orange",
"yellow",
......@@ -168,10 +167,11 @@ multi_Class_Sampling <- function(in.raster,
buffer = buffer,
reference = reference,
model = model,
area = area,
mtry = mtry,
mod.error = mod.error,
last = last,
seed = seed,
in.memory = in.memory,
init.seed = init.seed,
save_runs = save_runs,
parallel_mode = parallel_mode,
......@@ -297,11 +297,12 @@ multi_Class_Sampling <- function(in.raster,
buffer = buffer,
reference = reference,
model = model,
area = area,
mtry = mtry,
mod.error = mod.error,
last = last,
seed = seed,
init.seed = init.seed,
in.memory = in.memory,
save_runs = save_runs,
parallel_mode = parallel_mode,
max_num_cores = max_num_cores
......@@ -355,7 +356,6 @@ multi_Class_Sampling <- function(in.raster,
}
}
}
dummy <- maFo_rf@layer[[1]]
iplot(
x = dummy,
......@@ -406,11 +406,12 @@ multi_Class_Sampling <- function(in.raster,
buffer = buffer,
reference = reference,
model = model,
area = area,
mtry = mtry,
mod.error = mod.error,
last = last,
seed = seed,
init.seed = init.seed,
in.memory = in.memory,
save_runs = save_runs,
parallel_mode = parallel_mode,
max_num_cores = max_num_cores
......@@ -463,7 +464,6 @@ multi_Class_Sampling <- function(in.raster,
}
}
}
dummy <- maFo_rf@layer[[1]]
iplot(
x = dummy,
......
......@@ -10,16 +10,15 @@ model_opt_r(
sample_type,
buffer,
model,
area,
seed,
n,
sample_size,
n_channel,
seed2,
mtry,
mod.error,
pbtn1,
pbtn2,
ras_vx,
rast,
max_samples_per_class
)
}
......@@ -34,8 +33,6 @@ model_opt_r(
\item{model}{which machine learning classifier to use c("rf", "svm") for random forest or support vector machine implementation}
\item{area}{extent where the the classification is happening}
\item{seed}{set seed for reproducible results}
\item{n}{number of iterations for model accuracy}
......@@ -48,11 +45,11 @@ model_opt_r(
\item{mtry}{number of predictor used at random forest splitting nodes (mtry << n predictors)}
\item{pbtn1}{matrix for points}
\item{mod.error}{threshold for model error until which iteration is being executed}
\item{pbtn2}{matrix for points}
\item{pbtn1}{matrix for points}
\item{ras_vx}{velox raster}
\item{rast}{raster}
\item{max_samples_per_class}{maximum number of samples per class}
}
......
......@@ -14,6 +14,7 @@ multi_Class_Sampling(
reference,
model = "rf",
mtry = 10,
mod.error = 0.02,
last = F,
seed = 3,
init.seed = "sample",
......@@ -23,6 +24,7 @@ multi_Class_Sampling(
n_classes,
multiTest = 1,
RGB = c(19, 20, 21),
in.memory = TRUE,
color = c("lightgrey", "orange", "yellow", "limegreen", "forestgreen"),
overwrite = TRUE,
save_runs = TRUE,
......@@ -50,6 +52,8 @@ multi_Class_Sampling(
\item{mtry}{number of predictor used at random forest splitting nodes (mtry << n predictors)}
\item{mod.error}{threshold for model error until which iteration is being executed}
\item{last}{only true for one class classifier c("FALSE", TRUE")}
\item{seed}{set seed for reproducible results}
......@@ -68,7 +72,9 @@ multi_Class_Sampling(
\item{RGB}{rgb channel numbers for image plot}
\item{color}{color pallet}
\item{in.memory}{boolean for raster processing (memory = "TRUE", from disk = "FALSE")}
\item{color}{single colors for continuous color palette interpolation}
\item{overwrite}{overwrite the KML and raster files from previous runs (default TRUE)}
......@@ -85,7 +91,7 @@ multi_Class_Sampling(
\enumerate{
\item Habitat type probability map as geocoded *.kmz file (with a *.kml layer and *.png image output), and *.tif raster file
\item A Habitat object (only if save_runs is set to TRUE) consisting of 7 slots: \cr
run1@models - list of selcted classifiers \cr
run1@models - list of selected classifiers \cr
run1@ref_samples - list of SpatialPointsDataFrames with same length as run1@models holding reference labels \link{1,2} for each selected model \cr
run1@switch - vector of lenght run1@models indicating if target class equals 2, if not NA the labels need to be switched \cr
run1@layer - raster map of habitat type probability \cr
......
......@@ -13,11 +13,12 @@ sample_nb(
buffer,
reference,
model,
area,
mtry,
mod.error,
last,
seed,
init.seed,
in.memory,
save_runs,
parallel_mode,