source("init.R")
num.trees.RF = 10000
mtry = as.integer(287^(3/4))
s = 15
t = 5
rerun = FALSE
For all analyses the following parameters were used for RF classification, variable selection and relation analysis:
se = loadObject(file = file.path(project.dir, "/subprojects/01_pretreatment/data/sum_exp_norm.RData"))
samples = colData(se)
maxsamp = 100
set.seed(42)
samples.scot = sample(which(samples[,"Country"] == "Scotland"), maxsamp)
#create se with relevant samples with class number at least 10
se.country = se[, c(samples.scot, which( samples[,"Country"] == "Germany" |
samples[,"Country"] == "Ireland" |
samples[,"Country"] == "USA" |
samples[,"Country"] == "Canada" |
samples[,"Country"] == "Japan"
))]
if (rerun) {
set.seed(42)
res.all = RF.analysis.all(sum.exp = se.country,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Country",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.all, file = file.path(data.dir,paste0("res_country",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.all = loadObject(file = file.path(data.dir,paste0("res_country",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}
Classification was conducted with a OOB prediction error of 0.2821577
Canada | Germany | Ireland | Japan | Scotland | USA | |
---|---|---|---|---|---|---|
Canada | 8 | 0 | 1 | 1 | 0 | 1 |
Germany | 2 | 23 | 5 | 1 | 5 | 5 |
Ireland | 5 | 4 | 25 | 2 | 4 | 1 |
Japan | 1 | 0 | 5 | 2 | 2 | 0 |
Scotland | 0 | 5 | 8 | 2 | 83 | 2 |
USA | 0 | 1 | 2 | 0 | 3 | 32 |
#create se with relevant samples with class number at least 10
se.usa = se[, which( samples[,"Region"] == " Kentucky" |
samples[,"Region"] == "Tennessee"
)]
se.scotland = se[, which( samples[,"Region"] == "Speyside" |
samples[,"Region"] == "Campbeltown" |
samples[,"Region"] == "Highlands" |
samples[,"Region"] == "Islay" |
samples[,"Region"] == "Isle of Jura" |
samples[,"Region"] == "Isle of Mull" |
samples[,"Region"] == "Lowlands" |
samples[,"Region"] == "Orkney"
)]
# zusammenfassen der Inseln als eine Gruppe
se.scotland.island = se[, which( samples[,"Region"] == "Speyside" |
samples[,"Region"] == "Campbeltown" |
samples[,"Region"] == "Highlands" |
samples[,"Region"] == "Islay" |
samples[,"Region"] == "Isle of Jura" |
samples[,"Region"] == "Isle of Mull" |
samples[,"Region"] == "Lowlands" |
samples[,"Region"] == "Orkney" |
samples[,"Region"] == "Isle of Skye" |
samples[,"Region"] == "Isle of Rassay" |
samples[,"Region"] == "Isle of Arran"
)]
samples.island = colData(se.scotland.island)
samples.island[which( samples.island[,"Region"] == "Isle of Jura" |
samples.island[,"Region"] == "Isle of Mull" |
samples.island[,"Region"] == "Isle of Skye" |
samples.island[,"Region"] == "Isle of Rassay" |
samples.island[,"Region"] == "Isle of Arran" |
samples.island[,"Region"] == "Orkney" ),5] = "Islands"
colData(se.scotland.island) = samples.island
if (rerun) {
set.seed(42)
res.usa = RF.analysis.all(sum.exp = se.usa,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Region",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.usa, file = file.path(data.dir,paste0("res_region_usa",num.trees.RF,"_s_",s,"_t_",t,".RData")))
set.seed(42)
res.scot = RF.analysis.all(sum.exp = se.scotland,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Region",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.scot, file = file.path(data.dir,paste0("res_region_scotland",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.scot.island = RF.analysis.all(sum.exp = se.scotland.island,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Region",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.scot.island, file = file.path(data.dir,paste0("res_region_scotland_island",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.usa = loadObject(file = file.path(data.dir,paste0("res_region_usa",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.scot = loadObject(file = file.path(data.dir,paste0("res_region_scotland",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.scot.island = loadObject(file = file.path(data.dir,paste0("res_region_scotland_island",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}
Classification was conducted with a OOB prediction error of 0.2903226
Kentucky | Tennessee | |
---|---|---|
Kentucky | 22 | 4 |
Tennessee | 5 | 0 |
Classification was conducted with a OOB prediction error of 0.497561
Campbeltown | Highlands | Islay | Isle of Jura | Isle of Mull | Lowlands | Orkney | Speyside | |
---|---|---|---|---|---|---|---|---|
Campbeltown | 2 | 6 | 0 | 0 | 3 | 0 | 1 | 4 |
Highlands | 6 | 30 | 9 | 0 | 3 | 7 | 1 | 49 |
Islay | 1 | 5 | 75 | 1 | 7 | 2 | 1 | 7 |
Isle of Jura | 0 | 2 | 0 | 6 | 0 | 0 | 0 | 5 |
Isle of Mull | 4 | 1 | 6 | 0 | 6 | 0 | 0 | 2 |
Lowlands | 0 | 4 | 1 | 0 | 0 | 4 | 0 | 4 |
Orkney | 1 | 2 | 0 | 1 | 1 | 0 | 10 | 5 |
Speyside | 7 | 22 | 3 | 5 | 4 | 2 | 9 | 73 |
Classification was conducted with a OOB prediction error of 0.4800937
Campbeltown | Highlands | Islands | Islay | Lowlands | Speyside | |
---|---|---|---|---|---|---|
Campbeltown | 3 | 5 | 5 | 1 | 0 | 2 |
Highlands | 6 | 33 | 6 | 9 | 4 | 47 |
Islands | 4 | 11 | 23 | 12 | 0 | 19 |
Islay | 1 | 4 | 8 | 75 | 2 | 9 |
Lowlands | 0 | 5 | 0 | 1 | 5 | 2 |
Speyside | 3 | 26 | 9 | 4 | 0 | 83 |
Only Ex-Bourbon, Ex-Sherry and mixtures of both were analysed for all samples, as well as samples from Speyside and Islay
#create ses with Barells: Ex-Sherry, Ex-Bourbon and both for all samples and for samples from Islay and Speyside
se.barell = se[, which( samples[,"Barell"] == "Ex-Sherry" |
samples[,"Barell"] == "Ex-Bourbon" |
samples[,"Barell"] == "Ex-Bourbon, Ex-Sherry"
)]
# For samples from Speyside
se.speyside= se[, which( samples[,"Region"] == "Speyside" )]
samples.speyside = colData(se.speyside)
se.speyside= se.speyside[, which( samples.speyside[,"Barell"] == "Ex-Sherry" |
samples.speyside[,"Barell"] == "Ex-Bourbon" |
samples.speyside[,"Barell"] == "Ex-Bourbon, Ex-Sherry"
)]
# For samples from Islay
se.islay= se[, which( samples[,"Region"] == "Islay" )]
samples.islay = colData(se.islay)
se.islay= se.islay[, which( samples.islay[,"Barell"] == "Ex-Sherry" |
samples.islay[,"Barell"] == "Ex-Bourbon" |
samples.islay[,"Barell"] == "Ex-Bourbon, Ex-Sherry"
)]
if (rerun) {
set.seed(42)
res.barell = RF.analysis.all(sum.exp = se.barell,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Barell",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.barell, file = file.path(data.dir,paste0("res_barell",num.trees.RF,"_s_",s,"_t_",t,".RData")))
set.seed(42)
res.speyside = RF.analysis.all(sum.exp = se.speyside,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Barell",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.speyside, file = file.path(data.dir,paste0("res_speyside",num.trees.RF,"_s_",s,"_t_",t,".RData")))
set.seed(42)
res.islay = RF.analysis.all(sum.exp = se.islay,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Barell",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.islay, file = file.path(data.dir,paste0("res_islay",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.barell = loadObject(file = file.path(data.dir,paste0("res_barell",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.speyside = loadObject(file = file.path(data.dir,paste0("res_speyside",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.islay = loadObject(file = file.path(data.dir,paste0("res_islay",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}
Classification was conducted with a OOB prediction error of 0.3680556
Ex-Bourbon | Ex-Bourbon, Ex-Sherry | Ex-Sherry | |
---|---|---|---|
Ex-Bourbon | 71 | 15 | 1 |
Ex-Bourbon, Ex-Sherry | 26 | 61 | 33 |
Ex-Sherry | 2 | 29 | 50 |
Classification was conducted with a OOB prediction error of 0.25
Ex-Bourbon | Ex-Bourbon, Ex-Sherry | Ex-Sherry | |
---|---|---|---|
Ex-Bourbon | 11 | 0 | 1 |
Ex-Bourbon, Ex-Sherry | 3 | 12 | 5 |
Ex-Sherry | 2 | 4 | 22 |
Classification was conducted with a OOB prediction error of 0.4150943
Ex-Bourbon | Ex-Bourbon, Ex-Sherry | Ex-Sherry | |
---|---|---|---|
Ex-Bourbon | 13 | 3 | 0 |
Ex-Bourbon, Ex-Sherry | 6 | 12 | 5 |
Ex-Sherry | 1 | 7 | 6 |
#create se
se.colour = se[, which( samples[,"colour"] == "natural Colour" |
samples[,"colour"] == "plain caramel"
)]
if (rerun) {
set.seed(42)
res.colour = RF.analysis.all(sum.exp = se.colour,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "colour",
mtry = mtry,
s = s,
t = t,
VarAnal = TRUE)
saveObject(res.colour, file = file.path(data.dir,paste0("res_colour",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.colour = loadObject(file = file.path(data.dir,paste0("res_colour",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}
Classification was conducted with a OOB prediction error of 0.0920245
natural Colour | plain caramel | |
---|---|---|
natural Colour | 306 | 20 |
plain caramel | 25 | 138 |
#create se
se.age = se[, which( samples[,"Age"] != "NAS"
)]
if (rerun) {
set.seed(42)
res.age = RF.analysis.all(sum.exp = se.age,
num.trees.RF = num.trees.RF,
assay = "original",
var.y = "Age",
mtry = mtry,
s = s,
t = t,
type = "regression",
VarAnal = TRUE)
saveObject(res.age, file = file.path(data.dir,paste0("res_age",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.age = loadObject(file = file.path(data.dir,paste0("res_age",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}
A regression model to predict the age in years was trained. An error (Root of the mean square error) of 4.559628 and an R² of 0.4085824 was obtained
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##
## locale:
## [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] knitr_1.46 SummarizedExperiment_1.32.0
## [3] Biobase_2.62.0 GenomicRanges_1.54.1
## [5] GenomeInfoDb_1.38.8 IRanges_2.36.0
## [7] S4Vectors_0.40.2 BiocGenerics_0.48.1
## [9] MatrixGenerics_1.14.0 matrixStats_1.3.0
## [11] summarytools_1.0.1 R.utils_2.12.3
## [13] R.oo_1.26.0 R.methodsS3_1.8.2
## [15] DT_0.33
##
## loaded via a namespace (and not attached):
## [1] xfun_0.43 bslib_0.7.0 htmlwidgets_1.6.4
## [4] lattice_0.22-5 vctrs_0.6.5 tools_4.3.3
## [7] bitops_1.0-7 generics_0.1.3 tibble_3.2.1
## [10] fansi_1.0.6 pkgconfig_2.0.3 Matrix_1.6-5
## [13] checkmate_2.3.1 pryr_0.1.6 lifecycle_1.0.4
## [16] GenomeInfoDbData_1.2.11 compiler_4.3.3 stringr_1.5.1
## [19] rapportools_1.1 codetools_0.2-19 htmltools_0.5.8.1
## [22] sass_0.4.9 RCurl_1.98-1.16 yaml_2.3.8
## [25] pillar_1.9.0 crayon_1.5.2 jquerylib_0.1.4
## [28] tidyr_1.3.1 MASS_7.3-60.0.1 DelayedArray_0.28.0
## [31] cachem_1.0.8 magick_2.8.4 abind_1.4-5
## [34] tidyselect_1.2.1 digest_0.6.35 stringi_1.8.4
## [37] dplyr_1.1.4 reshape2_1.4.4 pander_0.6.5
## [40] purrr_1.0.2 grid_4.3.3 fastmap_1.1.1
## [43] SparseArray_1.2.4 cli_3.6.2 magrittr_2.0.3
## [46] S4Arrays_1.2.1 base64enc_0.1-3 utf8_1.2.4
## [49] backports_1.5.0 lubridate_1.9.3 timechange_0.3.0
## [52] rmarkdown_2.26 XVector_0.42.0 evaluate_0.23
## [55] tcltk_4.3.3 rlang_1.1.3 Rcpp_1.0.12
## [58] glue_1.7.0 rstudioapi_0.16.0 jsonlite_1.8.8
## [61] R6_2.5.1 plyr_1.8.9 zlibbioc_1.48.2