source("init.R")

num.trees.RF = 10000
mtry = as.integer(287^(3/4))
s = 15
t = 5
rerun = FALSE

General information

For all analyses the following parameters were used for RF classification, variable selection and relation analysis:

  • num.trees.RF: 10^{4}
  • mtry: 69
  • s: 15

Analysis of Country

  • only countries with at least 10 samples were considered
  • For Scotland 100 random samples were selected
se = loadObject(file = file.path(project.dir, "/subprojects/01_pretreatment/data/sum_exp_norm.RData"))
samples = colData(se)
maxsamp = 100

set.seed(42)
samples.scot = sample(which(samples[,"Country"] == "Scotland"), maxsamp)

#create se with relevant samples with class number at least 10
se.country = se[, c(samples.scot, which( samples[,"Country"] == "Germany" |
                          samples[,"Country"] == "Ireland" |
                          samples[,"Country"] == "USA" |
                          samples[,"Country"] == "Canada" |
                          samples[,"Country"] == "Japan" 
                        ))]

 
 if (rerun) {
  set.seed(42)
res.all = RF.analysis.all(sum.exp = se.country,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Country",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.all, file = file.path(data.dir,paste0("res_country",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.all = loadObject(file = file.path(data.dir,paste0("res_country",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

Classification was conducted with a OOB prediction error of 0.2821577

Classification results for geographical origin
Canada Germany Ireland Japan Scotland USA
Canada 8 0 1 1 0 1
Germany 2 23 5 1 5 5
Ireland 5 4 25 2 4 1
Japan 1 0 5 2 2 0
Scotland 0 5 8 2 83 2
USA 0 1 2 0 3 32

Analysis of regions

  • only regions in USA and Scotland were considered
#create se with relevant samples with class number at least 10
se.usa = se[, which( samples[,"Region"] == " Kentucky" |
                          samples[,"Region"] == "Tennessee"
                        )]

se.scotland = se[, which( samples[,"Region"] == "Speyside" |
                          samples[,"Region"] == "Campbeltown" |
                          samples[,"Region"] == "Highlands" |
                          samples[,"Region"] == "Islay" |
                          samples[,"Region"] == "Isle of Jura" |
                          samples[,"Region"] == "Isle of Mull" |
                          samples[,"Region"] == "Lowlands" |
                          samples[,"Region"] == "Orkney" 
                        )]

# zusammenfassen der Inseln als eine Gruppe

se.scotland.island = se[, which( samples[,"Region"] == "Speyside" |
                          samples[,"Region"] == "Campbeltown" |
                          samples[,"Region"] == "Highlands" |
                          samples[,"Region"] == "Islay" |
                          samples[,"Region"] == "Isle of Jura" |
                          samples[,"Region"] == "Isle of Mull" |
                          samples[,"Region"] == "Lowlands" |
                          samples[,"Region"] == "Orkney"  |
                          samples[,"Region"] == "Isle of Skye" |
                          samples[,"Region"] == "Isle of Rassay" |
                          samples[,"Region"] == "Isle of Arran" 
                        )]

samples.island = colData(se.scotland.island)
samples.island[which( samples.island[,"Region"] == "Isle of Jura" |
       samples.island[,"Region"] == "Isle of Mull" |
       samples.island[,"Region"] == "Isle of Skye" |
       samples.island[,"Region"] == "Isle of Rassay" |
       samples.island[,"Region"] == "Isle of Arran" |
       samples.island[,"Region"] == "Orkney"  ),5] = "Islands"

colData(se.scotland.island) = samples.island
       
 if (rerun) {
  set.seed(42)
res.usa = RF.analysis.all(sum.exp = se.usa,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Region",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.usa, file = file.path(data.dir,paste0("res_region_usa",num.trees.RF,"_s_",s,"_t_",t,".RData")))

  set.seed(42)
res.scot = RF.analysis.all(sum.exp = se.scotland,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Region",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.scot, file = file.path(data.dir,paste0("res_region_scotland",num.trees.RF,"_s_",s,"_t_",t,".RData")))

res.scot.island = RF.analysis.all(sum.exp = se.scotland.island,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Region",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.scot.island, file = file.path(data.dir,paste0("res_region_scotland_island",num.trees.RF,"_s_",s,"_t_",t,".RData")))

} else {
res.usa = loadObject(file = file.path(data.dir,paste0("res_region_usa",num.trees.RF,"_s_",s,"_t_",t,".RData")))

res.scot = loadObject(file = file.path(data.dir,paste0("res_region_scotland",num.trees.RF,"_s_",s,"_t_",t,".RData")))

res.scot.island = loadObject(file = file.path(data.dir,paste0("res_region_scotland_island",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

USA

Classification was conducted with a OOB prediction error of 0.2903226

Classification results for geographical origin
Kentucky Tennessee
Kentucky 22 4
Tennessee 5 0

Scotland: All regions

Classification was conducted with a OOB prediction error of 0.497561

Classification results for geographical origin
Campbeltown Highlands Islay Isle of Jura Isle of Mull Lowlands Orkney Speyside
Campbeltown 2 6 0 0 3 0 1 4
Highlands 6 30 9 0 3 7 1 49
Islay 1 5 75 1 7 2 1 7
Isle of Jura 0 2 0 6 0 0 0 5
Isle of Mull 4 1 6 0 6 0 0 2
Lowlands 0 4 1 0 0 4 0 4
Orkney 1 2 0 1 1 0 10 5
Speyside 7 22 3 5 4 2 9 73

Scotland: Islands summarised

Classification was conducted with a OOB prediction error of 0.4800937

Classification results for geographical origin
Campbeltown Highlands Islands Islay Lowlands Speyside
Campbeltown 3 5 5 1 0 2
Highlands 6 33 6 9 4 47
Islands 4 11 23 12 0 19
Islay 1 4 8 75 2 9
Lowlands 0 5 0 1 5 2
Speyside 3 26 9 4 0 83

Analysis of barells

Only Ex-Bourbon, Ex-Sherry and mixtures of both were analysed for all samples, as well as samples from Speyside and Islay

#create ses with Barells: Ex-Sherry, Ex-Bourbon and both for all samples and for samples from Islay and Speyside

se.barell = se[, which( samples[,"Barell"] == "Ex-Sherry" |
                        samples[,"Barell"] == "Ex-Bourbon" |
                         samples[,"Barell"] == "Ex-Bourbon, Ex-Sherry" 
                           )]

#  For samples from Speyside 
se.speyside= se[, which( samples[,"Region"] == "Speyside" )]
samples.speyside = colData(se.speyside)
se.speyside= se.speyside[, which( samples.speyside[,"Barell"] == "Ex-Sherry" |
                          samples.speyside[,"Barell"] == "Ex-Bourbon" |
                         samples.speyside[,"Barell"] == "Ex-Bourbon, Ex-Sherry" 
                           )]

#  For samples from Islay 
se.islay= se[, which( samples[,"Region"] == "Islay" )]
samples.islay = colData(se.islay)
se.islay= se.islay[, which( samples.islay[,"Barell"] == "Ex-Sherry" |
                          samples.islay[,"Barell"] == "Ex-Bourbon" |
                         samples.islay[,"Barell"] == "Ex-Bourbon, Ex-Sherry" 
                           )]



 
 if (rerun) {
  set.seed(42)
res.barell = RF.analysis.all(sum.exp = se.barell,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Barell",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)


saveObject(res.barell, file = file.path(data.dir,paste0("res_barell",num.trees.RF,"_s_",s,"_t_",t,".RData")))

  set.seed(42)
res.speyside = RF.analysis.all(sum.exp = se.speyside,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Barell",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.speyside, file = file.path(data.dir,paste0("res_speyside",num.trees.RF,"_s_",s,"_t_",t,".RData")))


  set.seed(42)
res.islay = RF.analysis.all(sum.exp = se.islay,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Barell",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.islay, file = file.path(data.dir,paste0("res_islay",num.trees.RF,"_s_",s,"_t_",t,".RData")))



} else {
res.barell = loadObject(file = file.path(data.dir,paste0("res_barell",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.speyside = loadObject(file = file.path(data.dir,paste0("res_speyside",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.islay = loadObject(file = file.path(data.dir,paste0("res_islay",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

All

Classification was conducted with a OOB prediction error of 0.3680556

Classification results for geographical origin
Ex-Bourbon Ex-Bourbon, Ex-Sherry Ex-Sherry
Ex-Bourbon 71 15 1
Ex-Bourbon, Ex-Sherry 26 61 33
Ex-Sherry 2 29 50

Speyside

Classification was conducted with a OOB prediction error of 0.25

Classification results for geographical origin
Ex-Bourbon Ex-Bourbon, Ex-Sherry Ex-Sherry
Ex-Bourbon 11 0 1
Ex-Bourbon, Ex-Sherry 3 12 5
Ex-Sherry 2 4 22

Islay

Classification was conducted with a OOB prediction error of 0.4150943

Classification results for geographical origin
Ex-Bourbon Ex-Bourbon, Ex-Sherry Ex-Sherry
Ex-Bourbon 13 3 0
Ex-Bourbon, Ex-Sherry 6 12 5
Ex-Sherry 1 7 6

Colour

#create se

se.colour = se[, which( samples[,"colour"] == "natural Colour" |
                        samples[,"colour"] == "plain caramel" 
                           )]



 if (rerun) {
  set.seed(42)
res.colour = RF.analysis.all(sum.exp = se.colour,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "colour",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)


saveObject(res.colour, file = file.path(data.dir,paste0("res_colour",num.trees.RF,"_s_",s,"_t_",t,".RData")))


} else {
res.colour = loadObject(file = file.path(data.dir,paste0("res_colour",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

Classification was conducted with a OOB prediction error of 0.0920245

Classification results for geographical origin
natural Colour plain caramel
natural Colour 306 20
plain caramel 25 138

Age

#create se

se.age = se[, which( samples[,"Age"] != "NAS" 
                           )]



 if (rerun) {
  set.seed(42)
res.age = RF.analysis.all(sum.exp = se.age,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Age",
                          mtry = mtry,
                          s = s,
                          t = t,
                          type = "regression",
                          VarAnal = TRUE)


saveObject(res.age, file = file.path(data.dir,paste0("res_age",num.trees.RF,"_s_",s,"_t_",t,".RData")))


} else {
res.age = loadObject(file = file.path(data.dir,paste0("res_age",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

A regression model to predict the age in years was trained. An error (Root of the mean square error) of 4.559628 and an R² of 0.4085824 was obtained

Session information

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 24.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] knitr_1.46                  SummarizedExperiment_1.32.0
##  [3] Biobase_2.62.0              GenomicRanges_1.54.1       
##  [5] GenomeInfoDb_1.38.8         IRanges_2.36.0             
##  [7] S4Vectors_0.40.2            BiocGenerics_0.48.1        
##  [9] MatrixGenerics_1.14.0       matrixStats_1.3.0          
## [11] summarytools_1.0.1          R.utils_2.12.3             
## [13] R.oo_1.26.0                 R.methodsS3_1.8.2          
## [15] DT_0.33                    
## 
## loaded via a namespace (and not attached):
##  [1] xfun_0.43               bslib_0.7.0             htmlwidgets_1.6.4      
##  [4] lattice_0.22-5          vctrs_0.6.5             tools_4.3.3            
##  [7] bitops_1.0-7            generics_0.1.3          tibble_3.2.1           
## [10] fansi_1.0.6             pkgconfig_2.0.3         Matrix_1.6-5           
## [13] checkmate_2.3.1         pryr_0.1.6              lifecycle_1.0.4        
## [16] GenomeInfoDbData_1.2.11 compiler_4.3.3          stringr_1.5.1          
## [19] rapportools_1.1         codetools_0.2-19        htmltools_0.5.8.1      
## [22] sass_0.4.9              RCurl_1.98-1.16         yaml_2.3.8             
## [25] pillar_1.9.0            crayon_1.5.2            jquerylib_0.1.4        
## [28] tidyr_1.3.1             MASS_7.3-60.0.1         DelayedArray_0.28.0    
## [31] cachem_1.0.8            magick_2.8.4            abind_1.4-5            
## [34] tidyselect_1.2.1        digest_0.6.35           stringi_1.8.4          
## [37] dplyr_1.1.4             reshape2_1.4.4          pander_0.6.5           
## [40] purrr_1.0.2             grid_4.3.3              fastmap_1.1.1          
## [43] SparseArray_1.2.4       cli_3.6.2               magrittr_2.0.3         
## [46] S4Arrays_1.2.1          base64enc_0.1-3         utf8_1.2.4             
## [49] backports_1.5.0         lubridate_1.9.3         timechange_0.3.0       
## [52] rmarkdown_2.26          XVector_0.42.0          evaluate_0.23          
## [55] tcltk_4.3.3             rlang_1.1.3             Rcpp_1.0.12            
## [58] glue_1.7.0              rstudioapi_0.16.0       jsonlite_1.8.8         
## [61] R6_2.5.1                plyr_1.8.9              zlibbioc_1.48.2