source("init.R")

num.trees.RF = 10000
mtry = as.integer(287^(3/4))
s = 15
t = 5
rerun = FALSE

General information

For all analyses the following parameters were used for RF classification, variable selection and relation analysis:

Analysis of Country

se = loadObject(file = file.path(project.dir, "/subprojects/01_pretreatment/data/sum_exp_norm.RData"))
samples = colData(se)
maxsamp = 100

set.seed(42)
samples.scot = sample(which(samples[,"Country"] == "Scotland"), maxsamp)

#create se with relevant samples with class number at least 10
se.country = se[, c(samples.scot, which( samples[,"Country"] == "Germany" |
                          samples[,"Country"] == "Ireland" |
                          samples[,"Country"] == "USA" |
                          samples[,"Country"] == "Canada" |
                          samples[,"Country"] == "Japan" 
                        ))]

 
 if (rerun) {
  set.seed(42)
res.all = RF.analysis.all(sum.exp = se.country,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Country",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.all, file = file.path(data.dir,paste0("res_country",num.trees.RF,"_s_",s,"_t_",t,".RData")))
} else {
res.all = loadObject(file = file.path(data.dir,paste0("res_country",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

Classification was conducted with a OOB prediction error of 0.2821577

Classification results for geographical origin
Canada Germany Ireland Japan Scotland USA
Canada 8 0 1 1 0 1
Germany 2 23 5 1 5 5
Ireland 5 4 25 2 4 1
Japan 1 0 5 2 2 0
Scotland 0 5 8 2 83 2
USA 0 1 2 0 3 32

Analysis of regions

#create se with relevant samples with class number at least 10
se.usa = se[, which( samples[,"Region"] == " Kentucky" |
                          samples[,"Region"] == "Tennessee"
                        )]

se.scotland = se[, which( samples[,"Region"] == "Speyside" |
                          samples[,"Region"] == "Campbeltown" |
                          samples[,"Region"] == "Highlands" |
                          samples[,"Region"] == "Islay" |
                          samples[,"Region"] == "Isle of Jura" |
                          samples[,"Region"] == "Isle of Mull" |
                          samples[,"Region"] == "Lowlands" |
                          samples[,"Region"] == "Orkney" 
                        )]


 
 if (rerun) {
  set.seed(42)
res.usa = RF.analysis.all(sum.exp = se.usa,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Region",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.usa, file = file.path(data.dir,paste0("res_region_usa",num.trees.RF,"_s_",s,"_t_",t,".RData")))

  set.seed(42)
res.scot = RF.analysis.all(sum.exp = se.scotland,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Region",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.scot, file = file.path(data.dir,paste0("res_region_scotland",num.trees.RF,"_s_",s,"_t_",t,".RData")))

} else {
res.usa = loadObject(file = file.path(data.dir,paste0("res_region_usa",num.trees.RF,"_s_",s,"_t_",t,".RData")))

res.scot = loadObject(file = file.path(data.dir,paste0("res_region_scotland",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

USA

Classification was conducted with a OOB prediction error of 0.2903226

Classification results for geographical origin
Kentucky Tennessee
Kentucky 22 4
Tennessee 5 0

Scotland

Classification was conducted with a OOB prediction error of 0.497561

Classification results for geographical origin
Campbeltown Highlands Islay Isle of Jura Isle of Mull Lowlands Orkney Speyside
Campbeltown 2 6 0 0 3 0 1 4
Highlands 6 30 9 0 3 7 1 49
Islay 1 5 75 1 7 2 1 7
Isle of Jura 0 2 0 6 0 0 0 5
Isle of Mull 4 1 6 0 6 0 0 2
Lowlands 0 4 1 0 0 4 0 4
Orkney 1 2 0 1 1 0 10 5
Speyside 7 22 3 5 4 2 9 73

Analysis of barells

Only Ex-Bourbon, Ex-Sherry and mixtures of both were analysed for all samples, as well as samples from Speyside and Islay

#create ses with Barells: Ex-Sherry, Ex-Bourbon and both for all samples and for samples from Islay and Speyside

se.barell = se[, which( samples[,"Barell"] == "Ex-Sherry" |
                        samples[,"Barell"] == "Ex-Bourbon" |
                         samples[,"Barell"] == "Ex-Bourbon, Ex-Sherry" 
                           )]

#  For samples from Speyside 
se.speyside= se[, which( samples[,"Region"] == "Speyside" )]
samples.speyside = colData(se.speyside)
se.speyside= se.speyside[, which( samples.speyside[,"Barell"] == "Ex-Sherry" |
                          samples.speyside[,"Barell"] == "Ex-Bourbon" |
                         samples.speyside[,"Barell"] == "Ex-Bourbon, Ex-Sherry" 
                           )]

#  For samples from Islay 
se.islay= se[, which( samples[,"Region"] == "Islay" )]
samples.islay = colData(se.islay)
se.islay= se.islay[, which( samples.islay[,"Barell"] == "Ex-Sherry" |
                          samples.islay[,"Barell"] == "Ex-Bourbon" |
                         samples.islay[,"Barell"] == "Ex-Bourbon, Ex-Sherry" 
                           )]



 
 if (rerun) {
  set.seed(42)
res.barell = RF.analysis.all(sum.exp = se.barell,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Barell",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)


saveObject(res.barell, file = file.path(data.dir,paste0("res_barell",num.trees.RF,"_s_",s,"_t_",t,".RData")))

  set.seed(42)
res.speyside = RF.analysis.all(sum.exp = se.speyside,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Barell",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.speyside, file = file.path(data.dir,paste0("res_speyside",num.trees.RF,"_s_",s,"_t_",t,".RData")))


  set.seed(42)
res.islay = RF.analysis.all(sum.exp = se.islay,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Barell",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)

saveObject(res.islay, file = file.path(data.dir,paste0("res_islay",num.trees.RF,"_s_",s,"_t_",t,".RData")))



} else {
res.barell = loadObject(file = file.path(data.dir,paste0("res_barell",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.speyside = loadObject(file = file.path(data.dir,paste0("res_speyside",num.trees.RF,"_s_",s,"_t_",t,".RData")))
res.islay = loadObject(file = file.path(data.dir,paste0("res_islay",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

##All

Classification was conducted with a OOB prediction error of 0.3680556

Classification results for geographical origin
Ex-Bourbon Ex-Bourbon, Ex-Sherry Ex-Sherry
Ex-Bourbon 71 15 1
Ex-Bourbon, Ex-Sherry 26 61 33
Ex-Sherry 2 29 50

Speyside

Classification was conducted with a OOB prediction error of 0.25

Classification results for geographical origin
Ex-Bourbon Ex-Bourbon, Ex-Sherry Ex-Sherry
Ex-Bourbon 11 0 1
Ex-Bourbon, Ex-Sherry 3 12 5
Ex-Sherry 2 4 22

Islay

Classification was conducted with a OOB prediction error of 0.4150943

Classification results for geographical origin
Ex-Bourbon Ex-Bourbon, Ex-Sherry Ex-Sherry
Ex-Bourbon 13 3 0
Ex-Bourbon, Ex-Sherry 6 12 5
Ex-Sherry 1 7 6

Colour

#create se

se.colour = se[, which( samples[,"colour"] == "natural Colour" |
                        samples[,"colour"] == "plain caramel" 
                           )]



 if (rerun) {
  set.seed(42)
res.colour = RF.analysis.all(sum.exp = se.colour,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "colour",
                          mtry = mtry,
                          s = s,
                          t = t,
                          VarAnal = TRUE)


saveObject(res.colour, file = file.path(data.dir,paste0("res_colour",num.trees.RF,"_s_",s,"_t_",t,".RData")))


} else {
res.colour = loadObject(file = file.path(data.dir,paste0("res_colour",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

Classification was conducted with a OOB prediction error of 0.0920245

Classification results for geographical origin
natural Colour plain caramel
natural Colour 306 20
plain caramel 25 138

Age

#create se

se.age = se[, which( samples[,"Age"] != "NAS" 
                           )]



 if (rerun) {
  set.seed(42)
res.age = RF.analysis.all(sum.exp = se.age,
                          num.trees.RF = num.trees.RF,
                          assay = "original",
                          var.y = "Age",
                          mtry = mtry,
                          s = s,
                          t = t,
                          type = "regression",
                          VarAnal = TRUE)


saveObject(res.age, file = file.path(data.dir,paste0("res_age",num.trees.RF,"_s_",s,"_t_",t,".RData")))


} else {
res.age = loadObject(file = file.path(data.dir,paste0("res_age",num.trees.RF,"_s_",s,"_t_",t,".RData")))
}

A regression model to predict the age in years was trained. An error (Root of the mean square error) of 4.559628 and an R² of 0.4085824 was obtained