source("init.R")

[1] “loading 1 file(s) …”

Preprocessing of Data

  • separated data in Xdata and sampleinfo
  • summarized scandinavian countries in one group
  • vectornormalised data
  • created SE file
  • no conspicous things found in sample info or data
data = read.csv(file = paste0(input.dir,"/20240905_Whisky_A.csv"), as.is = TRUE, sep =";")
rownames(data) = data[,1]
data = data[,-1]
colnames(data)[1] = c("Brand")
colnames(data)[3] = c("Variety")
colnames(data)[4] = c("Country")
colnames(data)[6] = c("Barell")
colnames(data)[7] = c("Age")
colnames(data)[11] = c("Alcohol")

sample.info = data[,1:11]
data.x = data[,12:ncol(data)]
# normalize data to Sum of spectrum 
data.total = data.x / rowSums(data.x)
# change name of Scandinavian countries
sample.info[which(sample.info[,"Country"] == "Norway"),"Country"] = "Scandinavia"
sample.info[which(sample.info[,"Country"] == "Sweden"),"Country"] = "Scandinavia"
sample.info[which(sample.info[,"Country"] == "Finland"),"Country"] = "Scandinavia"
sample.info[which(sample.info[,"Country"] == "Denmark"),"Country"] = "Scandinavia"

  ## SummarizedExperiment object
se = SummarizedExperiment(assays = list(original = t(data.x),
                                        norm.total = t(data.total)),
                          colData = as.data.frame(sample.info))
saveObject(se, 
           file = file.path(data.dir, "sum_exp_norm.RData"))

Descriptive statistics about sample information

print(dfSummary(sample.info[,3:11], 
                graph.magnif = 0.85), 
                method = 'render',
                max.distinct.values = 42)

Data Frame Summary

Dimensions: 686 x 9
Duplicates: 45
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Variety [character]
1. Single Malt
2. Blended
3. Blended Malt
4. Kentucky Straight Bourbon
5. Single Grain
6. Kentucky Straight Bourbon
7. Whisky
8. Canadian Whisky
9. Irish Whiskey
10. Rye
[ 21 others ]
506(73.8%)
45(6.6%)
28(4.1%)
12(1.7%)
12(1.7%)
8(1.2%)
8(1.2%)
7(1.0%)
7(1.0%)
7(1.0%)
46(6.7%)
686 (100.0%) 0 (0.0%)
2 Country [character]
1. Scotland
2. Germany
3. Ireland
4. USA
5. Scandinavia
6. Canada
7. Japan
8. Australia
9. India
10. Scotland + Germany
[ 16 others ]
487(71.0%)
41(6.0%)
41(6.0%)
38(5.5%)
16(2.3%)
11(1.6%)
10(1.5%)
4(0.6%)
4(0.6%)
4(0.6%)
30(4.4%)
686 (100.0%) 0 (0.0%)
3 Region [character]
1. (Empty string)
2. Speyside
3. Highlands
4. Islay
5. Blended
6.  
·
Kentucky
7. Orkney
8. Isle of Mull
9. Campbeltown
10. Isle of Jura
[ 22 others ]
132(19.2%)
125(18.2%)
105(15.3%)
99(14.4%)
73(10.6%)
26(3.8%)
20(2.9%)
19(2.8%)
16(2.3%)
13(1.9%)
58(8.5%)
686 (100.0%) 0 (0.0%)
4 Barell [character]
1. -
2. Ex-Bourbon, Ex-Sherry
3. Ex-Bourbon
4. Ex-Sherry
5. Virgin
6. Ex-Bourbon, Ex-Wine
7. Ex-Bourbon, Virgin
8. Ex-Bourbon, Ex-Portwine
9. Hogshead
10. Ex-Wine
[ 62 others ]
122(17.8%)
120(17.5%)
87(12.7%)
81(11.8%)
51(7.4%)
26(3.8%)
22(3.2%)
21(3.1%)
18(2.6%)
14(2.0%)
124(18.1%)
686 (100.0%) 0 (0.0%)
5 Age [character]
1. NAS
2. 12 Years
3. 10 Years
4. 15 Years
5. 11 Years
6. 3 Years
7. 18 Years
8. 5 Years
9. 8 Years
10. 14 Years
[ 24 others ]
290(42.3%)
83(12.1%)
39(5.7%)
31(4.5%)
22(3.2%)
21(3.1%)
20(2.9%)
20(2.9%)
19(2.8%)
18(2.6%)
123(17.9%)
686 (100.0%) 0 (0.0%)
6 chill.filtrated [character]
1. -
2. chill-filtered
3. non chill-filtered
378(55.1%)
5(0.7%)
303(44.2%)
686 (100.0%) 0 (0.0%)
7 colour [character]
1. -
2. natural Colour
3. plain caramel
197(28.7%)
326(47.5%)
163(23.8%)
686 (100.0%) 0 (0.0%)
8 peated [character]
1. (Empty string)
2. -
3. heavily peated
4. lightly peated
5. peated
6. unpeated
387(56.4%)
28(4.1%)
41(6.0%)
9(1.3%)
137(20.0%)
84(12.2%)
686 (100.0%) 0 (0.0%)
9 Alcohol [character]
1. 40.00%
2. 46.00%
3. 43.00%
4. 50.00%
5. 48.00%
6. 45.00%
7. 45.80%
8. 46.30%
9. 47.00%
10. 42.00%
[ 133 others ]
154(22.4%)
124(18.1%)
98(14.3%)
27(3.9%)
22(3.2%)
16(2.3%)
10(1.5%)
10(1.5%)
6(0.9%)
5(0.7%)
214(31.2%)
686 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.3.3)
2024-09-18

#QC sample level

prop.zero.sample = apply(data.x, 1, function(x) {
  sum(x == 0)}) / ncol(data.x)

  
  ind.max = apply(data.x, 2, which.max)
tab = table(ind.max)
  
  ind.max.total = apply(data.total, 2, which.max)
tab.total = table(ind.max.total)
  



#QC metabolite level

prop.zero.var = apply(data.x, 2, function(x) {
  sum(x == 0)}) / nrow(data.x)

Session information

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 24.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] SummarizedExperiment_1.32.0 Biobase_2.62.0             
##  [3] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
##  [5] IRanges_2.36.0              S4Vectors_0.40.2           
##  [7] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
##  [9] matrixStats_1.3.0           summarytools_1.0.1         
## [11] R.utils_2.12.3              R.oo_1.26.0                
## [13] R.methodsS3_1.8.2           DT_0.33                    
## 
## loaded via a namespace (and not attached):
##  [1] xfun_0.43               bslib_0.7.0             htmlwidgets_1.6.4      
##  [4] lattice_0.22-5          vctrs_0.6.5             tools_4.3.3            
##  [7] bitops_1.0-7            generics_0.1.3          tibble_3.2.1           
## [10] fansi_1.0.6             pkgconfig_2.0.3         Matrix_1.6-5           
## [13] checkmate_2.3.1         pryr_0.1.6              lifecycle_1.0.4        
## [16] GenomeInfoDbData_1.2.11 compiler_4.3.3          stringr_1.5.1          
## [19] rapportools_1.1         codetools_0.2-19        htmltools_0.5.8.1      
## [22] sass_0.4.9              RCurl_1.98-1.16         yaml_2.3.8             
## [25] pillar_1.9.0            crayon_1.5.2            jquerylib_0.1.4        
## [28] tidyr_1.3.1             MASS_7.3-60.0.1         DelayedArray_0.28.0    
## [31] cachem_1.0.8            magick_2.8.4            abind_1.4-5            
## [34] tidyselect_1.2.1        digest_0.6.35           stringi_1.8.4          
## [37] dplyr_1.1.4             reshape2_1.4.4          pander_0.6.5           
## [40] purrr_1.0.2             grid_4.3.3              fastmap_1.1.1          
## [43] SparseArray_1.2.4       cli_3.6.2               magrittr_2.0.3         
## [46] S4Arrays_1.2.1          base64enc_0.1-3         utf8_1.2.4             
## [49] backports_1.5.0         lubridate_1.9.3         timechange_0.3.0       
## [52] rmarkdown_2.26          XVector_0.42.0          evaluate_0.23          
## [55] knitr_1.46              tcltk_4.3.3             rlang_1.1.3            
## [58] Rcpp_1.0.12             glue_1.7.0              rstudioapi_0.16.0      
## [61] jsonlite_1.8.8          R6_2.5.1                plyr_1.8.9             
## [64] zlibbioc_1.48.2