diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000000000000000000000000000000000000..91114bf2f2bba5e0c5252e75018da19b869776f1
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
index f5b98bb2f80c40edf569bedfb4dc0982b9a92be7..56843bca41471d505619ca30a744bbf621e08f2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,36 +1,7 @@
-# History files
+.Rproj.user
 .Rhistory
-.Rapp.history
-
-# Session Data files
 .RData
-
-# User-specific files
 .Ruserdata
-
-# Example code in package build process
-*-Ex.R
-
-# Output files from R CMD build
-/*.tar.gz
-
-# Output files from R CMD check
-/*.Rcheck/
-
-# RStudio files
-.Rproj.user/
-
-# produced vignettes
-vignettes/*.html
-vignettes/*.pdf
-
-# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
-.httr-oauth
-
-# knitr and R markdown default cache directories
-*_cache/
-/cache/
-
-# Temporary files created by R markdown
-*.utf8.md
-*.knit.md
+src/*.o
+src/*.so
+src/*.dll
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000000000000000000000000000000000000..bec91b9ef14006b155c346c45a7ca5f006c15d3e
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,19 @@
+Package: RFSurrogates
+Type: Package
+Title: Surrogate minimal depth variable importance 
+Version: 0.3.1
+Author: Stephan Seifert, Sven Gundlach, Silke Szymczak
+Maintainer: Stephan Seifert <seifert@medinfo.uni-kiel.de>
+Description: This package provides functions to obtain surrogate splits applying ranger for random forests generation and using modified functions from the package rpart. Surrogate variables and corresponding adjusted agreement values are used for surrogate minimal depth variable importance and to investigate variable relations.  
+License: use_mit_license()
+Encoding: UTF-8
+LazyData: true
+Imports:
+	ranger (>= 0.12.1),
+	Rcpp,
+	randomForestSRC,
+	linkcomm,
+    parallel,
+    rlist
+RoxygenNote: 7.1.2
+LinkingTo: Rcpp
diff --git a/LICENCE.txt b/LICENCE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f628819d423808d478baa056ece268304f7da4b3
--- /dev/null
+++ b/LICENCE.txt
@@ -0,0 +1,7 @@
+Copyright 2023 Stephan Seifert
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/LICENSE.md b/LICENSE.md
deleted file mode 100644
index 312309984494e94c7d222c724b8e4e775770aacc..0000000000000000000000000000000000000000
--- a/LICENSE.md
+++ /dev/null
@@ -1,22 +0,0 @@
-
-The MIT License (MIT)
-
-Copyright (c) 2023 Florian Gärber
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000000000000000000000000000000000000..c2238d27843016c06a41c08ce16e8f1e8b5fc26b
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,17 @@
+# Generated by roxygen2: do not edit by hand
+
+export(addLayer)
+export(addSurrogates)
+export(build.clusters)
+export(count.surrogates)
+export(getTreeranger)
+export(meanAdjAgree)
+export(mindep)
+export(reduce.surrogates)
+export(surrmindep)
+export(var.relations)
+export(var.relations.mfi)
+export(var.select.md)
+export(var.select.mir)
+export(var.select.smd)
+useDynLib(SurrogateMinimalDepth, .registration = TRUE, .fixes = "C_")
diff --git a/R/SMD_example_data.R b/R/SMD_example_data.R
new file mode 100644
index 0000000000000000000000000000000000000000..ec046b1cfdb25f112fb758e6c14689dd96f3ac43
--- /dev/null
+++ b/R/SMD_example_data.R
@@ -0,0 +1,18 @@
+#' Example data set for the package SurrogateMinimalDepth
+#'
+#' A dataset containing 100 individuals (rows), 1 dependent variable (first column, y), and 200 additional variables (columns 2 to 201):
+#' For the simulation of the 200 additional variables nine variables X1,…,X9 called basic variables were simulated. X1 to X6 are causal
+#' with constant effect sizes of 1 and X7 to X9 are noncausal with effect size of 0. The outcome y is a linear combination of the causal
+#' predictor variables and a normally distributed error term. All basic variables were sampled from a normal distribution
+#' (N(0,1)) just like the noise (N(0,0.2)). For each of the six basic variables X1, X2, X3, X7, X8, and X9 ten variables
+#' with predefined correlations of 0.9 for X1 and X7, 0.6 for X2 and X8, and 0.3 for X3 and X9 were obtained by \link[WGCNA]{simulateModule} function of
+#' the R package WGCNA. The ten variables of each basis variable are labeled: Cp_basicvariable_number. Additional non-correlated and
+#' independent predictor variables (cgn) were simulated using the standard normal distribution to reach a total number of 200 variables.
+#'
+#'
+#' @docType data
+#' @keywords datasets
+#' @name SMD_example_data
+#' @usage data(SMD_example_data)
+#' @format A matrix with 100 rows and 1001 columns
+NULL
diff --git a/R/addLayer.R b/R/addLayer.R
new file mode 100644
index 0000000000000000000000000000000000000000..91be0b1a2c1382919d370045b0068f788833870c
--- /dev/null
+++ b/R/addLayer.R
@@ -0,0 +1,38 @@
+#'Add layer information to a forest that was created by getTreeranger
+#'
+#'This functions adds the layer information to each node in a list with trees that was obtained by getTreeranger.
+#'
+#' @param trees list of trees created by getTreeranger
+#' @return a list with trees. Each row of the list elements corresponds to a node of the respective tree and the columns correspond to:
+#' \itemize{
+#' \item nodeID: ID of the respective node (important for left and right daughters in the next columns)
+#' \item leftdaughter: ID of the left daughter of this node
+#' \item rightdaughter: ID of the right daughter of this node
+#' \item splitvariable: ID of the split variable
+#' \item splitpoint: splitpoint of the split variable
+#' \item status: "0" for terminal and "1" for non-terminal
+#' \item layer: layer information (0 means root node, 1 means 1 layer below root, etc)
+#' }
+#' @export
+
+addLayer=function(trees){
+  #This function adds the respective layer to the different nodes in a tree. The tree has to be prepared by getTree function
+  tree.layer=list()
+  num.trees= length(trees)
+  for (i in 1:num.trees){
+    tree=trees[[i]]
+    layer=rep(NA,nrow(tree))
+    layer[1]=0
+    t=1
+    while (anyNA(layer)){
+      r=unlist(tree[which(layer==(t-1)),2:3])
+      layer[r]=t
+      t=t+1
+    }
+    tree=cbind(tree,layer)
+    tree=tree[order(as.numeric(tree[,"layer"])),]
+    tree.layer[[i]]=tree
+  }
+  return(tree.layer)
+}
+
diff --git a/R/addSurrogates.R b/R/addSurrogates.R
new file mode 100644
index 0000000000000000000000000000000000000000..97ec16c00ef1ddfb769b36078b443b1827918d67
--- /dev/null
+++ b/R/addSurrogates.R
@@ -0,0 +1,146 @@
+#' Add surrogate information that was created by getTreeranger
+#'
+#' This function adds surrogate variables and adjusted agreement values to a forest that was created by getTreeranger.
+#'
+#' @useDynLib SurrogateMinimalDepth, .registration = TRUE, .fixes = "C_"
+#'
+#' @param RF random forest object created by ranger (with keep.inbag=TRUE).
+#' @param trees list of trees created by getTreeranger.
+#' @param s Predefined number of surrogate splits (it may happen that the actual number of surrogate splits differes in individual nodes). Default is 1 \% of no. of variables.
+#' @param Xdata data without the dependent variable.
+#' @param num.threads number of threads used for parallel execution. Default is number of CPUs available.
+#' @return a list with trees containing of lists of nodes with the elements:
+#' \itemize{
+#' \item nodeID: ID of the respective node (important for left and right daughters in the next columns)
+#' \item leftdaughter: ID of the left daughter of this node
+#' \item rightdaughter: ID of the right daughter of this node
+#' \item splitvariable: ID of the split variable
+#' \item splitpoint: splitpoint of the split variable
+#' \item status: "0" for terminal and "1" for non-terminal
+#' \item layer: layer information (0 means root node, 1 means 1 layer below root, etc)
+#' \item surrogate_i: numbered surrogate variables (number depending on s)
+#' \item adj_i: adjusted agreement of variable i
+#' }
+#' @export
+
+addSurrogates = function(RF,trees,s,Xdata,num.threads) {
+
+  num.trees = length(trees)
+  ncat = sapply(sapply(Xdata,levels),length)     # determine number of categories (o for continuous variables)
+  names(ncat) = colnames(Xdata)
+
+  if (is.null(num.threads)) {
+    num.threads = parallel::detectCores()
+  }
+
+  if (any(ncat) > 0) {
+  Xdata[,which(ncat > 0)] = sapply(Xdata[,which(ncat > 0)],unclass)
+  }
+
+  #variables to find surrogates (control file similar as in rpart)
+  controls = list(maxsurrogate = as.integer(s), sur_agree = 0)
+
+  trees.surr = parallel::mclapply(1:num.trees,
+                                  getSurrogate,
+                                  mc.cores = num.threads,
+                                  maxsurr = s,
+                                  surr.par = list(inbag.counts = RF$inbag.counts,
+                                                         Xdata = Xdata,
+                                                      controls = controls,
+                                                         trees = trees,
+                                                          ncat = ncat))
+  return(trees.surr)
+}
+
+#' getSurrogate
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+getSurrogate = function(surr.par, k = 1, maxsurr) {
+  #weights and trees are extracted
+ tree = surr.par$trees[[k]]
+ column.names = colnames(tree)
+ n.nodes = nrow(tree)
+ wt = surr.par$inbag.counts[[k]]
+ tree.surr = lapply(1:n.nodes,
+                    SurrTree,
+                    wt = wt,
+                    Xdata = surr.par$Xdata,
+                    controls = surr.par$controls,
+                    column.names, tree,maxsurr,
+                    ncat = surr.par$ncat)
+}
+#' SurrTree
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+SurrTree = function(j,wt,Xdata,controls,column.names,tree,maxsurr,ncat) {
+  node = tree[j,]
+  # for non-terminal nodes get surrogates
+  if (node["status"] == 1) {
+  #Handover to C
+  var = as.numeric(node[4]) # extract split variable
+
+  if (ncat[var] == 0) { # extract split information: split point for continuous variables and directions for qualitative variables
+    split = as.numeric(node[5])
+  } else {
+    right = as.numeric(strsplit(as.character(node[5]), ",")[[1]])
+    directions = rep(-1,ncat[var])
+    directions[right] = 1
+    split = as.numeric(c(ncat[var],directions))
+  }
+
+
+  surrogate.parameters = .Call(C_getSurrogates,
+                               ncat = as.integer(ncat),
+                               wt = as.numeric(wt),
+                               X = as.matrix(Xdata),
+                               controls = as.integer(unlist(controls)),
+                               var = as.integer(var),                      # node variables
+                               split = as.numeric(split))                    # split info
+
+  if (nrow(surrogate.parameters$isplit) > 1) {
+    surrogates = surrogate.parameters$isplit[2:nrow(surrogate.parameters$isplit),1]
+    surr.adj = round(surrogate.parameters$dsplit[2:nrow(surrogate.parameters$dsplit),1],2)
+    node.new = data.frame(matrix(nrow = 1, ncol = 7 + length(surrogates) + length(surr.adj)))
+    node.new[,1:7] = node[1:7]
+    node.new[,8:(7 + length(surrogates) + length(surr.adj))] = c(surrogates,surr.adj)
+    surrogate.names = NULL
+    adj.names = NULL
+    surrogate.names = sapply(1:length(surrogates),name.surr,surrogate.names)
+    adj.names = sapply(1:length(surrogates),name.adj,adj.names)
+    names(node.new) = c(column.names,surrogate.names,adj.names)
+  }
+
+  if (nrow(surrogate.parameters$isplit) == 1) {
+    node.new = node
+  }
+  }
+  if (node["status"] == 0) {
+    node.new = node
+  }
+return(node.new)
+}
+
+#' name.surr
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+name.surr = function(i,surrogate.names){
+surrogate.names = c(surrogate.names,paste0("surrogate_",i))
+return(surrogate.names)
+}
+
+#' name.adj
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+name.adj = function(i,adj.names){
+  adj.names = c(adj.names,paste0("adj_",i))
+  return(adj.names)
+}
+
diff --git a/R/build_clusters.R b/R/build_clusters.R
new file mode 100644
index 0000000000000000000000000000000000000000..af6627562906d0c4dbf3fdbb2ced391ee33646b0
--- /dev/null
+++ b/R/build_clusters.R
@@ -0,0 +1,48 @@
+#' Apply cluster analysis to build variable groups
+#'
+#'This function generates variables groups of relation information that was obtained by \link[SurrogateMinimalDepth]{var.relations} function applying
+#'\link[linkcomm]{getLinkCommunities}.
+#'
+#' @param rel a list containing variables, surr.res, threshold, and var. This is the output of \link[SurrogateMinimalDepth]{var.relations} function.
+#' @param hcmethod the hierarchical clustering method that is used. (see \link[linkcomm]{getLinkCommunities})
+#'
+#' @return a data frame containing the variable names and their associated clusters.
+#'
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#'
+#' \donttest{
+#'  # get trees and variable names
+#'  x = SMD_example_data[,2:ncol(SMD_example_data)]
+#'  y = SMD_example_data[,1]
+#'  allvariables = colnames(x)# extract variables names
+#'  nvar = length(allvariables)   # count number of variables
+#'  set.seed(42)
+#'  RF = ranger::ranger(data = SMD_example_data, dependent.variable.name = "y", num.trees = 10, keep.inbag = TRUE,mtry = floor(nvar^(3/4)), min.node.size = 1)
+#'  trees = getTreeranger(RF = RF, num.trees = 10)
+#'  trees.lay = addLayer(trees)
+#'  trees.surr = addSurrogates(RF = RF, trees = trees.lay, s = 10, Xdata = x, num.threads = NULL)
+#'
+#'  # investigate variable relations
+#'  rel=var.relations(forest = list(trees = trees.surr, allvariables = allvariables), variables = allvariables, candidates = allvariables, t = 10)
+#'  groups = build.clusters(rel)
+#' }
+#'
+#' @export
+
+
+build.clusters = function(rel,hcmethod = "ward.D") {
+
+  # create links matrix
+  links = matrix(nrow = length(unlist(rel$var)) ,ncol = 2)
+  links[,1] = unlist(sapply(1:length(rel$variables), function(x) {rep(rel$variables[x],length(rel$var[[x]]))}))
+  links[,2] = unlist(rel$var)
+
+  # cluster analysis to identify clusters
+  link.comm = linkcomm::getLinkCommunities(links,
+                                 hcmethod = hcmethod,
+                                 plot = FALSE)
+  cluster.info = link.comm$nodeclusters
+  return(cluster.info)
+}
diff --git a/R/count.surrogates.R b/R/count.surrogates.R
new file mode 100644
index 0000000000000000000000000000000000000000..670c46191aaf359f8ee04cbf2abe2434180370be
--- /dev/null
+++ b/R/count.surrogates.R
@@ -0,0 +1,50 @@
+#'Count surrogate variables
+#'
+#'This function counts surrogate variables and returns the total average number of surrogate variables and the average number of surrogate variables of the respective layers.
+#'This is necessary since the actual number of surrogate splits can be lower than the predefined number (when less surrogate splits outperform the majority rule).
+#'
+#'@param trees list of trees that was generated by getTreeranger function and layers, surrogate variables, and adjusted agreement values were added by addLayer and getSurrogates functions
+#' @return List with the following components:
+#' \itemize{
+#' \item s.a: total average number of surrogate variables
+#' \item s.l: average number of surrogate variables in the respective layers
+#' }
+#' @export
+count.surrogates=function(trees){
+  num.trees=length(trees)
+  surrogates.trees=lapply(1:num.trees,scount,trees)
+
+  s.a=mean(sapply(surrogates.trees,"[[","s.a"))
+  s.l.list=lapply(surrogates.trees,"[[","s.l")
+  s.l.all <- matrix(NA, nrow = 1000, ncol = num.trees)
+  for (u in 1:num.trees){
+   s.l.tree=s.l.list[[u]][,2]
+   s.l.all[1:length(s.l.tree),u]=s.l.tree
+  }
+  s.l=rowMeans(s.l.all,na.rm = TRUE)
+  names(s.l)=c(0:999)
+
+  return(list(s.a=s.a,s.l=s.l))
+}
+
+#' scount
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+scount=function(i=1,trees){
+  tree=trees[[i]]
+  nonterminal.nodes=tree[which(sapply(tree,"[[","status")==1)]
+  s.a=(mean(sapply((lapply(nonterminal.nodes,"[",-c(1:7))),length)))/2
+  maxlayer=as.numeric(unlist(nonterminal.nodes[length(nonterminal.nodes)])["layer"])
+  s.l=matrix(NA,maxlayer+1,2)
+  colnames(s.l)=c("layer","No. Surrogates")
+  s.l[,1]=0:maxlayer
+  for (u in 0:maxlayer){
+  nodes.at.layer=nonterminal.nodes[which(sapply(nonterminal.nodes,"[[","layer")==u)]
+  surr=lapply(nodes.at.layer,"[",-c(1:7))
+  s.u=(mean(sapply(surr,length)))/2
+  s.l[u+1,2]=s.u
+  }
+  return(list(s.a=s.a,s.l=s.l))
+}
diff --git a/R/getTreeranger.R b/R/getTreeranger.R
new file mode 100644
index 0000000000000000000000000000000000000000..e190c6491dc2767bde088ef97868caad374fd732
--- /dev/null
+++ b/R/getTreeranger.R
@@ -0,0 +1,51 @@
+#'Get a list of structured trees for ranger
+#'
+#'This functions creates a list of trees for ranger objects similar as getTree function does for random Forest objects.
+#'
+#' @param RF random forest object created by ranger (with keep.inbag=TRUE)
+#' @param num.trees number of trees
+#' @return a list with trees. Each row of the list elements corresponds to a node of the respective tree and the columns correspond to:
+#' \itemize{
+#' \item nodeID: ID of the respective node (important for left and right daughters in the next columns)
+#' \item leftdaughter: ID of the left daughter of this node
+#' \item rightdaughter: ID of the right daughter of this node
+#' \item splitvariable: ID of the split variable
+#' \item splitpoint: splitpoint of the split variable (for categorical variables this is a comma separated lists of values, representing the factor levels (in the original order) going to the right)
+#' \item status: "0" for terminal and "1" for non-terminal
+#' }
+#' @export
+
+
+
+getTreeranger=function(RF,num.trees) {
+  trees=lapply(1:num.trees,getsingletree,RF=RF)
+
+  return(trees)
+}
+
+
+#' getsingletree
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+getsingletree=function(RF,k=1){
+  # here we use the treeInfo function of the ranger package to create extract the trees, in an earlier version this was done with a self implemented function
+  tree.ranger = ranger::treeInfo(RF,tree = k)
+  ktree=data.frame(as.numeric(tree.ranger$nodeID+1),
+              as.numeric(tree.ranger$leftChild+1),
+              as.numeric(tree.ranger$rightChild+1),
+              as.numeric(tree.ranger$splitvarID+1),
+              tree.ranger$splitval,
+              tree.ranger$terminal)
+  if (is.factor(ktree[,5])) {
+  ktree[,5] = as.character(levels(ktree[,5] ))[ktree[,5]]
+  }
+  ktree[,6] = as.numeric(ktree[,6] == FALSE)
+
+ for (i in 2:4) {
+   ktree[,i][is.na(ktree[,i])] = 0
+ }
+  colnames(ktree)=c("nodeID","leftdaughter","rightdaughter","splitvariable","splitpoint","status")
+  return(ktree)
+}
diff --git a/R/meanAdjAgree.R b/R/meanAdjAgree.R
new file mode 100644
index 0000000000000000000000000000000000000000..fd14fb403f94cf052a8592b1122683913a34dd09
--- /dev/null
+++ b/R/meanAdjAgree.R
@@ -0,0 +1,151 @@
+#'Calculate mean adjusted agreement to investigate variables relations
+#'
+#'This is the main function of var.relations function.
+#'
+#' @param trees list of trees created by getTreeranger, addLayer and addSurrogate.
+#' @param variables vector of variable names.
+#' @param allvariables vector of all variable names (strings)
+#' @param candidates vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables)
+#' @param t variable to calculate threshold. Default is 3.
+#' @param s.a average number of surrogate variables (ideally calculated by count.surrogates function).
+#' @param select.var set False if only relations should be calculated and no related variables should be selected.
+#' @param num.threads number of threads used for parallel execution. Default is number of CPUs available.
+#' @return a list containing:
+#' \itemize{
+#' \item variables: the variables to which relations are investigated
+#' \item surr.res: matrix with mean adjusted agreement values and variables investigated in rows and candidate variables in columns
+#' \item threshold: the threshold used to create surr.var from surr.res
+#' \item surr.var: binary matrix showing if the variables are related (1) or non-related (0) with variables in rows and candidates in columns.
+#' }
+#' @export
+
+
+meanAdjAgree=function(trees,variables,allvariables,candidates,t,s.a,select.var,num.threads = NULL){
+  num.trees=length(trees)
+  index.variables=match(variables,allvariables)
+  index.candidates = match(candidates,allvariables)
+  if (is.null(num.threads)) {
+    num.threads = parallel::detectCores()
+  }
+  list.res = rlist::list.flatten(parallel::mclapply(trees,
+                                                    surr.tree,
+                                                    mc.cores = num.threads,
+                                                    variables,
+                                                    index.variables,
+                                                    allvariables,
+                                                    index.candidates))
+
+  results.allvar = matrix(unlist(lapply(1:length(index.variables),
+                                        mean.index,
+                                        list.res,
+                                        index.variables)),
+                          ncol=length(candidates),nrow=length(variables),byrow = TRUE)
+  colnames(results.allvar)=candidates
+  rownames(results.allvar)=variables
+
+  if(select.var) {
+    # calculate threshold and select variables according to it
+    adj.mean=mean(unlist(lapply((1:num.trees),adj.mean.trees,trees)),na.rm = TRUE)
+    threshold=((s.a*adj.mean)/(length(allvariables)-1))*t
+    SurrVar=ifelse(results.allvar>threshold, 1, 0)
+    result=list(surr.res=results.allvar,threshold=threshold,surr.var=SurrVar,variables=variables)
+  } else {
+    result=list(surr.res=results.allvar,variables=variables)
+  }
+  return(result)
+}
+
+#' mean.index
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+mean.index=function(i, list.res,index.variables){
+  list = list.res[which(names(list.res) == index.variables[i])]
+  mean.list = round(Reduce("+",list)/length(list),2)
+  if (length(mean.list) > 0) {
+  mean.list[index.variables[i]] = NA
+  return(mean.list)
+  } else {
+  return(rep(NA,length(index.variables)))
+  }
+}
+
+#' surr.tree
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+surr.tree=function(tree,variables,index.variables,allvariables,index.candidates){
+  allvar.num = length(allvariables)
+  nonterminal.nodes = tree[which(sapply(tree,"[[","status")==1)]
+  relevant.nodes = nonterminal.nodes[sapply(nonterminal.nodes,"[[","splitvariable") %in% index.variables]
+  if (length(relevant.nodes) > 0) {
+    list.nodes = lapply(1:length(relevant.nodes),adj.node,allvar.num,relevant.nodes,index.candidates)
+    splitvar = sapply(relevant.nodes,"[[","splitvariable")
+    names(list.nodes) = splitvar
+    return(list.nodes)
+  }
+}
+
+
+#' adj.node
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+adj.node = function(i,allvar.num,relevant.nodes,index.candidates) {
+  node = relevant.nodes[i]
+  adjnode = rep(0,allvar.num)
+  surr=unlist(sapply(node,"[",-c(1:7))) # extract surrogates
+  if ((length(node[[1]]))>7){
+    s=(length(surr))/2
+    adjnode[surr[1:s]]=surr[(s+1):(2*s)]
+  }
+  return(adjnode[index.candidates])
+}
+
+
+
+#' adj.mean
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+adj.mean=function(trees){
+  adj.trees=sapply(1:length(trees),adj.mean.trees,trees)
+
+}
+
+#' adj.mean.trees
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+adj.mean.trees=function(t,trees){
+  tree=trees[[t]]
+  nonterminal.nodes=tree[which(sapply(tree,"[[","status")==1)]
+  surr.nonterminal=lapply(nonterminal.nodes,"[",-c(1:7))
+  adj.tree=mean(unlist(lapply(1:length(surr.nonterminal),mean.adj.node,surr.nonterminal)),na.rm = TRUE)
+  if (adj.tree == "NaN") {
+    adj.tree = NA
+  }
+  return(adj.tree)
+}
+
+#' adj.node
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+mean.adj.node=function(m,surr.nonterminal){
+  surr=surr.nonterminal[[m]]
+  if (length(surr)!=0){
+    num.surr=length(surr)/2
+    adj=surr[(num.surr+1):(2*num.surr)]
+  }
+  if (length(surr)==0){
+    adj=NA
+  }
+  return(adj)
+}
diff --git a/R/mindep.R b/R/mindep.R
new file mode 100644
index 0000000000000000000000000000000000000000..45eed49cb092e6a4b6ad23f0875434c5e31b71f5
--- /dev/null
+++ b/R/mindep.R
@@ -0,0 +1,90 @@
+#'Execute minimal depth variable importance
+#'
+#'This function determines the minimal depth of variables from a forest that is created by getTreeranger, and addLayer functions.
+#'
+#' @param variables vector of variable names
+#' @param trees list of trees that was generated by getTreeranger and layers functions
+#' @return List with the following components:
+#' \itemize{
+#' \item depth: mean minimal depth for each variable
+#' \item selected: variables has been selected (1) or not (0),
+#' \item threshold: the threshold that is used for the selection
+#' }
+#' @export
+
+mindep=function(variables,trees){
+
+  num.trees= length(trees)
+# This function determines the minimal depth of variables from a tree with layers that is created by getTree and addLayer functions.
+# It is based on this paper: Ishwaran et. al. Journal of the American Statistical Accociation 2010 and used the conservative marginal
+# approximation to the minimal depth distribution
+
+var.num=length(variables)
+
+#prepare matrix for mindepth
+mindepth=matrix(NA,nrow=num.trees,ncol=var.num)
+colnames(mindepth)=variables
+
+MAX.DEPTH=10000
+#maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step
+maxdepth=rep(NA,num.trees)
+nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = num.trees)
+# get mindepth for every variable in every tree
+for (i in 1:num.trees){
+nodesAtDepth <- rep(NA, MAX.DEPTH)
+tree=trees[[i]]
+tree=tree[order(as.numeric(tree[,"layer"])),]
+# get layer information of the variables and save it in minimal depth file
+depth.tree=rep(NA,length(variables))
+  o=1
+  while (anyNA(depth.tree) && o<=nrow(tree)){
+    if (tree[o,"status"]==1){
+    if (is.na(depth.tree[tree[o,"splitvariable"]])) {
+      depth.tree[tree[o,"splitvariable"]]=tree[o,"layer"]
+      }
+  }
+  o=o+1
+  }
+  #variables with no split in the tree get maxdepth+1 as minimal depth
+  depth.tree[which(is.na(depth.tree))]=tree[nrow(tree),"layer"]
+  #save min and max depth information for every tree
+  mindepth[i,]=depth.tree
+  maxdepth[i]=tree[nrow(tree),"layer"]
+  #find the number of nodes in every layer
+
+  for (u in 1:(maxdepth[i])){
+
+    nodesAtDepth[u]=nrow(subset(tree, tree[,"layer"] == u & tree[,"status"] == 1 ))
+  }
+  nodesAtDepthMatrix[,i]=nodesAtDepth
+}
+
+# create mean values for the minimal depth of different variables
+mean.depth=colMeans(mindepth)
+
+# determine the mean depth of an uninformative variable called D star in Ishwaran et. al. Journal of the American Statistical
+# Accociation 2010 (treshhold for important variables)
+
+#the following determination of the treshold is taken from max.subtree of the randomForestSRC package
+treeHeight =maxdepth
+avgTreeHeight = mean(treeHeight, na.rm=TRUE)
+maxTreeHeight = max(treeHeight, na.rm=TRUE)
+nodes.at.depth.avg = apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE)
+l = nodes.at.depth.avg[1:avgTreeHeight]
+minDepthStatObj = randomForestSRC:::minDepthStat(var.num, l = l)
+threshold <- minDepthStatObj$first.moment
+
+
+# Decide if variables are important or unimportant
+Importances=rep(NA,var.num)
+for (p in 1:var.num){
+if (mean.depth[p]<threshold) {
+ Importances[p]=1}
+  else {
+    Importances[p]=0}
+}
+names(Importances)=variables
+results=list(depth=mean.depth,selected=Importances,threshold=threshold)
+return(results)
+}
+
diff --git a/R/reduce.surrogates.R b/R/reduce.surrogates.R
new file mode 100644
index 0000000000000000000000000000000000000000..a5745eb13fcbe393faba1c3b9775bba3f859cabe
--- /dev/null
+++ b/R/reduce.surrogates.R
@@ -0,0 +1,62 @@
+#' Reduce surrogate variables in a random forest.
+#'
+#'This function can be applied to reduce the surrogate variables in a forest that is created by getTreeranger, addLayer
+#'and getSurrogates functions. Hence, it can be applied to the forests that were used for surrogate minimal depth variable importance.
+#'
+#' @param forest a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions.
+#' @param s number of surrogate variables in the new forest (have to be less than in the RF in trees)
+#'
+#' @return
+#' forest with s surrogate variables.
+#'
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#' \donttest{
+#' ###### use result of SMD variable importance and reduce surrogate variables to 10
+#' # select variables with smd variable importance (usually more trees are needed)
+#' set.seed(42)
+#' res = var.select.smd(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], s = 100, num.trees = 10)
+#' forest.new = reduce.surrogates(forest = res$forest, s = 10)
+#'
+#' # execute SMD on tree with reduced number of surrogates
+#' res.new = var.select.smd(create.forest = FALSE, forest = forest.new)
+#' res.new$var
+#'
+#' #' # investigate variable relations
+#' rel = var.relations(forest = forest.new, variables=c("X1","X7"), candidates = res$forest[["allvariables"]][1:100], t = 5)
+#' rel$var
+#'}
+#' @export
+
+
+reduce.surrogates = function(forest, s = 10){
+
+  trees = forest[["trees"]]
+  num.trees = length(trees)
+  trees.new = lapply(1:num.trees, less.surrogates.trees, trees,s)
+  forest.new = list(trees = trees.new, allvariables = forest[["allvariables"]])
+return(forest.new)
+}
+
+less.surrogates.trees = function(i = 1, trees,s){
+  tree = trees[[i]]
+  n.nodes = length(tree)
+  surr.now = sapply((lapply(tree,"[",-c(1:7))),length)/2
+  surr.next = surr.now
+  surr.next[which(surr.now >= s)] = s
+  tree.new = lapply(1:n.nodes, less.surrogates.node, tree, surr.next,surr.now)
+  return(tree.new)
+}
+
+
+less.surrogates.node = function(j=1, tree,surr.next, surr.now){
+  node = tree[[j]]
+  if (length(node) == 7) {
+    node.new = node
+  }
+  if (length(node) > 7) {
+node.new = node[c(1:(7 + surr.next[j]), (8 + surr.now[j]):(7 + surr.now[j] + surr.next[j]))]
+}
+return(node.new)
+}
diff --git a/R/surrmindep.R b/R/surrmindep.R
new file mode 100644
index 0000000000000000000000000000000000000000..3dd2ac76b56a064b0f4340f39bd25d5f51a943a5
--- /dev/null
+++ b/R/surrmindep.R
@@ -0,0 +1,128 @@
+#'Execute surrogate minimal depth variable importance
+#'
+#'This function determines the surrogate minimal depth of variables from a forest that is created by getTreeranger, addLayer and getSurrogates functions.
+#'
+#' @param forest a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions.
+#' @param s.l Number of average surrogate variables in the respective layers. (use count.surrogate function to get it)
+#' @return List with the following components:
+#' \itemize{
+#' \item depth: mean surrogate minimal depth for each variable
+#' \item selected: variables has been selected (1) or not (0),
+#' \item threshold: the threshold that is used for the selection
+#' }
+#' @export
+
+surrmindep = function(forest, s.l){
+
+variables = forest[["allvariables"]]
+trees = forest[["trees"]]
+
+num.trees = length(trees)
+var.num = length(variables)
+
+#prepare matrix for mindepth
+mindepth = matrix(NA, nrow = num.trees, ncol = var.num)
+colnames(mindepth) = variables
+
+MAX.DEPTH = 10000
+#maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step
+maxdepth = rep(NA,num.trees)
+nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = num.trees)
+# get mindepth for every variable in every tree
+for (i in 1:num.trees) {
+nodesAtDepth <- rep(NA, MAX.DEPTH)
+tree = trees[[i]]
+# get layer information of the variables and save it in minimal depth file
+depth.tree = rep(NA,length(variables))
+  o = 1
+  while (anyNA(depth.tree) && o <= length(tree)) {
+    node = unlist(tree[o])
+    if (node["status"] == 1) {
+    if (is.na(depth.tree[node["splitvariable"]])) {
+      depth.tree[as.numeric(node["splitvariable"])] = as.numeric(node["layer"])
+    }
+      if (length(node) > 7) {
+      for (r in 8:(7 + (length(node) - 7)/2)) {
+        if (is.na(depth.tree[as.numeric(node[r])])) {
+      depth.tree[as.numeric(node[r])] = as.numeric(node["layer"])
+      }
+      }
+      }
+    }
+  o = o + 1
+  }
+  #variables with no split in the tree get maxdepth as minimal depth
+  depth.tree[which(is.na(depth.tree))] = as.numeric(unlist(tree[length(tree)])["layer"])
+  #save min and max depth information for every tree
+  mindepth[i,] = depth.tree
+  maxdepth[i] = as.numeric(unlist(tree[length(tree)])["layer"])
+  #find the number of nodes in every layer
+
+  laystat = cbind(sapply(tree,"[[","layer"),sapply(tree,"[[","status"))
+  colnames(laystat) = c("layer","status")
+  for (u in 1:(maxdepth[i])) {
+
+    nodesAtDepth[u] = nrow(subset(laystat, laystat[,"layer"] == u & laystat[,"status"] == 1 ))
+  }
+  nodesAtDepthMatrix[,i] = nodesAtDepth
+}
+
+# create mean values for the minimal depth of different variables
+mean.depth = colMeans(mindepth)
+
+# determine the mean depth of an uninformative variable similarly as in Ishwaran et. al. Journal of the American Statistical
+# Accociation 2010
+
+treeHeight = maxdepth
+avgTreeHeight = mean(treeHeight, na.rm = TRUE)
+maxTreeHeight = max(treeHeight, na.rm = TRUE)
+
+# threshold has to be calculated different when trees are small
+if ((avgTreeHeight) < 2) {
+  s.l.root = s.l[1]
+  p.root = (s.l.root + 1)/var.num
+  p.1 = 1 - p.root
+  threshold = p.root*0 + p.1*1
+  warning("Trees are very small! Threshold is defined based on trees with only root nodes.")
+}
+
+
+
+if ((avgTreeHeight) >= 2) {
+nodes.at.depth.avg = apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE)
+s.l.noroot = s.l[-1]
+s.l.root = s.l[1]
+p.root = (s.l.root + 1)/var.num
+var.at.depth = nodes.at.depth.avg[1:(avgTreeHeight - 1)]*(s.l.noroot[1:(avgTreeHeight - 1)] + 1)  # 1 time for the original tree and s-times for
+                                                                                                              # the surrogates
+                                                                                                              #-1 since the last layer will be added later
+p.depth = var.at.depth/var.num
+p.all = c(p.root,p.depth)
+prob.sum = 0
+probs.used = NULL
+for (u in 1:length(p.all)) {
+  p.u = p.all[u]
+  prob.sum = prob.sum + p.u
+  if (prob.sum >= 1) break
+  probs.used[u] = p.u
+}
+prob.last = 1 - sum(probs.used)   # the last layer (that was excluded before) is now used as minimal depth when the variable doesnt appear before
+probs.used = c(probs.used,prob.last)
+layers = c(0:(length(probs.used) - 1))
+threshold = sum(layers*probs.used)
+}
+
+
+# Decide if variables are important or unimportant
+Importances = rep(NA,var.num)
+for (p in 1:var.num) {
+if (mean.depth[p] < threshold) {
+ Importances[p] = 1}
+  else {
+    Importances[p] = 0}
+}
+names(Importances) = variables
+results = list(depth = mean.depth,selected = Importances,threshold = threshold)
+return(results)
+}
+
diff --git a/R/var.relations.R b/R/var.relations.R
new file mode 100644
index 0000000000000000000000000000000000000000..75fee1659742fceb732ca10e615bf80b821544ea
--- /dev/null
+++ b/R/var.relations.R
@@ -0,0 +1,166 @@
+#' Investigate variable relations of a specific variable with mean adjusted agreement
+#'
+#'This function uses the mean adjusted agreement to select variables that are related to a defined variable using a threshold T.
+#'The parameter t is used to calculate T: t=1 means that every variable with higher probability than "by chance" is identified
+#'as "important". t=2 means the probability has to be twice, etc.
+#'Based on the threshold a vector is created containing the related variables.
+#'
+#' @param variables variable names (string) for which related variables should be searched for (has to be contained in allvariables)
+#' @param candidates vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables)
+#' @param t variable to calculate threshold. Default is 5.
+#' @param select.rel set False if only relations should be calculated and no related variables should be selected.
+#' @param num.threads number of threads used for determination of relations. Default is number of CPUs available.
+#' @inheritParams var.select.smd
+#'
+#' @return a list containing:
+#' \itemize{
+#' \item variables: the variables to which relations are investigated.
+#' \item surr.res: a matrix with mean adjusted agreement values with variables in rows and candidates in columns.
+#' \item threshold: the threshold used to select related variables.
+#' \item var: a list with one vector for each variable containing related variables.
+#' \item ranger: ranger object.
+#' }
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#' x = SMD_example_data[,2:ncol(SMD_example_data)]
+#' y = SMD_example_data[,1]
+#' \donttest{
+#' # calculate variable relations
+#' set.seed(42)
+#' res = var.relations(x = x, y = y, s = 10, num.trees = 100, variables = c("X1","X7"), candidates = colnames(x)[1:100], t = 5)
+#' res$var
+#' }
+#'
+#' @export
+
+var.relations = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1,
+                         num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL,
+                         save.memory = FALSE, case.weights = NULL,
+                         variables, candidates, t = 5, select.rel = TRUE) {
+  if(!is.data.frame(x)){
+    stop("x has to be a data frame")
+  }
+  if (create.forest) {
+    ## check data
+    if (length(y) != nrow(x)) {
+      stop("length of y and number of rows in x are different")
+    }
+
+    if (any(is.na(x))) {
+      stop("missing values are not allowed")
+    }
+
+    allvariables = colnames(x)# extract variables names
+    nvar = length(allvariables)   # count number of variables
+    ## set global parameters
+    if (is.null(mtry)) {
+      mtry = floor((nvar)^(3/4))
+    }
+    if (mtry == "sqrt") {
+      mtry = floor(sqrt(nvar))
+    }
+    if (mtry == "0.5") {
+      mtry = floor(0.5*(nvar))
+    }
+    if (mtry == "^3/4") {
+      mtry = floor((nvar)^(3/4))
+    }
+
+
+    if (is.null(s)) {
+      s = ceiling(nvar*0.01)
+    }
+
+    if (s > (nvar - 2)) {
+      s = nvar - 1
+      warning("s was set to the maximum number that is reasonable (variables-1) ")
+    }
+
+    if (type == "classification") {
+      y = as.factor(y)
+      if (length(levels(y)) > 15) {
+        stop("Too much classes defined, classification might be the wrong choice")
+      }
+    }
+    if (type == "regression" && class(y) == "factor") {
+      stop("use factor variable for y only for classification! ")
+    }
+
+    data = data.frame(y, x)
+
+    if (type == "survival") {
+      if (is.null(status)) {
+        stop("a status variable named status has to be given for survival analysis")
+      }
+      data$status = status
+      RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory,
+                          case.weights = case.weights)
+    }
+    if (type == "classification" | type == "regression") {
+      RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights)
+
+    }
+
+    trees = getTreeranger(RF = RF,num.trees = num.trees)
+    trees.lay = addLayer(trees)
+    rm(trees)
+    ###AddSurrogates###
+    trees.surr = addSurrogates(RF = RF,trees = trees.lay,s = s,Xdata = data[,-1], num.threads = num.threads)
+    rm(trees.lay)
+    forest = list(trees = trees.surr, allvariables = colnames(data[,-1]))
+
+  }
+
+  if (!create.forest) {
+    if (is.null(forest)) {
+      stop("set create.forest to TRUE or analyze an existing random forest specified by parameter forest")
+    }
+  }
+  trees = forest[["trees"]]
+  allvariables = forest[["allvariables"]]
+
+  if (all(candidates %in% allvariables)) {
+  if (all(variables %in% allvariables)) {
+  # count surrogates
+  s = count.surrogates(trees)
+  results.meanAdjAgree = meanAdjAgree(trees, variables, allvariables, candidates, t = t, s$s.a, select.var = select.rel,num.threads = num.threads)
+  } else {
+    stop("allvariables do not contain the chosen variables")
+  }
+  } else {
+    stop("allvariables do not contain the candidate variables")
+  }
+  if(select.rel) {
+  surr.var = results.meanAdjAgree$surr.var
+  varlist = list()
+  for (i in 1:nrow(surr.var)) {
+    surr.var.var = surr.var[i,]
+    if (anyNA(surr.var.var)) {
+    surr.var.var = surr.var.var[-which(is.na(surr.var.var))]
+    }
+    var = names(surr.var.var[surr.var.var == 1])
+    name = variables[i]
+    varlist[[name]] = var
+  }
+
+  var = names(surr.var[surr.var == 1])
+  if (save.ranger) {
+ return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res,
+              threshold = results.meanAdjAgree$threshold, var = varlist, ranger = RF))
+  } else {
+    return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res,
+                threshold = results.meanAdjAgree$threshold, var = varlist))
+  }
+
+  } else {
+    if (save.ranger) {
+    return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res,ranger = RF))
+
+  } else {
+    return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res))
+  }
+  }
+}
diff --git a/R/var.relations.mfi.R b/R/var.relations.mfi.R
new file mode 100644
index 0000000000000000000000000000000000000000..bab78e8b16da9089e6c9c20fc559e4de6f64bb3c
--- /dev/null
+++ b/R/var.relations.mfi.R
@@ -0,0 +1,298 @@
+#' Investigate variable relations of a specific variable with mutual forest impact (corrected mean adjusted agreement).
+#'
+#'This function corrects the mean adjusted agreement by a permutation approach and generates the relation parameter mutual forest impact. Subsequently p-values are determined and related variables are selected.
+#'
+#' @param variables variable names (string) for which related variables should be searched for (has to be contained in allvariables)
+#' @param candidates vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables)
+#' @param p.t p.value threshold for selection of related variables. Default is 0.01.
+#' @param select.rel set False if only relations should be calculated and no related variables should be selected.
+#' @param method Method  to  compute  p-values.   Use  "janitza"  for  the  method  by  Janitza  et  al. (2016) or "permutation" to utilize permuted relations.
+#' @param num.threads number of threads used for determination of relations. Default is number of CPUs available.
+#' @inheritParams var.select.smd
+#'
+#' @return a list containing:
+#' \itemize{
+#' \item variables: the variables to which relations are investigated.
+#' \item surr.res: a matrix with the mutual forest impact values with variables in rows and candidates in columns.
+#' \item surr.perm: a matrix with the mutual forest impact values of the permuted variables with variables in rows and candidates in columns.
+#' \item p.rel: a list with the obtained p-values for the relation analysis of each variable.
+#' \item var.rel: a list with vectors of related variables for each variable.
+#' \item ranger: ranger objects.
+#' \item method: Method  to  compute  p-values: "janitza" or "permutation".
+#' \item p.t: p.value threshold for selection of related variables
+#'
+#'
+#' }
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#' x = SMD_example_data[,2:ncol(SMD_example_data)]
+#' y = SMD_example_data[,1]
+#' \donttest{
+#' # calculate variable relations
+#' set.seed(42)
+#' res = var.relations.mfi(x = x, y = y, s = 10, num.trees = 100, variables = c("X1","X7"), candidates = colnames(x)[1:100])
+#' res$var.rel[[1]]
+#' }
+#'
+#' @export
+
+var.relations.mfi = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1,
+                         num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL,
+                         save.memory = FALSE, case.weights = NULL,
+                         variables, candidates, p.t = 0.01, select.rel = TRUE, method = "janitza") {
+  if(!is.data.frame(x)){
+    stop("x has to be a data frame")
+  }
+  if (create.forest) {
+    ## check data
+    if (length(y) != nrow(x)) {
+      stop("length of y and number of rows in x are different")
+    }
+
+    if (any(is.na(x))) {
+      stop("missing values are not allowed")
+    }
+
+    allvariables = colnames(x)# extract variables names
+    nvar = length(allvariables)   # count number of variables
+    ## set global parameters
+    if (is.null(mtry)) {
+      mtry = floor((nvar)^(3/4))
+    }
+    if (mtry == "sqrt") {
+      mtry = floor(sqrt(nvar))
+    }
+    if (mtry == "0.5") {
+      mtry = floor(0.5*(nvar))
+    }
+    if (mtry == "^3/4") {
+      mtry = floor((nvar)^(3/4))
+    }
+
+
+    if (is.null(s)) {
+      s = ceiling(nvar*0.01)
+    }
+
+    if (s > (nvar - 2)) {
+      s = nvar - 1
+      warning("s was set to the maximum number that is reasonable (variables-1) ")
+    }
+
+    if (type == "classification") {
+      y = as.factor(y)
+      if (length(levels(y)) > 15) {
+        stop("Too much classes defined, classification might be the wrong choice")
+      }
+    }
+    if (type == "regression" && class(y) == "factor") {
+      stop("use factor variable for y only for classification! ")
+    }
+
+    # create shadow variables to correct the relation
+    x_perm = data.frame(lapply(1:ncol(x),permute.variable,x=x))
+    colnames(x_perm) = paste(allvariables,"_perm", sep = "")
+
+    data = data.frame(y, x)
+    data_perm = data.frame(y, x_perm)
+
+    if (type == "survival") {
+      if (is.null(status)) {
+        stop("a status variable named status has to be given for survival analysis")
+      }
+      data$status = status
+      RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry, min.node.size = min.node.size,
+                          keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory,
+                          case.weights = case.weights, respect.unordered.factors = "partition")
+      data_perm$status = status
+      RF_perm = ranger::ranger(data = data_perm,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory,
+                          case.weights = case.weights, respect.unordered.factors = "partition")
+    }
+    if (type == "classification" | type == "regression") {
+      RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition")
+
+      RF_perm = ranger::ranger(data = data_perm,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition")
+
+    }
+    trees = getTreeranger(RF = RF,num.trees = num.trees)
+    trees.lay = addLayer(trees)
+    rm(trees)
+    ###AddSurrogates###
+    trees.surr = addSurrogates(RF = RF,trees = trees.lay,s = s,Xdata = data[,-1], num.threads = num.threads)
+    rm(trees.lay)
+    forest = list(trees = trees.surr, allvariables = colnames(data[,-1]))
+
+    # do the same for the permutation forrest
+    trees_perm = getTreeranger(RF = RF_perm,num.trees = num.trees)
+    trees.lay_perm = addLayer(trees_perm)
+    rm(trees_perm)
+    ###AddSurrogates###
+    trees.surr_perm = addSurrogates(RF = RF_perm,trees = trees.lay_perm,s = s,Xdata = data_perm[,-1], num.threads = num.threads)
+    rm(trees.lay_perm)
+    forest_perm = list(trees = trees.surr_perm, allvariables = colnames(data_perm[,-1]))
+  }
+
+  if (!create.forest) {
+    if (is.null(forest)) {
+      stop("set create.forest to TRUE or analyze an existing random forest specified by parameter forest")
+    }
+  }
+
+  if (all(candidates %in% allvariables)) {
+  if (all(variables %in% allvariables)) {
+  # count surrogates
+  s = count.surrogates(forest$trees)
+  rel = meanAdjAgree(forest$trees, variables = allvariables, allvariables = allvariables, candidates = allvariables,
+                     t = t, s$s.a, select.var = FALSE, num.threads = num.threads)
+
+  allvariables_perm = colnames(x_perm)
+
+  rel_perm = meanAdjAgree(forest_perm$trees, variables = allvariables_perm , allvariables = allvariables_perm , candidates = allvariables_perm,
+                          t = t, s$s.a, select.var = FALSE, num.threads = num.threads)
+
+  } else {
+    stop("allvariables do not contain the chosen variables")
+  }
+  } else {
+    stop("allvariables do not contain the candidate variables")
+  }
+
+  adj.agree = rel$surr.res
+  adj.agree.perm = rel_perm$surr.res
+  diag(adj.agree) = diag(adj.agree.perm) = 1
+
+  if(anyNA(adj.agree)) {
+    no.na = length(which(rowSums(is.na(adj.agree)) != 0 ))
+    warning(paste0("Relations for ", no.na, " original variables were not calculated because they were never used as a primary split.
+            Affected relations are set to 0. "))
+    adj.agree[which(is.na(adj.agree))] = 0
+  }
+
+  if(anyNA(adj.agree.perm)) {
+    no.na = length(which(rowSums(is.na(adj.agree.perm)) != 0 ))
+    warning(paste0("Relations for ", no.na, " permuted variables were not calculated because they were not used as a primary split.
+            Affected relations are set to 0. "))
+    adj.agree.perm[which(is.na(adj.agree.perm))] = 0
+  }
+  adj.agree.corr = adj.agree - adj.agree.perm[1:nvar,1:nvar]
+  diag(adj.agree.corr) =  diag(adj.agree) = diag(adj.agree.perm) =  NA
+
+  adj.agree.corr.var = adj.agree.corr[variables,candidates]
+
+  if (select.rel) {
+
+    if (method == "janitza") {
+
+      adj.agree.1 = adj.agree.corr
+      diag(adj.agree.1) = 1
+      ## Mirrored VIMP (# This part is taken from ranger function)
+      m1 = adj.agree.1[adj.agree.1 < 0]
+      m2 = adj.agree.1[adj.agree.1 == 0]
+      null.rel = c(m1, -m1, m2)
+
+      if (length(m1) == 0) {
+        stop("No negative importance values found for selection of related variables. Consider the 'permutation' approach.")
+      }
+      if (length(m1) < 100) {
+        warning("Only few negative importance values found for selection of related variables, inaccurate p-values. Consider the 'permutation' approach.")
+      }
+
+      rel.p = lapply(1:length(variables),p.relation,
+                     null.rel = null.rel,
+                     adj.agree.corr = adj.agree.corr.var,
+                     candidates = candidates,
+                     variables = variables)
+      sel.rel = lapply(1:length(variables),select.related,
+                       rel.p,
+                       p.t)
+
+      names(rel.p) = names(sel.rel) = variables
+
+    }
+
+    if (method == "permutation") {
+
+    null.rel.plus = as.vector(adj.agree.perm)
+    null.rel.plus = null.rel.plus[!is.na(null.rel.plus)]
+
+    m1 = null.rel.plus[null.rel.plus > 0]
+    m2 = null.rel.plus[null.rel.plus == 0]
+    null.rel = c(m1, -m1, m2)
+
+
+    if (length(null.rel) < 100) {
+      warning("Only few null relations used. P-values could be inaccurate.")
+    }
+
+    rel.p = lapply(1:length(variables),p.relation,
+                   null.rel = null.rel,
+                   adj.agree.corr = adj.agree.corr.var,
+                   candidates = candidates,
+                   variables = variables)
+
+    sel.rel = lapply(1:length(variables),select.related,
+                     rel.p,
+                     p.t)
+
+    names(rel.p) = names(sel.rel) = variables
+  }
+
+  if (save.ranger) {
+ return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm,
+              p.rel = rel.p, var.rel = sel.rel, ranger = list(RF = RF,RF_perm = RF_perm), method = method, p.t = p.t))
+  } else {
+    return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, p.rel = rel.p, var.rel = sel.rel, method = method, p.t = p.t))
+  }
+  } else {
+    if (save.ranger) {
+    return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, ranger = list(RF = RF,RF_perm = RF_perm)))
+  } else {
+    return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm))
+  }
+  }
+}
+
+
+
+#' permute.variable
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+permute.variable=function(i=1,x){
+  var.perm = sample(x[,i],nrow(x))
+  return(var.perm)
+}
+
+#' p.relation
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+p.relation = function(l = 1,
+                      null.rel,
+                      adj.agree.corr,
+                      candidates,
+                      variables) {
+  relations = adj.agree.corr[l,]
+  pval <- 1 - ranger:::numSmaller(relations, null.rel) / length(null.rel)
+  names(pval) = candidates
+  pval[variables[l]] = NA
+  return(pval)
+}
+
+#' select.related
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+select.related = function(m=1,
+                          rel.p,
+                          p.t) {
+  rel.var = rel.p[[m]]
+  names(which(rel.var <= p.t))
+}
diff --git a/R/variable_selection_md.R b/R/variable_selection_md.R
new file mode 100644
index 0000000000000000000000000000000000000000..42ec1fdd765cdf3b8c12f2d02f66f644ab76eb76
--- /dev/null
+++ b/R/variable_selection_md.R
@@ -0,0 +1,73 @@
+#' Variable selection with Minimal Depth (MD)
+#'
+#' This function executes MD applying \link[ranger]{ranger} for random forests generation and is a reimplementation of \link[randomForestSRC]{var.select} from randomForestSRC package.
+#'
+#' @param x data.frame of predictor variables with variables in
+#'   columns and samples in rows. (Note: missing values are not allowed)
+#' @param y vector with values of phenotype variable (Note: will be converted to factor if
+#'   classification mode is used). For survival forests this is the time variable.
+#' @param num.trees Number of trees. Default is 500.
+#' @param mtry Number of variables to possibly split at in each node. Default is no. of variables^(3/4) as recommended by Ishwaran.
+#' @param type Mode of prediction ("regression","classification" or "survival"). Default is regression.
+#' @param min.node.size Minimal node size. Default is 1.
+#' @param num.threads number of threads used for parallel execution. Default is number of CPUs available.
+#' @param status status variable, only applicable to survival data. Use 1 for event and 0 for censoring.
+#' @param save.ranger Set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).
+#' @param create.forest set FALSE if you want to analyze an existing forest. Default is TRUE.
+#' @param forest the random forest that should be analyzed if create.forest is set to FALSE. (x and y still have to be given to obtain variable names)
+#' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)
+#' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.
+#'
+#' @return List with the following components:
+#' \itemize{
+#' \item info: list with results from mindep function:
+#' \itemize{
+#' \item depth: mean minimal depth for each variable.
+#' \item selected: variables has been selected (1) or not (0).
+#' \item threshold: the threshold that is used for the selection. (deviates slightly from the original implimentation)
+#' }
+#' \item var: vector of selected variables.
+#'
+#' \item forest: a list containing:
+#' #'\itemize{
+#' \item trees: list of trees that was created by getTreeranger, addLayer, and addSurrogates functions and that was used for surrogate minimal depth variable importance.
+#' \item allvariables: all variable names of the predictor variables that are present in x.
+#' }
+#'
+#' \item ranger: ranger object
+#'
+#'}
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#'
+#' \donttest{
+#' # select variables (usually more trees are needed)
+#' set.seed(42)
+#' res = var.select.md(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], num.trees = 10)
+#' res$var
+#' }
+#'
+#'@references
+##' \itemize{
+##'   \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103}
+##'   \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf}
+##'   }
+#'
+#' @export
+
+var.select.md = function(x = NULL, y = NULL, num.trees = 500, type = "regression", mtry = NULL, min.node.size = 1, num.threads = NULL,
+                         status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, save.memory = FALSE, case.weights = NULL) {
+
+  results.smd = var.select.smd(x = x, y = y ,num.trees = num.trees,type = type, mtry = mtry,min.node.size = min.node.size, num.threads = num.threads
+                               ,status = status, save.ranger = save.ranger, s = 0, create.forest = create.forest, forest = forest,
+                               save.memory = save.memory, case.weights = case.weights)
+  if (save.ranger) {
+    results = list(info = results.smd$info, var = results.smd$var, forest = results.smd$forest, ranger = results.smd$ranger)
+  }
+  else {
+    results = list(info = results.smd$info, var = results.smd$var, forest = results.smd$forest)
+  }
+  return(results)
+
+}
diff --git a/R/variable_selection_mir.R b/R/variable_selection_mir.R
new file mode 100644
index 0000000000000000000000000000000000000000..93d65aec60124210c95b483c8f52113b600478f7
--- /dev/null
+++ b/R/variable_selection_mir.R
@@ -0,0 +1,291 @@
+#' Variable selection with mutual impurity reduction (MIR)
+#'
+#' This function executes MIR applying \link[ranger]{ranger} for random forests generation and actual impurity reduction and a modified version of \link[rpart]{rpart} to find surrogate variables.
+#'
+#' @param x data.frame of predictor variables with variables in
+#'   columns and samples in rows (Note: missing values are not allowed)
+#' @param y vector with values of phenotype variable (Note: will be converted to factor if
+#'   classification mode is used). For survival forests this is the time variable.
+#' @param num.trees number of trees. Default is 500.
+#' @param mtry number of variables to possibly split at in each node. Default is no. of variables^(3/4) ("^3/4") as recommended by (Ishwaran 2011). Also possible is "sqrt" and "0.5" to use the square root or half of the no. of variables.
+#' @param type mode of prediction ("regression", "classification" or "survival"). Default is regression.
+#' @param min.node.size minimal node size. Default is 1.
+#' @param num.threads number of threads used for parallel execution. Default is number of CPUs available.
+#' @param s predefined number of surrogate splits (it may happen that the actual number of surrogate splits differs in individual nodes). Default is 1 \% of no. of variables.
+#' @param p.t.sel p.value threshold for selection of important variables. Default is 0.01.
+#' @param p.t.rel p.value threshold for selection of related variables. Default is 0.01.
+#' @param num.permutations number of permutations to determine p-values. Default is 100. (the relations are determined once based on the permuted X data and the utilized AIR values are permuted again for each permutation )
+#' @param status status variable, only applicable to survival data. Use 1 for event and 0 for censoring.
+#' @param save.ranger set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).
+#' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)
+#' @param select.var set False if only importance should be calculated and no variables should be selected.
+#' @param select.rel set False if only relations should be calculated and no variables should be selected.
+#' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.
+#' @param method.rel Method  to  compute  p-values for selection of related variables with var.relations.corr. Use  "janitza"  for  the  method  by  Janitza  et  al. (2016) or "permutation" to utilize permuted variables.
+#' @param method.sel Method  to  compute  p-values for selection of important variables. Use  "janitza"  for  the  method  by  Janitza  et  al. (2016) (can only be used when corrected variable relations are utilized) or "permutation" to utilize permuted variables.
+#' @param corr.rel set FALSE if non-corrected variable relations should be used for calculation of MIR. In this case the method "janitza" should not be used for selection of important variables
+#' @param t variable to calculate threshold for non-corrected relation analysis. Default is 5.
+#' @param save.rel set FALSE if relation information should not bet saved (default is TRUE)
+#'
+#'
+#' @return list with the following components:
+#' \itemize{
+#' \item info: list with results containing:
+#' \itemize{
+#' \item MIR: the calculated variable importance for each variable based on mutual impurity reduction.
+#' \item pvalue: the obtained p-values for each variable.
+#' \item selected: variables has been selected (1) or not (0).
+#' \item relations: a list containing the results of variable relation analysis.
+#' \item parameters: a list that contains the parameters s, type, mtry, p.t.sel, p.t.rel and method.sel that were used.
+#' }
+#' \item var: vector of selected variables.
+#'
+#'\item ranger: ranger object.
+#'
+#' }
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#'
+#' \donttest{
+#' # select variables (usually more trees are needed)
+#' set.seed(42)
+#' res = var.select.mir(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1],s = 10, num.trees = 10)
+#' res$var
+#' }
+#'@references
+##' \itemize{
+##'   \item Nembrini, S. et al. (2018) The revival of the Gini importance? Bioinformatics, 34, 3711–3718. \url{https://academic.oup.com/bioinformatics/article/34/21/3711/4994791}
+##'   \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013}
+##'   }
+#' @export
+
+var.select.mir = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1,
+                          num.threads = NULL, status = NULL, save.ranger = FALSE,
+                          save.memory = FALSE, num.permutations = 100, p.t.sel = 0.01, p.t.rel = 0.01, select.var = TRUE, select.rel = FALSE,
+                          case.weights = NULL, corr.rel = TRUE, t = 5, method.rel = "permutation", method.sel = "janitza", save.rel = TRUE) {
+  if(!is.data.frame(x)){
+    stop("x has to be a data frame")
+  }
+    ## check data
+    if (length(y) != nrow(x)) {
+      stop("length of y and number of rows in x are different")
+    }
+
+    if (any(is.na(x))) {
+      stop("missing values are not allowed")
+    }
+
+    allvariables = colnames(x)# extract variables names
+    nvar = length(allvariables)   # count number of variables
+    ## set global parameters
+    if (is.null(mtry)) {
+      mtry = floor((nvar)^(3/4))
+    }
+    if (mtry == "sqrt") {
+      mtry = floor(sqrt(nvar))
+    }
+    if (mtry == "0.5") {
+      mtry = floor(0.5*(nvar))
+    }
+    if (mtry == "^3/4") {
+      mtry = floor((nvar)^(3/4))
+    }
+
+
+    if (is.null(s)) {
+      s = ceiling(nvar*0.01)
+    }
+
+    if (s > (nvar - 2)) {
+      s = nvar - 1
+      warning("s was set to the maximum number that is reasonable (variables-1) ")
+    }
+
+    if (type == "classification") {
+      y = as.factor(y)
+      if (length(levels(y)) > 15) {
+        stop("Too much classes defined, classification might be the wrong choice")
+      }
+    }
+    if (type == "regression" && class(y) == "factor") {
+      stop("use factor variable for y only for classification! ")
+    }
+
+    data = data.frame(y, x)
+
+    if (type == "survival") {
+      if (is.null(status)) {
+        stop("a status variable named status has to be given for survival analysis")
+      }
+      data$status = status
+      RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          num.threads = num.threads, status.variable.name = "status", save.memory = save.memory,
+                          importance ="impurity_corrected", case.weights = case.weights, respect.unordered.factors = "partition")
+      if (corr.rel) {
+        rel = var.relations.mfi(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                                 num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables,
+                                 candidates = allvariables, p.t = p.t.rel, method = method.rel,select.rel = select.rel)
+      } else {
+        rel = var.relations(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                            num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables,
+                            candidates = allvariables, t = t, select.rel = select.rel)
+      }
+    }
+    if (type == "classification" | type == "regression") {
+      RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                          num.threads = num.threads, importance ="impurity_corrected", case.weights = case.weights, respect.unordered.factors = "partition")
+
+      if (corr.rel) {
+        rel = var.relations.mfi(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                                 num.threads = num.threads, case.weights = case.weights, variables = allvariables,
+                                 candidates = allvariables, p.t = p.t.rel, method = method.rel,select.rel = select.rel)
+      } else {
+        rel = var.relations(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                            num.threads = num.threads, case.weights = case.weights, variables = allvariables,
+                            candidates = allvariables, t = t,select.rel = select.rel)
+      }
+
+    }
+
+
+
+adj.agree = rel$surr.res
+diag(adj.agree) = 1
+
+  mir = colSums(adj.agree * RF$variable.importance)
+
+  if (select.var) {
+    if (method.sel == "janitza") {
+      if (corr.rel) {
+      ## Mirrored VIMP (# This part is taken from ranger function)
+      m1 = mir[mir< 0]
+      m2 = mir[mir == 0]
+      null.rel = c(m1, -m1, m2)
+
+      pval <- 1 - ranger:::numSmaller(mir, null.rel) / length(null.rel)
+      names(pval) = allvariables
+      selected = as.numeric(pval <= p.t.sel)
+      names(selected) = names(pval)
+
+      if (length(m1) == 0) {
+        stop("No negative importance values found for selection of important variables. Consider the 'permutation' approach.")
+      }
+      if (length(m1) < 100) {
+        warning("Only few negative importance values found for selection of important variables, inaccurate p-values. Consider the 'permutation' approach.")
+      }
+      } else {
+        stop("Janitza approach should only be conducted with corrected relations")
+}
+    }
+
+    if (method.sel == "permutation") {
+
+      if (corr.rel){
+        adj.agree_perm = rel$surr.perm
+      } else {
+      x_perm = sapply(1:ncol(x),permute.variable,x=x)
+      colnames(x_perm) = paste(allvariables,"_perm", sep = "")
+      data_perm = data.frame(y, x_perm)
+      allvariables_perm = colnames(x_perm)
+
+      if (type == "survival") {
+        if (is.null(status)) {
+          stop("a status variables has to be given for survival analysis")
+        }
+
+        if (corr.rel) {
+          rel_perm = var.relations.mfi(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                                   num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables,
+                                   candidates = allvariables, p.t = p.t.rel, method = method.rel, select.rel = select.rel)
+        } else {
+          rel_perm = var.relations(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                              num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables,
+                              candidates = allvariables, t = t, select.rel = select.rel)
+        }
+
+      }
+      if (type == "classification" | type == "regression") {
+
+        if (corr.rel) {
+          rel_perm = var.relations.mfi(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                                   num.threads = num.threads, case.weights = case.weights, variables = allvariables_perm,
+                                   candidates = allvariables_perm, p.t = p.t.rel, method = method.rel,select.rel = select.rel)
+        } else {
+          rel_perm = var.relations(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size,
+                              num.threads = num.threads, case.weights = case.weights, variables = allvariables_perm,
+                              candidates = allvariables_perm, t = t,select.rel = select.rel)
+        }
+        }
+
+      adj.agree_perm = rel_perm$surr.res
+      }
+
+      diag(adj.agree_perm) = 0
+
+      null.rel = unlist(lapply(1:num.permutations,calculate.mir.perm,
+                               adj.agree_perm = adj.agree_perm,
+                               air = RF$variable.importance,
+                               allvariables = allvariables))
+
+
+
+      pval <- 1 - ranger:::numSmaller(mir, null.rel) / length(null.rel)
+      names(pval) = allvariables
+      selected = as.numeric(pval <= p.t.sel)
+      names(selected) = names(pval)
+
+      }
+if(save.rel) {
+  info = list(MIR = mir,
+              pvalue = pval,
+              selected = selected,
+              relations = rel,
+              AIR = RF$variable.importance,
+              parameters = list(s = s, type = type, mtry = mtry, p.t.sel = p.t.sel, p.t.rel = p.t.rel, method.sel = method.sel))
+} else {
+  info = list(MIR = mir,
+              pvalue = pval,
+              selected = selected,
+              AIR = RF$variable.importance,
+              parameters = list(s = s, type = type, mtry = mtry, p.t.sel = p.t.sel, p.t.rel = p.t.rel, method.sel = method.sel))
+}
+  } else {
+    if(save.rel) {
+    info = list(MIR = mir,
+                relations = rel,
+                AIR = RF$variable.importance,
+                parameters = list(s = s, type = type, mtry = mtry))
+    } else {
+      info = list(MIR = mir,
+                  AIR = RF$variable.importance,
+                  parameters = list(s = s, type = type, mtry = mtry))
+    }
+  }
+
+  if (save.ranger) {
+    results = list(info = info,
+                   var = names(info$selected[info$selected == 1]),
+                   ranger = RF)
+  } else {
+    if (select.var) {
+    results = list(info = info,
+                   var = names(info$selected[info$selected == 1]))
+  } else {
+    results = list(info = info)
+  }
+  }
+  return(results)
+}
+
+
+#'
+#' This is an internal function
+#'
+#' @keywords internal
+calculate.mir.perm = function(r=1, adj.agree_perm, air, allvariables) {
+mir.perm = colSums(adj.agree_perm * sample(air,length(air)))
+return(mir.perm)
+}
+
+
diff --git a/R/variable_selection_smd.R b/R/variable_selection_smd.R
new file mode 100644
index 0000000000000000000000000000000000000000..e8c3ad4a16ac6dc31141d4e085d9488b3fc058f0
--- /dev/null
+++ b/R/variable_selection_smd.R
@@ -0,0 +1,160 @@
+#' Variable selection with Surrogate Minimal Depth (SMD) (MAIN FUNCTION)
+#'
+#' This function executes SMD applying \link[ranger]{ranger} for random forests generation and a modified version of \link[rpart]{rpart} to find surrogate variables.
+#'
+#' @param x data.frame of predictor variables with variables in
+#'   columns and samples in rows (Note: missing values are not allowed)
+#' @param y vector with values of phenotype variable (Note: will be converted to factor if
+#'   classification mode is used). For survival forests this is the time variable.
+#' @param num.trees number of trees. Default is 500.
+#' @param mtry number of variables to possibly split at in each node. Default is no. of variables^(3/4) ("^3/4") as recommended by (Ishwaran 2011). Also possible is "sqrt" and "0.5" to use the square root or half of the no. of variables.
+#' @param type mode of prediction ("regression", "classification" or "survival"). Default is regression.
+#' @param min.node.size minimal node size. Default is 1.
+#' @param num.threads number of threads used for parallel execution. Default is number of CPUs available.
+#' @param s predefined number of surrogate splits (it may happen that the actual number of surrogate splits differs in individual nodes). Default is 1 \% of no. of variables.
+#' @param status status variable, only applicable to survival data. Use 1 for event and 0 for censoring.
+#' @param save.ranger set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).
+#' @param create.forest set FALSE if you want to analyze an existing forest. Default is TRUE.
+#' @param forest the random forest that should be analyzed if create.forest is set to FALSE. (x and y still have to be given to obtain variable names)
+#' @param save.memory Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)
+#' @param case.weights Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.
+#'
+#' @return list with the following components:
+#' \itemize{
+#' \item info: list with results from surrmindep function:
+#' \itemize{
+#' \item depth: mean surrogate minimal depth for each variable.
+#' \item selected: variables has been selected (1) or not (0).
+#' \item threshold: the threshold that is used for the selection.
+#' }
+#' \item var: vector of selected variables.
+#'
+#'\item s: list with the results of count.surrogate function:
+#'\itemize{
+#' \item s.a: total average number of surrogate variables.
+#' \item s.l: average number of surrogate variables in the respective layers.
+#'}
+#' \item forest: a list containing:
+#' #'\itemize{
+#' \item trees: list of trees that was created by getTreeranger, addLayer, and addSurrogates functions and that was used for surrogate minimal depth variable importance.
+#' \item allvariables: all variable names of the predictor variables that are present in x.
+#' }
+#'\item ranger: ranger object.
+#'
+#' }
+#' @examples
+#' # read data
+#' data("SMD_example_data")
+#'
+#' \donttest{
+#' # select variables (usually more trees are needed)
+#' set.seed(42)
+#' res = var.select.smd(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1],s = 10, num.trees = 10)
+#' res$var
+#' }
+#'@references
+##' \itemize{
+##'   \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013}
+##'   \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103}
+##'   \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf}
+##'   }
+#' @export
+
+var.select.smd = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1,
+                          num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL,
+                          save.memory = FALSE, case.weights = NULL) {
+  if(!is.data.frame(x)){
+    stop("x has to be a data frame")
+  }
+  if (create.forest) {
+  ## check data
+  if (length(y) != nrow(x)) {
+    stop("length of y and number of rows in x are different")
+  }
+
+  if (any(is.na(x))) {
+    stop("missing values are not allowed")
+  }
+
+  variables = colnames(x)    # extract variables names
+  nvar = length(variables)   # count number of variables
+
+  ## set global parameters
+  if (is.null(mtry)) {
+    mtry = floor((nvar)^(3/4))
+  }
+  if (mtry == "sqrt") {
+    mtry = floor(sqrt(nvar))
+  }
+  if (mtry == "0.5") {
+    mtry = floor(0.5*nvar)
+  }
+  if (mtry == "^3/4") {
+    mtry = floor((nvar)^(3/4))
+  }
+
+
+
+  if (is.null(s)) {
+    s = ceiling(nvar*0.01)
+  }
+
+  if (s > (nvar - 1)) {
+    s = nvar - 1
+    warning("s was set to the maximum number that is reasonable (variables-1) ")
+  }
+
+  if (type == "classification") {
+    y = as.factor(y)
+    if (length(levels(y)) > 15) {
+      stop("Too much classes defined, classification might be the wrong choice")
+    }
+  }
+  if (type == "regression" && class(y) == "factor") {
+    stop("use factor variable for y only for classification! ")
+  }
+  data = data.frame(y, x)
+  if (type == "survival") {
+    if (is.null(status)) {
+      stop("a status variable named status has to be given for survival analysis")
+    }
+    data$status = status
+    RF = ranger::ranger(data = data, dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                        keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory,
+                        case.weights = case.weights, respect.unordered.factors = "partition")
+  }
+  if (type == "classification" | type == "regression") {
+  RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size,
+                      keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition")
+  }
+  trees = getTreeranger(RF = RF,num.trees = num.trees)
+  trees.lay = addLayer(trees)
+  rm(trees)
+  ###AddSurrogates###
+  trees.surr = addSurrogates(RF = RF,trees = trees.lay,s = s,Xdata = x, num.threads = num.threads)
+  rm(trees.lay)
+  # count surrogates
+  s = count.surrogates(trees.surr)
+  surrminimaldepth.s = surrmindep(forest = list(trees = trees.surr, allvariables = variables), s.l = s$s.l)
+  }
+  if (!create.forest) {
+  if (is.null(forest)) {
+    stop("set create.forest to TRUE or analyze an existing random forest specified by parameter forest")
+  }
+    allvariables = forest[["allvariables"]]
+    trees = forest[["trees"]]
+    s = count.surrogates(trees)
+    surrminimaldepth.s = surrmindep(forest, s.l = s$s.l)
+    trees.surr = forest[["trees"]]
+    variables = forest[["variables"]]
+  }
+  if (save.ranger) {
+  results = list(info = surrminimaldepth.s,var = names(surrminimaldepth.s$selected[surrminimaldepth.s$selected == 1]),s = s,
+                 forest = list(trees = trees.surr, allvariables = variables), ranger = RF)
+  }
+  else {
+    results = list(info = surrminimaldepth.s,var = names(surrminimaldepth.s$selected[surrminimaldepth.s$selected == 1]),s = s,
+                   forest = list(trees = trees.surr, allvariables = variables))
+  }
+  return(results)
+}
diff --git a/README.md b/README.md
index 76acceacd893cf1409002fcb1df7fd81011b69de..557a10d2595354a46a8b438d4e0bacc2b3277e25 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,191 @@
-rfsur
+# RFSurrogates
+In this R package, several functions are provided for applying approaches based on random forest. Minimal depth (MD), Surrogate minimal depth (SMD) and mutual impurity reduction (MIR), which is a corrected approach of SMD, can be applied to assess the importance of variables and to select important variables. In addition, the parameters mean adjusted agreement and mutual forest impact (MFI), a corrected approach of the previous, can be applied to investigate variable relations based on surrogate variables. 
+
+Please cite the following manuscripts if you use the package:  
+
+[1] S. Seifert, S. Gundlach, S. Szymczak, Surrogate minimal depth as an importance measure for variables in random forests, Bioinformatics 2019, 35, 3663-3671.
+
+[2] publication about MFI/MIR under preparation
+
+# Install
+```
+library(devtools)
+install_github("StephanSeifert/RFSurrogates")
+```
+
+# Example data
+The package contains an example data set which consists of a single replicate of the simulation study 1 in our manuscript. Please refer to the paper and the documentation of the SMD_example_data for further details on the simulation scenario.
+
+# Usage
+First the package and the example data are loaded:
+```
+library(RFSurrogates
+data("SMD_example_data")
+dim(SMD_example_data)
+[1] 100 201
+
+head(SMD_example_data[, 1:5])
+           y          X1         X2         X3           X4
+1  1.8222421 -0.02768266 -1.1019154  2.2659401  0.008021516
+2 -1.0401813  0.73258486 -0.4107975  0.7587792 -0.718752746
+3  2.7139607 -0.05399936  1.1851261  0.9743160 -2.563176970
+4 -0.7081372 -0.84838121 -0.8975802  0.5247899  1.180683275
+5 -1.0264429 -0.42219003  0.5439467 -0.1626504  0.682333020
+6  3.1871209  0.91722598  0.1974106  0.9571554  0.351634641
+
+```
+The data set has 100 observations in the rows and the columns contain the continuous outcome variable y and 200 continuous predictor variables in the columns.
+
+## Minimal depth
+
+First, we perform variable selection based on minimal depth using 1000 trees in the random forest. To make the analysis reproducible we set the seed first.
+```
+set.seed(42)
+res.md = var.select.md(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], ntree=1000)
+res.md$var
+[1] "X2"     "X3"     "X4"     "X5"     "X6"     "cp1_8"  "cp2_6"  "cp2_7"  "cp3_4"  "cp3_6"  "cp8_10" "cgn_68" "cgn_72" "cgn_81"
+```
+
+The selected variables are stored in res.md$var. In this analysis the relevant basic variables X2 to X6, as well as the relevant variables cp2_6, cp2_7, cp3_4, and cp3_6, and the non-relevant variables cp8_10, cgn_68, cgn_72, and cgn_81 are selected.
+
+The MD values for each predictor variable and the threshold to select variables can be extracted as follows:
+```
+md = res.md$info$depth
+head(md)
+   X1    X2    X3    X4    X5    X6 
+9.823 7.848 6.164 6.662 6.442 6.390 
+
+res.md$info$threshold
+[1] 9.23097
+```
+We can see that variables X2, …, X6 have MD values smaller than the threshold in contrast to X1.
+
+## Surrogate Minimal depth (SMD)
+
+Now we would like to analyze the example data with surrogate minimal depth which works similarly. However, we need to specify an additional parameter s, i.e. the number of surrogate variables that should be considered. In this analysis we use s = 10. Based on our simulation studies we recommend to set this parameter to approximately 1% of the predictor variables in larger datasets. 
+
+Variable selection with var.select.smd is conducted:
+```
+set.seed(42)
+res.smd = var.select.smd(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], s = 10, ntree = 1000)
+res.smd$var
+ [1] "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "cp1_1"  "cp1_2"  "cp1_3"  "cp1_4"  "cp1_5"  "cp1_6"  "cp1_7"  "cp1_8"  "cp1_9" 
+[16] "cp1_10" "cp2_4"  "cp2_6" 
+```
+
+
+The selected variables are stored in res.smd$var. In this analysis the relevant basic variables X1 to X6, as well as the relevant variables cp1_1 to cp1_10, cp2_4, and cp2_6 are selected. Compared to MD more of the relevant variables and none of the non-relevant variables are selected.
+
+
+The SMD values for each predictor variable and the threshold to select variables can be extracted as follows:
+
+```
+smd = res.smd$info$depth
+head(smd)
+  X1    X2    X3    X4    X5    X6 
+2.344 2.287 2.095 2.576 2.509 2.276 
+
+res.smd$info$threshold
+[1] 2.690082
+```
+
+We can see that variables X1, …, X6 have SMD values smaller than the threshold.
+
+
+
+## Variable relations based on the mean adjusted agreement of surrogate variables
+
+Now we want to investigate the relations of variables. We would like to identify which of the first 100 predictor variables are related to X1 and X7. We simulated 10 correlated predictor variables for each of these two basic variables.
+One possibility to investigate variable relations is to use the results from var.select.smd. Hence, first SMD is conducted like in the previous section:
+
+```
+res.smd = var.select.smd(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], s = 10, ntree = 1000)
+```
+Subsequently, variable relations are analyzed with var.relations. The parameter t can be adapted to either focus on strongly related variables only (high numbers) or to include also moderately related variables (low numbers):
+
+```
+candidates = colnames(SMD_example_data )[2:101]
+rel = var.relations(forest = res.smd$forest, variables = c("X1","X7"), candidates = candidates, t = 5)
+rel$var
+$X1
+ [1] "cp1_1"  "cp1_2"  "cp1_3"  "cp1_4"  "cp1_5"  "cp1_6"  "cp1_7"  "cp1_8"  "cp1_9"  "cp1_10"
+
+$X7
+ [1] "cp7_1"  "cp7_2"  "cp7_3"  "cp7_4"  "cp7_5"  "cp7_6"  "cp7_7"  "cp7_8"  "cp7_9"  "cp7_10"
+  
+```
+
+All of the variables that are correlated to X1 are correctly identified as related to X1 and all of the variables that are correlated to X7 are correcly identified as related to X7. 
+
+
+## Variable relations based on mutual forest impact (MFI)
+
+MFI is a corrected relation parameter calculated by the mean adjusted agreement of the variables and permuted versions of them. Related variables are selected by p-values obtained from a null distribution either determined by negative relation scores (based on the Janitza approach) or by permuted relations. 
+We use the default parameters for the selection here, which is a p-values threshold of 0.01 and the Janitza approach. 
+
+```
+set.seed(42)
+rel.mfi = var.relations.mfi(x = x, y = y, s = 10, ntree = 1000, variables = c("X1","X7"), candidates = colnames(x)[1:100], p.t = 0.01, method = "janitza" )
+rel.mfi$var.rel
+$X1
+ [1] "cp1_1"  "cp1_2"  "cp1_3"  "cp1_4"  "cp1_5"  "cp1_6"  "cp1_7"  "cp1_8"  "cp1_9"  "cp1_10"
+
+$X7
+ [1] "cp7_1"  "cp7_2"  "cp7_3"  "cp7_4"  "cp7_5"  "cp7_6"  "cp7_7"  "cp7_8"  "cp7_9"  "cp7_10"
+```
+
+Also by MFI, all of the variables that are correlated to X1 are correctly identified as related to X1 and all of the variables that are correlated to X7 are correcly identified as related to X7. 
+Also the matrix of determined relation (surr.res), permuted relations (surr.perm) and determined p-values (p.rel) can be extracted as followes: 
+
+```
+MFI = rel.mfi$surr.res
+surr.perm = rel.mfi$surr.perm
+p.rel = rel.mfi$p.rel
+```
+
+## Mutual impurity reduction (MIR)
+
+Now we would like to analyze the example data with MIR, which determines the variable importance by the actual impurity reduction combined with the relations determined by MFI. Different to MD and SMD, this approach calculates p-values for the selection of important variables. For this, the null distribution is obtained in a similar way as for MFI, either by negative importance scores called the Janitza approach or by permutation. Since this example dataset is comparatively small, we use the permutation approach. As a threshold for selection a value of 0.01 is applied (p.t.sel = 0.01).
+
+```
+set.seed(42)
+res.mir = var.select.mir(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], s = 10, ntree = 1000, method.sel = "permutation", p.t.sel = 0.01)
+res.mir$var
+ [1] "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "cp1_1"  "cp1_2"  "cp1_3"  "cp1_4"  "cp1_5"  "cp1_6" 
+[13] "cp1_7"  "cp1_8"  "cp1_9"  "cp1_10" "cp2_1"  "cp2_3"  "cp2_4"  "cp2_6"  "cp2_7"  "cp2_10" "cp3_1"  "cp3_4" 
+[25] "cp3_5"  "cgn_72" "cgn_81"
+```
+The selected variables are stored in res.mir$var. Here, the relevant variables cp1_1 to cp1_10, cp2_1, cp2_3, cp2_4, cp2_6, cp2_7, cp2_10, cp3_1, cp3_4, cp3_5, as well as the non-relevant variables cgn_72 and cgn_81 are selected. 
+
+The MIR values and p-values can be extracted as follows:
+
+```
+mir = res.mir$info$MIR
+head(mir)
+      X1       X2       X3       X4       X5       X6 
+10.68243 15.95674 27.09036 20.50233 23.16293 21.15731 
+
+pvalues = res.mir$info$pvalue
+head(pvalues)
+X1 X2 X3 X4 X5 X6 
+ 0  0  0  0  0  0 
+ 
+```
+
+ We can see that variables X1, …, X6 have a p-value of 0 and are selected.
+ 
+ Since this approach is based on the actual impurity reduction combined with the relations determined by MFI, both of these can also be extracted from the results:
+ 
+ 
+```
+air = res.mir$info$AIR
+head(air)
+       X1        X2        X3        X4        X5        X6 
+ 1.072849 13.133904 26.444900 19.155187 22.718355 20.782305 
+ 
+ res.mfi = res.mir$info$relations
+
+```
+res.mfi contains the results of var.relations.mfi conducted in MIR. 
+ 
+ 
diff --git a/SurrogateMinimalDepth.Rproj b/SurrogateMinimalDepth.Rproj
new file mode 100644
index 0000000000000000000000000000000000000000..398aa1438c2c45f6eb3efd75ef2d4052d5ae7923
--- /dev/null
+++ b/SurrogateMinimalDepth.Rproj
@@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: knitr
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/Version_info.txt b/Version_info.txt
new file mode 100644
index 0000000000000000000000000000000000000000..953d35e2e21f239acdda13835e2a7592f0113fc9
--- /dev/null
+++ b/Version_info.txt
@@ -0,0 +1,43 @@
+Version 0.3.1 - changed the name of the parameter ntree to num.trees
+
+
+Version 0.3.0 - round adjusted agreement and mean adjusted agreement to 2 digits
+              - speed up meanAdjAgree function 
+              - changed that relations to variables not used as primary plit are set as NA (before it was 0)
+              - renamed var.relations.corr to var.relations.mfi (as named in the paper)
+              - bugfix for MFI and MIR
+              - permutation approach to determine p-values was optimized for the selection of important and related variables
+              - default approach for p-value calculation was set to "permutation"
+
+Version 0.2.1 - included possibility to analyze categorical variables 
+
+Version 0.2.0 - added function var.sel.mir to select important variables based on mutual impurity reduction
+              - added function var.relations.corr to calculate unbiased relations
+              - added multicore calculation of variable relations 
+              - expand function inputs for var.relations 
+              - included possibility to only calculate relations and not select related variables for var.relations function 
+              - included the possibility to set case weights
+
+Version 0.1.10 - fixed bug not including first and last surrogate
+	       - added random selection of surrogates when adj_agree are the same
+	       - fixed bug in var.relation example
+	       - added build.clusters function to obtain variable groups  
+
+Version 0.1.9	- some specifics to set s in var.select.smd were adapted
+		- added save.memory parameter (to build the forest with ranger) to 	var.select.smd and var.select.md  
+
+Version 0.1.8	- MD now executes var.select.smd with s = 0 instead of using a separate function
+		- The function reduce.surrogates is included. 
+		- Included parameters create.forest and forest to use var.select.smd for 			  existing forests (e.g. created by reduce.surrogates)
+		- Changed the value "trees" to "forest" containing trees and variable names 
+		- The C-code is updated to enable multicore analysis
+
+Version 0.1.7	- Included parameter save.ranger in var.select.md and var.select.smd to save 			  ranger object
+
+Version 0.1.6	- Adapted threshold for low depth trees in var.select.smd
+
+Version 0.1.5 	- Added s as value in var.select.smd 
+		- Implemented survival function for var.select.smd and var.select.md
+
+Version 0.1.4 	- All errors and comments from coauthors implemented: first version uploaded on 		  github
+
diff --git a/data/SMD_example_data.RData b/data/SMD_example_data.RData
new file mode 100644
index 0000000000000000000000000000000000000000..9d00d0fb02b60d93a31861cf8c8eeb949e36fde4
Binary files /dev/null and b/data/SMD_example_data.RData differ
diff --git a/man/SMD_example_data.Rd b/man/SMD_example_data.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..f982040f5ce5a060f99a634533b9efe5e7f22e77
--- /dev/null
+++ b/man/SMD_example_data.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SMD_example_data.R
+\docType{data}
+\name{SMD_example_data}
+\alias{SMD_example_data}
+\title{Example data set for the package SurrogateMinimalDepth}
+\format{
+A matrix with 100 rows and 1001 columns
+}
+\usage{
+data(SMD_example_data)
+}
+\description{
+A dataset containing 100 individuals (rows), 1 dependent variable (first column, y), and 200 additional variables (columns 2 to 201):
+For the simulation of the 200 additional variables nine variables X1,…,X9 called basic variables were simulated. X1 to X6 are causal
+with constant effect sizes of 1 and X7 to X9 are noncausal with effect size of 0. The outcome y is a linear combination of the causal
+predictor variables and a normally distributed error term. All basic variables were sampled from a normal distribution
+(N(0,1)) just like the noise (N(0,0.2)). For each of the six basic variables X1, X2, X3, X7, X8, and X9 ten variables
+with predefined correlations of 0.9 for X1 and X7, 0.6 for X2 and X8, and 0.3 for X3 and X9 were obtained by \link[WGCNA]{simulateModule} function of
+the R package WGCNA. The ten variables of each basis variable are labeled: Cp_basicvariable_number. Additional non-correlated and
+independent predictor variables (cgn) were simulated using the standard normal distribution to reach a total number of 200 variables.
+}
+\keyword{datasets}
diff --git a/man/SurrTree.Rd b/man/SurrTree.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..33b8dfbedc80dbd62cbad16555feea79132bcd5e
--- /dev/null
+++ b/man/SurrTree.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/addSurrogates.R
+\name{SurrTree}
+\alias{SurrTree}
+\title{SurrTree}
+\usage{
+SurrTree(j, wt, Xdata, controls, column.names, tree, maxsurr, ncat)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/addLayer.Rd b/man/addLayer.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..2ff20437c6f62aab51aedc23e04d5ae3a21cb859
--- /dev/null
+++ b/man/addLayer.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/addLayer.R
+\name{addLayer}
+\alias{addLayer}
+\title{Add layer information to a forest that was created by getTreeranger}
+\usage{
+addLayer(trees)
+}
+\arguments{
+\item{trees}{list of trees created by getTreeranger}
+}
+\value{
+a list with trees. Each row of the list elements corresponds to a node of the respective tree and the columns correspond to:
+\itemize{
+\item nodeID: ID of the respective node (important for left and right daughters in the next columns)
+\item leftdaughter: ID of the left daughter of this node
+\item rightdaughter: ID of the right daughter of this node
+\item splitvariable: ID of the split variable
+\item splitpoint: splitpoint of the split variable
+\item status: "0" for terminal and "1" for non-terminal
+\item layer: layer information (0 means root node, 1 means 1 layer below root, etc)
+}
+}
+\description{
+This functions adds the layer information to each node in a list with trees that was obtained by getTreeranger.
+}
diff --git a/man/addSurrogates.Rd b/man/addSurrogates.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..95af49c22118cbb4ff50f0deae5ee8ee3395546f
--- /dev/null
+++ b/man/addSurrogates.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/addSurrogates.R
+\name{addSurrogates}
+\alias{addSurrogates}
+\title{Add surrogate information that was created by getTreeranger}
+\usage{
+addSurrogates(RF, trees, s, Xdata, num.threads)
+}
+\arguments{
+\item{RF}{random forest object created by ranger (with keep.inbag=TRUE).}
+
+\item{trees}{list of trees created by getTreeranger.}
+
+\item{s}{Predefined number of surrogate splits (it may happen that the actual number of surrogate splits differes in individual nodes). Default is 1 \% of no. of variables.}
+
+\item{Xdata}{data without the dependent variable.}
+
+\item{num.threads}{number of threads used for parallel execution. Default is number of CPUs available.}
+}
+\value{
+a list with trees containing of lists of nodes with the elements:
+\itemize{
+\item nodeID: ID of the respective node (important for left and right daughters in the next columns)
+\item leftdaughter: ID of the left daughter of this node
+\item rightdaughter: ID of the right daughter of this node
+\item splitvariable: ID of the split variable
+\item splitpoint: splitpoint of the split variable
+\item status: "0" for terminal and "1" for non-terminal
+\item layer: layer information (0 means root node, 1 means 1 layer below root, etc)
+\item surrogate_i: numbered surrogate variables (number depending on s)
+\item adj_i: adjusted agreement of variable i
+}
+}
+\description{
+This function adds surrogate variables and adjusted agreement values to a forest that was created by getTreeranger.
+}
diff --git a/man/adj.mean.Rd b/man/adj.mean.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..03aa37911c0958ece28cc14729465efa7d2ca25d
--- /dev/null
+++ b/man/adj.mean.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{adj.mean}
+\alias{adj.mean}
+\title{adj.mean}
+\usage{
+adj.mean(trees)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/adj.mean.trees.Rd b/man/adj.mean.trees.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..7a161340a108981c5e1aeaa0a02880b3b1efee7c
--- /dev/null
+++ b/man/adj.mean.trees.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{adj.mean.trees}
+\alias{adj.mean.trees}
+\title{adj.mean.trees}
+\usage{
+adj.mean.trees(t, trees)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/adj.node.Rd b/man/adj.node.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..b750c5479c7758f54368faf6790f70ec791a4063
--- /dev/null
+++ b/man/adj.node.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{adj.node}
+\alias{adj.node}
+\title{adj.node}
+\usage{
+adj.node(i, allvar.num, relevant.nodes, index.candidates)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/build.clusters.Rd b/man/build.clusters.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..965c0c19b42d12d9b86635202f9e13dfdd843b31
--- /dev/null
+++ b/man/build.clusters.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/build_clusters.R
+\name{build.clusters}
+\alias{build.clusters}
+\title{Apply cluster analysis to build variable groups}
+\usage{
+build.clusters(rel, hcmethod = "ward.D")
+}
+\arguments{
+\item{rel}{a list containing variables, surr.res, threshold, and var. This is the output of \link[SurrogateMinimalDepth]{var.relations} function.}
+
+\item{hcmethod}{the hierarchical clustering method that is used. (see \link[linkcomm]{getLinkCommunities})}
+}
+\value{
+a data frame containing the variable names and their associated clusters.
+}
+\description{
+This function generates variables groups of relation information that was obtained by \link[SurrogateMinimalDepth]{var.relations} function applying
+\link[linkcomm]{getLinkCommunities}.
+}
+\examples{
+# read data
+data("SMD_example_data")
+
+\donttest{
+ # get trees and variable names
+ x = SMD_example_data[,2:ncol(SMD_example_data)]
+ y = SMD_example_data[,1]
+ allvariables = colnames(x)# extract variables names
+ nvar = length(allvariables)   # count number of variables
+ set.seed(42)
+ RF = ranger::ranger(data = SMD_example_data, dependent.variable.name = "y", num.trees = 10, keep.inbag = TRUE,mtry = floor(nvar^(3/4)), min.node.size = 1)
+ trees = getTreeranger(RF = RF, num.trees = 10)
+ trees.lay = addLayer(trees)
+ trees.surr = addSurrogates(RF = RF, trees = trees.lay, s = 10, Xdata = x, num.threads = NULL)
+
+ # investigate variable relations
+ rel=var.relations(forest = list(trees = trees.surr, allvariables = allvariables), variables = allvariables, candidates = allvariables, t = 10)
+ groups = build.clusters(rel)
+}
+
+}
diff --git a/man/calculate.mir.perm.Rd b/man/calculate.mir.perm.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..8001ca03946c11c78b3067cc6b49561b14463c0b
--- /dev/null
+++ b/man/calculate.mir.perm.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/variable_selection_mir.R
+\name{calculate.mir.perm}
+\alias{calculate.mir.perm}
+\title{This is an internal function}
+\usage{
+calculate.mir.perm(r = 1, adj.agree_perm, air, allvariables)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/count.surrogates.Rd b/man/count.surrogates.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..50d91b08a73264a7bf052f70944682e83af6a48e
--- /dev/null
+++ b/man/count.surrogates.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/count.surrogates.R
+\name{count.surrogates}
+\alias{count.surrogates}
+\title{Count surrogate variables}
+\usage{
+count.surrogates(trees)
+}
+\arguments{
+\item{trees}{list of trees that was generated by getTreeranger function and layers, surrogate variables, and adjusted agreement values were added by addLayer and getSurrogates functions}
+}
+\value{
+List with the following components:
+\itemize{
+\item s.a: total average number of surrogate variables
+\item s.l: average number of surrogate variables in the respective layers
+}
+}
+\description{
+This function counts surrogate variables and returns the total average number of surrogate variables and the average number of surrogate variables of the respective layers.
+This is necessary since the actual number of surrogate splits can be lower than the predefined number (when less surrogate splits outperform the majority rule).
+}
diff --git a/man/getSurrogate.Rd b/man/getSurrogate.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..83f310f7a10c5278955bdd9b58e1b2c9cc85e528
--- /dev/null
+++ b/man/getSurrogate.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/addSurrogates.R
+\name{getSurrogate}
+\alias{getSurrogate}
+\title{getSurrogate}
+\usage{
+getSurrogate(surr.par, k = 1, maxsurr)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/getTreeranger.Rd b/man/getTreeranger.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..bb103e82b9de3019d3b0fe5bc049f3f40708b48b
--- /dev/null
+++ b/man/getTreeranger.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/getTreeranger.R
+\name{getTreeranger}
+\alias{getTreeranger}
+\title{Get a list of structured trees for ranger}
+\usage{
+getTreeranger(RF, num.trees)
+}
+\arguments{
+\item{RF}{random forest object created by ranger (with keep.inbag=TRUE)}
+
+\item{num.trees}{number of trees}
+}
+\value{
+a list with trees. Each row of the list elements corresponds to a node of the respective tree and the columns correspond to:
+\itemize{
+\item nodeID: ID of the respective node (important for left and right daughters in the next columns)
+\item leftdaughter: ID of the left daughter of this node
+\item rightdaughter: ID of the right daughter of this node
+\item splitvariable: ID of the split variable
+\item splitpoint: splitpoint of the split variable (for categorical variables this is a comma separated lists of values, representing the factor levels (in the original order) going to the right)
+\item status: "0" for terminal and "1" for non-terminal
+}
+}
+\description{
+This functions creates a list of trees for ranger objects similar as getTree function does for random Forest objects.
+}
diff --git a/man/getsingletree.Rd b/man/getsingletree.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..1cf262f7f1b76ae1d3c942c0690052749f729ff3
--- /dev/null
+++ b/man/getsingletree.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/getTreeranger.R
+\name{getsingletree}
+\alias{getsingletree}
+\title{getsingletree}
+\usage{
+getsingletree(RF, k = 1)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/mean.adj.node.Rd b/man/mean.adj.node.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..7f2759f27dbccacbcbd4e4571cb1a132c2de0b25
--- /dev/null
+++ b/man/mean.adj.node.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{mean.adj.node}
+\alias{mean.adj.node}
+\title{adj.node}
+\usage{
+\method{mean}{adj.node}(m, surr.nonterminal)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/mean.index.Rd b/man/mean.index.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..098eaf14d6e22585b79c7ebfe471425702a7b508
--- /dev/null
+++ b/man/mean.index.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{mean.index}
+\alias{mean.index}
+\title{mean.index}
+\usage{
+\method{mean}{index}(i, list.res, index.variables)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/meanAdjAgree.Rd b/man/meanAdjAgree.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..86fea4daf37963c233893859b22f6726645835cb
--- /dev/null
+++ b/man/meanAdjAgree.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{meanAdjAgree}
+\alias{meanAdjAgree}
+\title{Calculate mean adjusted agreement to investigate variables relations}
+\usage{
+meanAdjAgree(
+  trees,
+  variables,
+  allvariables,
+  candidates,
+  t,
+  s.a,
+  select.var,
+  num.threads = NULL
+)
+}
+\arguments{
+\item{trees}{list of trees created by getTreeranger, addLayer and addSurrogate.}
+
+\item{variables}{vector of variable names.}
+
+\item{allvariables}{vector of all variable names (strings)}
+
+\item{candidates}{vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables)}
+
+\item{t}{variable to calculate threshold. Default is 3.}
+
+\item{s.a}{average number of surrogate variables (ideally calculated by count.surrogates function).}
+
+\item{select.var}{set False if only relations should be calculated and no related variables should be selected.}
+
+\item{num.threads}{number of threads used for parallel execution. Default is number of CPUs available.}
+}
+\value{
+a list containing:
+\itemize{
+\item variables: the variables to which relations are investigated
+\item surr.res: matrix with mean adjusted agreement values and variables investigated in rows and candidate variables in columns
+\item threshold: the threshold used to create surr.var from surr.res
+\item surr.var: binary matrix showing if the variables are related (1) or non-related (0) with variables in rows and candidates in columns.
+}
+}
+\description{
+This is the main function of var.relations function.
+}
diff --git a/man/mindep.Rd b/man/mindep.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..44f05d5334e223d3d40b9b50bcbd3a9961791238
--- /dev/null
+++ b/man/mindep.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mindep.R
+\name{mindep}
+\alias{mindep}
+\title{Execute minimal depth variable importance}
+\usage{
+mindep(variables, trees)
+}
+\arguments{
+\item{variables}{vector of variable names}
+
+\item{trees}{list of trees that was generated by getTreeranger and layers functions}
+}
+\value{
+List with the following components:
+\itemize{
+\item depth: mean minimal depth for each variable
+\item selected: variables has been selected (1) or not (0),
+\item threshold: the threshold that is used for the selection
+}
+}
+\description{
+This function determines the minimal depth of variables from a forest that is created by getTreeranger, and addLayer functions.
+}
diff --git a/man/name.adj.Rd b/man/name.adj.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..11c56286b1278ac1ca5ffb097c593e36f435122a
--- /dev/null
+++ b/man/name.adj.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/addSurrogates.R
+\name{name.adj}
+\alias{name.adj}
+\title{name.adj}
+\usage{
+name.adj(i, adj.names)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/name.surr.Rd b/man/name.surr.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..c7295bddd8f4dab431996a050b47effea3eace12
--- /dev/null
+++ b/man/name.surr.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/addSurrogates.R
+\name{name.surr}
+\alias{name.surr}
+\title{name.surr}
+\usage{
+name.surr(i, surrogate.names)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/p.relation.Rd b/man/p.relation.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..e123f011d8521c5f5688ab354c7ecc8f86d4c55e
--- /dev/null
+++ b/man/p.relation.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/var.relations.mfi.R
+\name{p.relation}
+\alias{p.relation}
+\title{p.relation}
+\usage{
+p.relation(l = 1, null.rel, adj.agree.corr, candidates, variables)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/permute.variable.Rd b/man/permute.variable.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..427d6b95d2f1228821e6aaa0cbd46c1b290c2518
--- /dev/null
+++ b/man/permute.variable.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/var.relations.mfi.R
+\name{permute.variable}
+\alias{permute.variable}
+\title{permute.variable}
+\usage{
+permute.variable(i = 1, x)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/reduce.surrogates.Rd b/man/reduce.surrogates.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..b21ab58007a5eaac58c5e5c7bb352578de03b626
--- /dev/null
+++ b/man/reduce.surrogates.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/reduce.surrogates.R
+\name{reduce.surrogates}
+\alias{reduce.surrogates}
+\title{Reduce surrogate variables in a random forest.}
+\usage{
+reduce.surrogates(forest, s = 10)
+}
+\arguments{
+\item{forest}{a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions.}
+
+\item{s}{number of surrogate variables in the new forest (have to be less than in the RF in trees)}
+}
+\value{
+forest with s surrogate variables.
+}
+\description{
+This function can be applied to reduce the surrogate variables in a forest that is created by getTreeranger, addLayer
+and getSurrogates functions. Hence, it can be applied to the forests that were used for surrogate minimal depth variable importance.
+}
+\examples{
+# read data
+data("SMD_example_data")
+\donttest{
+###### use result of SMD variable importance and reduce surrogate variables to 10
+# select variables with smd variable importance (usually more trees are needed)
+set.seed(42)
+res = var.select.smd(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], s = 100, num.trees = 10)
+forest.new = reduce.surrogates(forest = res$forest, s = 10)
+
+# execute SMD on tree with reduced number of surrogates
+res.new = var.select.smd(create.forest = FALSE, forest = forest.new)
+res.new$var
+
+#' # investigate variable relations
+rel = var.relations(forest = forest.new, variables=c("X1","X7"), candidates = res$forest[["allvariables"]][1:100], t = 5)
+rel$var
+}
+}
diff --git a/man/scount.Rd b/man/scount.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..40760d3826f8c952e64679c752a74b3ae61aac58
--- /dev/null
+++ b/man/scount.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/count.surrogates.R
+\name{scount}
+\alias{scount}
+\title{scount}
+\usage{
+scount(i = 1, trees)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/select.related.Rd b/man/select.related.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..a1b3151047fce1509722829d049c73622a7cb293
--- /dev/null
+++ b/man/select.related.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/var.relations.mfi.R
+\name{select.related}
+\alias{select.related}
+\title{select.related}
+\usage{
+select.related(m = 1, rel.p, p.t)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/surr.tree.Rd b/man/surr.tree.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..142ae927e90832a3df3b8e9105bbdf53f195d818
--- /dev/null
+++ b/man/surr.tree.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/meanAdjAgree.R
+\name{surr.tree}
+\alias{surr.tree}
+\title{surr.tree}
+\usage{
+surr.tree(tree, variables, index.variables, allvariables, index.candidates)
+}
+\description{
+This is an internal function
+}
+\keyword{internal}
diff --git a/man/surrmindep.Rd b/man/surrmindep.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..27e6b0606a2de3a6cf80c0f8d7b55aff44f2634b
--- /dev/null
+++ b/man/surrmindep.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/surrmindep.R
+\name{surrmindep}
+\alias{surrmindep}
+\title{Execute surrogate minimal depth variable importance}
+\usage{
+surrmindep(forest, s.l)
+}
+\arguments{
+\item{forest}{a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions.}
+
+\item{s.l}{Number of average surrogate variables in the respective layers. (use count.surrogate function to get it)}
+}
+\value{
+List with the following components:
+\itemize{
+\item depth: mean surrogate minimal depth for each variable
+\item selected: variables has been selected (1) or not (0),
+\item threshold: the threshold that is used for the selection
+}
+}
+\description{
+This function determines the surrogate minimal depth of variables from a forest that is created by getTreeranger, addLayer and getSurrogates functions.
+}
diff --git a/man/var.relations.Rd b/man/var.relations.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..f056eae8b86c766b88482156c2fc56b4b5bb2c57
--- /dev/null
+++ b/man/var.relations.Rd
@@ -0,0 +1,95 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/var.relations.R
+\name{var.relations}
+\alias{var.relations}
+\title{Investigate variable relations of a specific variable with mean adjusted agreement}
+\usage{
+var.relations(
+  x = NULL,
+  y = NULL,
+  num.trees = 500,
+  type = "regression",
+  s = NULL,
+  mtry = NULL,
+  min.node.size = 1,
+  num.threads = NULL,
+  status = NULL,
+  save.ranger = FALSE,
+  create.forest = TRUE,
+  forest = NULL,
+  save.memory = FALSE,
+  case.weights = NULL,
+  variables,
+  candidates,
+  t = 5,
+  select.rel = TRUE
+)
+}
+\arguments{
+\item{x}{data.frame of predictor variables with variables in
+columns and samples in rows (Note: missing values are not allowed)}
+
+\item{y}{vector with values of phenotype variable (Note: will be converted to factor if
+classification mode is used). For survival forests this is the time variable.}
+
+\item{num.trees}{number of trees. Default is 500.}
+
+\item{type}{mode of prediction ("regression", "classification" or "survival"). Default is regression.}
+
+\item{s}{predefined number of surrogate splits (it may happen that the actual number of surrogate splits differs in individual nodes). Default is 1 \% of no. of variables.}
+
+\item{mtry}{number of variables to possibly split at in each node. Default is no. of variables^(3/4) ("^3/4") as recommended by (Ishwaran 2011). Also possible is "sqrt" and "0.5" to use the square root or half of the no. of variables.}
+
+\item{min.node.size}{minimal node size. Default is 1.}
+
+\item{num.threads}{number of threads used for determination of relations. Default is number of CPUs available.}
+
+\item{status}{status variable, only applicable to survival data. Use 1 for event and 0 for censoring.}
+
+\item{save.ranger}{set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).}
+
+\item{create.forest}{set FALSE if you want to analyze an existing forest. Default is TRUE.}
+
+\item{forest}{the random forest that should be analyzed if create.forest is set to FALSE. (x and y still have to be given to obtain variable names)}
+
+\item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)}
+
+\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}
+
+\item{variables}{variable names (string) for which related variables should be searched for (has to be contained in allvariables)}
+
+\item{candidates}{vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables)}
+
+\item{t}{variable to calculate threshold. Default is 5.}
+
+\item{select.rel}{set False if only relations should be calculated and no related variables should be selected.}
+}
+\value{
+a list containing:
+\itemize{
+\item variables: the variables to which relations are investigated.
+\item surr.res: a matrix with mean adjusted agreement values with variables in rows and candidates in columns.
+\item threshold: the threshold used to select related variables.
+\item var: a list with one vector for each variable containing related variables.
+\item ranger: ranger object.
+}
+}
+\description{
+This function uses the mean adjusted agreement to select variables that are related to a defined variable using a threshold T.
+The parameter t is used to calculate T: t=1 means that every variable with higher probability than "by chance" is identified
+as "important". t=2 means the probability has to be twice, etc.
+Based on the threshold a vector is created containing the related variables.
+}
+\examples{
+# read data
+data("SMD_example_data")
+x = SMD_example_data[,2:ncol(SMD_example_data)]
+y = SMD_example_data[,1]
+\donttest{
+# calculate variable relations
+set.seed(42)
+res = var.relations(x = x, y = y, s = 10, num.trees = 100, variables = c("X1","X7"), candidates = colnames(x)[1:100], t = 5)
+res$var
+}
+
+}
diff --git a/man/var.relations.mfi.Rd b/man/var.relations.mfi.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..7757aa30fa9621f68854a37cbd5806c8efdd5111
--- /dev/null
+++ b/man/var.relations.mfi.Rd
@@ -0,0 +1,103 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/var.relations.mfi.R
+\name{var.relations.mfi}
+\alias{var.relations.mfi}
+\title{Investigate variable relations of a specific variable with mutual forest impact (corrected mean adjusted agreement).}
+\usage{
+var.relations.mfi(
+  x = NULL,
+  y = NULL,
+  num.trees = 500,
+  type = "regression",
+  s = NULL,
+  mtry = NULL,
+  min.node.size = 1,
+  num.threads = NULL,
+  status = NULL,
+  save.ranger = FALSE,
+  create.forest = TRUE,
+  forest = NULL,
+  save.memory = FALSE,
+  case.weights = NULL,
+  min.var.p = 200,
+  variables,
+  candidates,
+  p.t = 0.01,
+  select.rel = TRUE,
+  method = "janitza"
+)
+}
+\arguments{
+\item{x}{data.frame of predictor variables with variables in
+columns and samples in rows (Note: missing values are not allowed)}
+
+\item{y}{vector with values of phenotype variable (Note: will be converted to factor if
+classification mode is used). For survival forests this is the time variable.}
+
+\item{num.trees}{number of trees. Default is 500.}
+
+\item{type}{mode of prediction ("regression", "classification" or "survival"). Default is regression.}
+
+\item{s}{predefined number of surrogate splits (it may happen that the actual number of surrogate splits differs in individual nodes). Default is 1 \% of no. of variables.}
+
+\item{mtry}{number of variables to possibly split at in each node. Default is no. of variables^(3/4) ("^3/4") as recommended by (Ishwaran 2011). Also possible is "sqrt" and "0.5" to use the square root or half of the no. of variables.}
+
+\item{min.node.size}{minimal node size. Default is 1.}
+
+\item{num.threads}{number of threads used for determination of relations. Default is number of CPUs available.}
+
+\item{status}{status variable, only applicable to survival data. Use 1 for event and 0 for censoring.}
+
+\item{save.ranger}{set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).}
+
+\item{create.forest}{set FALSE if you want to analyze an existing forest. Default is TRUE.}
+
+\item{forest}{the random forest that should be analyzed if create.forest is set to FALSE. (x and y still have to be given to obtain variable names)}
+
+\item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)}
+
+\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}
+
+\item{min.var.p}{minimum number of permuted variables used to determine p-value. Default is 200.}
+
+\item{variables}{variable names (string) for which related variables should be searched for (has to be contained in allvariables)}
+
+\item{candidates}{vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables)}
+
+\item{p.t}{p.value threshold for selection of related variables. Default is 0.01.}
+
+\item{select.rel}{set False if only relations should be calculated and no related variables should be selected.}
+
+\item{method}{Method  to  compute  p-values.   Use  "janitza"  for  the  method  by  Janitza  et  al. (2016) or "permutation" to utilize importance values of permuted variables.}
+}
+\value{
+a list containing:
+\itemize{
+\item variables: the variables to which relations are investigated.
+\item surr.res: a matrix with the mutual forest impact values with variables in rows and candidates in columns.
+\item surr.perm: a matrix with the mutual forest impact values of the permuted variables with variables in rows and candidates in columns.
+\item p.rel: a list with the obtained p-values for the relation analysis of each variable.
+\item var.rel: a list with vectors of related variables for each variable.
+\item ranger: ranger objects.
+\item method: Method  to  compute  p-values: "janitza" or "permutation".
+\item p.t: p.value threshold for selection of related variables
+
+
+}
+}
+\description{
+This function corrects the mean adjusted agreement by a permutation approach and generates the relation parameter mutual forest impact. Subsequently p-values are determined and related variables are selected.
+}
+\examples{
+# read data
+data("SMD_example_data")
+x = SMD_example_data[,2:ncol(SMD_example_data)]
+y = SMD_example_data[,1]
+\donttest{
+# calculate variable relations
+set.seed(42)
+res = var.relations.mfi(x = x, y = y, s = 10, num.trees = 100, variables = c("X1","X7"), candidates = colnames(x)[1:100])
+res$var.rel[[1]]
+}
+
+}
diff --git a/man/var.select.md.Rd b/man/var.select.md.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..f0dcc70151b7942a675ecfca22349e12531cee71
--- /dev/null
+++ b/man/var.select.md.Rd
@@ -0,0 +1,93 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/variable_selection_md.R
+\name{var.select.md}
+\alias{var.select.md}
+\title{Variable selection with Minimal Depth (MD)}
+\usage{
+var.select.md(
+  x = NULL,
+  y = NULL,
+  num.trees = 500,
+  type = "regression",
+  mtry = NULL,
+  min.node.size = 1,
+  num.threads = NULL,
+  status = NULL,
+  save.ranger = FALSE,
+  create.forest = TRUE,
+  forest = NULL,
+  save.memory = FALSE,
+  case.weights = NULL
+)
+}
+\arguments{
+\item{x}{data.frame of predictor variables with variables in
+columns and samples in rows. (Note: missing values are not allowed)}
+
+\item{y}{vector with values of phenotype variable (Note: will be converted to factor if
+classification mode is used). For survival forests this is the time variable.}
+
+\item{num.trees}{Number of trees. Default is 500.}
+
+\item{type}{Mode of prediction ("regression","classification" or "survival"). Default is regression.}
+
+\item{mtry}{Number of variables to possibly split at in each node. Default is no. of variables^(3/4) as recommended by Ishwaran.}
+
+\item{min.node.size}{Minimal node size. Default is 1.}
+
+\item{num.threads}{number of threads used for parallel execution. Default is number of CPUs available.}
+
+\item{status}{status variable, only applicable to survival data. Use 1 for event and 0 for censoring.}
+
+\item{save.ranger}{Set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).}
+
+\item{create.forest}{set FALSE if you want to analyze an existing forest. Default is TRUE.}
+
+\item{forest}{the random forest that should be analyzed if create.forest is set to FALSE. (x and y still have to be given to obtain variable names)}
+
+\item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)}
+
+\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}
+}
+\value{
+List with the following components:
+\itemize{
+\item info: list with results from mindep function:
+\itemize{
+\item depth: mean minimal depth for each variable.
+\item selected: variables has been selected (1) or not (0).
+\item threshold: the threshold that is used for the selection. (deviates slightly from the original implimentation)
+}
+\item var: vector of selected variables.
+
+\item forest: a list containing:
+#'\itemize{
+\item trees: list of trees that was created by getTreeranger, addLayer, and addSurrogates functions and that was used for surrogate minimal depth variable importance.
+\item allvariables: all variable names of the predictor variables that are present in x.
+}
+
+\item ranger: ranger object
+
+}
+}
+\description{
+This function executes MD applying \link[ranger]{ranger} for random forests generation and is a reimplementation of \link[randomForestSRC]{var.select} from randomForestSRC package.
+}
+\examples{
+# read data
+data("SMD_example_data")
+
+\donttest{
+# select variables (usually more trees are needed)
+set.seed(42)
+res = var.select.md(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1], num.trees = 10)
+res$var
+}
+
+}
+\references{
+\itemize{
+  \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103}
+  \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf}
+  }
+}
diff --git a/man/var.select.mir.Rd b/man/var.select.mir.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..42f04064a8a6d55db92cfd7d6dcdf039f74dac28
--- /dev/null
+++ b/man/var.select.mir.Rd
@@ -0,0 +1,115 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/variable_selection_mir.R
+\name{var.select.mir}
+\alias{var.select.mir}
+\title{Variable selection with mutual impurity reduction (MIR)}
+\usage{
+var.select.mir(
+  x = NULL,
+  y = NULL,
+  num.trees = 500,
+  type = "regression",
+  s = NULL,
+  mtry = NULL,
+  min.node.size = 1,
+  num.threads = NULL,
+  status = NULL,
+  save.ranger = FALSE,
+  save.memory = FALSE,
+  num.permutations = 100,
+  p.t.sel = 0.01,
+  p.t.rel = 0.01,
+  select.var = TRUE,
+  select.rel = FALSE,
+  case.weights = NULL,
+  corr.rel = TRUE,
+  t = 5,
+  method.rel = "janitza",
+  method.sel = "janitza",
+  save.rel = TRUE
+)
+}
+\arguments{
+\item{x}{data.frame of predictor variables with variables in
+columns and samples in rows (Note: missing values are not allowed)}
+
+\item{y}{vector with values of phenotype variable (Note: will be converted to factor if
+classification mode is used). For survival forests this is the time variable.}
+
+\item{num.trees}{number of trees. Default is 500.}
+
+\item{type}{mode of prediction ("regression", "classification" or "survival"). Default is regression.}
+
+\item{s}{predefined number of surrogate splits (it may happen that the actual number of surrogate splits differs in individual nodes). Default is 1 \% of no. of variables.}
+
+\item{mtry}{number of variables to possibly split at in each node. Default is no. of variables^(3/4) ("^3/4") as recommended by (Ishwaran 2011). Also possible is "sqrt" and "0.5" to use the square root or half of the no. of variables.}
+
+\item{min.node.size}{minimal node size. Default is 1.}
+
+\item{num.threads}{number of threads used for parallel execution. Default is number of CPUs available.}
+
+\item{status}{status variable, only applicable to survival data. Use 1 for event and 0 for censoring.}
+
+\item{save.ranger}{set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).}
+
+\item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)}
+
+\item{num.permutations}{number of permutations to determine p-values. Default is 100. (the relations are determined once based on the permuted X data and the utilized AIR values are permuted again for each permutation )}
+
+\item{p.t.sel}{p.value threshold for selection of important variables. Default is 0.01.}
+
+\item{p.t.rel}{p.value threshold for selection of related variables. Default is 0.01.}
+
+\item{select.var}{set False if only importance should be calculated and no variables should be selected.}
+
+\item{select.rel}{set False if only relations should be calculated and no variables should be selected.}
+
+\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}
+
+\item{corr.rel}{set FALSE if non-corrected variable relations should be used for calculation of MIR. In this case the method "janitza" should not be used for selection of important variables}
+
+\item{t}{variable to calculate threshold for non-corrected relation analysis. Default is 5.}
+
+\item{method.rel}{Method  to  compute  p-values for selection of related variables with var.relations.corr. Use  "janitza"  for  the  method  by  Janitza  et  al. (2016) or "permutation" to utilize permuted variables.}
+
+\item{method.sel}{Method  to  compute  p-values for selection of important variables. Use  "janitza"  for  the  method  by  Janitza  et  al. (2016) (can only be used when corrected variable relations are utilized) or "permutation" to utilize permuted variables.}
+
+\item{save.rel}{set FALSE if relation information should not bet saved (default is TRUE)}
+}
+\value{
+list with the following components:
+\itemize{
+\item info: list with results containing:
+\itemize{
+\item MIR: the calculated variable importance for each variable based on mutual impurity reduction.
+\item pvalue: the obtained p-values for each variable.
+\item selected: variables has been selected (1) or not (0).
+\item relations: a list containing the results of variable relation analysis.
+\item parameters: a list that contains the parameters s, type, mtry, p.t.sel, p.t.rel and method.sel that were used.
+}
+\item var: vector of selected variables.
+
+\item ranger: ranger object.
+
+}
+}
+\description{
+This function executes MIR applying \link[ranger]{ranger} for random forests generation and actual impurity reduction and a modified version of \link[rpart]{rpart} to find surrogate variables.
+}
+\examples{
+# read data
+data("SMD_example_data")
+
+\donttest{
+# select variables (usually more trees are needed)
+set.seed(42)
+res = var.select.mir(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1],s = 10, num.trees = 10)
+res$var
+}
+}
+\references{
+\itemize{
+  \item Nembrini, S. et al. (2018) The revival of the Gini importance? Bioinformatics, 34, 3711–3718. \url{https://academic.oup.com/bioinformatics/article/34/21/3711/4994791}
+  \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013}
+  }
+}
diff --git a/man/var.select.smd.Rd b/man/var.select.smd.Rd
new file mode 100644
index 0000000000000000000000000000000000000000..488d2a9d20574e7e6efc19912d2648a3868c8aca
--- /dev/null
+++ b/man/var.select.smd.Rd
@@ -0,0 +1,100 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/variable_selection_smd.R
+\name{var.select.smd}
+\alias{var.select.smd}
+\title{Variable selection with Surrogate Minimal Depth (SMD) (MAIN FUNCTION)}
+\usage{
+var.select.smd(
+  x = NULL,
+  y = NULL,
+  num.trees = 500,
+  type = "regression",
+  s = NULL,
+  mtry = NULL,
+  min.node.size = 1,
+  num.threads = NULL,
+  status = NULL,
+  save.ranger = FALSE,
+  create.forest = TRUE,
+  forest = NULL,
+  save.memory = FALSE,
+  case.weights = NULL
+)
+}
+\arguments{
+\item{x}{data.frame of predictor variables with variables in
+columns and samples in rows (Note: missing values are not allowed)}
+
+\item{y}{vector with values of phenotype variable (Note: will be converted to factor if
+classification mode is used). For survival forests this is the time variable.}
+
+\item{num.trees}{number of trees. Default is 500.}
+
+\item{type}{mode of prediction ("regression", "classification" or "survival"). Default is regression.}
+
+\item{s}{predefined number of surrogate splits (it may happen that the actual number of surrogate splits differs in individual nodes). Default is 1 \% of no. of variables.}
+
+\item{mtry}{number of variables to possibly split at in each node. Default is no. of variables^(3/4) ("^3/4") as recommended by (Ishwaran 2011). Also possible is "sqrt" and "0.5" to use the square root or half of the no. of variables.}
+
+\item{min.node.size}{minimal node size. Default is 1.}
+
+\item{num.threads}{number of threads used for parallel execution. Default is number of CPUs available.}
+
+\item{status}{status variable, only applicable to survival data. Use 1 for event and 0 for censoring.}
+
+\item{save.ranger}{set TRUE if ranger object should be saved. Default is that ranger object is not saved (FALSE).}
+
+\item{create.forest}{set FALSE if you want to analyze an existing forest. Default is TRUE.}
+
+\item{forest}{the random forest that should be analyzed if create.forest is set to FALSE. (x and y still have to be given to obtain variable names)}
+
+\item{save.memory}{Use memory saving (but slower) splitting mode. No effect for survival and GWAS data. Warning: This option slows down the tree growing, use only if you encounter memory problems. (This parameter is transfered to ranger)}
+
+\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}
+}
+\value{
+list with the following components:
+\itemize{
+\item info: list with results from surrmindep function:
+\itemize{
+\item depth: mean surrogate minimal depth for each variable.
+\item selected: variables has been selected (1) or not (0).
+\item threshold: the threshold that is used for the selection.
+}
+\item var: vector of selected variables.
+
+\item s: list with the results of count.surrogate function:
+\itemize{
+\item s.a: total average number of surrogate variables.
+\item s.l: average number of surrogate variables in the respective layers.
+}
+\item forest: a list containing:
+#'\itemize{
+\item trees: list of trees that was created by getTreeranger, addLayer, and addSurrogates functions and that was used for surrogate minimal depth variable importance.
+\item allvariables: all variable names of the predictor variables that are present in x.
+}
+\item ranger: ranger object.
+
+}
+}
+\description{
+This function executes SMD applying \link[ranger]{ranger} for random forests generation and a modified version of \link[rpart]{rpart} to find surrogate variables.
+}
+\examples{
+# read data
+data("SMD_example_data")
+
+\donttest{
+# select variables (usually more trees are needed)
+set.seed(42)
+res = var.select.smd(x = SMD_example_data[,2:ncol(SMD_example_data)], y = SMD_example_data[,1],s = 10, num.trees = 10)
+res$var
+}
+}
+\references{
+\itemize{
+  \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013}
+  \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103}
+  \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf}
+  }
+}
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a242c9ceed5bfe335ff326570219351fb0c01541
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,4 @@
+*.o
+*.so
+*.dll
+.vscode
diff --git a/src/choose_surg.c b/src/choose_surg.c
new file mode 100644
index 0000000000000000000000000000000000000000..36412149fa6303436da6ba2b3169613cf58c24cd
--- /dev/null
+++ b/src/choose_surg.c
@@ -0,0 +1,301 @@
+/*
+ * This routine is from rpart
+ *
+ * A particular split routine, optimized for the surrogate variable
+ *  search.  The "goodness" of a split is the total weights of concordant
+ *  observations between the surrogate and the primary split.
+ *  Note that the CART folks use the %concordance, which factors missing
+ *  values into the equations somewhat differently.
+ *
+ *  y is coded as  +1=left, -1=right, 0=missing
+ *
+ */
+#include "get_surg.h"
+#include "rpartproto.h"
+
+void choose_surg(int n1, int n2, int *y, double *x, int *order, int ncat, double *agreement, double *split, int *csplit_for_cat, double tleft, double tright, double *adj) {
+	//printf("01: Hello from choose_surg()!\n");
+	//printf("i: %d\n", i);
+
+	/* sum of weights for each */
+	double llwt, lrwt, rrwt, rlwt;
+	double agree, majority, total_wt;
+
+	int *left = rp.left;
+	int *right = rp.right;
+	double *lwt = rp.lwt;
+	double *rwt = rp.rwt;
+
+
+	/* set to 1 when something worthwhile is found */
+	int success = 0;
+	/*
+	 * I enforce that at least 2 obs must go each way, to avoid having an
+	 *  uncorrelated surrogate beat the "null" surrogate too easily
+	 * Observations with 0 weight don't count in this total
+	 */
+	/* continuous case */
+	/*
+	 * ll = y's that go left that are also sent left by my split
+	 * lr = y's that go left that I send right
+	 * rl = y's that go right that I send to the left
+	 * rr = y's that go right that I send to the right
+	 *
+	 * The agreement is max(ll+rr, lr+rl), if weights were = 1;
+	 *   actually max(llwt + rrwt, lrwt + rlwt) / denominator
+	 */
+	if (ncat == 0) {
+		//printf("Hello from choose_surg() ncat == 0\n");
+
+	double lastx = 0.0;
+	int ll, lr, rr, rl;
+	ll = rl = 0;
+	llwt = 0;
+	rlwt = 0;
+	// printf("n2: %d\n", n2); -> 100
+	// printf("n1: %d\n", n1); -> 0
+	for (int i = n2 - 1; i >= n1; i--) {
+		/* start with me sending all to the left */
+		int j = order[i];
+		if (j >= 0) {
+			/* this is why I run the loop backwards */
+			lastx = x[j];
+			//printf("03:		lastx: %f\n", lastx);
+			switch (y[j]) {
+			case LEFT:
+				if (rp.wt[j] > 0)
+					ll++;
+				llwt += rp.wt[j];
+				break;
+			case RIGHT:
+				if (rp.wt[j] > 0)
+					rl++;
+				rlwt += rp.wt[j];
+				break;
+			default:
+				;
+			}
+		}
+	}
+
+	// printf("ll: %d\n", ll); -> 13
+	// printf("rl: %d\n", rl); -> 87
+
+	if (llwt > rlwt)
+		agree = llwt;
+	else
+		agree = rlwt;
+
+	/*
+	 *  Now we have the total agreement.  Calculate the %agreement and
+	 *    the adjusted agreement
+	 *  For both, do I use the total y vector as my denominator (my
+	 *    preference), or only the y's for non-missing x (CART book)?
+	 *    If the former, need to reset some totals.
+	 */
+	
+		/* worst possible agreement */
+		majority = agree;
+		total_wt = llwt + rlwt;
+	
+
+	lr = rr = 0;
+	lrwt = 0;
+	rrwt = 0;
+	/*
+	 *  March across, moving things from the right to the left
+	 *    the "lastx" code is caring for ties in the x var
+	 *    (The loop above sets it to the first unique x value).
+	 */
+	/* NB: might never set csplit or split */
+	//*csplit = LEFT;
+	csplit_for_cat[0] = LEFT; 
+	/* a valid splitting value */
+	*split = lastx;
+	for (int i = n1; (ll + rl) >= 2; i++) {
+		int j = order[i];
+		//length_of_order_i = length_of_order_i + 1;
+		//printf("order[i]: %d\n", order[i]);
+		if (j >= 0) {
+			/* not a missing value */
+			if ((lr + rr) >= 2 && x[j] != lastx) {
+				/* new x found, evaluate the split */
+				if ((llwt + rrwt) > agree) {
+					success = 1;
+					agree = llwt + rrwt;
+					/* < goes to the right */
+					//*csplit = RIGHT;
+					csplit_for_cat[0] = RIGHT;
+					*split = (x[j] + lastx) / 2;
+				} else if ((lrwt + rlwt) > agree) {
+					success = 1;
+					agree = lrwt + rlwt;
+					//*csplit = LEFT;
+					csplit_for_cat[0] = LEFT;
+					*split = (x[j] + lastx) / 2;
+				}
+			}
+
+			switch (y[j]) {
+			/* update numbers */
+			case LEFT:
+				if (rp.wt[j] > 0) {
+					ll--;
+					lr++;
+				}
+				llwt -= rp.wt[j];
+				lrwt += rp.wt[j];
+				break;
+			case RIGHT:
+				if (rp.wt[j] > 0) {
+					rl--;
+					rr++;
+				}
+				rlwt -= rp.wt[j];
+				rrwt += rp.wt[j];
+				break;
+			default:
+				/* ignore missing y's */
+				;
+			}
+			lastx = x[j];
+		}
+	}
+	} else {
+		
+		int defdir;
+		int lcount = 0;
+		int rcount = 0;
+		for (int i = 0; i < ncat; i++) {
+			left[i] = 0;
+			right[i] = 0;
+			lwt[i] = 0;
+			rwt[i] = 0;
+		}
+
+		/* First step:
+	 *  left = table(x[y goes left]), right= table(x[y goes right])
+	 *  so left[2] will be the number of x == 2's that went left,
+	 *  and lwt[2] the sum of the weights for those observations.
+	 * Only those with weight > 0 count in the totals
+	 */
+	for (int i = n1; i < n2; i++) {
+	    int j = order[i];
+	    if (j >= 0) {
+		int k = (int) x[j] - 1;
+
+		
+		
+		switch (y[j]) {
+		case LEFT:
+		    if (rp.wt[j] > 0)
+			left[k]++;
+			
+		    lwt[k] += rp.wt[j];
+		    break;
+		case RIGHT:
+		    if (rp.wt[j] > 0)
+			right[k]++;
+		    rwt[k] += rp.wt[j];
+		    break;
+		default:;
+		}
+	    }
+	}
+	
+
+	
+
+	
+	
+	
+
+	  /*
+	*  Compute which is better: everyone to the right or left
+	*/
+	llwt = 0;
+	rrwt = 0;
+	for (int i = 0; i < ncat; i++) {
+	    llwt += lwt[i];
+	    rrwt += rwt[i];
+	}
+	if (llwt > rrwt) {
+	    defdir = LEFT;
+	    majority = llwt;
+	} else {
+	    defdir = RIGHT;
+	    majority = rrwt;
+	}
+	total_wt = llwt + rrwt;
+
+	/*
+	 * We can calculate the best split category by category--- send each
+	 *  x value individually to its better direction
+	 */
+	agree = 0.0;
+	for (int i = 0; i < ncat; i++) {
+	    if (left[i] == 0 && right[i] == 0)
+		{
+			csplit_for_cat[i] = 0;
+		}
+	    else {
+		if (lwt[i] < rwt[i] || (lwt[i] == rwt[i] && defdir == RIGHT)) {
+		    agree += rwt[i];
+		    csplit_for_cat[i] = RIGHT;
+		    lcount += left[i];
+		    rcount += right[i];
+		} else {
+		    agree += lwt[i];
+		    csplit_for_cat[i] = LEFT;
+		    lcount += right[i];
+		    rcount += left[i];
+		}
+	    }
+	}
+	
+
+
+	//success = lcount > 1 && rcount > 1; /* sends at least 2 each way */
+	success = lcount > 1 || rcount > 1;
+	
+
+
+	}
+	
+
+
+	/*
+	 * success = 0 means no split was found that had at least 2 sent each
+	 *   way (not counting weights of zero), and for continuous splits also had
+	 *   an improvement in agreement.
+	 *  Due to round-off error such a split could still appear to have adj>0
+	 *  in the calculation further below; avoid this.
+	 */
+
+	if (!success) {
+		*agreement = 0.0;
+		*adj = 0.0;
+		return;
+	}
+
+
+	/*
+     *  Now we have the total agreement.  Calculate the %agreement and
+     *    the adjusted agreement
+     *  For both, do I use the total y vector as my denominator (my
+     *    preference), or only the y's for non-missing x (CART book)?
+     *    If the former, need to reset some totals.
+     */
+    if (rp.sur_agree == 0) {    /* use total table */
+	total_wt = tleft + tright;
+	if (tleft > tright)
+	    majority = tleft;
+	else
+	    majority = tright;
+    }
+
+	*agreement = agree / total_wt;
+	majority /= total_wt;
+	*adj = (*agreement - majority) / (1. - majority);
+	
+}
\ No newline at end of file
diff --git a/src/free_tree.c b/src/free_tree.c
new file mode 100644
index 0000000000000000000000000000000000000000..d5425783f6a75ff1acdee070a400fddd8bc86c37
--- /dev/null
+++ b/src/free_tree.c
@@ -0,0 +1,29 @@
+// This routine is from rpart
+//
+// free up all of the memory associated with a tree
+
+#include "get_surg.h"
+#include "node.h"
+#include "rpartproto.h"
+
+void free_split(pSplit spl) {
+	if (spl) {
+		free_split(spl->nextsplit);
+		free(spl);
+	}
+}
+
+// use freenode if the tree was CALLOC-ed, from xval.c
+void free_tree(pNode node, int freenode) {
+	free_split(node->surrogate);
+	free_split(node->primary);
+
+	if (freenode == 1)
+		if (node)
+			free(node);
+		else {
+			// don't point to things I just freed
+			node->primary = (pSplit) NULL;
+			node->surrogate = (pSplit) NULL;
+		}
+}
\ No newline at end of file
diff --git a/src/get_surg.c b/src/get_surg.c
new file mode 100644
index 0000000000000000000000000000000000000000..2789523389238608a5c04198bad69c3a3dfb4736
--- /dev/null
+++ b/src/get_surg.c
@@ -0,0 +1,261 @@
+// This routine is originally from rpart.
+//
+// The main entry point for R to find surrogate variables for a node.
+//
+// Input variables:
+//      wt      = vector of case weights
+//      xmat    = matrix of continuous variables
+//      opt     = vector of options.  Same order as get_surg.control, as a vector
+//                of doubles.
+//      node    = primary node
+//
+// Returned: a list with elements
+//      dsplit = for each split, numeric variables (doubles)
+//      isplit = for each split, integer variables
+//      isplitcntSum =  for each node, integer variables
+//
+// Naming convention: ... = pointer to an integer vector, ...R = the
+// input R object (SEXP) containing that vector
+
+#define MAINRP
+#include <math.h>
+#include "get_surg.h"
+#include "node.h"
+#include "rpartproto.h"
+
+#ifdef DEBUG
+#include <unistd.h>	//for using the function sleep
+#endif
+
+SEXP getSurrogates(SEXP ncat2, SEXP wt, SEXP xmat, SEXP opt, SEXP var, SEXP split) {
+	
+
+	double *dptr;
+	int *iptr;
+	int i, j, k, n;
+	pNode tree;
+
+	// LCJ
+	int *ncat;
+	int maxcat;
+	int *cat_direction;
+
+	// return objects for R
+	// end in "3" to avoid overlap with internal names
+
+	SEXP dsplit3, isplit3;
+
+	
+	// output variables
+	int nsplit[2];
+	double *ddsplit[3];
+	int *iisplit[3];
+
+
+	//cat_direction = INTEGER(cat_direction2);
+
+	ncat = INTEGER(ncat2);
+
+	int test2 = 5;
+	int *prim_var_num;
+	prim_var_num = INTEGER(var);
+
+	double *splitpoint;
+	double *cat_dir_numeric;
+
+	int *cat_dir_int = (int *) calloc(ncat[prim_var_num[0] - 1] + 1, sizeof(int));
+
+	if (ncat[prim_var_num[0] - 1] == 0)
+	{
+		splitpoint = REAL(split);
+	} 
+	else
+		{
+			cat_dir_numeric = REAL(split);
+			for (i = 0; i < cat_dir_numeric[0] + 1; i++)
+			{
+				cat_dir_int[i] = (int) cat_dir_numeric[i];
+			}
+		}
+
+	// hand over arguments
+
+	rp.numcat = INTEGER(ncat2);
+	rp.n = nrows(xmat);
+	n = rp.n;
+	rp.nvar = ncols(xmat);
+	rp.wt = REAL(wt);
+	iptr = INTEGER(opt);
+	rp.maxsur = (int) iptr[0];
+	rp.sur_agree = (int) iptr[1];
+
+	
+
+
+
+	// create pointers to the matrix
+	// x and missmat are in column major order
+	// y is in row major order
+	dptr = REAL(xmat);
+	rp.xdata = (double **) calloc(rp.nvar, sizeof(double *));
+
+	for (i = 0; i < rp.nvar; i++) {
+		rp.xdata[i] = dptr;
+		dptr += n;
+	}
+
+	// create matrix of sort indices, for surrogate
+	// one for each continuous variable
+	// This sort is "once and for all".
+	// I don't have to sort the categories.
+
+	rp.sorts = (int **) calloc(rp.nvar, sizeof(int *));
+	rp.sorts[0] = (int *) calloc(n * rp.nvar, sizeof(int));
+
+	maxcat = 0;
+
+
+		int *tempvec = (int *) calloc(n, sizeof(int));
+		double *xtemp = (double *) calloc(n, sizeof(double));
+
+		for (i = 0; i < rp.nvar; i++) {
+			rp.sorts[i] = rp.sorts[0] + i * n;
+			for (k = 0; k < n; k++) {
+				if (!R_FINITE(rp.xdata[i][k])) {
+					// this variable is missing (NA)
+					tempvec[k] = -(k + 1);
+					xtemp[k] = 0;
+				} else {
+					tempvec[k] = k;
+					xtemp[k] = rp.xdata[i][k];
+				}
+			}
+			if(ncat[i] == 0)
+			{
+				sort_vec(0, n - 1, xtemp, tempvec, i);
+			} else if(ncat[i] > maxcat)
+			{
+				maxcat = ncat[i];
+			}
+			for (k = 0; k < n; k++)
+			{
+				rp.sorts[i][k] = tempvec[k];
+
+				if (i == 8)
+				{
+					// printf("rp.sorts[i][k]: %d, xtemp[k]: %f\n", rp.sorts[i][k], xtemp[k]);
+				}
+			}	
+			
+		}
+		//clear DMA
+		free(tempvec);
+		free(xtemp);
+
+
+	if (maxcat > 0) {
+		rp.csplit_for_cat = (int *) ALLOC(3 * maxcat, sizeof(int));
+		rp.lwt = (double *) ALLOC(2 * maxcat, sizeof(double));
+		rp.left = rp.csplit_for_cat + maxcat;
+		rp.right = rp.left + maxcat;
+		rp.rwt = rp.lwt + maxcat;
+	} else {
+		rp.csplit_for_cat = (int *) ALLOC(1, sizeof(int));
+	}
+
+	// finalize tree object
+	nodesize = sizeof(Node);
+	tree = (pNode) calloc(1, nodesize);
+
+	//dptr = REAL(node);
+	// the split structure is sized for 2 categories.
+	int splitsize = sizeof(Split);
+	tree->primary = (pSplit) calloc(1, splitsize);
+
+	
+
+	tree->primary->var_num = prim_var_num[0] - 1;
+
+
+	if (rp.numcat[tree->primary->var_num] == 0)
+	{
+		tree->primary->csplit[0] = 1;
+		tree->primary->spoint = splitpoint[0];
+	} else 
+	{
+		for (i = 1; i < cat_dir_int[0]+1; i++)
+		{
+			tree->primary->csplit[i-1] = cat_dir_int[i];
+		}
+	}
+
+	// free(cat_dir_int);
+
+	tree->primary->nextsplit = NULL;
+
+
+/*
+
+	if (rp.numcat[tree->primary->var_num] == 0)
+	{
+		tree->primary->spoint = splitpoint[0];
+	}
+*/
+
+	if (0 < rp.maxsur)
+		surrogate(tree, 0, n);
+
+
+	// Return the body of the tree
+	// For each component we first create a vector to hold the
+	// result, then a ragged array index into the vector.
+	// The rpmatrix routine then fills everything in.
+
+	rpcountup(tree, nsplit);
+	int splitcnt = nsplit[0] + nsplit[1];
+
+	dsplit3 = PROTECT(allocMatrix(REALSXP, splitcnt, 2));
+	dptr = REAL(dsplit3);
+
+	for (i = 0; i < 2; i++) {
+		ddsplit[i] = dptr;
+		dptr += splitcnt;
+		for (j = 0; j < splitcnt; j++) {
+			ddsplit[i][j] = 0.0;
+		}
+	}
+
+
+	isplit3 = PROTECT(allocMatrix(INTSXP, splitcnt, 2));
+	iptr = INTEGER(isplit3);
+
+	for (i = 0; i < 2; i++) {
+		iisplit[i] = iptr;
+		iptr += splitcnt;
+	}
+
+	rpmatrix(tree, rp.numcat, ddsplit, iisplit, nsplit);
+
+	// Create the output list
+	int nout = 2;
+	SEXP rlist = PROTECT(allocVector(VECSXP, nout));
+	SEXP rname = allocVector(STRSXP, nout);
+	setAttrib(rlist, R_NamesSymbol, rname);
+	SET_VECTOR_ELT(rlist, 0, dsplit3);
+	SET_STRING_ELT(rname, 0, mkChar("dsplit"));
+	SET_VECTOR_ELT(rlist, 1, isplit3);
+	SET_STRING_ELT(rname, 1, mkChar("isplit"));
+
+	UNPROTECT(1 + nout);
+
+	// let the memory go
+	free_tree(tree, 0);
+
+	// clear DMA
+	free(rp.xdata);
+	free(rp.sorts[0]);
+	free(rp.sorts);
+	free(tree);
+
+	return rlist;
+}
\ No newline at end of file
diff --git a/src/get_surg.h b/src/get_surg.h
new file mode 100644
index 0000000000000000000000000000000000000000..231f15a0ab37196e6de1927b91547bac3f77a13c
--- /dev/null
+++ b/src/get_surg.h
@@ -0,0 +1,103 @@
+// This routine is originally from rpart.
+//
+// common variables for the rpart routine
+//
+// Start with things that depend on R.h
+
+
+#include <R.h>
+#include <Rinternals.h>
+
+#ifdef ENABLE_NLS
+#include <libintl.h>
+#define _(String) dgettext ("SurrogateMinimalDepth", String)
+#else
+#define _(String) (String)
+#endif
+
+// Header for OpenMP
+#include <omp.h>
+
+// Most compilers with openMP support supply a
+//  pre-defined compiler macro _OPENMP. Following
+//  facilitates selective turning off (by testing
+//  value or defining multiple versions OPENMP_ON1,
+//  OPENMP_ON2...)
+//  note also that there is no actual *need* to
+//  protect #pragmas with #ifdef OPENMP_ON, since C
+//  ignores undefined pragmas, but failing to do so
+//  may produce warnings if openMP is not supported.
+//  In contrast functions from omp.h must be protected.
+
+
+// TODO DEBUG
+//#define DEBUG
+////#ifdef DEBUG
+////Rprintf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
+////#endif
+
+
+#ifdef DEBUG
+//for using the function sleep
+#include <unistd.h>
+// Header for measuring execution time
+//#include "stopwatch.h"
+#endif
+
+#if defined _OPENMP
+#define OPENMP_ON 1
+#endif
+
+
+// Memory defined with R_alloc is removed automatically
+// That with "CALLOC" I have to remove myself. Use the
+// latter for objects that need to persist between the
+// s_to_rp1 and s_to_rp2 calls
+
+#define ALLOC(a,b) R_alloc(a,b)
+#define CALLOC(a,b) R_chk_calloc((size_t)(a), b)
+
+// done with the R internals
+// used for the variable "extra" in nodes
+#define LEFT  (-1)
+#define RIGHT  1
+
+#ifdef MAINRP
+#define EXTERN
+#else
+#define EXTERN extern
+#endif
+
+
+// As a sop to S, I need to keep the total number of external symbols
+// somewhat smaller. So, pack most of them all into a structure.
+
+EXTERN struct {
+	double **xdata;
+	double *wt;
+	// matrix of sort indices
+	int **sorts;
+	// total number of subjects
+	int n;
+	// number of predictors
+	int nvar;
+	// max # of primary or surrogate splits to use
+	int maxsur;
+	// 0 = my style, 1=CART style
+	int sur_agree;
+	// to be allocated by the mainline, of length n
+	int csplit;
+	int *csplit_for_cat;
+	double *lwt;
+	double *rwt;
+	int *left;
+	int *right;
+	int nthreads;
+	int *numcat;
+} rp;
+
+EXTERN int nodesize;
+
+
+// Categorical variables must be coded as 1,2,3, ...., and there may be
+// missing categories. The upper limit is determined on the fly.
diff --git a/src/init.c b/src/init.c
new file mode 100644
index 0000000000000000000000000000000000000000..79629773f2081d485344e11da42b0ab21b889302
--- /dev/null
+++ b/src/init.c
@@ -0,0 +1,23 @@
+// This routine export the c function for R
+#include "get_surg.h"
+#include "R_ext/Rdynload.h"
+#include "node.h"
+#include "rpartproto.h"
+
+SEXP getSurrogates(SEXP ncat2, SEXP wt, SEXP xmat, SEXP opt, SEXP var, SEXP split);
+
+static const R_CallMethodDef CallEntries[] = {
+	// registering native routines http://www.hep.by/gnu/r-patched/r-exts/R-exts_95.html
+	{ "getSurrogates", (DL_FUNC) & getSurrogates, 6 }, { NULL, NULL, 0 }
+};
+
+#include <Rversion.h>
+void
+// registering native routines http://www.hep.by/gnu/r-patched/r-exts/R-exts_95.html
+R_init_SurrogateMinimalDepth(DllInfo * dll) {
+	R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+	R_useDynamicSymbols(dll, FALSE);
+#if defined(R_VERSION) && R_VERSION >= R_Version(2, 16, 0)
+	R_forceSymbols(dll, TRUE);
+#endif
+}
diff --git a/src/insert_split.c b/src/insert_split.c
new file mode 100644
index 0000000000000000000000000000000000000000..09bca80bf7bc0b9d79fb97cd8a5fe6a2397bd2d2
--- /dev/null
+++ b/src/insert_split.c
@@ -0,0 +1,87 @@
+
+// This routine is from rpart
+//
+// sort a new split into a linked list, based on its "improvement"
+//
+//  allocates new memory as needed
+//   returns 0 if the new element isn't good enough,
+//   the address of the new element otherwise
+
+#include "get_surg.h"
+#include "node.h"
+#include "rpartproto.h"
+
+pSplit insert_split(pSplit *listhead, int ncat, double improve, int max) {
+	int nlist;
+	pSplit s1, s2, s3 = NULL, s4;
+
+	if (ncat == 0) {
+		ncat = 1;
+	}
+	int splitsize = sizeof(Split) + (ncat - 20) * sizeof(int);
+
+	// The Split structure is sized for 2 categories.
+	if (*listhead == 0) {
+		// first call to a new list
+		// csplit gets used even for continuous splits.
+		s3 = (pSplit) CALLOC(1, splitsize);
+		s3->nextsplit = NULL;
+		*listhead = s3;
+		return s3;
+	}
+	if (max < 2) {
+		// user asked for only 1 to be retained!
+		s3 = *listhead;
+		if (improve <= s3->improve)
+			return NULL;
+		if (ncat > 1) {
+			Free(s3);
+			s3 = (pSplit) CALLOC(1, splitsize);
+			s3->nextsplit = NULL;
+			*listhead = s3;
+		}
+		return s3;
+	}
+	// set up --- nlist = length of list, s4=last element, s3=next to last
+	nlist = 1;
+	for (s4 = *listhead; s4->nextsplit; s4 = s4->nextsplit) {
+		s3 = s4;
+		nlist++;
+	}
+
+	// now set up so that the "to be added" is between s1 and s2
+	s1 = *listhead;
+	for (s2 = *listhead; s2; s2 = s2->nextsplit) {
+		if (improve > s2->improve)
+			break;
+		s1 = s2;
+	}
+	if (nlist == max) {
+		if (s2 == 0)
+			// not good enough
+			return NULL;
+		if (ncat > 1) {
+			Free(s4);
+			s4 = (pSplit) CALLOC(1, splitsize);
+		}
+
+		// redefine element s4 as new element
+		if (s1 == s3) {
+			s4->nextsplit = NULL;
+		}
+		else {
+			s3->nextsplit = NULL;
+			s4->nextsplit = s2;
+		}
+	} else {
+		// reuse pointer s4 for new element
+		// csplit gets used even for continuous splits.
+		s4 = (pSplit) CALLOC(1, splitsize);
+		s4->nextsplit = s2;
+	}
+	if (s2 == *listhead)
+		*listhead = s4;
+	else
+		s1->nextsplit = s4;
+	return s4;
+}
\ No newline at end of file
diff --git a/src/node.h b/src/node.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dc10d2400e487dd3a03f9361a52c88e5f5a44cb
--- /dev/null
+++ b/src/node.h
@@ -0,0 +1,76 @@
+#ifndef RPART_NODE_H
+#define RPART_NODE_H
+
+// This routine is originally from rpart.
+//
+// definition of a node in the tree
+//
+// The actual size of these structures when allocated in insert_split.c
+// depends on the split.
+// csplit[0] gets used even for continuous splits.
+
+typedef struct split {
+	double improve;
+	// for surrogates only, adjusted agreement
+	double adj;
+	// only used if it is continuous
+	double spoint;
+	struct split *nextsplit;
+	int var_num;
+	int count;
+	// the actual length depends on splitting rule
+	int csplit[20];
+} Split, *pSplit;
+
+typedef struct node {
+	// risk for the node
+	double risk;
+	pSplit primary, surrogate;
+	int lastsurrogate;
+} Node, *pNode;
+
+//
+//  Split:
+//      variable number of the split; 0 = no more surrogates (or primaries)
+//
+//      split point: the actual split point for a continuous
+//
+//      improve:  For primary splits, the improvement index returned by the
+//                bsplit routine. This is the measure that determines the
+//                winning split.
+//                For surrogate splits, this holds the error rate, i.e., the
+//                % incorrect guesses of the primary by using this surrogate.
+//
+//      count: The number of observations split using this variable. For the
+//             first primary, this will = the number of non-missing values.
+//             For surrogates, it will be the number missing in the primary
+//             and all earlier surrogates but not missing on this one. (For
+//             all primaries but the first, the number is theoretical).
+//
+//   adj:  Let "maj" be the %agreement for going with the majority,
+//         and "agree" the %agreement for this surrogate. The
+//         adjusted value is (agree - maj)/(1-maj); the amount of
+//         the potential improvement actually realized. The denominator
+//         for both percents depends on the sur_agree option.
+//
+//      csplit[0]:   For a continuous variable, we also need to know the
+//                   direction of the split. We use this "extra" variable
+//                   as 1: <x to the left, -1: <x to the right.
+//
+//      csplit[]:    For a categorical, the labels are LEFT, RIGHT, and
+//                   0=missing. (Even if a particular category is not empty,
+//                   there may be no subjects from that category present
+//                   at a particular split further down the tree).
+//
+//
+//  Node:
+//      num_obs: Number of observations in the node.
+//
+//      risk: From the eval routine. Estimate of risk, if this node were
+//            terminal.
+//
+//      lastsurrogate: Which direction to send obs for which the primary and
+//              all the surrogates are missing. (The child with the greatest
+//             sum of weights).
+
+#endif
diff --git a/src/rpartproto.h b/src/rpartproto.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3fd1d18c4b25defb35671ec063d7538bfdc31bb
--- /dev/null
+++ b/src/rpartproto.h
@@ -0,0 +1,23 @@
+// This routine is originally from rpart
+//
+// prototypes for all of the functions
+//   This helps the ansi compiler do tight checking.
+//
+
+#include "node.h"
+
+void choose_surg(int n1, int n2, int *y, double *x, int *order, int ncat, double *agreement, double *split, int *csplit_for_cat, double ltot, double rtot, double *adj);
+
+void free_split(pSplit spl);
+
+void free_tree(pNode node, int freenode);
+
+pSplit insert_split(pSplit *listhead, int ncat, double improve, int max);
+
+void sort_vec(int start, int stop, double *x, int *cvec, int var);
+
+void rpcountup(pNode me, int *nsplit);
+
+void rpmatrix(pNode me, int *numcat, double **dsplit, int **isplit,  int *splits);
+
+void surrogate(pNode me, int n1, int n2);
\ No newline at end of file
diff --git a/src/rpcountup.c b/src/rpcountup.c
new file mode 100644
index 0000000000000000000000000000000000000000..519f218aae93bba5ff7377d12b2316d4518caa2c
--- /dev/null
+++ b/src/rpcountup.c
@@ -0,0 +1,27 @@
+// This routine is originally from rpart.
+//
+// count up the number of nodes and splits in the final result
+//
+
+#include "get_surg.h"
+#include "node.h"
+#include "rpartproto.h"
+
+void rpcountup(pNode me, int *nsplit) {
+	int i, j;
+	pSplit ss;
+
+	i = 0;
+	j = 0;
+
+		for (ss = me->primary; ss; ss = ss->nextsplit) {
+			i++;
+		}
+
+		for (ss = me->surrogate; ss; ss = ss->nextsplit) {
+			j++;
+		}
+
+	nsplit[0] = i;
+	nsplit[1] = j;
+}
\ No newline at end of file
diff --git a/src/rpmatrix.c b/src/rpmatrix.c
new file mode 100644
index 0000000000000000000000000000000000000000..03a4e4041ccd3247809e3ebbf38981ce688b76d4
--- /dev/null
+++ b/src/rpmatrix.c
@@ -0,0 +1,64 @@
+// This routine is originally from rpart.
+//
+// For S's usage, convert the linked list data into matrix form
+
+#include "get_surg.h"
+#include "node.h"
+#include "rpartproto.h"
+
+void rpmatrix(pNode me, int *numcat, double **dsplit, int **isplit,  int *nsplit) {
+
+	// dsplit  0: improvement
+	//         1: split point for continuous
+	//         2: surrogate: adjusted agreement,  primary: nothing
+	// isplit  0: variable #
+	//         1: count
+	//         2: continuous: direction -1=left, 1=right
+
+
+	int scnt;
+	int j;
+	pSplit spl;
+
+	scnt = 0;
+
+
+			spl = me->primary;
+			for (scnt = 0; scnt < nsplit[0]; scnt++) {
+				j = spl->var_num;
+				
+				if (numcat[j] == 0)
+				{
+					dsplit[1][scnt] = spl->spoint;
+					isplit[1][scnt] = -1;
+				} else
+				{
+					isplit[1][scnt] = numcat[j];
+				}
+
+				isplit[0][scnt] = spl->var_num + 1;
+				
+
+				spl = spl->nextsplit;
+			}
+
+			spl = me->surrogate;
+			int splitcnt = nsplit[0] + nsplit[1];
+			for (scnt = nsplit[0]; scnt < splitcnt; scnt++) {
+				j = spl->var_num;
+				
+				dsplit[0][scnt] = spl->adj;
+				if (numcat[j] == 0) {
+					dsplit[1][scnt] = spl->spoint;
+					isplit[1][scnt] = -1;
+				} else {
+					isplit[1][scnt] = numcat[j];
+				}
+				isplit[0][scnt] = j + 1;
+				
+
+
+				spl = spl->nextsplit;
+			}
+
+}
\ No newline at end of file
diff --git a/src/sort_vec.c b/src/sort_vec.c
new file mode 100644
index 0000000000000000000000000000000000000000..8307df85f97ec1eefd0ec89ea1cb1a82f4d61832
--- /dev/null
+++ b/src/sort_vec.c
@@ -0,0 +1,131 @@
+/*
+ * This routine is from rpart
+ *
+ * quick sort routine : sort a vector of floats, and carry along an int
+ *
+ *  x:     vector to sort on
+ *  start: first element of x to sort
+ *  stop:  last element of x to sort
+ *  cvec:  a vector to carry along
+ */
+#include "get_surg.h"
+#include "rpartproto.h"
+
+void sort_vec(int start, int stop, double *x, int *cvec, int var) {
+	int i, j, k;
+	double temp, median;
+	int tempd;
+	while (start < stop) {
+		/*
+		 * first-- if the list is short, do an ordinary insertion sort
+		 */
+		if ((stop - start) < 11) {
+			for (i = start + 1; i <= stop; i++) {
+				temp = x[i];
+				tempd = cvec[i];
+				j = i - 1;
+
+				while (j >= start && x[j] > temp) {
+					x[j + 1] = x[j];
+					cvec[j + 1] = cvec[j];
+					j--;
+				}
+				x[j + 1] = temp;
+				cvec[j + 1] = tempd;
+			}
+			return;
+		}
+		/*
+		 * list is longer -- split it into two
+		 * I use the median of 3 values as the split point
+		 */
+		i = start;
+		j = stop;
+		k = (start + stop) / 2;
+
+		median = x[k];
+		if (x[i] >= x[k]) {
+			/* one of j or k is smallest */
+			if (x[j] > x[k]) {
+				/* k is smallest */
+				if (x[i] > x[j])
+					median = x[j];
+				else
+					median = x[i];
+			}
+		} else {
+			if (x[j] < x[k]) {
+				if (x[i] > x[j])
+					median = x[i];
+				else
+					median = x[j];
+			}
+		}
+
+		/*
+		 *  Now actually do the partitioning
+		 *   Because we must have at least one element >= median, "i"
+		 *   will never run over the end of the array.  Similar logic
+		 *   applies to j.
+		 * A note on the use of "<" rather than "<=".  If a list has lots
+		 *   of identical elements, e.g. 80/100 are "3.5", then we will
+		 *   often go to the swap step with x[i]=x[j]=median.  But we will
+		 *   get the pointers i and j to meet approximately in the middle of
+		 *   the list, and that is THE important condition for speed in a
+		 *   quicksort.
+		 *
+		 */
+		while (i < j) {
+			/* top pointer down till it points at something too large */
+			while (x[i] < median)
+				i++;
+
+			/* bottom pointer up until it points at something too small */
+			while (x[j] > median)
+				j--;
+
+			if (i < j) {
+				if (x[i] > x[j]) {
+					/* swap */
+					temp = x[i];
+					x[i] = x[j];
+					x[j] = temp;
+					tempd = cvec[i];
+					cvec[i] = cvec[j];
+					cvec[j] = tempd;
+
+				}
+				i++;
+				j--;
+			}
+		}
+
+		/*
+		 * The while() step helps if there are lots of ties.  It will break
+		 *  the list into 3 parts: < median, ==median, >=median, of which only
+		 *  the top and bottom ones need further attention.
+		 * The ">=" is needed because i may be  == to j
+		 */
+		while (x[i] >= median && i > start)
+			i--;
+		while (x[j] <= median && j < stop)
+			j++;
+
+		/*
+		 * list has been split, now do a recursive call
+		 *   always recur on the shorter list, as this keeps the total
+		 *       depth of nested calls to less than log_base2(n).
+		 */
+		if ((i - start) < (stop - j)) {
+			/* top list is shorter */
+			if ((i - start) > 0)
+				sort_vec(start, i, x, cvec, var);
+			start = j;
+		} else {
+			/* bottom list is shorter */
+			if ((stop - j) > 0)
+				sort_vec(j, stop, x, cvec, var);
+			stop = i;
+		}
+	}
+}
diff --git a/src/surrogate.c b/src/surrogate.c
new file mode 100644
index 0000000000000000000000000000000000000000..db2b4ca02aada745ea747e808b03e6ac92a0ffb2
--- /dev/null
+++ b/src/surrogate.c
@@ -0,0 +1,162 @@
+// This routine is from rpart
+//
+// Calculate the surrogate splits for a node and its primary
+//    (This routine is an awful lot like bsplit)
+//
+// Input :      node
+//              start and stop indices for the arrays (which obs apply)
+//
+// Output:      Fills in the node's
+//                      surrogate splits
+//                      lastsurrogate value
+//
+// Uses:        The global vector tempvec (integer) as a temporary, assumed
+//                to be of length n.
+
+#include "get_surg.h"
+#include "node.h"
+#include "rpartproto.h"
+
+void surrogate(pNode me, int n1, int n2) {
+	int i, j, k;
+	// the primary split variable
+	int var;
+	double split;
+	double improve;
+	// weight sent left and right by primary
+	double lcount, rcount;
+	int extra;
+	pSplit ss;
+	pSplit *ss_list;
+	int *index;
+	int *tempy;
+	int **sorts;
+	double **xdata;
+	double adj_agree;
+	int ncat;
+
+	// DMA 
+	tempy = (int *) calloc(rp.n, sizeof(int));
+	sorts = rp.sorts;
+	xdata = rp.xdata;
+
+	// First construct, in tempy, the "y" variable for this calculation.
+	// It will be LEFT:goes left, 0:missing, RIGHT:goes right.
+	// Count up the number of obs the primary sends to the left, as my
+	// last surrogate (or to the right, if larger).
+
+	var = (me->primary)->var_num;
+
+	// LCJ
+	// continuous variable
+	if (rp.numcat[var] == 0)
+	{
+		split = (me->primary)->spoint;
+		extra = (me->primary)->csplit[0];
+		for (i = n1; i < n2; i++)
+		{	
+			// lcj: j is the variable num out of the sorted matrix of variables "sorts", whereby "sorts" is sorted in ascending order
+			j = sorts[var][i];
+			// printf("j (von primvar == 8): %d\n", j);
+			// -> vgl. j mit logfile branch_2_sorts.txt: variable numbers stimmen überein
+			if (j < 0)
+			{
+				tempy[-(j + 1)] = 0;
+			} else
+			{
+				tempy[j] = (xdata[var][j] < split) ? extra : -extra;
+			}
+		}
+	} else
+	{ /* categorial variable */
+	// lcj: index is a list of directions for the cats, whereby the index of "index" represents the categorial and the corresponding value the direction
+	// example: index = [1, -1, -1, 1]: cat 0 goes 1 (right), cat 1 goes -1 (left), cat 2 goes -1 (left), cat 3 goes 1 (right)
+	index = (me->primary)->csplit;
+	
+	for (i = n1; i < n2; i++) {
+		j = sorts[var][i];
+	    if (j < 0) {
+		tempy[-(j + 1)] = 0;
+		}
+	    else {
+		tempy[j] = index[(int) xdata[var][j] - 1];
+		}
+	
+	}
+	
+	}
+
+
+	lcount = 0;
+	rcount = 0;
+	// lcj: var: PrimVar (see line 48)
+	for (i = n1; i < n2; i++) {
+		j = sorts[var][i];
+		//if (j < 0)
+		//	j = -(j + 1);
+		j = j < 0 ? -(j + 1) : j;
+		switch (tempy[j]) {
+		case LEFT:
+			lcount += rp.wt[j];
+			break;
+		case RIGHT:
+			rcount += rp.wt[j];
+			break;
+		default:
+			break;
+		}
+	}
+	// end parallel section
+
+	if (lcount < rcount)
+		me->lastsurrogate = RIGHT;
+	else {
+		if (lcount > rcount)
+			me->lastsurrogate = LEFT;
+		else
+			// no default
+			me->lastsurrogate = 0;
+	}
+
+	// Now walk through the variables
+	me->surrogate = (pSplit) NULL;
+	//int splitLR = rp.csplit;
+	int nthreads;
+
+		for (i = 0; i < rp.nvar; i++) {
+			if (var == i)
+				continue;
+			ncat = rp.numcat[i];
+
+			
+
+			choose_surg(n1, n2, tempy, xdata[i], sorts[i], ncat, &improve, &split, rp.csplit_for_cat, lcount, rcount, &adj_agree);
+
+			// org comment: was 0
+			if (adj_agree <= 1e-10)
+				// no better than default
+				continue;
+			
+
+			// sort it onto the list of surrogates
+
+			ss = insert_split(&(me->surrogate), ncat, improve, rp.maxsur);
+			if (ss) {
+				ss->improve = improve;
+				ss->var_num = i;
+				// corrected by nodesplit()
+				ss->count = 0;
+				ss->adj = adj_agree;
+				if ( rp.numcat[i] == 0) {
+					ss->spoint = split;
+					ss->csplit[0] = rp.csplit_for_cat[0];
+				} else {
+					for (k = 0; k < rp.numcat[i]; k++) {
+						ss->csplit[k] = rp.csplit_for_cat[k];
+					}
+				}
+				
+			}
+
+		}
+}