From f863ca6007d633ba3a2bf68780854ea8a04dc29b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20G=C3=A4rber?= <florian.gaerber@uni-hamburg.de> Date: Thu, 20 Jul 2023 14:06:14 +0200 Subject: [PATCH] fix: Lint code using `styler::style_pkg()` --- R/SMD_example_data.R | 1 - R/addLayer.R | 38 ++-- R/addSurrogates.R | 151 +++++++------- R/build_clusters.R | 82 ++++---- R/count.surrogates.R | 58 +++--- R/getTreeranger.R | 44 ++--- R/meanAdjAgree.R | 132 +++++++------ R/mindep.R | 132 ++++++------- R/reduce.surrogates.R | 71 ++++--- R/surrmindep.R | 208 ++++++++++---------- R/var.relations.R | 151 +++++++------- R/var.relations.mfi.R | 291 ++++++++++++++------------- R/variable_selection_md.R | 35 ++-- R/variable_selection_mir.R | 393 ++++++++++++++++++++----------------- R/variable_selection_smd.R | 188 +++++++++--------- man/build.clusters.Rd | 56 +++--- man/reduce.surrogates.Rd | 23 ++- man/var.relations.Rd | 11 +- man/var.relations.mfi.Rd | 13 +- man/var.select.md.Rd | 9 +- man/var.select.mir.Rd | 9 +- man/var.select.smd.Rd | 9 +- 22 files changed, 1073 insertions(+), 1032 deletions(-) diff --git a/R/SMD_example_data.R b/R/SMD_example_data.R index ec046b1..8feabec 100644 --- a/R/SMD_example_data.R +++ b/R/SMD_example_data.R @@ -9,7 +9,6 @@ #' the R package WGCNA. The ten variables of each basis variable are labeled: Cp_basicvariable_number. Additional non-correlated and #' independent predictor variables (cgn) were simulated using the standard normal distribution to reach a total number of 200 variables. #' -#' #' @docType data #' @keywords datasets #' @name SMD_example_data diff --git a/R/addLayer.R b/R/addLayer.R index 91be0b1..dae06cf 100644 --- a/R/addLayer.R +++ b/R/addLayer.R @@ -1,6 +1,6 @@ -#'Add layer information to a forest that was created by getTreeranger +#' Add layer information to a forest that was created by getTreeranger #' -#'This functions adds the layer information to each node in a list with trees that was obtained by getTreeranger. +#' This functions adds the layer information to each node in a list with trees that was obtained by getTreeranger. #' #' @param trees list of trees created by getTreeranger #' @return a list with trees. Each row of the list elements corresponds to a node of the respective tree and the columns correspond to: @@ -14,25 +14,23 @@ #' \item layer: layer information (0 means root node, 1 means 1 layer below root, etc) #' } #' @export - -addLayer=function(trees){ - #This function adds the respective layer to the different nodes in a tree. The tree has to be prepared by getTree function - tree.layer=list() - num.trees= length(trees) - for (i in 1:num.trees){ - tree=trees[[i]] - layer=rep(NA,nrow(tree)) - layer[1]=0 - t=1 - while (anyNA(layer)){ - r=unlist(tree[which(layer==(t-1)),2:3]) - layer[r]=t - t=t+1 +addLayer <- function(trees) { + # This function adds the respective layer to the different nodes in a tree. The tree has to be prepared by getTree function + tree.layer <- list() + num.trees <- length(trees) + for (i in 1:num.trees) { + tree <- trees[[i]] + layer <- rep(NA, nrow(tree)) + layer[1] <- 0 + t <- 1 + while (anyNA(layer)) { + r <- unlist(tree[which(layer == (t - 1)), 2:3]) + layer[r] <- t + t <- t + 1 } - tree=cbind(tree,layer) - tree=tree[order(as.numeric(tree[,"layer"])),] - tree.layer[[i]]=tree + tree <- cbind(tree, layer) + tree <- tree[order(as.numeric(tree[, "layer"])), ] + tree.layer[[i]] <- tree } return(tree.layer) } - diff --git a/R/addSurrogates.R b/R/addSurrogates.R index ed3fafd..8116c81 100644 --- a/R/addSurrogates.R +++ b/R/addSurrogates.R @@ -20,32 +20,34 @@ #' \item adj_i: adjusted agreement of variable i #' } #' @export -addSurrogates = function(RF,trees,s,Xdata,num.threads) { - - num.trees = length(trees) - ncat = sapply(sapply(Xdata,levels),length) # determine number of categories (o for continuous variables) - names(ncat) = colnames(Xdata) +addSurrogates <- function(RF, trees, s, Xdata, num.threads) { + num.trees <- length(trees) + ncat <- sapply(sapply(Xdata, levels), length) # determine number of categories (o for continuous variables) + names(ncat) <- colnames(Xdata) if (is.null(num.threads)) { - num.threads = parallel::detectCores() + num.threads <- parallel::detectCores() } if (any(ncat) > 0) { - Xdata[,which(ncat > 0)] = sapply(Xdata[,which(ncat > 0)],unclass) + Xdata[, which(ncat > 0)] <- sapply(Xdata[, which(ncat > 0)], unclass) } - #variables to find surrogates (control file similar as in rpart) - controls = list(maxsurrogate = as.integer(s), sur_agree = 0) + # variables to find surrogates (control file similar as in rpart) + controls <- list(maxsurrogate = as.integer(s), sur_agree = 0) - trees.surr = parallel::mclapply(1:num.trees, - getSurrogate, - mc.cores = num.threads, - maxsurr = s, - surr.par = list(inbag.counts = RF$inbag.counts, - Xdata = Xdata, - controls = controls, - trees = trees, - ncat = ncat)) + trees.surr <- parallel::mclapply(1:num.trees, + getSurrogate, + mc.cores = num.threads, + maxsurr = s, + surr.par = list( + inbag.counts = RF$inbag.counts, + Xdata = Xdata, + controls = controls, + trees = trees, + ncat = ncat + ) + ) return(trees.surr) } @@ -54,20 +56,22 @@ addSurrogates = function(RF,trees,s,Xdata,num.threads) { #' This is an internal function #' #' @keywords internal -getSurrogate = function(surr.par, k = 1, maxsurr) { - #weights and trees are extracted - tree = surr.par$trees[[k]] - column.names = colnames(tree) - n.nodes = nrow(tree) - wt = surr.par$inbag.counts[[k]] - tree.surr = lapply(1:n.nodes, - SurrTree, - wt = wt, - Xdata = surr.par$Xdata, - controls = surr.par$controls, - column.names, tree,maxsurr, - ncat = surr.par$ncat) +getSurrogate <- function(surr.par, k = 1, maxsurr) { + # weights and trees are extracted + tree <- surr.par$trees[[k]] + column.names <- colnames(tree) + n.nodes <- nrow(tree) + wt <- surr.par$inbag.counts[[k]] + tree.surr <- lapply(1:n.nodes, + SurrTree, + wt = wt, + Xdata = surr.par$Xdata, + controls = surr.par$controls, + column.names, tree, maxsurr, + ncat = surr.par$ncat + ) } + #' SurrTree #' #' This is an internal function @@ -75,52 +79,52 @@ getSurrogate = function(surr.par, k = 1, maxsurr) { #' @useDynLib RFSurrogates, .registration = TRUE #' #' @keywords internal -SurrTree = function(j,wt,Xdata,controls,column.names,tree,maxsurr,ncat) { - node = tree[j,] +SurrTree <- function(j, wt, Xdata, controls, column.names, tree, maxsurr, ncat) { + node <- tree[j, ] # for non-terminal nodes get surrogates if (node["status"] == 1) { - #Handover to C - var = as.numeric(node[4]) # extract split variable + # Handover to C + var <- as.numeric(node[4]) # extract split variable - if (ncat[var] == 0) { # extract split information: split point for continuous variables and directions for qualitative variables - split = as.numeric(node[5]) - } else { - right = as.numeric(strsplit(as.character(node[5]), ",")[[1]]) - directions = rep(-1,ncat[var]) - directions[right] = 1 - split = as.numeric(c(ncat[var],directions)) - } + if (ncat[var] == 0) { # extract split information: split point for continuous variables and directions for qualitative variables + split <- as.numeric(node[5]) + } else { + right <- as.numeric(strsplit(as.character(node[5]), ",")[[1]]) + directions <- rep(-1, ncat[var]) + directions[right] <- 1 + split <- as.numeric(c(ncat[var], directions)) + } + surrogate.parameters <- .Call("getSurrogates", + ncat = as.integer(ncat), + wt = as.numeric(wt), + X = as.matrix(Xdata), + controls = as.integer(unlist(controls)), + var = as.integer(var), # node variables + split = as.numeric(split) + ) # split info - surrogate.parameters = .Call("getSurrogates", - ncat = as.integer(ncat), - wt = as.numeric(wt), - X = as.matrix(Xdata), - controls = as.integer(unlist(controls)), - var = as.integer(var), # node variables - split = as.numeric(split)) # split info + if (nrow(surrogate.parameters$isplit) > 1) { + surrogates <- surrogate.parameters$isplit[2:nrow(surrogate.parameters$isplit), 1] + surr.adj <- round(surrogate.parameters$dsplit[2:nrow(surrogate.parameters$dsplit), 1], 2) + node.new <- data.frame(matrix(nrow = 1, ncol = 7 + length(surrogates) + length(surr.adj))) + node.new[, 1:7] <- node[1:7] + node.new[, 8:(7 + length(surrogates) + length(surr.adj))] <- c(surrogates, surr.adj) + surrogate.names <- NULL + adj.names <- NULL + surrogate.names <- sapply(1:length(surrogates), name.surr, surrogate.names) + adj.names <- sapply(1:length(surrogates), name.adj, adj.names) + names(node.new) <- c(column.names, surrogate.names, adj.names) + } - if (nrow(surrogate.parameters$isplit) > 1) { - surrogates = surrogate.parameters$isplit[2:nrow(surrogate.parameters$isplit),1] - surr.adj = round(surrogate.parameters$dsplit[2:nrow(surrogate.parameters$dsplit),1],2) - node.new = data.frame(matrix(nrow = 1, ncol = 7 + length(surrogates) + length(surr.adj))) - node.new[,1:7] = node[1:7] - node.new[,8:(7 + length(surrogates) + length(surr.adj))] = c(surrogates,surr.adj) - surrogate.names = NULL - adj.names = NULL - surrogate.names = sapply(1:length(surrogates),name.surr,surrogate.names) - adj.names = sapply(1:length(surrogates),name.adj,adj.names) - names(node.new) = c(column.names,surrogate.names,adj.names) - } - - if (nrow(surrogate.parameters$isplit) == 1) { - node.new = node - } + if (nrow(surrogate.parameters$isplit) == 1) { + node.new <- node + } } if (node["status"] == 0) { - node.new = node + node.new <- node } -return(node.new) + return(node.new) } #' name.surr @@ -128,9 +132,9 @@ return(node.new) #' This is an internal function #' #' @keywords internal -name.surr = function(i,surrogate.names){ -surrogate.names = c(surrogate.names,paste0("surrogate_",i)) -return(surrogate.names) +name.surr <- function(i, surrogate.names) { + surrogate.names <- c(surrogate.names, paste0("surrogate_", i)) + return(surrogate.names) } #' name.adj @@ -138,8 +142,7 @@ return(surrogate.names) #' This is an internal function #' #' @keywords internal -name.adj = function(i,adj.names){ - adj.names = c(adj.names,paste0("adj_",i)) +name.adj <- function(i, adj.names) { + adj.names <- c(adj.names, paste0("adj_", i)) return(adj.names) } - diff --git a/R/build_clusters.R b/R/build_clusters.R index dcf428b..a1df9b6 100644 --- a/R/build_clusters.R +++ b/R/build_clusters.R @@ -1,7 +1,7 @@ #' Apply cluster analysis to build variable groups #' -#'This function generates variables groups of relation information that was obtained by \link[SurrogateMinimalDepth]{var.relations} function applying -#'\link[linkcomm]{getLinkCommunities}. +#' This function generates variables groups of relation information that was obtained by \link[SurrogateMinimalDepth]{var.relations} function applying +#' \link[linkcomm]{getLinkCommunities}. #' #' @param rel a list containing variables, surr.res, threshold, and var. This is the output of \link[SurrogateMinimalDepth]{var.relations} function. #' @param hcmethod the hierarchical clustering method that is used. (see \link[linkcomm]{getLinkCommunities}) @@ -13,50 +13,52 @@ #' data("SMD_example_data") #' #' \donttest{ -#' # get trees and variable names -#' x = SMD_example_data[,2:ncol(SMD_example_data)] -#' y = SMD_example_data[,1] -#' allvariables = colnames(x)# extract variables names -#' nvar = length(allvariables) # count number of variables -#' set.seed(42) -#' RF = ranger::ranger( -#' data = SMD_example_data, -#' dependent.variable.name = "y", -#' num.trees = 10, -#' keep.inbag = TRUE, -#' mtry = floor(nvar^(3/4)), -#' min.node.size = 1, -#' num.threads = 1) -#' trees = getTreeranger(RF = RF, num.trees = 10) -#' trees.lay = addLayer(trees) -#' trees.surr = addSurrogates(RF = RF, trees = trees.lay, s = 10, Xdata = x, num.threads = 1) +#' # get trees and variable names +#' x <- SMD_example_data[, 2:ncol(SMD_example_data)] +#' y <- SMD_example_data[, 1] +#' allvariables <- colnames(x) # extract variables names +#' nvar <- length(allvariables) # count number of variables +#' set.seed(42) +#' RF <- ranger::ranger( +#' data = SMD_example_data, +#' dependent.variable.name = "y", +#' num.trees = 10, +#' keep.inbag = TRUE, +#' mtry = floor(nvar^(3 / 4)), +#' min.node.size = 1, +#' num.threads = 1 +#' ) +#' trees <- getTreeranger(RF = RF, num.trees = 10) +#' trees.lay <- addLayer(trees) +#' trees.surr <- addSurrogates(RF = RF, trees = trees.lay, s = 10, Xdata = x, num.threads = 1) #' -#' # investigate variable relations -#' rel=var.relations( -#' x = data.frame(), -#' create.forest = FALSE, -#' forest = list(trees = trees.surr, allvariables = allvariables), -#' variables = allvariables, -#' candidates = allvariables, -#' t = 10, -#' num.threads = 1) -#' groups = build.clusters(rel) +#' # investigate variable relations +#' rel <- var.relations( +#' x = data.frame(), +#' create.forest = FALSE, +#' forest = list(trees = trees.surr, allvariables = allvariables), +#' variables = allvariables, +#' candidates = allvariables, +#' t = 10, +#' num.threads = 1 +#' ) +#' groups <- build.clusters(rel) #' } #' #' @export - - -build.clusters = function(rel,hcmethod = "ward.D") { - +build.clusters <- function(rel, hcmethod = "ward.D") { # create links matrix - links = matrix(nrow = length(unlist(rel$var)) ,ncol = 2) - links[,1] = unlist(sapply(1:length(rel$variables), function(x) {rep(rel$variables[x],length(rel$var[[x]]))})) - links[,2] = unlist(rel$var) + links <- matrix(nrow = length(unlist(rel$var)), ncol = 2) + links[, 1] <- unlist(sapply(1:length(rel$variables), function(x) { + rep(rel$variables[x], length(rel$var[[x]])) + })) + links[, 2] <- unlist(rel$var) # cluster analysis to identify clusters - link.comm = linkcomm::getLinkCommunities(links, - hcmethod = hcmethod, - plot = FALSE) - cluster.info = link.comm$nodeclusters + link.comm <- linkcomm::getLinkCommunities(links, + hcmethod = hcmethod, + plot = FALSE + ) + cluster.info <- link.comm$nodeclusters return(cluster.info) } diff --git a/R/count.surrogates.R b/R/count.surrogates.R index 670c461..18658a8 100644 --- a/R/count.surrogates.R +++ b/R/count.surrogates.R @@ -1,30 +1,30 @@ -#'Count surrogate variables +#' Count surrogate variables #' -#'This function counts surrogate variables and returns the total average number of surrogate variables and the average number of surrogate variables of the respective layers. -#'This is necessary since the actual number of surrogate splits can be lower than the predefined number (when less surrogate splits outperform the majority rule). +#' This function counts surrogate variables and returns the total average number of surrogate variables and the average number of surrogate variables of the respective layers. +#' This is necessary since the actual number of surrogate splits can be lower than the predefined number (when less surrogate splits outperform the majority rule). #' -#'@param trees list of trees that was generated by getTreeranger function and layers, surrogate variables, and adjusted agreement values were added by addLayer and getSurrogates functions +#' @param trees list of trees that was generated by getTreeranger function and layers, surrogate variables, and adjusted agreement values were added by addLayer and getSurrogates functions #' @return List with the following components: #' \itemize{ #' \item s.a: total average number of surrogate variables #' \item s.l: average number of surrogate variables in the respective layers #' } #' @export -count.surrogates=function(trees){ - num.trees=length(trees) - surrogates.trees=lapply(1:num.trees,scount,trees) +count.surrogates <- function(trees) { + num.trees <- length(trees) + surrogates.trees <- lapply(1:num.trees, scount, trees) - s.a=mean(sapply(surrogates.trees,"[[","s.a")) - s.l.list=lapply(surrogates.trees,"[[","s.l") + s.a <- mean(sapply(surrogates.trees, "[[", "s.a")) + s.l.list <- lapply(surrogates.trees, "[[", "s.l") s.l.all <- matrix(NA, nrow = 1000, ncol = num.trees) - for (u in 1:num.trees){ - s.l.tree=s.l.list[[u]][,2] - s.l.all[1:length(s.l.tree),u]=s.l.tree + for (u in 1:num.trees) { + s.l.tree <- s.l.list[[u]][, 2] + s.l.all[1:length(s.l.tree), u] <- s.l.tree } - s.l=rowMeans(s.l.all,na.rm = TRUE) - names(s.l)=c(0:999) + s.l <- rowMeans(s.l.all, na.rm = TRUE) + names(s.l) <- c(0:999) - return(list(s.a=s.a,s.l=s.l)) + return(list(s.a = s.a, s.l = s.l)) } #' scount @@ -32,19 +32,19 @@ count.surrogates=function(trees){ #' This is an internal function #' #' @keywords internal -scount=function(i=1,trees){ - tree=trees[[i]] - nonterminal.nodes=tree[which(sapply(tree,"[[","status")==1)] - s.a=(mean(sapply((lapply(nonterminal.nodes,"[",-c(1:7))),length)))/2 - maxlayer=as.numeric(unlist(nonterminal.nodes[length(nonterminal.nodes)])["layer"]) - s.l=matrix(NA,maxlayer+1,2) - colnames(s.l)=c("layer","No. Surrogates") - s.l[,1]=0:maxlayer - for (u in 0:maxlayer){ - nodes.at.layer=nonterminal.nodes[which(sapply(nonterminal.nodes,"[[","layer")==u)] - surr=lapply(nodes.at.layer,"[",-c(1:7)) - s.u=(mean(sapply(surr,length)))/2 - s.l[u+1,2]=s.u +scount <- function(i = 1, trees) { + tree <- trees[[i]] + nonterminal.nodes <- tree[which(sapply(tree, "[[", "status") == 1)] + s.a <- (mean(sapply((lapply(nonterminal.nodes, "[", -c(1:7))), length))) / 2 + maxlayer <- as.numeric(unlist(nonterminal.nodes[length(nonterminal.nodes)])["layer"]) + s.l <- matrix(NA, maxlayer + 1, 2) + colnames(s.l) <- c("layer", "No. Surrogates") + s.l[, 1] <- 0:maxlayer + for (u in 0:maxlayer) { + nodes.at.layer <- nonterminal.nodes[which(sapply(nonterminal.nodes, "[[", "layer") == u)] + surr <- lapply(nodes.at.layer, "[", -c(1:7)) + s.u <- (mean(sapply(surr, length))) / 2 + s.l[u + 1, 2] <- s.u } - return(list(s.a=s.a,s.l=s.l)) + return(list(s.a = s.a, s.l = s.l)) } diff --git a/R/getTreeranger.R b/R/getTreeranger.R index e190c64..2e71d7b 100644 --- a/R/getTreeranger.R +++ b/R/getTreeranger.R @@ -1,6 +1,6 @@ -#'Get a list of structured trees for ranger +#' Get a list of structured trees for ranger #' -#'This functions creates a list of trees for ranger objects similar as getTree function does for random Forest objects. +#' This functions creates a list of trees for ranger objects similar as getTree function does for random Forest objects. #' #' @param RF random forest object created by ranger (with keep.inbag=TRUE) #' @param num.trees number of trees @@ -14,38 +14,36 @@ #' \item status: "0" for terminal and "1" for non-terminal #' } #' @export - - - -getTreeranger=function(RF,num.trees) { - trees=lapply(1:num.trees,getsingletree,RF=RF) +getTreeranger <- function(RF, num.trees) { + trees <- lapply(1:num.trees, getsingletree, RF = RF) return(trees) } - #' getsingletree #' #' This is an internal function #' #' @keywords internal -getsingletree=function(RF,k=1){ +getsingletree <- function(RF, k = 1) { # here we use the treeInfo function of the ranger package to create extract the trees, in an earlier version this was done with a self implemented function - tree.ranger = ranger::treeInfo(RF,tree = k) - ktree=data.frame(as.numeric(tree.ranger$nodeID+1), - as.numeric(tree.ranger$leftChild+1), - as.numeric(tree.ranger$rightChild+1), - as.numeric(tree.ranger$splitvarID+1), - tree.ranger$splitval, - tree.ranger$terminal) - if (is.factor(ktree[,5])) { - ktree[,5] = as.character(levels(ktree[,5] ))[ktree[,5]] + tree.ranger <- ranger::treeInfo(RF, tree = k) + ktree <- data.frame( + as.numeric(tree.ranger$nodeID + 1), + as.numeric(tree.ranger$leftChild + 1), + as.numeric(tree.ranger$rightChild + 1), + as.numeric(tree.ranger$splitvarID + 1), + tree.ranger$splitval, + tree.ranger$terminal + ) + if (is.factor(ktree[, 5])) { + ktree[, 5] <- as.character(levels(ktree[, 5]))[ktree[, 5]] } - ktree[,6] = as.numeric(ktree[,6] == FALSE) + ktree[, 6] <- as.numeric(ktree[, 6] == FALSE) - for (i in 2:4) { - ktree[,i][is.na(ktree[,i])] = 0 - } - colnames(ktree)=c("nodeID","leftdaughter","rightdaughter","splitvariable","splitpoint","status") + for (i in 2:4) { + ktree[, i][is.na(ktree[, i])] <- 0 + } + colnames(ktree) <- c("nodeID", "leftdaughter", "rightdaughter", "splitvariable", "splitpoint", "status") return(ktree) } diff --git a/R/meanAdjAgree.R b/R/meanAdjAgree.R index fd14fb4..f9a67a1 100644 --- a/R/meanAdjAgree.R +++ b/R/meanAdjAgree.R @@ -1,6 +1,6 @@ -#'Calculate mean adjusted agreement to investigate variables relations +#' Calculate mean adjusted agreement to investigate variables relations #' -#'This is the main function of var.relations function. +#' This is the main function of var.relations function. #' #' @param trees list of trees created by getTreeranger, addLayer and addSurrogate. #' @param variables vector of variable names. @@ -18,39 +18,42 @@ #' \item surr.var: binary matrix showing if the variables are related (1) or non-related (0) with variables in rows and candidates in columns. #' } #' @export - - -meanAdjAgree=function(trees,variables,allvariables,candidates,t,s.a,select.var,num.threads = NULL){ - num.trees=length(trees) - index.variables=match(variables,allvariables) - index.candidates = match(candidates,allvariables) +meanAdjAgree <- function(trees, variables, allvariables, candidates, t, s.a, select.var, num.threads = NULL) { + num.trees <- length(trees) + index.variables <- match(variables, allvariables) + index.candidates <- match(candidates, allvariables) if (is.null(num.threads)) { - num.threads = parallel::detectCores() + num.threads <- parallel::detectCores() } - list.res = rlist::list.flatten(parallel::mclapply(trees, - surr.tree, - mc.cores = num.threads, - variables, - index.variables, - allvariables, - index.candidates)) + list.res <- rlist::list.flatten(parallel::mclapply(trees, + surr.tree, + mc.cores = num.threads, + variables, + index.variables, + allvariables, + index.candidates + )) - results.allvar = matrix(unlist(lapply(1:length(index.variables), - mean.index, - list.res, - index.variables)), - ncol=length(candidates),nrow=length(variables),byrow = TRUE) - colnames(results.allvar)=candidates - rownames(results.allvar)=variables + results.allvar <- matrix( + unlist(lapply( + 1:length(index.variables), + mean.index, + list.res, + index.variables + )), + ncol = length(candidates), nrow = length(variables), byrow = TRUE + ) + colnames(results.allvar) <- candidates + rownames(results.allvar) <- variables - if(select.var) { + if (select.var) { # calculate threshold and select variables according to it - adj.mean=mean(unlist(lapply((1:num.trees),adj.mean.trees,trees)),na.rm = TRUE) - threshold=((s.a*adj.mean)/(length(allvariables)-1))*t - SurrVar=ifelse(results.allvar>threshold, 1, 0) - result=list(surr.res=results.allvar,threshold=threshold,surr.var=SurrVar,variables=variables) + adj.mean <- mean(unlist(lapply((1:num.trees), adj.mean.trees, trees)), na.rm = TRUE) + threshold <- ((s.a * adj.mean) / (length(allvariables) - 1)) * t + SurrVar <- ifelse(results.allvar > threshold, 1, 0) + result <- list(surr.res = results.allvar, threshold = threshold, surr.var = SurrVar, variables = variables) } else { - result=list(surr.res=results.allvar,variables=variables) + result <- list(surr.res = results.allvar, variables = variables) } return(result) } @@ -60,14 +63,14 @@ meanAdjAgree=function(trees,variables,allvariables,candidates,t,s.a,select.var,n #' This is an internal function #' #' @keywords internal -mean.index=function(i, list.res,index.variables){ - list = list.res[which(names(list.res) == index.variables[i])] - mean.list = round(Reduce("+",list)/length(list),2) +mean.index <- function(i, list.res, index.variables) { + list <- list.res[which(names(list.res) == index.variables[i])] + mean.list <- round(Reduce("+", list) / length(list), 2) if (length(mean.list) > 0) { - mean.list[index.variables[i]] = NA - return(mean.list) + mean.list[index.variables[i]] <- NA + return(mean.list) } else { - return(rep(NA,length(index.variables))) + return(rep(NA, length(index.variables))) } } @@ -76,14 +79,14 @@ mean.index=function(i, list.res,index.variables){ #' This is an internal function #' #' @keywords internal -surr.tree=function(tree,variables,index.variables,allvariables,index.candidates){ - allvar.num = length(allvariables) - nonterminal.nodes = tree[which(sapply(tree,"[[","status")==1)] - relevant.nodes = nonterminal.nodes[sapply(nonterminal.nodes,"[[","splitvariable") %in% index.variables] +surr.tree <- function(tree, variables, index.variables, allvariables, index.candidates) { + allvar.num <- length(allvariables) + nonterminal.nodes <- tree[which(sapply(tree, "[[", "status") == 1)] + relevant.nodes <- nonterminal.nodes[sapply(nonterminal.nodes, "[[", "splitvariable") %in% index.variables] if (length(relevant.nodes) > 0) { - list.nodes = lapply(1:length(relevant.nodes),adj.node,allvar.num,relevant.nodes,index.candidates) - splitvar = sapply(relevant.nodes,"[[","splitvariable") - names(list.nodes) = splitvar + list.nodes <- lapply(1:length(relevant.nodes), adj.node, allvar.num, relevant.nodes, index.candidates) + splitvar <- sapply(relevant.nodes, "[[", "splitvariable") + names(list.nodes) <- splitvar return(list.nodes) } } @@ -94,13 +97,13 @@ surr.tree=function(tree,variables,index.variables,allvariables,index.candidates) #' This is an internal function #' #' @keywords internal -adj.node = function(i,allvar.num,relevant.nodes,index.candidates) { - node = relevant.nodes[i] - adjnode = rep(0,allvar.num) - surr=unlist(sapply(node,"[",-c(1:7))) # extract surrogates - if ((length(node[[1]]))>7){ - s=(length(surr))/2 - adjnode[surr[1:s]]=surr[(s+1):(2*s)] +adj.node <- function(i, allvar.num, relevant.nodes, index.candidates) { + node <- relevant.nodes[i] + adjnode <- rep(0, allvar.num) + surr <- unlist(sapply(node, "[", -c(1:7))) # extract surrogates + if ((length(node[[1]])) > 7) { + s <- (length(surr)) / 2 + adjnode[surr[1:s]] <- surr[(s + 1):(2 * s)] } return(adjnode[index.candidates]) } @@ -112,9 +115,8 @@ adj.node = function(i,allvar.num,relevant.nodes,index.candidates) { #' This is an internal function #' #' @keywords internal -adj.mean=function(trees){ - adj.trees=sapply(1:length(trees),adj.mean.trees,trees) - +adj.mean <- function(trees) { + adj.trees <- sapply(1:length(trees), adj.mean.trees, trees) } #' adj.mean.trees @@ -122,13 +124,13 @@ adj.mean=function(trees){ #' This is an internal function #' #' @keywords internal -adj.mean.trees=function(t,trees){ - tree=trees[[t]] - nonterminal.nodes=tree[which(sapply(tree,"[[","status")==1)] - surr.nonterminal=lapply(nonterminal.nodes,"[",-c(1:7)) - adj.tree=mean(unlist(lapply(1:length(surr.nonterminal),mean.adj.node,surr.nonterminal)),na.rm = TRUE) +adj.mean.trees <- function(t, trees) { + tree <- trees[[t]] + nonterminal.nodes <- tree[which(sapply(tree, "[[", "status") == 1)] + surr.nonterminal <- lapply(nonterminal.nodes, "[", -c(1:7)) + adj.tree <- mean(unlist(lapply(1:length(surr.nonterminal), mean.adj.node, surr.nonterminal)), na.rm = TRUE) if (adj.tree == "NaN") { - adj.tree = NA + adj.tree <- NA } return(adj.tree) } @@ -138,14 +140,14 @@ adj.mean.trees=function(t,trees){ #' This is an internal function #' #' @keywords internal -mean.adj.node=function(m,surr.nonterminal){ - surr=surr.nonterminal[[m]] - if (length(surr)!=0){ - num.surr=length(surr)/2 - adj=surr[(num.surr+1):(2*num.surr)] +mean.adj.node <- function(m, surr.nonterminal) { + surr <- surr.nonterminal[[m]] + if (length(surr) != 0) { + num.surr <- length(surr) / 2 + adj <- surr[(num.surr + 1):(2 * num.surr)] } - if (length(surr)==0){ - adj=NA + if (length(surr) == 0) { + adj <- NA } return(adj) } diff --git a/R/mindep.R b/R/mindep.R index 45eed49..39da6ad 100644 --- a/R/mindep.R +++ b/R/mindep.R @@ -1,6 +1,6 @@ -#'Execute minimal depth variable importance +#' Execute minimal depth variable importance #' -#'This function determines the minimal depth of variables from a forest that is created by getTreeranger, and addLayer functions. +#' This function determines the minimal depth of variables from a forest that is created by getTreeranger, and addLayer functions. #' #' @param variables vector of variable names #' @param trees list of trees that was generated by getTreeranger and layers functions @@ -11,80 +11,76 @@ #' \item threshold: the threshold that is used for the selection #' } #' @export +mindep <- function(variables, trees) { + num.trees <- length(trees) + # This function determines the minimal depth of variables from a tree with layers that is created by getTree and addLayer functions. + # It is based on this paper: Ishwaran et. al. Journal of the American Statistical Accociation 2010 and used the conservative marginal + # approximation to the minimal depth distribution -mindep=function(variables,trees){ + var.num <- length(variables) - num.trees= length(trees) -# This function determines the minimal depth of variables from a tree with layers that is created by getTree and addLayer functions. -# It is based on this paper: Ishwaran et. al. Journal of the American Statistical Accociation 2010 and used the conservative marginal -# approximation to the minimal depth distribution + # prepare matrix for mindepth + mindepth <- matrix(NA, nrow = num.trees, ncol = var.num) + colnames(mindepth) <- variables -var.num=length(variables) - -#prepare matrix for mindepth -mindepth=matrix(NA,nrow=num.trees,ncol=var.num) -colnames(mindepth)=variables - -MAX.DEPTH=10000 -#maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step -maxdepth=rep(NA,num.trees) -nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = num.trees) -# get mindepth for every variable in every tree -for (i in 1:num.trees){ -nodesAtDepth <- rep(NA, MAX.DEPTH) -tree=trees[[i]] -tree=tree[order(as.numeric(tree[,"layer"])),] -# get layer information of the variables and save it in minimal depth file -depth.tree=rep(NA,length(variables)) - o=1 - while (anyNA(depth.tree) && o<=nrow(tree)){ - if (tree[o,"status"]==1){ - if (is.na(depth.tree[tree[o,"splitvariable"]])) { - depth.tree[tree[o,"splitvariable"]]=tree[o,"layer"] + MAX.DEPTH <- 10000 + # maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step + maxdepth <- rep(NA, num.trees) + nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = num.trees) + # get mindepth for every variable in every tree + for (i in 1:num.trees) { + nodesAtDepth <- rep(NA, MAX.DEPTH) + tree <- trees[[i]] + tree <- tree[order(as.numeric(tree[, "layer"])), ] + # get layer information of the variables and save it in minimal depth file + depth.tree <- rep(NA, length(variables)) + o <- 1 + while (anyNA(depth.tree) && o <= nrow(tree)) { + if (tree[o, "status"] == 1) { + if (is.na(depth.tree[tree[o, "splitvariable"]])) { + depth.tree[tree[o, "splitvariable"]] <- tree[o, "layer"] + } } - } - o=o+1 - } - #variables with no split in the tree get maxdepth+1 as minimal depth - depth.tree[which(is.na(depth.tree))]=tree[nrow(tree),"layer"] - #save min and max depth information for every tree - mindepth[i,]=depth.tree - maxdepth[i]=tree[nrow(tree),"layer"] - #find the number of nodes in every layer - - for (u in 1:(maxdepth[i])){ + o <- o + 1 + } + # variables with no split in the tree get maxdepth+1 as minimal depth + depth.tree[which(is.na(depth.tree))] <- tree[nrow(tree), "layer"] + # save min and max depth information for every tree + mindepth[i, ] <- depth.tree + maxdepth[i] <- tree[nrow(tree), "layer"] + # find the number of nodes in every layer - nodesAtDepth[u]=nrow(subset(tree, tree[,"layer"] == u & tree[,"status"] == 1 )) + for (u in 1:(maxdepth[i])) { + nodesAtDepth[u] <- nrow(subset(tree, tree[, "layer"] == u & tree[, "status"] == 1)) + } + nodesAtDepthMatrix[, i] <- nodesAtDepth } - nodesAtDepthMatrix[,i]=nodesAtDepth -} - -# create mean values for the minimal depth of different variables -mean.depth=colMeans(mindepth) -# determine the mean depth of an uninformative variable called D star in Ishwaran et. al. Journal of the American Statistical -# Accociation 2010 (treshhold for important variables) + # create mean values for the minimal depth of different variables + mean.depth <- colMeans(mindepth) -#the following determination of the treshold is taken from max.subtree of the randomForestSRC package -treeHeight =maxdepth -avgTreeHeight = mean(treeHeight, na.rm=TRUE) -maxTreeHeight = max(treeHeight, na.rm=TRUE) -nodes.at.depth.avg = apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE) -l = nodes.at.depth.avg[1:avgTreeHeight] -minDepthStatObj = randomForestSRC:::minDepthStat(var.num, l = l) -threshold <- minDepthStatObj$first.moment + # determine the mean depth of an uninformative variable called D star in Ishwaran et. al. Journal of the American Statistical + # Accociation 2010 (treshhold for important variables) + # the following determination of the treshold is taken from max.subtree of the randomForestSRC package + treeHeight <- maxdepth + avgTreeHeight <- mean(treeHeight, na.rm = TRUE) + maxTreeHeight <- max(treeHeight, na.rm = TRUE) + nodes.at.depth.avg <- apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE) + l <- nodes.at.depth.avg[1:avgTreeHeight] + minDepthStatObj <- randomForestSRC:::minDepthStat(var.num, l = l) + threshold <- minDepthStatObj$first.moment -# Decide if variables are important or unimportant -Importances=rep(NA,var.num) -for (p in 1:var.num){ -if (mean.depth[p]<threshold) { - Importances[p]=1} - else { - Importances[p]=0} -} -names(Importances)=variables -results=list(depth=mean.depth,selected=Importances,threshold=threshold) -return(results) + # Decide if variables are important or unimportant + Importances <- rep(NA, var.num) + for (p in 1:var.num) { + if (mean.depth[p] < threshold) { + Importances[p] <- 1 + } else { + Importances[p] <- 0 + } + } + names(Importances) <- variables + results <- list(depth = mean.depth, selected = Importances, threshold = threshold) + return(results) } - diff --git a/R/reduce.surrogates.R b/R/reduce.surrogates.R index 18abf0f..1635e94 100644 --- a/R/reduce.surrogates.R +++ b/R/reduce.surrogates.R @@ -1,7 +1,7 @@ #' Reduce surrogate variables in a random forest. #' -#'This function can be applied to reduce the surrogate variables in a forest that is created by getTreeranger, addLayer -#'and getSurrogates functions. Hence, it can be applied to the forests that were used for surrogate minimal depth variable importance. +#' This function can be applied to reduce the surrogate variables in a forest that is created by getTreeranger, addLayer +#' and getSurrogates functions. Hence, it can be applied to the forests that were used for surrogate minimal depth variable importance. #' #' @param forest a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions. #' @param s number of surrogate variables in the new forest (have to be less than in the RF in trees) @@ -16,63 +16,62 @@ #' ###### use result of SMD variable importance and reduce surrogate variables to 10 #' # select variables with smd variable importance (usually more trees are needed) #' set.seed(42) -#' res = var.select.smd( -#' x = as.data.frame(SMD_example_data[,2:ncol(SMD_example_data)]), -#' y = SMD_example_data[,1], +#' res <- var.select.smd( +#' x = as.data.frame(SMD_example_data[, 2:ncol(SMD_example_data)]), +#' y = SMD_example_data[, 1], #' s = 100, #' num.trees = 10, -#' num.threads = 1) -#' forest.new = reduce.surrogates(forest = res$forest, s = 10) +#' num.threads = 1 +#' ) +#' forest.new <- reduce.surrogates(forest = res$forest, s = 10) #' #' # execute SMD on tree with reduced number of surrogates -#' res.new = var.select.smd( +#' res.new <- var.select.smd( #' x = data.frame(), #' create.forest = FALSE, #' forest = forest.new, -#' num.threads = 1) +#' num.threads = 1 +#' ) #' res.new$var #' #' #' # investigate variable relations -#' rel = var.relations( +#' rel <- var.relations( #' x = data.frame(), #' create.forest = FALSE, #' forest = forest.new, -#' variables=c("X1","X7"), +#' variables = c("X1", "X7"), #' candidates = res$forest[["allvariables"]][1:100], #' t = 5, -#' num.threads = 1) +#' num.threads = 1 +#' ) #' rel$var -#'} +#' } #' @export - - -reduce.surrogates = function(forest, s = 10){ - - trees = forest[["trees"]] - num.trees = length(trees) - trees.new = lapply(1:num.trees, less.surrogates.trees, trees,s) - forest.new = list(trees = trees.new, allvariables = forest[["allvariables"]]) -return(forest.new) +reduce.surrogates <- function(forest, s = 10) { + trees <- forest[["trees"]] + num.trees <- length(trees) + trees.new <- lapply(1:num.trees, less.surrogates.trees, trees, s) + forest.new <- list(trees = trees.new, allvariables = forest[["allvariables"]]) + return(forest.new) } -less.surrogates.trees = function(i = 1, trees,s){ - tree = trees[[i]] - n.nodes = length(tree) - surr.now = sapply((lapply(tree,"[",-c(1:7))),length)/2 - surr.next = surr.now - surr.next[which(surr.now >= s)] = s - tree.new = lapply(1:n.nodes, less.surrogates.node, tree, surr.next,surr.now) +less.surrogates.trees <- function(i = 1, trees, s) { + tree <- trees[[i]] + n.nodes <- length(tree) + surr.now <- sapply((lapply(tree, "[", -c(1:7))), length) / 2 + surr.next <- surr.now + surr.next[which(surr.now >= s)] <- s + tree.new <- lapply(1:n.nodes, less.surrogates.node, tree, surr.next, surr.now) return(tree.new) } - -less.surrogates.node = function(j=1, tree,surr.next, surr.now){ - node = tree[[j]] +less.surrogates.node <- function(j = 1, tree, surr.next, surr.now) { + node <- tree[[j]] if (length(node) == 7) { - node.new = node + node.new <- node } if (length(node) > 7) { -node.new = node[c(1:(7 + surr.next[j]), (8 + surr.now[j]):(7 + surr.now[j] + surr.next[j]))] -} -return(node.new) + node.new <- node[c(1:(7 + surr.next[j]), (8 + surr.now[j]):(7 + surr.now[j] + surr.next[j]))] + } + return(node.new) } diff --git a/R/surrmindep.R b/R/surrmindep.R index 3dd2ac7..7246f28 100644 --- a/R/surrmindep.R +++ b/R/surrmindep.R @@ -1,6 +1,6 @@ -#'Execute surrogate minimal depth variable importance +#' Execute surrogate minimal depth variable importance #' -#'This function determines the surrogate minimal depth of variables from a forest that is created by getTreeranger, addLayer and getSurrogates functions. +#' This function determines the surrogate minimal depth of variables from a forest that is created by getTreeranger, addLayer and getSurrogates functions. #' #' @param forest a list containing allvariables and trees. Allvariables is a vector of all variable names in the original data set (strings). Trees is a list of trees that was generated by getTreeranger, addLayer, and getSurrogates functions. #' @param s.l Number of average surrogate variables in the respective layers. (use count.surrogate function to get it) @@ -11,118 +11,112 @@ #' \item threshold: the threshold that is used for the selection #' } #' @export - -surrmindep = function(forest, s.l){ - -variables = forest[["allvariables"]] -trees = forest[["trees"]] - -num.trees = length(trees) -var.num = length(variables) - -#prepare matrix for mindepth -mindepth = matrix(NA, nrow = num.trees, ncol = var.num) -colnames(mindepth) = variables - -MAX.DEPTH = 10000 -#maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step -maxdepth = rep(NA,num.trees) -nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = num.trees) -# get mindepth for every variable in every tree -for (i in 1:num.trees) { -nodesAtDepth <- rep(NA, MAX.DEPTH) -tree = trees[[i]] -# get layer information of the variables and save it in minimal depth file -depth.tree = rep(NA,length(variables)) - o = 1 - while (anyNA(depth.tree) && o <= length(tree)) { - node = unlist(tree[o]) - if (node["status"] == 1) { - if (is.na(depth.tree[node["splitvariable"]])) { - depth.tree[as.numeric(node["splitvariable"])] = as.numeric(node["layer"]) - } - if (length(node) > 7) { - for (r in 8:(7 + (length(node) - 7)/2)) { - if (is.na(depth.tree[as.numeric(node[r])])) { - depth.tree[as.numeric(node[r])] = as.numeric(node["layer"]) - } - } +surrmindep <- function(forest, s.l) { + variables <- forest[["allvariables"]] + trees <- forest[["trees"]] + + num.trees <- length(trees) + var.num <- length(variables) + + # prepare matrix for mindepth + mindepth <- matrix(NA, nrow = num.trees, ncol = var.num) + colnames(mindepth) <- variables + + MAX.DEPTH <- 10000 + # maximal Depth of trees and the number of nodes in every layer is saved to calculate treshold in a following step + maxdepth <- rep(NA, num.trees) + nodesAtDepthMatrix <- matrix(NA, nrow = MAX.DEPTH, ncol = num.trees) + # get mindepth for every variable in every tree + for (i in 1:num.trees) { + nodesAtDepth <- rep(NA, MAX.DEPTH) + tree <- trees[[i]] + # get layer information of the variables and save it in minimal depth file + depth.tree <- rep(NA, length(variables)) + o <- 1 + while (anyNA(depth.tree) && o <= length(tree)) { + node <- unlist(tree[o]) + if (node["status"] == 1) { + if (is.na(depth.tree[node["splitvariable"]])) { + depth.tree[as.numeric(node["splitvariable"])] <- as.numeric(node["layer"]) + } + if (length(node) > 7) { + for (r in 8:(7 + (length(node) - 7) / 2)) { + if (is.na(depth.tree[as.numeric(node[r])])) { + depth.tree[as.numeric(node[r])] <- as.numeric(node["layer"]) + } + } + } } + o <- o + 1 } - o = o + 1 - } - #variables with no split in the tree get maxdepth as minimal depth - depth.tree[which(is.na(depth.tree))] = as.numeric(unlist(tree[length(tree)])["layer"]) - #save min and max depth information for every tree - mindepth[i,] = depth.tree - maxdepth[i] = as.numeric(unlist(tree[length(tree)])["layer"]) - #find the number of nodes in every layer - - laystat = cbind(sapply(tree,"[[","layer"),sapply(tree,"[[","status")) - colnames(laystat) = c("layer","status") - for (u in 1:(maxdepth[i])) { - - nodesAtDepth[u] = nrow(subset(laystat, laystat[,"layer"] == u & laystat[,"status"] == 1 )) + # variables with no split in the tree get maxdepth as minimal depth + depth.tree[which(is.na(depth.tree))] <- as.numeric(unlist(tree[length(tree)])["layer"]) + # save min and max depth information for every tree + mindepth[i, ] <- depth.tree + maxdepth[i] <- as.numeric(unlist(tree[length(tree)])["layer"]) + # find the number of nodes in every layer + + laystat <- cbind(sapply(tree, "[[", "layer"), sapply(tree, "[[", "status")) + colnames(laystat) <- c("layer", "status") + for (u in 1:(maxdepth[i])) { + nodesAtDepth[u] <- nrow(subset(laystat, laystat[, "layer"] == u & laystat[, "status"] == 1)) + } + nodesAtDepthMatrix[, i] <- nodesAtDepth } - nodesAtDepthMatrix[,i] = nodesAtDepth -} -# create mean values for the minimal depth of different variables -mean.depth = colMeans(mindepth) + # create mean values for the minimal depth of different variables + mean.depth <- colMeans(mindepth) -# determine the mean depth of an uninformative variable similarly as in Ishwaran et. al. Journal of the American Statistical -# Accociation 2010 + # determine the mean depth of an uninformative variable similarly as in Ishwaran et. al. Journal of the American Statistical + # Accociation 2010 -treeHeight = maxdepth -avgTreeHeight = mean(treeHeight, na.rm = TRUE) -maxTreeHeight = max(treeHeight, na.rm = TRUE) + treeHeight <- maxdepth + avgTreeHeight <- mean(treeHeight, na.rm = TRUE) + maxTreeHeight <- max(treeHeight, na.rm = TRUE) -# threshold has to be calculated different when trees are small -if ((avgTreeHeight) < 2) { - s.l.root = s.l[1] - p.root = (s.l.root + 1)/var.num - p.1 = 1 - p.root - threshold = p.root*0 + p.1*1 - warning("Trees are very small! Threshold is defined based on trees with only root nodes.") -} - - - -if ((avgTreeHeight) >= 2) { -nodes.at.depth.avg = apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE) -s.l.noroot = s.l[-1] -s.l.root = s.l[1] -p.root = (s.l.root + 1)/var.num -var.at.depth = nodes.at.depth.avg[1:(avgTreeHeight - 1)]*(s.l.noroot[1:(avgTreeHeight - 1)] + 1) # 1 time for the original tree and s-times for - # the surrogates - #-1 since the last layer will be added later -p.depth = var.at.depth/var.num -p.all = c(p.root,p.depth) -prob.sum = 0 -probs.used = NULL -for (u in 1:length(p.all)) { - p.u = p.all[u] - prob.sum = prob.sum + p.u - if (prob.sum >= 1) break - probs.used[u] = p.u -} -prob.last = 1 - sum(probs.used) # the last layer (that was excluded before) is now used as minimal depth when the variable doesnt appear before -probs.used = c(probs.used,prob.last) -layers = c(0:(length(probs.used) - 1)) -threshold = sum(layers*probs.used) -} + # threshold has to be calculated different when trees are small + if ((avgTreeHeight) < 2) { + s.l.root <- s.l[1] + p.root <- (s.l.root + 1) / var.num + p.1 <- 1 - p.root + threshold <- p.root * 0 + p.1 * 1 + warning("Trees are very small! Threshold is defined based on trees with only root nodes.") + } + if ((avgTreeHeight) >= 2) { + nodes.at.depth.avg <- apply(nodesAtDepthMatrix, 1, mean, na.rm = TRUE) + s.l.noroot <- s.l[-1] + s.l.root <- s.l[1] + p.root <- (s.l.root + 1) / var.num + var.at.depth <- nodes.at.depth.avg[1:(avgTreeHeight - 1)] * (s.l.noroot[1:(avgTreeHeight - 1)] + 1) # 1 time for the original tree and s-times for + # the surrogates + #-1 since the last layer will be added later + p.depth <- var.at.depth / var.num + p.all <- c(p.root, p.depth) + prob.sum <- 0 + probs.used <- NULL + for (u in 1:length(p.all)) { + p.u <- p.all[u] + prob.sum <- prob.sum + p.u + if (prob.sum >= 1) break + probs.used[u] <- p.u + } + prob.last <- 1 - sum(probs.used) # the last layer (that was excluded before) is now used as minimal depth when the variable doesnt appear before + probs.used <- c(probs.used, prob.last) + layers <- c(0:(length(probs.used) - 1)) + threshold <- sum(layers * probs.used) + } -# Decide if variables are important or unimportant -Importances = rep(NA,var.num) -for (p in 1:var.num) { -if (mean.depth[p] < threshold) { - Importances[p] = 1} - else { - Importances[p] = 0} -} -names(Importances) = variables -results = list(depth = mean.depth,selected = Importances,threshold = threshold) -return(results) + # Decide if variables are important or unimportant + Importances <- rep(NA, var.num) + for (p in 1:var.num) { + if (mean.depth[p] < threshold) { + Importances[p] <- 1 + } else { + Importances[p] <- 0 + } + } + names(Importances) <- variables + results <- list(depth = mean.depth, selected = Importances, threshold = threshold) + return(results) } - diff --git a/R/var.relations.R b/R/var.relations.R index f67f644..fbb31de 100644 --- a/R/var.relations.R +++ b/R/var.relations.R @@ -1,9 +1,9 @@ #' Investigate variable relations of a specific variable with mean adjusted agreement #' -#'This function uses the mean adjusted agreement to select variables that are related to a defined variable using a threshold T. -#'The parameter t is used to calculate T: t=1 means that every variable with higher probability than "by chance" is identified -#'as "important". t=2 means the probability has to be twice, etc. -#'Based on the threshold a vector is created containing the related variables. +#' This function uses the mean adjusted agreement to select variables that are related to a defined variable using a threshold T. +#' The parameter t is used to calculate T: t=1 means that every variable with higher probability than "by chance" is identified +#' as "important". t=2 means the probability has to be twice, etc. +#' Based on the threshold a vector is created containing the related variables. #' #' @param variables variable names (string) for which related variables should be searched for (has to be contained in allvariables) #' @param candidates vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables) @@ -23,30 +23,30 @@ #' @examples #' # read data #' data("SMD_example_data") -#' x = SMD_example_data[,2:ncol(SMD_example_data)] -#' y = SMD_example_data[,1] +#' x <- SMD_example_data[, 2:ncol(SMD_example_data)] +#' y <- SMD_example_data[, 1] #' \donttest{ #' # calculate variable relations #' set.seed(42) -#' res = var.relations( +#' res <- var.relations( #' x = x, #' y = y, #' s = 10, #' num.trees = 10, -#' variables = c("X1","X7"), +#' variables = c("X1", "X7"), #' candidates = colnames(x)[1:100], #' t = 5, -#' num.threads = 1) +#' num.threads = 1 +#' ) #' res$var #' } #' #' @export - -var.relations = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, - num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, - save.memory = FALSE, case.weights = NULL, - variables, candidates, t = 5, select.rel = TRUE) { - if(!is.data.frame(x)){ +var.relations <- function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, + num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, + save.memory = FALSE, case.weights = NULL, + variables, candidates, t = 5, select.rel = TRUE) { + if (!is.data.frame(x)) { stop("x has to be a data frame") } if (create.forest) { @@ -59,34 +59,33 @@ var.relations = function(x = NULL, y = NULL, num.trees = 500, type = "regression stop("missing values are not allowed") } - allvariables = colnames(x)# extract variables names - nvar = length(allvariables) # count number of variables + allvariables <- colnames(x) # extract variables names + nvar <- length(allvariables) # count number of variables ## set global parameters if (is.null(mtry)) { - mtry = floor((nvar)^(3/4)) + mtry <- floor((nvar)^(3 / 4)) } if (mtry == "sqrt") { - mtry = floor(sqrt(nvar)) + mtry <- floor(sqrt(nvar)) } if (mtry == "0.5") { - mtry = floor(0.5*(nvar)) + mtry <- floor(0.5 * (nvar)) } if (mtry == "^3/4") { - mtry = floor((nvar)^(3/4)) + mtry <- floor((nvar)^(3 / 4)) } - if (is.null(s)) { - s = ceiling(nvar*0.01) + s <- ceiling(nvar * 0.01) } if (s > (nvar - 2)) { - s = nvar - 1 + s <- nvar - 1 warning("s was set to the maximum number that is reasonable (variables-1) ") } if (type == "classification") { - y = as.factor(y) + y <- as.factor(y) if (length(levels(y)) > 15) { stop("Too much classes defined, classification might be the wrong choice") } @@ -95,31 +94,33 @@ var.relations = function(x = NULL, y = NULL, num.trees = 500, type = "regression stop("use factor variable for y only for classification! ") } - data = data.frame(y, x) + data <- data.frame(y, x) if (type == "survival") { if (is.null(status)) { stop("a status variable named status has to be given for survival analysis") } - data$status = status - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, - case.weights = case.weights) + data$status <- status + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, + case.weights = case.weights + ) } if (type == "classification" | type == "regression") { - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights) - + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights + ) } - trees = getTreeranger(RF = RF,num.trees = num.trees) - trees.lay = addLayer(trees) + trees <- getTreeranger(RF = RF, num.trees = num.trees) + trees.lay <- addLayer(trees) rm(trees) - ###AddSurrogates### - trees.surr = addSurrogates(RF = RF,trees = trees.lay,s = s,Xdata = data[,-1], num.threads = num.threads) + ### AddSurrogates### + trees.surr <- addSurrogates(RF = RF, trees = trees.lay, s = s, Xdata = data[, -1], num.threads = num.threads) rm(trees.lay) - forest = list(trees = trees.surr, allvariables = colnames(data[,-1])) - + forest <- list(trees = trees.surr, allvariables = colnames(data[, -1])) } if (!create.forest) { @@ -127,48 +128,50 @@ var.relations = function(x = NULL, y = NULL, num.trees = 500, type = "regression stop("set create.forest to TRUE or analyze an existing random forest specified by parameter forest") } } - trees = forest[["trees"]] - allvariables = forest[["allvariables"]] + trees <- forest[["trees"]] + allvariables <- forest[["allvariables"]] if (all(candidates %in% allvariables)) { - if (all(variables %in% allvariables)) { - # count surrogates - s = count.surrogates(trees) - results.meanAdjAgree = meanAdjAgree(trees, variables, allvariables, candidates, t = t, s$s.a, select.var = select.rel,num.threads = num.threads) - } else { - stop("allvariables do not contain the chosen variables") - } + if (all(variables %in% allvariables)) { + # count surrogates + s <- count.surrogates(trees) + results.meanAdjAgree <- meanAdjAgree(trees, variables, allvariables, candidates, t = t, s$s.a, select.var = select.rel, num.threads = num.threads) + } else { + stop("allvariables do not contain the chosen variables") + } } else { stop("allvariables do not contain the candidate variables") } - if(select.rel) { - surr.var = results.meanAdjAgree$surr.var - varlist = list() - for (i in 1:nrow(surr.var)) { - surr.var.var = surr.var[i,] - if (anyNA(surr.var.var)) { - surr.var.var = surr.var.var[-which(is.na(surr.var.var))] - } - var = names(surr.var.var[surr.var.var == 1]) - name = variables[i] - varlist[[name]] = var - } - - var = names(surr.var[surr.var == 1]) - if (save.ranger) { - return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res, - threshold = results.meanAdjAgree$threshold, var = varlist, ranger = RF)) - } else { - return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res, - threshold = results.meanAdjAgree$threshold, var = varlist)) - } + if (select.rel) { + surr.var <- results.meanAdjAgree$surr.var + varlist <- list() + for (i in 1:nrow(surr.var)) { + surr.var.var <- surr.var[i, ] + if (anyNA(surr.var.var)) { + surr.var.var <- surr.var.var[-which(is.na(surr.var.var))] + } + var <- names(surr.var.var[surr.var.var == 1]) + name <- variables[i] + varlist[[name]] <- var + } - } else { + var <- names(surr.var[surr.var == 1]) if (save.ranger) { - return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res,ranger = RF)) - + return(list( + variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res, + threshold = results.meanAdjAgree$threshold, var = varlist, ranger = RF + )) + } else { + return(list( + variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res, + threshold = results.meanAdjAgree$threshold, var = varlist + )) + } } else { - return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res)) - } + if (save.ranger) { + return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res, ranger = RF)) + } else { + return(list(variables = results.meanAdjAgree$variables, surr.res = results.meanAdjAgree$surr.res)) + } } } diff --git a/R/var.relations.mfi.R b/R/var.relations.mfi.R index 043c9c4..ee64c21 100644 --- a/R/var.relations.mfi.R +++ b/R/var.relations.mfi.R @@ -1,6 +1,6 @@ #' Investigate variable relations of a specific variable with mutual forest impact (corrected mean adjusted agreement). #' -#'This function corrects the mean adjusted agreement by a permutation approach and generates the relation parameter mutual forest impact. Subsequently p-values are determined and related variables are selected. +#' This function corrects the mean adjusted agreement by a permutation approach and generates the relation parameter mutual forest impact. Subsequently p-values are determined and related variables are selected. #' #' @param variables variable names (string) for which related variables should be searched for (has to be contained in allvariables) #' @param candidates vector of variable names (strings) that are candidates to be related to the variables (has to be contained in allvariables) @@ -20,35 +20,33 @@ #' \item ranger: ranger objects. #' \item method: Method to compute p-values: "janitza" or "permutation". #' \item p.t: p.value threshold for selection of related variables -#' -#' #' } #' @examples #' # read data #' data("SMD_example_data") -#' x = SMD_example_data[,2:ncol(SMD_example_data)] -#' y = SMD_example_data[,1] +#' x <- SMD_example_data[, 2:ncol(SMD_example_data)] +#' y <- SMD_example_data[, 1] #' \donttest{ #' # calculate variable relations #' set.seed(42) -#' res = var.relations.mfi( +#' res <- var.relations.mfi( #' x = x, #' y = y, #' s = 10, #' num.trees = 10, -#' variables = c("X1","X7"), +#' variables = c("X1", "X7"), #' candidates = colnames(x)[1:100], -#' num.threads = 1) +#' num.threads = 1 +#' ) #' res$var.rel[[1]] #' } #' #' @export - -var.relations.mfi = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, - num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, - save.memory = FALSE, case.weights = NULL, - variables, candidates, p.t = 0.01, select.rel = TRUE, method = "janitza") { - if(!is.data.frame(x)){ +var.relations.mfi <- function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, + num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, + save.memory = FALSE, case.weights = NULL, + variables, candidates, p.t = 0.01, select.rel = TRUE, method = "janitza") { + if (!is.data.frame(x)) { stop("x has to be a data frame") } if (create.forest) { @@ -61,34 +59,33 @@ var.relations.mfi = function(x = NULL, y = NULL, num.trees = 500, type = "regres stop("missing values are not allowed") } - allvariables = colnames(x)# extract variables names - nvar = length(allvariables) # count number of variables + allvariables <- colnames(x) # extract variables names + nvar <- length(allvariables) # count number of variables ## set global parameters if (is.null(mtry)) { - mtry = floor((nvar)^(3/4)) + mtry <- floor((nvar)^(3 / 4)) } if (mtry == "sqrt") { - mtry = floor(sqrt(nvar)) + mtry <- floor(sqrt(nvar)) } if (mtry == "0.5") { - mtry = floor(0.5*(nvar)) + mtry <- floor(0.5 * (nvar)) } if (mtry == "^3/4") { - mtry = floor((nvar)^(3/4)) + mtry <- floor((nvar)^(3 / 4)) } - if (is.null(s)) { - s = ceiling(nvar*0.01) + s <- ceiling(nvar * 0.01) } if (s > (nvar - 2)) { - s = nvar - 1 + s <- nvar - 1 warning("s was set to the maximum number that is reasonable (variables-1) ") } if (type == "classification") { - y = as.factor(y) + y <- as.factor(y) if (length(levels(y)) > 15) { stop("Too much classes defined, classification might be the wrong choice") } @@ -98,49 +95,56 @@ var.relations.mfi = function(x = NULL, y = NULL, num.trees = 500, type = "regres } # create shadow variables to correct the relation - x_perm = data.frame(lapply(1:ncol(x),permute.variable,x=x)) - colnames(x_perm) = paste(allvariables,"_perm", sep = "") + x_perm <- data.frame(lapply(1:ncol(x), permute.variable, x = x)) + colnames(x_perm) <- paste(allvariables, "_perm", sep = "") - data = data.frame(y, x) - data_perm = data.frame(y, x_perm) + data <- data.frame(y, x) + data_perm <- data.frame(y, x_perm) if (type == "survival") { if (is.null(status)) { stop("a status variable named status has to be given for survival analysis") } - data$status = status - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry, min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, - case.weights = case.weights, respect.unordered.factors = "partition") - data_perm$status = status - RF_perm = ranger::ranger(data = data_perm,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, - case.weights = case.weights, respect.unordered.factors = "partition") + data$status <- status + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, + case.weights = case.weights, respect.unordered.factors = "partition" + ) + data_perm$status <- status + RF_perm <- ranger::ranger( + data = data_perm, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, + case.weights = case.weights, respect.unordered.factors = "partition" + ) } if (type == "classification" | type == "regression") { - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition") - - RF_perm = ranger::ranger(data = data_perm,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition") - + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition" + ) + + RF_perm <- ranger::ranger( + data = data_perm, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition" + ) } - trees = getTreeranger(RF = RF,num.trees = num.trees) - trees.lay = addLayer(trees) + trees <- getTreeranger(RF = RF, num.trees = num.trees) + trees.lay <- addLayer(trees) rm(trees) - ###AddSurrogates### - trees.surr = addSurrogates(RF = RF,trees = trees.lay,s = s,Xdata = data[,-1], num.threads = num.threads) + ### AddSurrogates### + trees.surr <- addSurrogates(RF = RF, trees = trees.lay, s = s, Xdata = data[, -1], num.threads = num.threads) rm(trees.lay) - forest = list(trees = trees.surr, allvariables = colnames(data[,-1])) + forest <- list(trees = trees.surr, allvariables = colnames(data[, -1])) # do the same for the permutation forrest - trees_perm = getTreeranger(RF = RF_perm,num.trees = num.trees) - trees.lay_perm = addLayer(trees_perm) + trees_perm <- getTreeranger(RF = RF_perm, num.trees = num.trees) + trees.lay_perm <- addLayer(trees_perm) rm(trees_perm) - ###AddSurrogates### - trees.surr_perm = addSurrogates(RF = RF_perm,trees = trees.lay_perm,s = s,Xdata = data_perm[,-1], num.threads = num.threads) + ### AddSurrogates### + trees.surr_perm <- addSurrogates(RF = RF_perm, trees = trees.lay_perm, s = s, Xdata = data_perm[, -1], num.threads = num.threads) rm(trees.lay_perm) - forest_perm = list(trees = trees.surr_perm, allvariables = colnames(data_perm[,-1])) + forest_perm <- list(trees = trees.surr_perm, allvariables = colnames(data_perm[, -1])) } if (!create.forest) { @@ -150,56 +154,57 @@ var.relations.mfi = function(x = NULL, y = NULL, num.trees = 500, type = "regres } if (all(candidates %in% allvariables)) { - if (all(variables %in% allvariables)) { - # count surrogates - s = count.surrogates(forest$trees) - rel = meanAdjAgree(forest$trees, variables = allvariables, allvariables = allvariables, candidates = allvariables, - t = t, s$s.a, select.var = FALSE, num.threads = num.threads) - - allvariables_perm = colnames(x_perm) - - rel_perm = meanAdjAgree(forest_perm$trees, variables = allvariables_perm , allvariables = allvariables_perm , candidates = allvariables_perm, - t = t, s$s.a, select.var = FALSE, num.threads = num.threads) - - } else { - stop("allvariables do not contain the chosen variables") - } + if (all(variables %in% allvariables)) { + # count surrogates + s <- count.surrogates(forest$trees) + rel <- meanAdjAgree(forest$trees, + variables = allvariables, allvariables = allvariables, candidates = allvariables, + t = t, s$s.a, select.var = FALSE, num.threads = num.threads + ) + + allvariables_perm <- colnames(x_perm) + + rel_perm <- meanAdjAgree(forest_perm$trees, + variables = allvariables_perm, allvariables = allvariables_perm, candidates = allvariables_perm, + t = t, s$s.a, select.var = FALSE, num.threads = num.threads + ) + } else { + stop("allvariables do not contain the chosen variables") + } } else { stop("allvariables do not contain the candidate variables") } - adj.agree = rel$surr.res - adj.agree.perm = rel_perm$surr.res - diag(adj.agree) = diag(adj.agree.perm) = 1 + adj.agree <- rel$surr.res + adj.agree.perm <- rel_perm$surr.res + diag(adj.agree) <- diag(adj.agree.perm) <- 1 - if(anyNA(adj.agree)) { - no.na = length(which(rowSums(is.na(adj.agree)) != 0 )) + if (anyNA(adj.agree)) { + no.na <- length(which(rowSums(is.na(adj.agree)) != 0)) warning(paste0("Relations for ", no.na, " original variables were not calculated because they were never used as a primary split. Affected relations are set to 0. ")) - adj.agree[which(is.na(adj.agree))] = 0 + adj.agree[which(is.na(adj.agree))] <- 0 } - if(anyNA(adj.agree.perm)) { - no.na = length(which(rowSums(is.na(adj.agree.perm)) != 0 )) + if (anyNA(adj.agree.perm)) { + no.na <- length(which(rowSums(is.na(adj.agree.perm)) != 0)) warning(paste0("Relations for ", no.na, " permuted variables were not calculated because they were not used as a primary split. Affected relations are set to 0. ")) - adj.agree.perm[which(is.na(adj.agree.perm))] = 0 + adj.agree.perm[which(is.na(adj.agree.perm))] <- 0 } - adj.agree.corr = adj.agree - adj.agree.perm[1:nvar,1:nvar] - diag(adj.agree.corr) = diag(adj.agree) = diag(adj.agree.perm) = NA + adj.agree.corr <- adj.agree - adj.agree.perm[1:nvar, 1:nvar] + diag(adj.agree.corr) <- diag(adj.agree) <- diag(adj.agree.perm) <- NA - adj.agree.corr.var = adj.agree.corr[variables,candidates] + adj.agree.corr.var <- adj.agree.corr[variables, candidates] if (select.rel) { - if (method == "janitza") { - - adj.agree.1 = adj.agree.corr - diag(adj.agree.1) = 1 + adj.agree.1 <- adj.agree.corr + diag(adj.agree.1) <- 1 ## Mirrored VIMP (# This part is taken from ranger function) - m1 = adj.agree.1[adj.agree.1 < 0] - m2 = adj.agree.1[adj.agree.1 == 0] - null.rel = c(m1, -m1, m2) + m1 <- adj.agree.1[adj.agree.1 < 0] + m2 <- adj.agree.1[adj.agree.1 == 0] + null.rel <- c(m1, -m1, m2) if (length(m1) == 0) { stop("No negative importance values found for selection of related variables. Consider the 'permutation' approach.") @@ -208,70 +213,74 @@ var.relations.mfi = function(x = NULL, y = NULL, num.trees = 500, type = "regres warning("Only few negative importance values found for selection of related variables, inaccurate p-values. Consider the 'permutation' approach.") } - rel.p = lapply(1:length(variables),p.relation, - null.rel = null.rel, - adj.agree.corr = adj.agree.corr.var, - candidates = candidates, - variables = variables) - sel.rel = lapply(1:length(variables),select.related, - rel.p, - p.t) - - names(rel.p) = names(sel.rel) = variables - + rel.p <- lapply(1:length(variables), p.relation, + null.rel = null.rel, + adj.agree.corr = adj.agree.corr.var, + candidates = candidates, + variables = variables + ) + sel.rel <- lapply( + 1:length(variables), select.related, + rel.p, + p.t + ) + + names(rel.p) <- names(sel.rel) <- variables } if (method == "permutation") { + null.rel.plus <- as.vector(adj.agree.perm) + null.rel.plus <- null.rel.plus[!is.na(null.rel.plus)] - null.rel.plus = as.vector(adj.agree.perm) - null.rel.plus = null.rel.plus[!is.na(null.rel.plus)] - - m1 = null.rel.plus[null.rel.plus > 0] - m2 = null.rel.plus[null.rel.plus == 0] - null.rel = c(m1, -m1, m2) + m1 <- null.rel.plus[null.rel.plus > 0] + m2 <- null.rel.plus[null.rel.plus == 0] + null.rel <- c(m1, -m1, m2) - if (length(null.rel) < 100) { - warning("Only few null relations used. P-values could be inaccurate.") - } + if (length(null.rel) < 100) { + warning("Only few null relations used. P-values could be inaccurate.") + } - rel.p = lapply(1:length(variables),p.relation, - null.rel = null.rel, - adj.agree.corr = adj.agree.corr.var, - candidates = candidates, - variables = variables) + rel.p <- lapply(1:length(variables), p.relation, + null.rel = null.rel, + adj.agree.corr = adj.agree.corr.var, + candidates = candidates, + variables = variables + ) - sel.rel = lapply(1:length(variables),select.related, - rel.p, - p.t) + sel.rel <- lapply( + 1:length(variables), select.related, + rel.p, + p.t + ) - names(rel.p) = names(sel.rel) = variables - } + names(rel.p) <- names(sel.rel) <- variables + } - if (save.ranger) { - return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, - p.rel = rel.p, var.rel = sel.rel, ranger = list(RF = RF,RF_perm = RF_perm), method = method, p.t = p.t)) - } else { - return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, p.rel = rel.p, var.rel = sel.rel, method = method, p.t = p.t)) - } - } else { if (save.ranger) { - return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, ranger = list(RF = RF,RF_perm = RF_perm))) + return(list( + variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, + p.rel = rel.p, var.rel = sel.rel, ranger = list(RF = RF, RF_perm = RF_perm), method = method, p.t = p.t + )) + } else { + return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, p.rel = rel.p, var.rel = sel.rel, method = method, p.t = p.t)) + } } else { - return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm)) - } + if (save.ranger) { + return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm, ranger = list(RF = RF, RF_perm = RF_perm))) + } else { + return(list(variables = variables, surr.res = adj.agree.corr.var, surr.perm = adj.agree.perm)) + } } } - - #' permute.variable #' #' This is an internal function #' #' @keywords internal -permute.variable=function(i=1,x){ - var.perm = sample(x[,i],nrow(x)) +permute.variable <- function(i = 1, x) { + var.perm <- sample(x[, i], nrow(x)) return(var.perm) } @@ -280,15 +289,15 @@ permute.variable=function(i=1,x){ #' This is an internal function #' #' @keywords internal -p.relation = function(l = 1, - null.rel, - adj.agree.corr, - candidates, - variables) { - relations = adj.agree.corr[l,] +p.relation <- function(l = 1, + null.rel, + adj.agree.corr, + candidates, + variables) { + relations <- adj.agree.corr[l, ] pval <- 1 - ranger:::numSmaller(relations, null.rel) / length(null.rel) - names(pval) = candidates - pval[variables[l]] = NA + names(pval) <- candidates + pval[variables[l]] <- NA return(pval) } @@ -297,9 +306,9 @@ p.relation = function(l = 1, #' This is an internal function #' #' @keywords internal -select.related = function(m=1, - rel.p, - p.t) { - rel.var = rel.p[[m]] +select.related <- function(m = 1, + rel.p, + p.t) { + rel.var <- rel.p[[m]] names(which(rel.var <= p.t)) } diff --git a/R/variable_selection_md.R b/R/variable_selection_md.R index d847fcf..c8a7c44 100644 --- a/R/variable_selection_md.R +++ b/R/variable_selection_md.R @@ -36,7 +36,7 @@ #' #' \item ranger: ranger object #' -#'} +#' } #' @examples #' # read data #' data("SMD_example_data") @@ -44,32 +44,31 @@ #' \donttest{ #' # select variables (usually more trees are needed) #' set.seed(42) -#' res = var.select.md( -#' x = SMD_example_data[,2:ncol(SMD_example_data)], -#' y = SMD_example_data[,1], num.trees = 10, num.threads = 1) +#' res <- var.select.md( +#' x = SMD_example_data[, 2:ncol(SMD_example_data)], +#' y = SMD_example_data[, 1], num.trees = 10, num.threads = 1 +#' ) #' res$var #' } #' -#'@references +#' @references ##' \itemize{ ##' \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103} ##' \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf} -##' } +##' } #' #' @export - -var.select.md = function(x = NULL, y = NULL, num.trees = 500, type = "regression", mtry = NULL, min.node.size = 1, num.threads = NULL, - status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, save.memory = FALSE, case.weights = NULL) { - - results.smd = var.select.smd(x = x, y = y ,num.trees = num.trees,type = type, mtry = mtry,min.node.size = min.node.size, num.threads = num.threads - ,status = status, save.ranger = save.ranger, s = 0, create.forest = create.forest, forest = forest, - save.memory = save.memory, case.weights = case.weights) +var.select.md <- function(x = NULL, y = NULL, num.trees = 500, type = "regression", mtry = NULL, min.node.size = 1, num.threads = NULL, + status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, save.memory = FALSE, case.weights = NULL) { + results.smd <- var.select.smd( + x = x, y = y, num.trees = num.trees, type = type, mtry = mtry, min.node.size = min.node.size, num.threads = num.threads, + status = status, save.ranger = save.ranger, s = 0, create.forest = create.forest, forest = forest, + save.memory = save.memory, case.weights = case.weights + ) if (save.ranger) { - results = list(info = results.smd$info, var = results.smd$var, forest = results.smd$forest, ranger = results.smd$ranger) - } - else { - results = list(info = results.smd$info, var = results.smd$var, forest = results.smd$forest) + results <- list(info = results.smd$info, var = results.smd$var, forest = results.smd$forest, ranger = results.smd$ranger) + } else { + results <- list(info = results.smd$info, var = results.smd$var, forest = results.smd$forest) } return(results) - } diff --git a/R/variable_selection_mir.R b/R/variable_selection_mir.R index f8ea84e..18cade9 100644 --- a/R/variable_selection_mir.R +++ b/R/variable_selection_mir.R @@ -40,7 +40,7 @@ #' } #' \item var: vector of selected variables. #' -#'\item ranger: ranger object. +#' \item ranger: ranger object. #' #' } #' @examples @@ -50,233 +50,258 @@ #' \donttest{ #' # select variables (usually more trees are needed) #' set.seed(42) -#' res = var.select.mir( -#' x = SMD_example_data[,2:ncol(SMD_example_data)], -#' y = SMD_example_data[,1],s = 10, num.trees = 10, num.threads = 1) +#' res <- var.select.mir( +#' x = SMD_example_data[, 2:ncol(SMD_example_data)], +#' y = SMD_example_data[, 1], s = 10, num.trees = 10, num.threads = 1 +#' ) #' res$var #' } -#'@references +#' @references ##' \itemize{ ##' \item Nembrini, S. et al. (2018) The revival of the Gini importance? Bioinformatics, 34, 3711–3718. \url{https://academic.oup.com/bioinformatics/article/34/21/3711/4994791} ##' \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013} -##' } +##' } #' @export - -var.select.mir = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, - num.threads = NULL, status = NULL, save.ranger = FALSE, - save.memory = FALSE, num.permutations = 100, p.t.sel = 0.01, p.t.rel = 0.01, select.var = TRUE, select.rel = FALSE, - case.weights = NULL, corr.rel = TRUE, t = 5, method.rel = "permutation", method.sel = "janitza", save.rel = TRUE) { - if(!is.data.frame(x)){ +var.select.mir <- function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, + num.threads = NULL, status = NULL, save.ranger = FALSE, + save.memory = FALSE, num.permutations = 100, p.t.sel = 0.01, p.t.rel = 0.01, select.var = TRUE, select.rel = FALSE, + case.weights = NULL, corr.rel = TRUE, t = 5, method.rel = "permutation", method.sel = "janitza", save.rel = TRUE) { + if (!is.data.frame(x)) { stop("x has to be a data frame") } - ## check data - if (length(y) != nrow(x)) { - stop("length of y and number of rows in x are different") - } + ## check data + if (length(y) != nrow(x)) { + stop("length of y and number of rows in x are different") + } - if (any(is.na(x))) { - stop("missing values are not allowed") - } + if (any(is.na(x))) { + stop("missing values are not allowed") + } - allvariables = colnames(x)# extract variables names - nvar = length(allvariables) # count number of variables - ## set global parameters - if (is.null(mtry)) { - mtry = floor((nvar)^(3/4)) - } - if (mtry == "sqrt") { - mtry = floor(sqrt(nvar)) - } - if (mtry == "0.5") { - mtry = floor(0.5*(nvar)) - } - if (mtry == "^3/4") { - mtry = floor((nvar)^(3/4)) - } + allvariables <- colnames(x) # extract variables names + nvar <- length(allvariables) # count number of variables + ## set global parameters + if (is.null(mtry)) { + mtry <- floor((nvar)^(3 / 4)) + } + if (mtry == "sqrt") { + mtry <- floor(sqrt(nvar)) + } + if (mtry == "0.5") { + mtry <- floor(0.5 * (nvar)) + } + if (mtry == "^3/4") { + mtry <- floor((nvar)^(3 / 4)) + } + if (is.null(s)) { + s <- ceiling(nvar * 0.01) + } - if (is.null(s)) { - s = ceiling(nvar*0.01) - } + if (s > (nvar - 2)) { + s <- nvar - 1 + warning("s was set to the maximum number that is reasonable (variables-1) ") + } - if (s > (nvar - 2)) { - s = nvar - 1 - warning("s was set to the maximum number that is reasonable (variables-1) ") - } - - if (type == "classification") { - y = as.factor(y) - if (length(levels(y)) > 15) { - stop("Too much classes defined, classification might be the wrong choice") - } - } - if (type == "regression" && inherits(y, "factor")) { - stop("use factor variable for y only for classification! ") + if (type == "classification") { + y <- as.factor(y) + if (length(levels(y)) > 15) { + stop("Too much classes defined, classification might be the wrong choice") } + } + if (type == "regression" && inherits(y, "factor")) { + stop("use factor variable for y only for classification! ") + } - data = data.frame(y, x) + data <- data.frame(y, x) - if (type == "survival") { - if (is.null(status)) { - stop("a status variable named status has to be given for survival analysis") - } - data$status = status - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, - importance ="impurity_corrected", case.weights = case.weights, respect.unordered.factors = "partition") - if (corr.rel) { - rel = var.relations.mfi(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, - candidates = allvariables, p.t = p.t.rel, method = method.rel,select.rel = select.rel) - } else { - rel = var.relations(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, - candidates = allvariables, t = t, select.rel = select.rel) - } + if (type == "survival") { + if (is.null(status)) { + stop("a status variable named status has to be given for survival analysis") } - if (type == "classification" | type == "regression") { - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - num.threads = num.threads, importance ="impurity_corrected", case.weights = case.weights, respect.unordered.factors = "partition") - - if (corr.rel) { - rel = var.relations.mfi(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, case.weights = case.weights, variables = allvariables, - candidates = allvariables, p.t = p.t.rel, method = method.rel,select.rel = select.rel) - } else { - rel = var.relations(x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, case.weights = case.weights, variables = allvariables, - candidates = allvariables, t = t,select.rel = select.rel) - } - + data$status <- status + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, + importance = "impurity_corrected", case.weights = case.weights, respect.unordered.factors = "partition" + ) + if (corr.rel) { + rel <- var.relations.mfi( + x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, + candidates = allvariables, p.t = p.t.rel, method = method.rel, select.rel = select.rel + ) + } else { + rel <- var.relations( + x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, + candidates = allvariables, t = t, select.rel = select.rel + ) } + } + if (type == "classification" | type == "regression") { + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, importance = "impurity_corrected", case.weights = case.weights, respect.unordered.factors = "partition" + ) + + if (corr.rel) { + rel <- var.relations.mfi( + x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, case.weights = case.weights, variables = allvariables, + candidates = allvariables, p.t = p.t.rel, method = method.rel, select.rel = select.rel + ) + } else { + rel <- var.relations( + x = x, y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, case.weights = case.weights, variables = allvariables, + candidates = allvariables, t = t, select.rel = select.rel + ) + } + } + adj.agree <- rel$surr.res + diag(adj.agree) <- 1 - -adj.agree = rel$surr.res -diag(adj.agree) = 1 - - mir = colSums(adj.agree * RF$variable.importance) + mir <- colSums(adj.agree * RF$variable.importance) if (select.var) { if (method.sel == "janitza") { if (corr.rel) { - ## Mirrored VIMP (# This part is taken from ranger function) - m1 = mir[mir< 0] - m2 = mir[mir == 0] - null.rel = c(m1, -m1, m2) - - pval <- 1 - ranger:::numSmaller(mir, null.rel) / length(null.rel) - names(pval) = allvariables - selected = as.numeric(pval <= p.t.sel) - names(selected) = names(pval) - - if (length(m1) == 0) { - stop("No negative importance values found for selection of important variables. Consider the 'permutation' approach.") - } - if (length(m1) < 100) { - warning("Only few negative importance values found for selection of important variables, inaccurate p-values. Consider the 'permutation' approach.") - } + ## Mirrored VIMP (# This part is taken from ranger function) + m1 <- mir[mir < 0] + m2 <- mir[mir == 0] + null.rel <- c(m1, -m1, m2) + + pval <- 1 - ranger:::numSmaller(mir, null.rel) / length(null.rel) + names(pval) <- allvariables + selected <- as.numeric(pval <= p.t.sel) + names(selected) <- names(pval) + + if (length(m1) == 0) { + stop("No negative importance values found for selection of important variables. Consider the 'permutation' approach.") + } + if (length(m1) < 100) { + warning("Only few negative importance values found for selection of important variables, inaccurate p-values. Consider the 'permutation' approach.") + } } else { stop("Janitza approach should only be conducted with corrected relations") -} + } } if (method.sel == "permutation") { - - if (corr.rel){ - adj.agree_perm = rel$surr.perm + if (corr.rel) { + adj.agree_perm <- rel$surr.perm } else { - x_perm = sapply(1:ncol(x),permute.variable,x=x) - colnames(x_perm) = paste(allvariables,"_perm", sep = "") - data_perm = data.frame(y, x_perm) - allvariables_perm = colnames(x_perm) - - if (type == "survival") { - if (is.null(status)) { - stop("a status variables has to be given for survival analysis") + x_perm <- sapply(1:ncol(x), permute.variable, x = x) + colnames(x_perm) <- paste(allvariables, "_perm", sep = "") + data_perm <- data.frame(y, x_perm) + allvariables_perm <- colnames(x_perm) + + if (type == "survival") { + if (is.null(status)) { + stop("a status variables has to be given for survival analysis") + } + + if (corr.rel) { + rel_perm <- var.relations.mfi( + x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, + candidates = allvariables, p.t = p.t.rel, method = method.rel, select.rel = select.rel + ) + } else { + rel_perm <- var.relations( + x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, + candidates = allvariables, t = t, select.rel = select.rel + ) + } } - - if (corr.rel) { - rel_perm = var.relations.mfi(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, - candidates = allvariables, p.t = p.t.rel, method = method.rel, select.rel = select.rel) - } else { - rel_perm = var.relations(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, status = status, case.weights = case.weights, variables = allvariables, - candidates = allvariables, t = t, select.rel = select.rel) + if (type == "classification" | type == "regression") { + if (corr.rel) { + rel_perm <- var.relations.mfi( + x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, case.weights = case.weights, variables = allvariables_perm, + candidates = allvariables_perm, p.t = p.t.rel, method = method.rel, select.rel = select.rel + ) + } else { + rel_perm <- var.relations( + x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, + num.threads = num.threads, case.weights = case.weights, variables = allvariables_perm, + candidates = allvariables_perm, t = t, select.rel = select.rel + ) + } } + adj.agree_perm <- rel_perm$surr.res } - if (type == "classification" | type == "regression") { - - if (corr.rel) { - rel_perm = var.relations.mfi(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, case.weights = case.weights, variables = allvariables_perm, - candidates = allvariables_perm, p.t = p.t.rel, method = method.rel,select.rel = select.rel) - } else { - rel_perm = var.relations(x = data.frame(x_perm), y = y, num.trees = num.trees, type = type, s = s, mtry = mtry, min.node.size = min.node.size, - num.threads = num.threads, case.weights = case.weights, variables = allvariables_perm, - candidates = allvariables_perm, t = t,select.rel = select.rel) - } - } - - adj.agree_perm = rel_perm$surr.res - } - - diag(adj.agree_perm) = 0 - - null.rel = unlist(lapply(1:num.permutations,calculate.mir.perm, - adj.agree_perm = adj.agree_perm, - air = RF$variable.importance, - allvariables = allvariables)) + diag(adj.agree_perm) <- 0 + null.rel <- unlist(lapply(1:num.permutations, calculate.mir.perm, + adj.agree_perm = adj.agree_perm, + air = RF$variable.importance, + allvariables = allvariables + )) pval <- 1 - ranger:::numSmaller(mir, null.rel) / length(null.rel) - names(pval) = allvariables - selected = as.numeric(pval <= p.t.sel) - names(selected) = names(pval) + names(pval) <- allvariables + selected <- as.numeric(pval <= p.t.sel) + names(selected) <- names(pval) + } - } -if(save.rel) { - info = list(MIR = mir, - pvalue = pval, - selected = selected, - relations = rel, - AIR = RF$variable.importance, - parameters = list(s = s, type = type, mtry = mtry, p.t.sel = p.t.sel, p.t.rel = p.t.rel, method.sel = method.sel)) -} else { - info = list(MIR = mir, - pvalue = pval, - selected = selected, - AIR = RF$variable.importance, - parameters = list(s = s, type = type, mtry = mtry, p.t.sel = p.t.sel, p.t.rel = p.t.rel, method.sel = method.sel)) -} + if (save.rel) { + info <- list( + MIR = mir, + pvalue = pval, + selected = selected, + relations = rel, + AIR = RF$variable.importance, + parameters = list(s = s, type = type, mtry = mtry, p.t.sel = p.t.sel, p.t.rel = p.t.rel, method.sel = method.sel) + ) + } else { + info <- list( + MIR = mir, + pvalue = pval, + selected = selected, + AIR = RF$variable.importance, + parameters = list(s = s, type = type, mtry = mtry, p.t.sel = p.t.sel, p.t.rel = p.t.rel, method.sel = method.sel) + ) + } } else { - if(save.rel) { - info = list(MIR = mir, - relations = rel, - AIR = RF$variable.importance, - parameters = list(s = s, type = type, mtry = mtry)) + if (save.rel) { + info <- list( + MIR = mir, + relations = rel, + AIR = RF$variable.importance, + parameters = list(s = s, type = type, mtry = mtry) + ) } else { - info = list(MIR = mir, - AIR = RF$variable.importance, - parameters = list(s = s, type = type, mtry = mtry)) + info <- list( + MIR = mir, + AIR = RF$variable.importance, + parameters = list(s = s, type = type, mtry = mtry) + ) } } if (save.ranger) { - results = list(info = info, - var = names(info$selected[info$selected == 1]), - ranger = RF) + results <- list( + info = info, + var = names(info$selected[info$selected == 1]), + ranger = RF + ) } else { if (select.var) { - results = list(info = info, - var = names(info$selected[info$selected == 1])) - } else { - results = list(info = info) - } + results <- list( + info = info, + var = names(info$selected[info$selected == 1]) + ) + } else { + results <- list(info = info) + } } + return(results) } @@ -285,9 +310,7 @@ if(save.rel) { #' This is an internal function #' #' @keywords internal -calculate.mir.perm = function(r=1, adj.agree_perm, air, allvariables) { -mir.perm = colSums(adj.agree_perm * sample(air,length(air))) -return(mir.perm) +calculate.mir.perm <- function(r = 1, adj.agree_perm, air, allvariables) { + mir.perm <- colSums(adj.agree_perm * sample(air, length(air))) + return(mir.perm) } - - diff --git a/R/variable_selection_smd.R b/R/variable_selection_smd.R index ef239f0..55cedb5 100644 --- a/R/variable_selection_smd.R +++ b/R/variable_selection_smd.R @@ -29,17 +29,17 @@ #' } #' \item var: vector of selected variables. #' -#'\item s: list with the results of count.surrogate function: -#'\itemize{ +#' \item s: list with the results of count.surrogate function: +#' \itemize{ #' \item s.a: total average number of surrogate variables. #' \item s.l: average number of surrogate variables in the respective layers. -#'} +#' } #' \item forest: a list containing: #' #'\itemize{ #' \item trees: list of trees that was created by getTreeranger, addLayer, and addSurrogates functions and that was used for surrogate minimal depth variable importance. #' \item allvariables: all variable names of the predictor variables that are present in x. #' } -#'\item ranger: ranger object. +#' \item ranger: ranger object. #' #' } #' @examples @@ -49,114 +49,122 @@ #' \donttest{ #' # select variables (usually more trees are needed) #' set.seed(42) -#' res = var.select.smd( -#' x = SMD_example_data[,2:ncol(SMD_example_data)], -#' y = SMD_example_data[,1],s = 10, num.trees = 10, num.threads = 1) +#' res <- var.select.smd( +#' x = SMD_example_data[, 2:ncol(SMD_example_data)], +#' y = SMD_example_data[, 1], s = 10, num.trees = 10, num.threads = 1 +#' ) #' res$var #' } -#'@references +#' @references ##' \itemize{ ##' \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013} ##' \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103} ##' \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf} -##' } +##' } #' @export - -var.select.smd = function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, - num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, - save.memory = FALSE, case.weights = NULL) { - if(!is.data.frame(x)){ +var.select.smd <- function(x = NULL, y = NULL, num.trees = 500, type = "regression", s = NULL, mtry = NULL, min.node.size = 1, + num.threads = NULL, status = NULL, save.ranger = FALSE, create.forest = TRUE, forest = NULL, + save.memory = FALSE, case.weights = NULL) { + if (!is.data.frame(x)) { stop("x has to be a data frame") } if (create.forest) { - ## check data - if (length(y) != nrow(x)) { - stop("length of y and number of rows in x are different") - } - - if (any(is.na(x))) { - stop("missing values are not allowed") - } - - variables = colnames(x) # extract variables names - nvar = length(variables) # count number of variables + ## check data + if (length(y) != nrow(x)) { + stop("length of y and number of rows in x are different") + } - ## set global parameters - if (is.null(mtry)) { - mtry = floor((nvar)^(3/4)) - } - if (mtry == "sqrt") { - mtry = floor(sqrt(nvar)) - } - if (mtry == "0.5") { - mtry = floor(0.5*nvar) - } - if (mtry == "^3/4") { - mtry = floor((nvar)^(3/4)) - } + if (any(is.na(x))) { + stop("missing values are not allowed") + } + variables <- colnames(x) # extract variables names + nvar <- length(variables) # count number of variables + ## set global parameters + if (is.null(mtry)) { + mtry <- floor((nvar)^(3 / 4)) + } + if (mtry == "sqrt") { + mtry <- floor(sqrt(nvar)) + } + if (mtry == "0.5") { + mtry <- floor(0.5 * nvar) + } + if (mtry == "^3/4") { + mtry <- floor((nvar)^(3 / 4)) + } - if (is.null(s)) { - s = ceiling(nvar*0.01) - } + if (is.null(s)) { + s <- ceiling(nvar * 0.01) + } - if (s > (nvar - 1)) { - s = nvar - 1 - warning("s was set to the maximum number that is reasonable (variables-1) ") - } + if (s > (nvar - 1)) { + s <- nvar - 1 + warning("s was set to the maximum number that is reasonable (variables-1) ") + } - if (type == "classification") { - y = as.factor(y) - if (length(levels(y)) > 15) { - stop("Too much classes defined, classification might be the wrong choice") + if (type == "classification") { + y <- as.factor(y) + if (length(levels(y)) > 15) { + stop("Too much classes defined, classification might be the wrong choice") + } } - } - if (type == "regression" && inherits(y, "factor")) { - stop("use factor variable for y only for classification! ") - } - data = data.frame(y, x) - if (type == "survival") { - if (is.null(status)) { - stop("a status variable named status has to be given for survival analysis") + if (type == "regression" && inherits(y, "factor")) { + stop("use factor variable for y only for classification! ") } - data$status = status - RF = ranger::ranger(data = data, dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, - case.weights = case.weights, respect.unordered.factors = "partition") - } - if (type == "classification" | type == "regression") { - RF = ranger::ranger(data = data,dependent.variable.name = "y",num.trees = num.trees,mtry = mtry,min.node.size = min.node.size, - keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition") - } - trees = getTreeranger(RF = RF,num.trees = num.trees) - trees.lay = addLayer(trees) - rm(trees) - ###AddSurrogates### - trees.surr = addSurrogates(RF = RF,trees = trees.lay,s = s,Xdata = x, num.threads = num.threads) - rm(trees.lay) - # count surrogates - s = count.surrogates(trees.surr) - surrminimaldepth.s = surrmindep(forest = list(trees = trees.surr, allvariables = variables), s.l = s$s.l) + data <- data.frame(y, x) + if (type == "survival") { + if (is.null(status)) { + stop("a status variable named status has to be given for survival analysis") + } + data$status <- status + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, status.variable.name = "status", save.memory = save.memory, + case.weights = case.weights, respect.unordered.factors = "partition" + ) + } + if (type == "classification" | type == "regression") { + RF <- ranger::ranger( + data = data, dependent.variable.name = "y", num.trees = num.trees, mtry = mtry, min.node.size = min.node.size, + keep.inbag = TRUE, num.threads = num.threads, case.weights = case.weights, respect.unordered.factors = "partition" + ) + } + trees <- getTreeranger(RF = RF, num.trees = num.trees) + trees.lay <- addLayer(trees) + rm(trees) + ### AddSurrogates### + trees.surr <- addSurrogates(RF = RF, trees = trees.lay, s = s, Xdata = x, num.threads = num.threads) + rm(trees.lay) + # count surrogates + s <- count.surrogates(trees.surr) + surrminimaldepth.s <- surrmindep(forest = list(trees = trees.surr, allvariables = variables), s.l = s$s.l) } + if (!create.forest) { - if (is.null(forest)) { - stop("set create.forest to TRUE or analyze an existing random forest specified by parameter forest") - } - allvariables = forest[["allvariables"]] - trees = forest[["trees"]] - s = count.surrogates(trees) - surrminimaldepth.s = surrmindep(forest, s.l = s$s.l) - trees.surr = forest[["trees"]] - variables = forest[["variables"]] + if (is.null(forest)) { + stop("set create.forest to TRUE or analyze an existing random forest specified by parameter forest") + } + allvariables <- forest[["allvariables"]] + trees <- forest[["trees"]] + s <- count.surrogates(trees) + surrminimaldepth.s <- surrmindep(forest, s.l = s$s.l) + trees.surr <- forest[["trees"]] + variables <- forest[["variables"]] } + if (save.ranger) { - results = list(info = surrminimaldepth.s,var = names(surrminimaldepth.s$selected[surrminimaldepth.s$selected == 1]),s = s, - forest = list(trees = trees.surr, allvariables = variables), ranger = RF) - } - else { - results = list(info = surrminimaldepth.s,var = names(surrminimaldepth.s$selected[surrminimaldepth.s$selected == 1]),s = s, - forest = list(trees = trees.surr, allvariables = variables)) + results <- list( + info = surrminimaldepth.s, var = names(surrminimaldepth.s$selected[surrminimaldepth.s$selected == 1]), s = s, + forest = list(trees = trees.surr, allvariables = variables), ranger = RF + ) + } else { + results <- list( + info = surrminimaldepth.s, var = names(surrminimaldepth.s$selected[surrminimaldepth.s$selected == 1]), s = s, + forest = list(trees = trees.surr, allvariables = variables) + ) } + return(results) } diff --git a/man/build.clusters.Rd b/man/build.clusters.Rd index 206ca09..5322c85 100644 --- a/man/build.clusters.Rd +++ b/man/build.clusters.Rd @@ -23,34 +23,36 @@ This function generates variables groups of relation information that was obtain data("SMD_example_data") \donttest{ - # get trees and variable names - x = SMD_example_data[,2:ncol(SMD_example_data)] - y = SMD_example_data[,1] - allvariables = colnames(x)# extract variables names - nvar = length(allvariables) # count number of variables - set.seed(42) - RF = ranger::ranger( - data = SMD_example_data, - dependent.variable.name = "y", - num.trees = 10, - keep.inbag = TRUE, - mtry = floor(nvar^(3/4)), - min.node.size = 1, - num.threads = 1) - trees = getTreeranger(RF = RF, num.trees = 10) - trees.lay = addLayer(trees) - trees.surr = addSurrogates(RF = RF, trees = trees.lay, s = 10, Xdata = x, num.threads = 1) +# get trees and variable names +x <- SMD_example_data[, 2:ncol(SMD_example_data)] +y <- SMD_example_data[, 1] +allvariables <- colnames(x) # extract variables names +nvar <- length(allvariables) # count number of variables +set.seed(42) +RF <- ranger::ranger( + data = SMD_example_data, + dependent.variable.name = "y", + num.trees = 10, + keep.inbag = TRUE, + mtry = floor(nvar^(3 / 4)), + min.node.size = 1, + num.threads = 1 +) +trees <- getTreeranger(RF = RF, num.trees = 10) +trees.lay <- addLayer(trees) +trees.surr <- addSurrogates(RF = RF, trees = trees.lay, s = 10, Xdata = x, num.threads = 1) - # investigate variable relations - rel=var.relations( - x = data.frame(), - create.forest = FALSE, - forest = list(trees = trees.surr, allvariables = allvariables), - variables = allvariables, - candidates = allvariables, - t = 10, - num.threads = 1) - groups = build.clusters(rel) +# investigate variable relations +rel <- var.relations( + x = data.frame(), + create.forest = FALSE, + forest = list(trees = trees.surr, allvariables = allvariables), + variables = allvariables, + candidates = allvariables, + t = 10, + num.threads = 1 +) +groups <- build.clusters(rel) } } diff --git a/man/reduce.surrogates.Rd b/man/reduce.surrogates.Rd index ff5f4b7..1fc1e84 100644 --- a/man/reduce.surrogates.Rd +++ b/man/reduce.surrogates.Rd @@ -25,31 +25,34 @@ data("SMD_example_data") ###### use result of SMD variable importance and reduce surrogate variables to 10 # select variables with smd variable importance (usually more trees are needed) set.seed(42) -res = var.select.smd( - x = as.data.frame(SMD_example_data[,2:ncol(SMD_example_data)]), - y = SMD_example_data[,1], +res <- var.select.smd( + x = as.data.frame(SMD_example_data[, 2:ncol(SMD_example_data)]), + y = SMD_example_data[, 1], s = 100, num.trees = 10, - num.threads = 1) -forest.new = reduce.surrogates(forest = res$forest, s = 10) + num.threads = 1 +) +forest.new <- reduce.surrogates(forest = res$forest, s = 10) # execute SMD on tree with reduced number of surrogates -res.new = var.select.smd( +res.new <- var.select.smd( x = data.frame(), create.forest = FALSE, forest = forest.new, - num.threads = 1) + num.threads = 1 +) res.new$var #' # investigate variable relations -rel = var.relations( +rel <- var.relations( x = data.frame(), create.forest = FALSE, forest = forest.new, - variables=c("X1","X7"), + variables = c("X1", "X7"), candidates = res$forest[["allvariables"]][1:100], t = 5, - num.threads = 1) + num.threads = 1 +) rel$var } } diff --git a/man/var.relations.Rd b/man/var.relations.Rd index c34efdf..ecf6537 100644 --- a/man/var.relations.Rd +++ b/man/var.relations.Rd @@ -83,20 +83,21 @@ Based on the threshold a vector is created containing the related variables. \examples{ # read data data("SMD_example_data") -x = SMD_example_data[,2:ncol(SMD_example_data)] -y = SMD_example_data[,1] +x <- SMD_example_data[, 2:ncol(SMD_example_data)] +y <- SMD_example_data[, 1] \donttest{ # calculate variable relations set.seed(42) -res = var.relations( +res <- var.relations( x = x, y = y, s = 10, num.trees = 10, - variables = c("X1","X7"), + variables = c("X1", "X7"), candidates = colnames(x)[1:100], t = 5, - num.threads = 1) + num.threads = 1 +) res$var } diff --git a/man/var.relations.mfi.Rd b/man/var.relations.mfi.Rd index 82cc354..fb0dc0f 100644 --- a/man/var.relations.mfi.Rd +++ b/man/var.relations.mfi.Rd @@ -78,8 +78,6 @@ a list containing: \item ranger: ranger objects. \item method: Method to compute p-values: "janitza" or "permutation". \item p.t: p.value threshold for selection of related variables - - } } \description{ @@ -88,19 +86,20 @@ This function corrects the mean adjusted agreement by a permutation approach and \examples{ # read data data("SMD_example_data") -x = SMD_example_data[,2:ncol(SMD_example_data)] -y = SMD_example_data[,1] +x <- SMD_example_data[, 2:ncol(SMD_example_data)] +y <- SMD_example_data[, 1] \donttest{ # calculate variable relations set.seed(42) -res = var.relations.mfi( +res <- var.relations.mfi( x = x, y = y, s = 10, num.trees = 10, - variables = c("X1","X7"), + variables = c("X1", "X7"), candidates = colnames(x)[1:100], - num.threads = 1) + num.threads = 1 +) res$var.rel[[1]] } diff --git a/man/var.select.md.Rd b/man/var.select.md.Rd index e1c3aec..b20dd26 100644 --- a/man/var.select.md.Rd +++ b/man/var.select.md.Rd @@ -80,9 +80,10 @@ data("SMD_example_data") \donttest{ # select variables (usually more trees are needed) set.seed(42) -res = var.select.md( - x = SMD_example_data[,2:ncol(SMD_example_data)], - y = SMD_example_data[,1], num.trees = 10, num.threads = 1) +res <- var.select.md( + x = SMD_example_data[, 2:ncol(SMD_example_data)], + y = SMD_example_data[, 1], num.trees = 10, num.threads = 1 +) res$var } @@ -91,5 +92,5 @@ res$var \itemize{ \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103} \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf} - } +} } diff --git a/man/var.select.mir.Rd b/man/var.select.mir.Rd index ec56728..aa42325 100644 --- a/man/var.select.mir.Rd +++ b/man/var.select.mir.Rd @@ -103,9 +103,10 @@ data("SMD_example_data") \donttest{ # select variables (usually more trees are needed) set.seed(42) -res = var.select.mir( - x = SMD_example_data[,2:ncol(SMD_example_data)], - y = SMD_example_data[,1],s = 10, num.trees = 10, num.threads = 1) +res <- var.select.mir( + x = SMD_example_data[, 2:ncol(SMD_example_data)], + y = SMD_example_data[, 1], s = 10, num.trees = 10, num.threads = 1 +) res$var } } @@ -113,5 +114,5 @@ res$var \itemize{ \item Nembrini, S. et al. (2018) The revival of the Gini importance? Bioinformatics, 34, 3711–3718. \url{https://academic.oup.com/bioinformatics/article/34/21/3711/4994791} \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013} - } +} } diff --git a/man/var.select.smd.Rd b/man/var.select.smd.Rd index 48e91b6..bba46d4 100644 --- a/man/var.select.smd.Rd +++ b/man/var.select.smd.Rd @@ -87,9 +87,10 @@ data("SMD_example_data") \donttest{ # select variables (usually more trees are needed) set.seed(42) -res = var.select.smd( - x = SMD_example_data[,2:ncol(SMD_example_data)], - y = SMD_example_data[,1],s = 10, num.trees = 10, num.threads = 1) +res <- var.select.smd( + x = SMD_example_data[, 2:ncol(SMD_example_data)], + y = SMD_example_data[, 1], s = 10, num.trees = 10, num.threads = 1 +) res$var } } @@ -98,5 +99,5 @@ res$var \item Seifert, S. et al. (2019) Surrogate minimal depth as an importance measure for variables in random forests. Bioinformatics, 35, 3663–3671. \url{https://academic.oup.com/bioinformatics/article/35/19/3663/5368013} \item Ishwaran, H. et al. (2011) Random survival forests for high-dimensional data. Stat Anal Data Min, 4, 115–132. \url{https://onlinelibrary.wiley.com/doi/abs/10.1002/sam.10103} \item Ishwaran, H. et al. (2010) High-Dimensional Variable Selection for Survival Data. J. Am. Stat. Assoc., 105, 205–217. \url{http://www.ccs.miami.edu/~hishwaran/papers/IKGML.JASA.2010.pdf} - } +} } -- GitLab