% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/unityfor.R
\encoding{UTF-8}
\name{unityfor}
\alias{unityfor}
\title{Construct a unity forest prediction rule and compute the unity VIM.}
\usage{
unityfor(
  formula = NULL,
  dependent.variable.name = NULL,
  data = NULL,
  num.trees = 20000,
  num.cand.trees = 500,
  probability = TRUE,
  importance = "none",
  prop.best.splits = NULL,
  min.node.size.root = NULL,
  min.node.size = NULL,
  max.depth.root = NULL,
  max.depth = NULL,
  prop.var.root = NULL,
  mtry.sprout = NULL,
  replace = FALSE,
  sample.fraction = ifelse(replace, 1, 0.7),
  case.weights = NULL,
  class.weights = NULL,
  inbag = NULL,
  oob.error = TRUE,
  num.threads = NULL,
  write.forest = TRUE,
  verbose = TRUE
)
}
\arguments{
\item{formula}{Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables.}

\item{dependent.variable.name}{Name of the outcome variable, required if no formula is provided. For categorical outcomes, this variable must be coded as a \code{factor}.}

\item{data}{Training data of class \code{data.frame}, \code{matrix}, \code{dgCMatrix} (Matrix) or \code{gwaa.data} (GenABEL).}

\item{num.trees}{Number of trees. Default is 20000.}

\item{num.cand.trees}{Number of random candidate trees to generate for each tree root. Default is 500.}

\item{probability}{Grow a probability forest as in Malley et al. (2012). Default is \code{TRUE}. For categorical outcomes only.}

\item{importance}{Variable importance mode, either 'unity' (unity VIM) or 'none'.}

\item{prop.best.splits}{Related to the unity VIM. Default value should generally not be modified by the user. When calculating the unity VIM, only the top \code{prop.best.splits} \eqn{\times} 100\% of the splits -- those with the highest split criterion values weighted by node size -- are considered for each variable. The default value is 0.01, meaning that only the top 1\% of splits are used. While small values are recommended, they should not be set too low to ensure that each variable has a sufficient number of splits for a reliable unity VIM computation.}

\item{min.node.size.root}{Minimal node size in the tree roots. Default is 10 irrespective of the outcome type.}

\item{min.node.size}{Minimal node size. Default 5 for probability, 5 for classification, and 5 for continuous outcomes.}

\item{max.depth.root}{Maximal depth of the tree roots. Default value is 3 and should generally not be modified by the user. Larger values can be associated with worse predictive performance for some datasets.}

\item{max.depth}{Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree). Must be at least as large as \code{max.depth.root}.}

\item{prop.var.root}{Proportion of variables randomly sampled for constructing each tree root. Default is the square root of the number of variables divided by the number of variables. Consequently, per default, for each tree root, a random subset of variables is considered, with size equal to the (rounded up) square root of the total number of variables. An exception is made for datasets with more than 100 variables, where the default for \code{prop.var.root} is set to 0.1. See the 'Details' section below for explanation.}

\item{mtry.sprout}{Number of randomly sampled variables to possibly split at in each node of the tree sprouts (i.e., the branches of the trees beyond the tree roots). Default is the (rounded down) square root of the number variables.}

\item{replace}{Sample with replacement. Default is \code{FALSE}.}

\item{sample.fraction}{Fraction of observations to sample for each tree. Default is 1 for sampling with replacement and 0.7 for sampling without replacement.}

\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}

\item{class.weights}{Weights for the outcome classes (in order of the factor levels) in the splitting rule (cost sensitive learning). Classification and probability prediction only. For classification the weights are also applied in the majority vote in terminal nodes.}

\item{inbag}{Manually set observations per tree. List of size \code{num.trees}, containing inbag counts for each observation. Can be used for stratified sampling.}

\item{oob.error}{Compute OOB prediction error. Set to \code{FALSE} to save computation time.}

\item{num.threads}{Number of threads to use.
The default is to use at most 2 threads (and at most the number of available CPU cores).
This conservative default avoids unintentionally using many cores on shared computing resources
(e.g., CI systems, servers, or HPC login/compute nodes).

For typical use on a personal computer, setting \code{num.threads = 0} is strongly recommended,
as it uses all available CPU cores, which typically substantially reduces runtime.}

\item{write.forest}{Save \code{unityfor.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended.}

\item{verbose}{Show computation status and estimated runtime.}
}
\value{
Object of class \code{unityfor} with elements
  \item{\code{predictions}}{Predicted classes/class probabilities/values, based on out-of-bag samples.}
  \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.}
  \item{\code{data}}{Training data.}
  \item{\code{variable.importance}}{Variable importance for each independent variable. Only available if \code{importance} is not \code{"none"}.}
  \item{\code{importance.mode}}{Importance mode used.}
  \item{\code{prediction.error}}{Overall out-of-bag prediction error. For classification this is the fraction of missclassified samples, for probability estimation the Brier score and for continuous outcomes the mean squared error.}
  \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out-of-bag samples (classification only).}
  \item{\code{r.squared}}{R squared. Also called explained variance or coefficient of determination (continuous outcomes only). Computed on out-of-bag data.}
  \item{\code{call}}{Function call.}
  \item{\code{num.trees}}{Number of trees.}
  \item{\code{num.cand.trees}}{Number of candidate trees generated for each tree root.}
  \item{\code{num.independent.variables}}{Number of independent variables.}
  \item{\code{num.samples}}{Number of samples.}
  \item{\code{prop.var.root}}{Proportion of variables randomly sampled for each tree root.}
  \item{\code{mtry}}{Value of mtry used (in the tree sprouts).}
  \item{\code{max.depth.root}}{Maximal depth of the tree roots.}
  \item{\code{min.node.size.root}}{Minimal node size in the tree roots.}
  \item{\code{min.node.size}}{Value of minimal node size used.}
  \item{\code{splitrule}}{Splitting rule (used only in the tree sprouts).}
  \item{\code{replace}}{Sample with replacement.}
  \item{\code{treetype}}{Type of forest/tree. Categorical or continous outcome.}
}
\description{
Constructs a unity forest and computes the unity variable importance measure (VIM), as described in Hornung & Hapfelmeier (2026). Categorical and continuous outcomes are supported.\cr
The unity forest algorithm is a tree construction approach for random forests in which the first few splits are optimized jointly in order to more effectively capture interaction effects beyond marginal effects. The unity VIM quantifies the influence of each variable under the conditions in which that influence is strongest, thereby placing a stronger emphasis on interaction effects than conventional variable importance measures.\cr
To explore the nature of the effects identified by the unity VIM, it is essential to examine covariate-representative tree roots (CRTRs), which are implemented in \code{\link{reprTrees}}.
}
\details{
There are two reasons why, for datasets with more than 100 variables, the default value of \code{prop.var.root} is set to 0.1 rather than to the square root of the number of variables divided by the total number of variables.

First, as the total number of variables increases, the square-root-based proportion decreases. This makes it less likely that the same pairs of variables are selected together in multiple trees. This can be problematic for the unity VIM, particularly for variables that do not have marginal effects on their own but act only through interactions with one or a few other variables. Such variables are informative in tree roots only when they are used jointly with the covariates they interact with. Setting \code{prop.var.root = 0.1} ensures that interacting covariates are selected together sufficiently often in tree roots.

Second, this choice reflects the fact that in high-dimensional datasets, typically only a small proportion of variables are informative. Applying the square-root rule in such settings may result in too few informative variables being selected, thereby reducing the likelihood of constructing predictive tree roots.

However, note that results obtained from applications of the unity forest framework to high-dimensional datasets should be interpreted with caution. For high-dimensional data, the curse of dimensionality makes the identification of individual interaction effects challenging and increases the risk of false positives. Moreover, the split points identified in the CRTRs (\code{\link{reprTrees}}) may become less precise as the number of covariates considered per tree root increases.

NOTE: The empirical evaluation of the unity forest framework (including the unity forest algorithm, the unity VIM, and covariate-representative tree roots) in Hornung & Hapfelmeier (2026) focused on categorical outcomes. Its performance for continuous outcomes has not been systematically investigated. Results for continuous outcomes should therefore be interpreted with appropriate caution.
}
\examples{

## IMPORTANT NOTE on parallelization:
## The default uses at most 2 threads (num.threads) to avoid unintentionally
## using many cores on shared systems.
## However, for typical runs on a personal computer, set num.threads = 0 to 
## use all available CPU cores; this is strongly recommended and can 
## substantially reduce runtime.
## Note: num.threads = 1 is used in the examples to avoid parallel
## execution during package checks.


## Load package:

library("unityForest")


## Set seed to make results reproducible:

set.seed(1234)


## Load wine dataset:

data(wine)


## Construct unity forest and calculate unity VIM values:

model <- unityfor(dependent.variable.name = "C", data = wine,
                  importance = "unity", num.trees = 20, num.threads = 1)

# NOTE: num.trees = 20 (in the above) would be much too small for practical 
# purposes. This small number of trees was simply used to keep the
# runtime of the example short.
# The default number of trees is num.trees = 20000.


## Inspect the rankings of the variables and variable pairs with respect to 
## the unity VIM:

sort(model$variable.importance, decreasing = TRUE)


## Prediction:

# Separate 'wine' dataset randomly in training
# and test data:
train.idx <- sample(nrow(wine), 2/3 * nrow(wine))
wine_train <- wine[train.idx, ]
wine_test <- wine[-train.idx, ]

# Construct unity forest on training data:
# NOTE again: num.trees = 20 is specified too small for practical purposes.
model_train <- unityfor(dependent.variable.name = "C", data = wine_train, 
                        importance = "none", num.trees = 20, 
                        probability = FALSE, num.threads = 1)
# NOTE: Because we are only interested in prediction here, we do not
# calculate unity VIM values (by setting importance = "none"), because 
# this speeds up calculations.
# Moreover, 'probability' is set to 'FALSE' because we are interested in pure
# class prediction in this example (without class probability prediction).

# Predict class values of the test data:
pred_wine <- predict(model_train, data = wine_test, num.threads = 1)

# Compare predicted and true class values of the test data:
table(wine_test$C, pred_wine$predictions)



## Prediction for dataset with continuous outcome:

# Load stock dataset:

data(stock)

# Separate 'stock' dataset randomly in training
# and test data:
train.idx <- sample(nrow(stock), 2/3 * nrow(stock))
stock_train <- stock[train.idx, ]
stock_test <- stock[-train.idx, ]

# Construct unity forest on training data:
# NOTE again: num.trees = 20 is specified too small for practical purposes.
model_train <- unityfor(dependent.variable.name = "company10", 
                        data = stock_train, importance = "none", 
                        num.trees = 20, num.threads = 1)
# NOTE: Because we are only interested in prediction here, we do not
# calculate unity VIM values (by setting importance = "none"), because 
# this speeds up calculations.

# Predict outcome values of the test data:
pred_stock <- predict(model_train, data = stock_test, num.threads = 1)

# Compare predicted and true outcome values of the test data:
plot(pred_stock$predictions, stock_test$company10)

}
\references{
\itemize{
  \item Hornung, R., Hapfelmeier, A. (2026). Unity Forests: Improving Interaction Modelling and Interpretability in Random Forests. arXiv:2601.07003, <\doi{10.48550/arXiv.2601.07003}>.
  \item Wright, M. N., Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. Journal of Statistical Software 77:1-17, <\doi{10.18637/jss.v077.i01}>.
  \item Breiman, L. (2001). Random forests. Machine Learning 45:5-32, <\doi{10.1023/A:1010933404324}>.
  \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods of Information in Medicine 51:74-81, <\doi{10.3414/ME00-01-0052}>.
  }
}
\seealso{
\code{\link{predict.unityfor}}
}
\author{
Roman Hornung, Marvin N. Wright
}
