% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/LEGIT.R
\name{stepwise_search}
\alias{stepwise_search}
\title{Stepwise search for the best subset of genetic variants or environments with the LEGIT model}
\usage{
stepwise_search(data, formula, interactive_mode = FALSE,
  genes_original = NULL, env_original = NULL, genes_extra = NULL,
  env_extra = NULL, search_type = "bidirectional-forward",
  search = "genes", search_criterion = "AIC",
  forward_exclude_p_bigger = 0.2, backward_exclude_p_smaller = 0.01,
  exclude_worse_AIC = TRUE, max_steps = 50, cv_iter = 5, cv_folds = 10,
  folds = NULL, classification = FALSE, start_genes = NULL,
  start_env = NULL, eps = 0.01, maxiter = 25, family = gaussian,
  seed = NULL, print = TRUE)
}
\arguments{
\item{data}{data.frame of the dataset to be used. Do not include elements that are in the datasets \code{genes} and \code{env} and do not include manually coded interactions.}

\item{formula}{Model formula. Use \emph{E} for the environmental score and \emph{G} for the genetic score. Do not manually code interactions, write them in the formula instead (ex: G*E*z or G:E:z).}

\item{interactive_mode}{If TRUE, uses interactive mode. In interactive mode, at each iteration, the user is shown the AIC, BIC, p-value and also the cross-validation \eqn{R^2} if \code{search_criterion="cv"} and the cross-validation AUC if \code{search_criterion="cv_AUC"} for the best 5 variables. The user must then enter a number between 1 and 5 to select the variable to be added, entering anything else will stop the search.}

\item{genes_original}{data.frame of the variables inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic).}

\item{env_original}{data.frame of the variables inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental).}

\item{genes_extra}{data.frame of the additionnal variables to try including inside the genetic score \emph{G} (can be any sort of variable, doesn't even have to be genetic). If not NULL, \code{env_extra} should be NULL.}

\item{env_extra}{data.frame of the variables to try including inside the environmental score \emph{E} (can be any sort of variable, doesn't even have to be environmental). If not NULL, \code{genes_extra} should be NULL.}

\item{search_type}{If \code{search_type="forward"}, uses a forward search. If \code{search_type="bidirectional"}, uses bidirectional forward search. If \code{search_type="backward"}, uses backward search. For now only \code{search_type="forward"} is implemented (Default = "forward").}

\item{search}{If \code{search="genes"}, uses a stepwise search for the genetic score variables \code{genes_extra}, forcing \code{genes_original} to be included in the genetic score. If \code{search="env"}, uses a stepwise search for the environmental score variables \code{env_extra}, forcing \code{env_original} to be included in the genetic score (Default = "genes").}

\item{search_criterion}{Criterion used to determine which variable is the best to add or worst to drop. if \code{search_criterion="AIC"}, uses the AIC, if \code{search_criterion="BIC"}, uses the BIC, if \code{search_criterion="cv"}, uses the cross-validation error, if \code{search_criterion="cv_AUC"}, uses the cross-validated AUC (Default = "AIC").}

\item{forward_exclude_p_bigger}{If p-value > \code{forward_exclude_p_bigger}, we do not consider the variable for inclusion in the forward steps (Default = .20).}

\item{backward_exclude_p_smaller}{If p-value < \code{backward_exclude_p_smaller}, we do not consider the variable for removal in the backward steps (Default = .01).}

\item{exclude_worse_AIC}{If AIC with variable > AIC without variable, we ignore the variable (Default = TRUE).}

\item{max_steps}{Maximum number of steps taken (Default = 50).}

\item{cv_iter}{Number of cross-validation iterations (Default = 5).}

\item{cv_folds}{Number of cross-validation folds (Default = 10). Using \code{cv_folds=NROW(data)} will lead to leave-one-out cross-validation.}

\item{folds}{Optional list of vectors containing the fold number for each observation. Bypass cv_iter and cv_folds. Setting your own folds could be important for certain data types like time series or longitudinal data.}

\item{classification}{Set to TRUE if you are doing classification (binary outcome).}

\item{start_genes}{Optional starting points for genetic score (must be same length as the number of columns of \code{genes}).}

\item{start_env}{Optional starting points for environmental score (must be same length as the number of columns of \code{env}).}

\item{eps}{Threshold for convergence (.01 for quick batch simulations, .0001 for accurate results).}

\item{maxiter}{Maximum number of iterations.}

\item{family}{Outcome distribution and link function (Default = gaussian).}

\item{seed}{Seed for cross-validation folds.}

\item{print}{If TRUE, print all the steps and notes/warnings. Highly recommended unless you are batch running multiple stepwise searchs. (Default=TRUE).}
}
\value{
Returns an object of the class "LEGIT" which is list containing, in the following order: a glm fit of the main model, a glm fit of the genetic score, a glm fit of the environmental score, a list of the true model parameters (AIC, BIC, rank, df.residual, null.deviance) for which the individual model parts (main, genetic, environmental) don't estimate properly.
}
\description{
Adds the best variable or drops the worst variable one at a time in the genetic (if \code{search="genes"}) or environmental score (if \code{search="env"}). For now, only \code{search_type="forward"} is implemented. You can select the desired search criterion (AIC, BIC, cross-validation error, cross-validation AUC) to determine which variable is the best/worst and should be added/dropped. If using cross-validation (\code{search_criterion="cv"} or \code{search_criterion="cv_AUC"}), to prevent cross-validating with each variable (extremely slow), we recommend setting a p-value threshold (\code{p_threshold}) and forcing the algorithm not to look at models with bigger AIC (\code{exclude_worse_AIC=TRUE}).
}
\examples{
\dontrun{
## Continuous example
train = example_3way(250, 2.5, seed=777)
# Forward search for genes based on BIC (in interactive mode)
forward_genes_BIC = stepwise_search(train$data, genes_extra=train$G, env_original=train$E,
formula=y ~ E*G*z,search_type="forward", search="genes", search_criterion="BIC",
interactive_mode=TRUE)
# Bidirectional-backward search for environments based on cross-validation error
bidir_backward_env_cv = stepwise_search(train$data, genes_original=train$G, env_original=train$E,
formula=y ~ E*G*z,search_type="bidirectional-backward", search="env", search_criterion="cv")
## Binary example
train_bin = example_2way(500, 2.5, logit=TRUE, seed=777)
# Forward search for genes based on cross-validated AUC (in interactive mode)
forward_genes_AUC = stepwise_search(train_bin$data, genes_extra=train_bin$G, 
env_original=train_bin$E, formula=y ~ E*G,search_type="forward", search="genes", 
search_criterion="cv_AUC", classification=TRUE, family=binomial, interactive_mode=TRUE)
# Forward search for genes based on AIC
bidir_forward_genes_AIC = stepwise_search(train_bin$data, genes_extra=train_bin$G, 
env_original=train_bin$E, formula=y ~ E*G,search_type="bidirectional-forward", search="genes", 
search_criterion="AIC", classification=TRUE, family=binomial)
}
}
