% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/prepare_trait_data.R
\name{prepare_trait_data}
\alias{prepare_trait_data}
\title{Map trait evolution on a time-calibrated phylogeny}
\usage{
prepare_trait_data(
  tip_data,
  trait_data_type,
  phylo,
  seed = NULL,
  evolutionary_models = NULL,
  Q_matrix = NULL,
  BioGeoBEARS_directory_path = NULL,
  keep_BioGeoBEARS_files = TRUE,
  prefix_for_files = NULL,
  nb_cores = 1,
  max_range_size = 2,
  split_multi_area_ranges = FALSE,
  ...,
  res = 100,
  nb_simulations = 1000,
  color_scale = NULL,
  colors_per_levels = NULL,
  plot_map = TRUE,
  plot_overlay = TRUE,
  add_ACE_pies = TRUE,
  PDF_file_path = NULL,
  return_ace = TRUE,
  return_BSM = FALSE,
  return_simmaps = FALSE,
  return_best_model_fit = FALSE,
  return_model_selection_df = FALSE,
  verbose = TRUE
)
}
\arguments{
\item{tip_data}{Named numerical or character string vector of trait values/states/ranges at tips.
Names should be ordered as the tip labels in the phylogeny found in \code{phylo$tip.label}.
For biogeographic data, ranges should follow the coding scheme of BioGeoBEARS with a unique CAPITAL letter per unique areas
(ex: A, B), combined to form multi-area ranges (Ex: AB). Alternatively, you can provide tip_data as a matrix or data.frame of
binary presence/absence in each area (coded as unique CAPITAL letter). In this case, columns are unique areas, rows are taxa,
and values are integer (0/1) signaling absence or presence of the taxa in the area.}

\item{trait_data_type}{Character string. Type of trait data. Either: "continuous", "categorical" or "biogeographic".}

\item{phylo}{Time-calibrated phylogeny. Object of class \code{"phylo"} as defined in \code{{ape}}.
Tip labels (\code{phylo$tip.label}) should match names in \code{tip_data}.}

\item{seed}{Integer. Set the seed to ensure reproducibility. Default is \code{NULL} (a random seed is used).}

\item{evolutionary_models}{(Vector of) character string(s). To provide the set of evolutionary models to fit on the data.
\itemize{
\item Models available for continuous data are detailed in \code{\link[geiger:fitContinuous]{geiger::fitContinuous()}}. Default is \code{"BM"}.
\item Models available for categorical data are detailed in \code{\link[geiger:fitDiscrete]{geiger::fitDiscrete()}}. Default is \code{"ARD"}.
\item Models for biogeographic data are fit with R package \code{BioGeoBEARS} using \code{BioGeoBEARS::bears_optim_run()}. Default is \code{"DEC"}.
\item See list in "Details" section.
}}

\item{Q_matrix}{Custom Q-matrix for categorical data representing transition classes between states.
Transitions with similar integers are estimated with a shared rate parameter.
Transitions with \code{0} represent rates that are fixed to zero (i.e., impossible transitions).
Diagonal must be populated with \code{NA}. \code{row.names(Q_matrix)} and \code{col.names(Q_matrix)} are the states.
Provide \code{"matrix"} among the model listed in 'evolutionary_models' to use the custom Q-matrix for modeling. Only for categorical data.}

\item{BioGeoBEARS_directory_path}{Character string. The path to the directory used to store input/output files generated
for/by BioGeoBEARS during biogeographic historical inferences. Use '/' to separate directory and sub-directories. It must end with '/'.
Only for biogeographic data.}

\item{keep_BioGeoBEARS_files}{Logical. Whether the \code{BioGeoBEARS_directory} and its content should be kept after the run. Default = \code{TRUE}. Only for biogeographic data.}

\item{prefix_for_files}{Character string. Prefix to add to all BioGeoBEARS files stored in the \code{BioGeoBEARS_directory_path} if \code{keep_BioGeoBEARS_files = TRUE}.
Files will be exported such as 'prefix_*' with an underscore separating the prefix and the file name.
Default is \code{NULL} (no prefix is added). Only for biogeographic data.}

\item{nb_cores}{Interger. Number of cores to use for parallel computation during BioGeoBEARS runs. Default = \code{1}. Only for biogeographic data.}

\item{max_range_size}{Integer. Maximum number of unique areas encompassed by multi-range areas. Default = \code{2}. Only for biogeographic data.}

\item{split_multi_area_ranges}{Logical. Whether to split multi-area ranges across unique areas when mapping ranges.
Ex: For range EW, posterior probabilities will be split equally between Eastern Palearctic (E) and Western Palearctic (W).
Default = \code{FALSE}. Only for biogeographic data.}

\item{...}{Additional arguments to be passed down to the functions used to fit models (See \code{evolutionary_models})
and produce simmaps with \code{\link[phytools:make.simmap]{phytools::make.simmap()}} or \code{BioGeoBEARS::runBSM()}.}

\item{res}{Integer. Define the number of time steps used to interpolate/estimate trait value/state/range in \code{contMap}/\code{densityMaps}. Default = \code{100}.}

\item{nb_simulations}{Integer. Define the number of simulations generated for stochastic mapping. Default = \code{1000}. Only for "categorical" and "biogeographic" data.}

\item{color_scale}{Vector of character string. List of colors to use to build the color scale with \code{\link[grDevices:colorRamp]{grDevices::colorRampPalette()}}
showing the evolution of a continuous trait on the \code{contMap}. From lowest values to highest values. Only for continuous data. Default = \code{NULL} will use the rainbow() color palette.}

\item{colors_per_levels}{Named character string. To set the colors to use to map each state/range posterior probabilities. Names = states/ranges; values = colors.
If \code{NULL} (default), the \code{rainbow()} color scale will be used for categorical trait; the \code{BioGeoBEARS::get_colors_for_states_list_0based()} will be used for biogeographic ranges.
If no color is provided for multi-area ranges, they will be interpolated based on the colors provided for unique areas. Only for categorical and biogeographic data.}

\item{plot_map}{Logical. Whether to plot or not the phylogeny with mapped trait evolution. Default = \code{TRUE}.}

\item{plot_overlay}{Logical. If \code{TRUE} (default), plot a unique \code{densityMap} with overlapping states/ranges using transparency.
If \code{FALSE}, plot a \code{densityMap} per state/range. Only for "categorical" and "biogeographic" data.}

\item{add_ACE_pies}{Logical. Whether to add pies of posterior probabilities of states/ranges at internal nodes on the mapped phylogeny. Default = \code{TRUE}.
Only for categorical and biogeographic data.}

\item{PDF_file_path}{Character string. If provided, the plot will be saved in a PDF file following the path provided here. The path must end with ".pdf".}

\item{return_ace}{Logical. Whether the named vector of ancestral characters estimates (ACE) at internal nodes should be returned in the output. Default = \code{TRUE}.}

\item{return_BSM}{Logical. (Only for Biogeographic data) Whether the summary tables of anagenetic and cladogenetic events generated during the Biogeographic Stochastic Mapping (BSM)
process should be returned in the output. Default = \code{FALSE}.}

\item{return_simmaps}{Logical. Whether the evolutionary histories simulated during stochastic mapping (i.e., \code{simmaps}) should be returned in the output.
Default = \code{TRUE}. Only for "categorical" and "biogeographic" data.}

\item{return_best_model_fit}{Logical. Whether to include the output of the best fitting model in the function output. Default = \code{FALSE}.}

\item{return_model_selection_df}{Logical. Whether to include the data.frame summarizing model comparisons used to select the best fitting model should be returned in the output. Default = \code{FALSE}.}

\item{verbose}{Logical. Should progression be displayed? A message will be printed for every steps in the process. Default is \code{TRUE}.}
}
\value{
The function returns a list with at least two elements.
\itemize{
\item \verb{$contMap} (For "continuous" data) Object of class \code{"contMap"}, typically generated with \code{\link[phytools:contMap]{phytools::contMap()}},
that contains a phylogenetic tree and associated continuous trait mapping.
\item \verb{$densityMaps} (For "categorical" and "biogeographic" data) List of objects of class \verb{"densityMap},
typically generated with \code{\link[phytools:densityMap]{phytools::densityMap()}}, that contains a phylogenetic tree and associated mapping of probability
to harbor a given state/range along branches. The list contains one \verb{"densityMap} per state/range found in the \code{tip_data}.
\item \verb{$densityMaps_all_ranges} (For "biogeographic" data only, if \code{split_multi_area_ranges = TRUE}) Same as \verb{$densityMaps},
but for all ranges including the multi-areas ranges (e.g., AB) while \verb{$densityMaps} will display posterior probabilities for
unique areas only (e.g., A and B), with multi-areas ranges split across the unique areas they encompass.
\item \verb{$trait_data_type} Character string. Record the type of trait data. Either: "continuous", "categorical" or "biogeographic".
}

If \code{return_ace = TRUE},
\itemize{
\item \verb{$ace} For continuous traits: Named vector that record the ancestral characters estimates (ACE) at internal nodes.
For categorical and biogeographic data: Matrix that record the posterior probabilities of ancestral states/ranges (characters) estimates (ACE) at internal nodes.
Rows are internal nodes. Columns are states/ranges. Values are posterior probabilities of each state per node.
\item \verb{$ace_all_ranges} For biogeographic data, if \code{split_multi_area_ranges = TRUE}: Named vector that record the ancestral characters estimates (ACE) at internal nodes,
but including all ranges observed in the simmaps, including the multi-areas ranges (e.g., AB), while \verb{$ace} will display posterior probabilities
for unique areas only (e.g., A and B), with multi-areas ranges split across the unique areas they encompass.
}

If \code{return_BSM = TRUE}, (Only for biogeographic data)
\itemize{
\item \verb{$BSM_output} List of two lists that contains summary information of cladogenetic (\verb{$RES_caldo_events_tables}) and anagenetic (\verb{$RES_ana_events_tables}) events
recording across the N simulations of biogeographic histories performed during Biogeographic Stochastic Mapping (BSM).
Each element of the list is a data.frame recording events occurring during one simulation.
}

If \code{return_simmaps = TRUE}, (Only for categorical and biogeographic data)
\itemize{
\item \verb{$simmaps} List that contains as many objects of class \code{"simmap"} that \code{nb_simulations} were requested.
Each simmap object is a phylogeny with one simulated geographic history (i.e., transitions in geographic ranges) mapped along branches.
}

If \code{return_best_model_fit = TRUE},
\itemize{
\item \verb{$best_model_fit} List that provides the output of the best fitting model.
}

If \code{model_selection_df = TRUE},
\itemize{
\item \verb{$model_selection_df} Data.frame that summarizes model comparisons used to select the best fitting model.
}

For biogeographic data, the function also produce input and output files associated with BioGeoBEARS and stored in the directory
specified in \code{BioGeoBEARS_directory_path}. The directory and its content are kept if \code{keep_BioGeoBEARS_files = TRUE}
}
\description{
Map trait evolution on a time-calibrated phylogeny in several steps:
\itemize{
\item Step 1: Fit evolutionary models to trait data using Maximum Likelihood.
\item Step 2: Select the best fitting model comparing AICc.
\item Step 3: Infer ancestral characters estimates (ACE) at nodes.
\item Step 4: Run stochastic mapping simulations to generate evolutionary histories
compatible with the best model and inferred ACE. (Only for categorical and biogeographic data)
\item Step 5: Infer ancestral states along branches.
\itemize{
\item For continuous traits: use interpolation to produce a \code{contMap}.
\item For categorical and biogeographic data: compute posterior frequencies of each state/range
to produce a \code{densityMap} for each state/range.
}
}
}
\details{
Map trait evolution on a time-calibrated phylogeny in several steps:

Step 1: Models are fit using Maximum Likelihood approach:
\itemize{
\item For "continuous" data models are fit with \code{\link[geiger:fitContinuous]{geiger::fitContinuous()}}: "BM", "OU", "EB", "rate_trend", "lambda", "kappa", "delta". Default is \code{"BM"}.
\item For "categorical" data models are fit with \code{\link[geiger:fitDiscrete]{geiger::fitDiscrete()}}: "ER", "SYM", "ARD", "meristic", "matrix". Default is \code{"ARD"}.
\item For "biogeographic" data models are fit with R package \code{BioGeoBEARS}: "BAYAREALIKE", "DIVALIKE", "DEC", "BAYAREALIKE+J", "DIVALIKE+J", "DEC+J". Default is \code{"DEC"}.
}

Step 2: Best model is identified among the list of \code{evolutionary_models} by comparing the corrected AIC (AICc)
and selecting the  model with lowest AICc.

Step 3: For continuous traits: Ancestral characters estimates (ACE) are inferred with \link[phytools:fastAnc]{phytools::fastAnc} on a tree
with modified branch lengths scaled to reflect the evolutionary rates estimated from the best model using \code{\link[phytools:rescale]{phytools::rescale()}}.

Step 4: Stochastic Mapping.

For categorical and biogeographic data, stochastic mapping simulations are performed to generate evolutionary histories
compatible with the best model and inferred ACE. Node states/ranges are drawn from the scaled marginal likelihoods of ACE,
and states/ranges shifts along branches are simulated according to the transition matrix Q estimated from the best fitting model.

Step 5: Infer ancestral states along branches.
\itemize{
\item For continuous traits: ancestral trait values along branches are interpolated with \code{\link[phytools:contMap]{phytools::contMap()}}.
This provides quick estimates of trait value at any point in time, but it does not provide accurate ML estimates in
case of models that are time or trait-value dependent (such as "EB" or "OU") as the interpolation used to built the contMap is assuming
a constant rate along each branch. However, ancestral trait values at nodes remain accurate
\item For categorical and biogeographic data: compute posterior frequencies of each state/range among the simulated evolutionary histories (\code{simmaps})
to produce a \code{densityMap} for each state/range that reflects the changes along branches in probability of harboring a given state/range.
}
}
\section{Note on macroevolutionary models of trait evolution}{
This function provides an easy solution to map trait evolution on a time-calibrated phylogeny
and obtain the \code{contMap}/\code{densityMaps} objects needed to run the deepSTRAPP workflow (\link{run_deepSTRAPP_for_focal_time}, \link{run_deepSTRAPP_over_time}).
However, it does not explore the most complex options for trait evolution. You may need to explore more complex models to capture the dynamics of trait evolution.
such as trait-dependent multi-rate models (\code{\link[phytools:brownie.lite]{phytools::brownie.lite()}}, \link[OUwie:OUwie]{OUwie::OUwie}), Bayesian MCMC implementations allowing a thorough exploration
of location and number of regime shifts (Ex: BayesTraits, RevBayes), or RRphylo for a penalized phylogenetic ridge regression approach that allows regime shifts across all branches.
}

\section{Note on macroevolutionary models of biogeographic history}{
This function provides an easy solution to infer ancestral geographic ranges using the BioGeoBEARS framework.
It allows to directly compare model fits across 6 models: DEC, DEC+J, DIVALIKE, DIVALIKE+J, BAYAREALIKE, BAYAREALIKE+J.
It uses a step-wise approach by using MLE estimates of previous runs as staring parameter values when increasing complexity (adding +J parameters).
However, it does not explore the most complex options for historical biogeography. You may need to explore more complex models
to capture the dynamics of range evolution such as time-stratification with adjacency matrices, dispersal multipliers (+W),
distance-based dispersal probabilities (+X), or other features.
See for instance, \href{http://phylo.wikidot.com/biogeobears}{http://phylo.wikidot.com/biogeobears}).

The R package \code{BioGeoBEARS} is needed for this function to work with biogeographic data.
Please install it manually from: \href{https://github.com/nmatzke/BioGeoBEARS}{https://github.com/nmatzke/BioGeoBEARS}.
}

\examples{
# ----- Example 1: Continuous data ----- #

## Load phylogeny and tip data
# Load eel phylogeny and tip data from the R package phytools
# Source: Collar et al., 2014; DOI: 10.1038/ncomms6505
data("eel.tree", package = "phytools")
data("eel.data", package = "phytools")

# Extract body size
eel_data <- stats::setNames(eel.data$Max_TL_cm,
                            rownames(eel.data))

\donttest{ # (May take several minutes to run)
## Map trait evolution on the phylogeny
mapped_cont_traits <- prepare_trait_data(
   tip_data = eel_data,
   trait_data_type = "continuous",
   phylo = eel.tree,
   evolutionary_models = c("BM", "OU", "lambda", "kappa"),
   # Example of an additional argument ('control') that can be provided to geiger::fitContinuous()
   control = list(niter = 200),
   color_scale = c("darkgreen", "limegreen", "orange", "red"),
   plot_map = FALSE,
   return_best_model_fit = TRUE,
   return_model_selection_df = TRUE,
   verbose = TRUE)

## Explore output
plot_contMap(mapped_cont_traits$contMap) # contMap with interpolated trait values
mapped_cont_traits$model_selection_df # Summary of model selection
# Parameter estimates and optimization summary of the best model
# (Here, the best model is Pagel's lambda)
mapped_cont_traits$best_model_fit$opt
mapped_cont_traits$ace # Ancestral character estimates at internal nodes }

# ----- Example 2: Categorical data ----- #

## Load phylogeny and tip data
# Load eel phylogeny and tip data from the R package phytools
# Source: Collar et al., 2014; DOI: 10.1038/ncomms6505
data("eel.tree", package = "phytools")
data("eel.data", package = "phytools")

# Transform feeding mode data into a 3-level factor
eel_data <- stats::setNames(eel.data$feed_mode, rownames(eel.data))
eel_data <- as.character(eel_data)
eel_data[c(1, 5, 6, 7, 10, 11, 15, 16, 17, 24, 25, 28, 30, 51, 52, 53, 55, 58, 60)] <- "kiss"
eel_data <- stats::setNames(eel_data, rownames(eel.data))
table(eel_data)

# Manually define a Q_matrix for rate classes of state transition to use in the 'matrix' model
# Does not allow transitions from state 1 ("bite") to state 2 ("kiss") or state 3 ("suction")
# Does not allow transitions from state 3 ("suction") to state 1 ("bite")
# Set symmetrical rates between state 2 ("kiss") and state 3 ("suction")
Q_matrix = rbind(c(NA, 0, 0), c(1, NA, 2), c(0, 2, NA))

# Set colors per states
colors_per_states <- c("limegreen", "orange", "dodgerblue")
names(colors_per_states) <- c("bite", "kiss", "suction")

\donttest{ # (May take several minutes to run)
## Run evolutionary models
eel_cat_3lvl_data <- prepare_trait_data(tip_data = eel_data, phylo = eel.tree,
    trait_data_type = "categorical",
    colors_per_levels = colors_per_states,
    evolutionary_models = c("ER", "SYM", "ARD", "meristic", "matrix"),
    Q_matrix = Q_matrix,
    nb_simulations = 1000,
    plot_map = TRUE,
    plot_overlay = TRUE,
    return_best_model_fit = TRUE,
    return_model_selection_df = TRUE) }

# Load directly output
data(eel_cat_3lvl_data, package = "deepSTRAPP")

## Explore output
plot(eel_cat_3lvl_data$densityMaps[[1]]) # densityMap for state n°1 ("bite")
eel_cat_3lvl_data$model_selection_df # Summary of model selection
# Parameter estimates and optimization summary of the best model
# (Here, the best model is ER)
print(eel_cat_3lvl_data$best_model_fit)$ # Summary of the best evolutionary model
eel_cat_3lvl_data$ace # Posterior probabilities of each state (= ACE) at internal nodes


# ----- Example 3: Biogeographic data ----- #

if (deepSTRAPP::is_dev_version())
{
 ## The R package 'BioGeoBEARS' is needed for this function to work with biogeographic data.
 # Please install it manually from: https://github.com/nmatzke/BioGeoBEARS.

 ## Load phylogeny and tip data
 # Load eel phylogeny and tip data from the R package phytools
 # Source: Collar et al., 2014; DOI: 10.1038/ncomms6505
 data("eel.tree", package = "phytools")
 data("eel.data", package = "phytools")

 # Transform feeding mode data into biogeographic data with ranges A, B, and AB.
 eel_data <- stats::setNames(eel.data$feed_mode, rownames(eel.data))
 eel_data <- as.character(eel_data)
 eel_data[eel_data == "bite"] <- "A"
 eel_data[eel_data == "suction"] <- "B"
 eel_data[c(5, 6, 7, 15, 25, 32, 33, 34, 50, 52, 57, 58, 59)] <- "AB"
 eel_data <- stats::setNames(eel_data, rownames(eel.data))
 table(eel_data)

 colors_per_ranges <- c("dodgerblue3", "gold")
 names(colors_per_ranges) <- c("A", "B")

 \donttest{ # (May take several minutes to run)
 ## Run evolutionary models
 eel_biogeo_data <- prepare_trait_data(
    tip_data = eel_data,
    trait_data_type = "biogeographic",
    phylo = eel.tree,
    # Default = "DEC" for biogeographic
    evolutionary_models = c("BAYAREALIKE", "DIVALIKE", "DEC",
                            "BAYAREALIKE+J", "DIVALIKE+J", "DEC+J"),
    BioGeoBEARS_directory_path = tempdir(), # Ex: "./BioGeoBEARS_directory/"
    keep_BioGeoBEARS_files = FALSE,
    prefix_for_files = "eel",
    max_range_size = 2,
    split_multi_area_ranges = TRUE, # Set to TRUE to display the two outputs
    # Reduce the number of Stochastic Mapping simulations to save time (Default = '1000')
    nb_simulations = 100,
    colors_per_levels = colors_per_ranges,
    return_simmaps = TRUE,
    return_best_model_fit = TRUE,
    return_model_selection_df = TRUE,
    verbose = TRUE) }

 # Load directly output
 data(eel_biogeo_data, package = "deepSTRAPP")

 ## Explore output
 str(eel_biogeo_data, 1)
 eel_biogeo_data$model_selection_df # Summary of model selection
 # Parameter estimates and optimization summary of the best model
 # (Here, the best model is DEC+J)
 eel_biogeo_data$best_model_fit$optim_result

 # Posterior probabilities of each state (= ACE) at internal nodes
 eel_biogeo_data$ace # Only with unique areas
 eel_biogeo_data$ace_all_ranges # Including multi-area ranges (Here, AB)

 ## Plot densityMaps
 # densityMap for range n°1 ("A")
 plot(eel_biogeo_data$densityMaps[[1]])
 # densityMaps with all unique areas overlaid
 plot_densityMaps_overlay(eel_biogeo_data$densityMaps)
 # densityMaps with all ranges (including multi-area ranges) overlaid
 plot_densityMaps_overlay(eel_biogeo_data$densityMaps_all_ranges)
}

}
\references{
For macroevolutionary models in geiger: Pennell, M. W., Eastman, J. M., Slater, G. J., Brown, J. W., Uyeda, J. C., FitzJohn, R. G., ... & Harmon, L. J. (2014).
geiger v2. 0: an expanded suite of methods for fitting macroevolutionary models to phylogenetic trees. Bioinformatics, 30(15), 2216-2218.
\doi{10.1093/bioinformatics/btu181}.

For BioGeoBEARS: Matzke, Nicholas J. (2018). BioGeoBEARS: BioGeography with Bayesian (and likelihood) Evolutionary Analysis with R Scripts.
version 1.1.1, published on GitHub on November 6, 2018. \doi{10.5281/zenodo.1478250}. Website: \url{http://phylo.wikidot.com/biogeobears}.
}
\seealso{
\code{\link[geiger:fitContinuous]{geiger::fitContinuous()}} \code{\link[geiger:fitDiscrete]{geiger::fitDiscrete()}} \code{\link[phytools:contMap]{phytools::contMap()}} \code{\link[phytools:densityMap]{phytools::densityMap()}}

For a guided tutorial, see the associated vignettes:
\itemize{
\item For continuous trait data: \code{vignette("model_continuous_trait_evolution", package = "deepSTRAPP")}
\item For categorical trait data: \code{vignette("model_categorical_trait_evolution", package = "deepSTRAPP")}
\item For biogeographic range data: \code{vignette("model_biogeographic_range_evolution", package = "deepSTRAPP")}
}
}
\author{
Maël Doré
}
