\name{bal.tab.df.formula}
\alias{bal.tab.data.frame}
\alias{bal.tab.formula}
\title{
Balance Statistics for Data Sets
}
\description{
Generates balance statistics for unadjusted, matched, weighted, or stratified data using either a data frame or formula interface.
}
\usage{
\method{bal.tab}{data.frame}(covs, treat, data = NULL, weights = NULL, 
    distance = NULL, subclass = NULL, match.strata = NULL, 
    method, int = FALSE, addl = NULL, continuous = c("std", 
    "raw"), binary = c("raw", "std"), s.d.denom, 
    m.threshold = NULL, v.threshold = NULL, 
    r.threshold = NULL, un = FALSE, disp.means = FALSE, 
    disp.v.ratio = FALSE, disp.subclass = FALSE, 
    cluster = NULL, which.cluster = NULL, 
    cluster.summary = TRUE, imp = NULL, which.imp = NA, 
    imp.summary = TRUE, estimand = NULL, quick = FALSE, ...)

\method{bal.tab}{formula}(formula, data, weights = NULL, distance = NULL, 
    subclass = NULL, match.strata = NULL, method, 
    int = FALSE, addl = NULL, continuous = c("std", "raw"), 
    binary = c("raw", "std"), s.d.denom, m.threshold = NULL, 
    v.threshold = NULL, r.threshold = NULL, un = FALSE, 
    disp.means = FALSE, disp.v.ratio = FALSE, 
    disp.subclass = FALSE, cluster = NULL, 
    which.cluster = NULL, cluster.summary = TRUE, imp = NULL, 
    which.imp = NA, imp.summary = TRUE, estimand = NULL, 
    quick = FALSE, ...)
}

\arguments{
  \item{covs}{
A data frame containing covariate values for each unit.
}
  \item{treat}{
Either a vector containing treatment status values for each unit or a string containing the name of the treatment variable in \code{data}.
}
  \item{formula}{
  a \code{formula} with the treatment variable as the response and the covariates for which balance is to be assessed as the terms. All arguments must be present as variable names in \code{data}.
}
  \item{data}{
For the data frame method: Optional; a data frame containing variables with the names used in \code{treat}, \code{weights}, \code{distance}, \code{addl}, \code{subclass}, \code{match.strata}, \code{cluster}, and/or \code{imp}if any.

For the \code{formula} method: Required; a data frame containing all covariates named in \code{formula} and variables with the names used in the arguments mentioned above.
}
  \item{weights}{
Optional; either a vector containing weights for each unit or a string containing the name of the weights variable in \code{data}.  These can be weights generated by, e.g., inverse probability weighting or matching weights resulting from a matching algorithm.  This must be specified in \code{method}.   If \code{weights=NULL}, \code{subclass=NULL} and \code{match.strata=NULL}, balance information will be presented only for the unadjusted sample.
}
  \item{distance}{
Optional; either a vector or data.frame containing distance values (e.g., propensity scores) for each unit or a string containing the name of the distance variable in \code{data}.
}
  \item{subclass}{
Optional; either a vector containing subclass membership for each unit or a string containing the name of the subclass variable in \code{data}.  If \code{weights=NULL}, \code{subclass=NULL} and \code{match.strata=NULL}, balance information will be presented only for the unadjusted sample.
}
\item{match.strata}{
Optional; either a vector containing matching stratum membership for each unit or a string containing the name of the matching stratum variable in \code{data}. If \code{weights=NULL}, \code{subclass=NULL} and \code{match.strata=NULL}, balance information will be presented only for the unadjusted sample.
}
  \item{method}{
A string containing the method of adjustement, if any.  If \code{weights} are specified, the user must specify either "matching" or "weighting"; "weighting" is the default.  If \code{subclass} is specified, "subclassification" is the default.  Abbreviations allowed.
}
  \item{int}{
\code{logical}; whether or not to include 2-way interactions of covariates included in \code{covs} and in \code{addl}.
}
  \item{addl}{
A vector, data frame, or the quoted names of additional covariates for which to present balance.  These may be covariates included in the original dataset but not included in \code{covs}.  In general, it makes more sense to include all desired variables in \code{covs} than in \code{addl}. If the argument is a vector, the variable will be displayed as "addl" in the output.
}
  \item{continuous}{
Whether mean differences for continuous variables should be standardized ("std") or raw ("raw").  Default "std".  Abbreviations allowed.
}
  \item{binary}{
Whether mean differences for binary variables (i.e., difference in proportion) should be standardized ("std") or raw ("raw").  Default "raw".  Abbreviations allowed.
}
  \item{s.d.denom}{
Whether the denominator for standardized differences (if any are calculated) should be the standard deviation of the treated group ("treated"), the standard deviation of the control group ("control"), or the pooled standard deviation ("pooled"), computed as the square root of the mean of the group variances.  Abbreviations allowed.  If weights are supplied, \code{bal.tab()} will try to determine whether the ATT, ATC, or ATE is being estimated based on the pattern of weights and supply \code{s.d.denom} accordingly. If matching or subclassification are used, the default is \code{"treated"}; if weighting is used, the default is \code{"pooled"}. If left blank, \code{bal.tab()} will try to use the \code{estimand} argument.
}
  \item{m.threshold}{
A numeric value for the threshold for mean differences.  .1 is recommended.  
}
  \item{v.threshold}{
A numeric value for the threshold for variance ratios.  Will automatically convert to the inverse if less than 1.
}
  \item{r.threshold}{
A numeric value for the threshold for correlations between covariates and treatment when treatment is continuous.
}
  \item{un}{
\code{logical}; whether to print statistics for the unadjusted sample as well as for the adjusted sample.  If \code{weights = NULL} and \code{subclass = NULL}, \code{un} will be set to \code{TRUE}.  
}
  \item{disp.means}{
\code{logical}; whether to print the group means in balance output.
}
  \item{disp.v.ratio}{
\code{logical}; whether to display variance ratios in balance output.
}
  \item{disp.subclass}{
\code{logical}; whether to display balance information for individual subclasses if subclassification is used in conditioning.
}
  \item{cluster}{
either a vector containing cluster membserhip for each unit or a string containing the name of the cluster membership variable in \code{data}. 
}
  \item{which.cluster}{
which cluster(s) to display. If \code{NULL}, all clusters in \code{cluster} will be displayed. If \code{NA}, no clusters will be displayed. Otherwise, can be a vector of cluster names or numerical indices for which to display balance. Indices correspond to the alphabetical order of cluster names. 
}
  \item{cluster.summary}{
\code{logical}; whether to display the cluster summary table if \code{cluster} is specified. If \code{which.cluster} is \code{NA}, \code{cluster.summary} will be set to \code{TRUE}.
}
  \item{imp}{
either a vector containing imputation indices for each unit or a string containing the name of the imputation index variable in \code{data}. 
}
  \item{which.imp}{
which imputation(s) to display. If \code{NULL}, all imputations in \code{imp} will be displayed. If \code{NA}, no imputations will be displayed. Otherwise, can be a vector of imputation indices for which to display balance.
}
  \item{imp.summary}{
\code{logical}; whether to display the across-imputation summary table if \code{imp} is specified. If \code{which.imp} is \code{NA}, \code{imp.summary} will be set to \code{TRUE}.
}
  \item{estimand}{
\code{character}; whether the desired estimand is the "ATT", "ATC", or "ATE." This argument can be used in place of \code{s.d.denom} to specify how standardized differences are calculated.
}
  \item{quick}{
\code{logical}; if \code{TRUE}, will not compute any values that will not be displayed. Leave \code{FALSE} if computed values not displayed will be used later.
}
  \item{...}{
further arguments passed to or from other methods. They are ignored in this function.
}
}
\details{
\code{bal.tab.data.frame()} generates a list of balance summaries for the data frame of covariates and treatment status values given. \code{bal.tab.formula()} does the same but uses a formula interface instead.  When the formula interface is used, the formula and data are reshaped into a treatment vector and data frame of covariates and then simply passed through the data frame method.  

The argument to \code{match.strata} correspond to a factor vector containing the name or index of each pair/stratum for units conditioned through matching, for example, using the \pkg{optmatch} package. If more than one of \code{weights}, \code{subclass}, or \code{match.strata} are specified, \code{bal.tab()} will attempt to figure out which one to apply. Currently only one of these can be applied ta a time. \code{bal.tab()} behaves differently depending on whether subclasses are used in conditioning or not. If they are used, bal.tab creates balance statistics for each subclass and for the sample in aggregate.

All balance statistics are calculated whether they are displayed by print or not, unless \code{quick = TRUE}.  The threshold values (\code{m.threshold}, \code{v.threshold}, and \code{r.threshold}) control whether extra columns should be inserted into the Balance table describing whether the balance statistics in question exceeded or were within the threshold.  Including these thresholds also creates summary tables tallying the number of variables that exceeded and were within the threshold and displaying the variables with the greatest imbalance on that balance measure.  When subclassification is used, the extra threshold columns are placed within the balance tables for each subclass as well as in the aggregate balance table, and the summary tables display balance for each subclass.

The inputs (if any) to \code{covs} must be a data frame; if more than one variable is included, this is straightforward (i.e., because \code{data[,c("v1", "v2")]} is already a data frame), but if only one variable is used (e.g., \code{data[,"v1"]}), R will coerce it to a vector, thus making it unfit for input. To avoid this, simply wrap the input to \code{covs} in \code{data.frame()} or use \code{subset()} if only one variable is to be added. Again, when more than one variable is included, the input is general already a data frame and nothing needs to be done.

Clusters and imputations can be used at the same time, but the resulting output may be quite large. Setting \code{which.cluster} or \code{which.imp} to \code{NA} can help keep the output clean.

}
\value{
If clusters are not specified, an object of class \code{"bal.tab"} containing balance summaries for the data object.  If subclassifcation is not used, the following are the elements of \code{bal.tab}:
\item{Balance}{A data frame containing balance information for each covariate.  Balance contains the following columns:
\itemize{
\item{\code{Type}: Whether the covariate is binary, continuous, or a measure of distance (e.g., the propensity score).}
\item{\code{M.C.Un}: The mean of the control group prior to adjusting.}
\item{\code{M.T.Un}: The mean of the treated group prior to adjusting.}
\item{\code{Diff.Un}: The (standardized) difference in means between the two groups prior to adjusting.}
\item{\code{V.Ratio.Un}: The ratio of the variances of the two groups prior to adjusting.  \code{NA} for binary variables.  If less than 1, the reciprocal is reported.}
\item{\code{M.C.Adj}: The mean of the control group after adjusting.}
\item{\code{M.T.Adj}: The mean of the treated group after adjusting.}
\item{\code{Diff.Adj}: The (standardized) difference in means between the two groups after adjusting.}
\item{\code{M.Threshold}: Whether or not the calculated mean difference after adjusting exceeds or is within the threshold given by \code{m.threshold}.  If \code{m.threshold} is \code{NULL}, this column will be \code{NA}.}
\item{\code{V.Ratio.Adj}: The ratio of the variances of the two groups after adjusting.  \code{NA} for binary variables.  If less than 1, the reciprocal is reported.}
\item{\code{V.Threshold}: Whether or not the calculated variance ratio after adjusting exceeds or is within the threshold given by \code{v.threshold} for continuous variables.  If \code{v.threshold} is \code{NULL}, this column will be \code{NA}.}
}}
\item{Balanced.Means}{If \code{m.threshold} is specified, a table tallying the number of variables that exceed or are within the threshold for mean differences.}
\item{Max.Imbalance.Means}{If \code{m.threshold} is specified, a table displaying the variable with the greatest absolute mean difference.}
\item{Balanced.Variances}{If \code{v.threshold} is specified, a table tallying the number of variables that exceed or are within the threshold for variance ratios.}
\item{Max.Imbalance.Variance}{If \code{v.threshold} is specified, a table displaying the variable with the greatest variance ratio.}
\item{Observations}{A table displaying the sample sizes before and after adjusting.}
\item{call}{\code{NULL}.}
\item{print.options}{A list of print options passed to \code{print.bal.tab}.}

If clusters are specified, an object of class \code{"bal.tab.cluster"} containing balance summaries within each cluster and a summary of balance across clusters. Each balance summary is a balance table as described in \code{Balance} above. The summary of balance across clusters displays the mean, median, and maximum mean difference and variance ratio after adjustment for each covariate across clusters. Minimum statistics are calculated as well, but not displayed. To see these, use the options in \code{\link{print.bal.tab.cluster}}.

If imputations are specified, an object of class \code{"bal.tab.imp"} containing balance summaries for each imputation and a summary of balance across imputations, just as with clusters.

If both clusters and imputations are specified, an object of class \code{"bal.tab.imp.cluster"} containing summaries between and across all clusters and imputations.

If subclassification is used, the following are the elements of bal.tab:
\item{Subclass.Balance}{A list of data frames containing balance information for each covariate in each subclass.  Each data frame contains the following columns:
\itemize{
\item{\code{Type}: Whether the covariate is binary, continuous, or a measure of distance (e.g., the propensity score).}
\item{\code{M.C.Adj}: The mean of the control group in the subclass.}
\item{\code{M.T.Adj}: The mean of the treated group in the subclass.}
\item{\code{Diff.Adj}: The (standardized) difference in means between the two groups in the subclass.}
\item{\code{M.Threshold}: Whether or not the calculated mean difference exceeds or is within the threshold given by \code{m.threshold}.  If \code{m.threshold} is \code{NULL}, this column will be \code{NA}.}
\item{\code{V.Ratio.Adj}: The ratio of the variances of the two groups in the subclass.  \code{NA} for binary variables.  If less than 1, the reciprocal is reported.}
\item{\code{V.Threshold}: Whether or not the calculated variance ratio exceeds or is within the threshold given by \code{v.threshold} for continuous variables.  If \code{v.threshold} is \code{NULL}, this column will be \code{NA}.}
}}

\item{Balance.Across.Subclass}{A data frame containing balance statistics for each covariate aggregated across subclasses and for the original sample (i.e., unadjusted).  Variance ratios are not reported here.}
\item{Balanced.Means.Subclass}{If \code{m.threshold} is specified, a table tallying the number of variables in each subclass that exceed or are within the threshold for mean differences.}
\item{Max.Imbalance.Means.Subclass}{If \code{m.threshold} is specified, a table displaying the variable in each subclass with the greatest absolute mean difference.}
\item{Balanced.Variances.Subclass}{If \code{v.threhsold} is specified, a table tallying the number of variables in each subclass that exceed or are within the threshold for variance ratios.}
\item{Max.Imbalance.Variance.Subclass}{If \code{v.threshold} is specified, a table displaying the variable in each subclass with the greatest variance ratio.}
\item{Subclass.Observations}{A table displaying the sample sizes in each subclass.}
\item{call}{\code{NULL}.}
\item{print.options}{A list of print options passed to \code{print.bal.tab.subclass}.}

If treatment is continuous, means, mean differences, and variance ratios are replaced by (weighted) Pearson correlations between each covariate and treatment. The \code{r.threshold} argument works the same as \code{m.threshold} or \code{v.threshold}, adding an extra column to the balance table output and creating additional summaries for balance tallies and maximum imbalances. All arguments related to the calculation or display of mean differences or variance ratios are ignored. The \code{int}, \code{distance}, \code{addl}, \code{un}, cluster and imputation arguments are still used as described above.
}


\author{
Noah Greifer \email{noah@unc.edu}
}

\seealso{
\code{\link{bal.tab}} for details of calculations.
}
\examples{
data("lalonde", package = "cobalt")
lalonde$p.score <- glm(treat ~ age + educ + race, data = lalonde, 
            family = "binomial")$fitted.values
covariates <- subset(lalonde, 
                     select = c(age, educ, race))
                     
## Propensity score weighting using IPTW
lalonde$iptw.weights <- ifelse(lalonde$treat==1, 
                               1/lalonde$p.score, 
                               1/(1-lalonde$p.score))

# data frame interface:
bal.tab(covariates, treat = "treat", data = lalonde, 
      weights = "iptw.weights", method = "weighting", 
      s.d.denom = "pooled")

# Formula interface:
bal.tab(treat ~ age + educ + race, data = lalonde, 
      weights = "iptw.weights", method = "weighting", 
      s.d.denom = "pooled")
      
## Propensity score subclassification
lalonde$subclass <- findInterval(lalonde$p.score, 
                        quantile(lalonde$p.score[lalonde$treat==1], 
                        (0:6)/6), all.inside = TRUE)

# data frame interface:
bal.tab(covariates, treat = "treat", data = lalonde, 
      subclass = "subclass", method = "subclassification", 
      disp.subclass = TRUE)

# Formula interface:
bal.tab(treat ~ age + educ + race, data = lalonde, 
      subclass = "subclass", method = "subclassification", 
      disp.subclass = TRUE)
}
\keyword{design}
