
#' Get binary representation of categorical variables
#'
#' @param categories A vector, data.frame or matrix representing one
#'     or several categorical variables
#' @param use_combinations Logical, should the output also include columns representing
#'    the combination / interaction of the categories (defaults to \code{FALSE}).
#'
#' @return A matrix encoding the categorical variable(s) in binary form. 
#'
#' @details
#' 
#' The conversion of categorical variables to binary variables is done via
#' \code{\link[stats]{model.matrix}}. Since version 0.8.9, each category
#' of a categorical variable is coded by a separate variable ('one hot' encoding).
#' So we do not use 'dummy' coding, which is often used to encode predictors 
#' in statistical analysis. Dummy coding uses a reference category that is not
#' explicitly modelled via a separate variable. This implies that there is
#' a different distance to the reference category than among the other 
#' categories, which is unwarranted in anticlustering (thanks to Gunnar Klau
#' for noting this). See examples. 
#' 
#' This function can be used to include categorical variables as part of the 
#' optimization criterion in anticlustering, rather than including them as hard constraints as done when using the 
#' argument \code{categories} in \code{\link{anticlustering}} (or \code{\link{fast_anticlustering}}). 
#' This way, categorical variables are treated as numeric variables, 
#' which can be useful when there are several
#' categorical variables or when the group sizes are unequal (or both).
#' See examples. Please see the vignette 'Using categorical variables with anticlustering'
#' for more information on this approach.
#' 
#' Since version 0.8.12, this function treats NA in the categorical input variables
#' as a distinct category. This implies that different observations having an NA are treated
#' as having the same value on this variable. If this is not desired, you need to do some other
#' preprocessing of the NAs before calling this function.
#' 
#' @importFrom stats as.formula model.matrix contrasts
#'
#'
#' @author
#' Martin Papenberg \email{martin.papenberg@@hhu.de}
#'
#' @references
#' 
#' Papenberg, M., Wang, C., Diop, M., Bukhari, S. H., Oskotsky, B., Davidson, B. R., 
#' Vo, K. C., Liu, B., Irwin, J. C., Combes, A., Gaudilliere, B., Li, J., Stevenson, D. K., 
#' Klau, G. W., Giudice, L. C., Sirota, M., & Oskotsky, T. T. (2025). Anticlustering
#' for sample allocation to minimize batch effects. Cell Reports Methods, 5(8), 
#' 101137. https://doi.org/10.1016/j.crmeth.2025.101137
#' 
#' @export
#'
#' @examples
#' 
#' # Illustrate why dummy encoding is not appropriate for anticlustering.
#' # Use 3 elements of the iris data set, with different levels of 'Species'
#' input <- iris[c(1, 51, 101), "Species", drop = FALSE]
#' input
#' # Default dummy encoding: 
#' (dummy <- model.matrix(~ . , input))
#' dist(dummy) 
#' # Distance between versicolor and virginica is larger than among setosa and 
#' # versicolor, and setosa and virginica. This would bias the anticlustering 
#' # computation.
#' (one_hot <- categories_to_binary(input))
#' dist(one_hot) # all the same distances 
#' dist(one_hot)^2 # or use squared Euclidean distance (corresponds to Manhattan distance in this case)
#' dist(one_hot, method = "manhattan")
#' 
#' # We could also get one-hot encoding directly via model.matrix(), which is 
#' # used in categories_to_binary():
#' model.matrix(~ . -1, input) # suppresses the 'intercept'
#' 
#' # Use Schaper data set for anticlustering example
#' data(schaper2019)
#' features <- schaper2019[, 3:6]
#' K <- 3
#' N <- nrow(features) 
#' 
#' # - Generate data input for k-means anticlustering -
#' # We conduct k-plus anticlustering by first generating k-plus variables, 
#' # and also include the categorical variable as "numeric" input for the 
#' # k-means optimization (rather than as input for the argument categories)
#' 
#' input_data <- cbind(
#'   kplus_moment_variables(features, T = 2), 
#'   categories_to_binary(schaper2019$room) 
#' )
#' 
#' kplus_groups <- anticlustering(
#'   input_data, 
#'   K = K,
#'   objective = "variance",
#'   method = "local-maximum", 
#'   repetitions = 10
#' )
#' mean_sd_tab(features, kplus_groups)
#' table(kplus_groups, schaper2019$room) # argument categories was not used!
#' 
#'  

categories_to_binary <- function(categories, use_combinations = FALSE) {
  validate_input(use_combinations, "use_combinations", objmode = "logical", len = 1,
                 input_set = c(TRUE, FALSE), not_na = TRUE, not_function = TRUE)
  categories <- data.frame(categories)
  categories <- as.data.frame(lapply(categories, factor, exclude = NULL))
  colnames(categories) <- paste0("X", 1:ncol(categories))
  ## Does some variable only have 1 value??? Allow this by converting to numeric (i.e., 1!)
  for (i in 1:ncol(categories)) {
    if (length(levels(categories[, i])) == 1) {
      categories[, i] <- 1
    }
  }
  combine_by <- ifelse(use_combinations, " * ", " + ")
  formula_string <- paste("~", paste(colnames(categories), collapse = combine_by), "-1", collapse = "")
  model.matrix(
    as.formula(formula_string), 
    data = categories
  )
}
