% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/clustering.R
\name{Clustering}
\alias{Clustering}
\title{Consensus clustering}
\usage{
Clustering(
  xdata,
  nc = NULL,
  eps = NULL,
  Lambda = NULL,
  K = 100,
  tau = 0.5,
  seed = 1,
  n_cat = 3,
  implementation = HierarchicalClustering,
  scale = TRUE,
  linkage = "complete",
  row = TRUE,
  optimisation = c("grid_search", "nloptr"),
  n_cores = 1,
  output_data = FALSE,
  verbose = TRUE,
  beep = NULL,
  ...
)
}
\arguments{
\item{xdata}{data matrix with observations as rows and variables as columns.}

\item{nc}{matrix of parameters controlling the number of clusters in the
underlying algorithm specified in \code{implementation}. If \code{nc} is
not provided, it is set to \code{seq(1, tau*nrow(xdata))}.}

\item{eps}{radius in density-based clustering, see
\code{\link[dbscan]{dbscan}}. Only used if
\code{implementation=DBSCANClustering}.}

\item{Lambda}{vector of penalty parameters for weighted distance calculation.
Only used for distance-based clustering, including for example
\code{implementation=HierarchicalClustering},
\code{implementation=PAMClustering}, or
\code{implementation=DBSCANClustering}.}

\item{K}{number of resampling iterations.}

\item{tau}{subsample size.}

\item{seed}{value of the seed to initialise the random number generator and
ensure reproducibility of the results (see \code{\link[base]{set.seed}}).}

\item{n_cat}{computation options for the stability score. Default is
\code{NULL} to use the score based on a z test. Other possible values are 2
or 3 to use the score based on the negative log-likelihood.}

\item{implementation}{function to use for clustering. Possible functions
include \code{\link{HierarchicalClustering}} (hierarchical clustering),
\code{\link{PAMClustering}} (Partitioning Around Medoids),
\code{\link{KMeansClustering}} (k-means) and \code{\link{GMMClustering}}
(Gaussian Mixture Models). Alternatively, a user-defined function taking
\code{xdata} and \code{Lambda} as arguments and returning a binary and
symmetric matrix for which diagonal elements are equal to zero can be used.}

\item{scale}{logical indicating if the data should be scaled to ensure that
all variables contribute equally to the clustering of the observations.}

\item{linkage}{character string indicating the type of linkage used in
hierarchical clustering to define the stable clusters. Possible values
include \code{"complete"}, \code{"single"} and \code{"average"} (see
argument \code{"method"} in \code{\link[stats]{hclust}} for a full list).
Only used if \code{implementation=HierarchicalClustering}.}

\item{row}{logical indicating if rows (if \code{row=TRUE}) or columns (if
\code{row=FALSE}) contain the items to cluster.}

\item{optimisation}{character string indicating the type of optimisation
method to calibrate the regularisation parameter (only used if
\code{Lambda} is not \code{NULL}). With \code{optimisation="grid_search"}
(the default), all values in \code{Lambda} are visited.
Alternatively, optimisation algorithms implemented in
\code{\link[nloptr]{nloptr}} can be used with \code{optimisation="nloptr"}.
By default, we use \code{"algorithm"="NLOPT_GN_DIRECT_L"},
\code{"xtol_abs"=0.1}, \code{"ftol_abs"=0.1} and
\code{"maxeval"} defined as \code{length(Lambda)}. These values can be
changed by providing the argument \code{opts} (see
\code{\link[nloptr]{nloptr}}).}

\item{n_cores}{number of cores to use for parallel computing (see argument
\code{workers} in \code{\link[future]{multisession}}). Using
\code{n_cores>1} is only supported with \code{optimisation="grid_search"}.}

\item{output_data}{logical indicating if the input datasets \code{xdata} and
\code{ydata} should be included in the output.}

\item{verbose}{logical indicating if a loading bar and messages should be
printed.}

\item{beep}{sound indicating the end of the run. Possible values are:
\code{NULL} (no sound) or an integer between 1 and 11 (see argument
\code{sound} in \code{\link[beepr]{beep}}).}

\item{...}{additional parameters passed to the functions provided in
\code{implementation} or \code{resampling}.}
}
\value{
An object of class \code{clustering}. A list with: \item{Sc}{a matrix
  of the best stability scores for different (sets of) parameters controlling
  the number of clusters and penalisation of attribute weights.} \item{nc}{a
  matrix of numbers of clusters.} \item{Lambda}{a matrix of regularisation
  parameters for attribute weights.} \item{Q}{a matrix of the average number
  of selected attributes by the underlying algorithm with different
  regularisation parameters.} \item{coprop}{an array of consensus matrices.
  Rows and columns correspond to items. Indices along the third dimension
  correspond to different parameters controlling the number of clusters and
  penalisation of attribute weights.} \item{selprop}{an array of selection
  proportions. Columns correspond to attributes. Rows correspond to different
  parameters controlling the number of clusters and penalisation of attribute
  weights.} \item{method}{a list with \code{type="clustering"} and values
  used for arguments \code{implementation}, \code{linkage}, and
  \code{resampling}.} \item{params}{a list with values used for arguments
  \code{K}, \code{tau}, \code{pk}, \code{n} (number of observations in
  \code{xdata}), and \code{seed}.} The rows of \code{Sc}, \code{nc},
  \code{Lambda}, \code{Q}, \code{selprop} and indices along the third
  dimension of \code{coprop} are ordered in the same way and correspond to
  parameter values stored in \code{nc} and \code{Lambda}.
}
\description{
Performs consensus (weighted) clustering. The underlying algorithm (e.g.
hierarchical clustering) is run with different number of clusters \code{nc}.
In consensus weighed clustering, weighted distances are calculated using the
\code{\link[rCOSA]{cosa2}} algorithm with different penalty parameters
\code{Lambda}. The hyper-parameters are calibrated by maximisation of the
consensus score.
}
\details{
In consensus clustering, a clustering algorithm is applied on
  \code{K} subsamples of the observations with different numbers of clusters
  provided in \code{nc}. If \code{row=TRUE} (the default), the observations
  (rows) are the items to cluster. If \code{row=FALSE}, the variables
  (columns) are the items to cluster. For a given number of clusters, the
  consensus matrix \code{coprop} stores the proportion of iterations where
  two items were in the same estimated cluster, out of all iterations where
  both items were drawn in the subsample.

  Stable cluster membership is obtained by applying a distance-based
  clustering method using \code{(1-coprop)} as distance (see
  \link{Clusters}).

  These parameters can be calibrated by maximisation of a stability score
  (see \code{\link{ConsensusScore}}) calculated under the null hypothesis of
  equiprobability of co-membership.

  It is strongly recommended to examine the calibration plot (see
  \code{\link{CalibrationPlot}}) to check that there is a clear maximum. The
  absence of a clear maximum suggests that the clustering is not stable,
  consensus clustering outputs should not be trusted in that case.

  To ensure reproducibility of the results, the starting number of the random
  number generator is set to \code{seed}.

  For parallelisation, stability selection with different sets of parameters
  can be run on \code{n_cores} cores. Using \code{n_cores > 1} creates a
  \code{\link[future]{multisession}}.
}
\examples{
\donttest{
# Consensus clustering
set.seed(1)
simul <- SimulateClustering(
  n = c(30, 30, 30), nu_xc = 1, ev_xc = 0.5
)
stab <- Clustering(xdata = simul$data)
print(stab)
CalibrationPlot(stab)
summary(stab)
Clusters(stab)
plot(stab)

# Consensus weighted clustering
if (requireNamespace("rCOSA", quietly = TRUE)) {
  set.seed(1)
  simul <- SimulateClustering(
    n = c(30, 30, 30), pk = 20,
    theta_xc = c(rep(1, 10), rep(0, 10)),
    ev_xc = 0.9
  )
  stab <- Clustering(
    xdata = simul$data,
    Lambda = LambdaSequence(lmin = 0.1, lmax = 10, cardinal = 10),
    noit = 20, niter = 10
  )
  print(stab)
  CalibrationPlot(stab)
  summary(stab)
  Clusters(stab)
  plot(stab)
  WeightBoxplot(stab)
}
}
}
\references{
\insertRef{JStatSoft}{sharp}

  \insertRef{OurConsensusClustering}{sharp}

  \insertRef{rCOSA}{sharp}

  \insertRef{COSA}{sharp}

  \insertRef{ConsensusClustering}{sharp}
}
\seealso{
\code{\link{Resample}}, \code{\link{ConsensusScore}},
  \code{\link{HierarchicalClustering}}, \code{\link{PAMClustering}},
  \code{\link{KMeansClustering}}, \code{\link{GMMClustering}}

Other stability functions: 
\code{\link{BiSelection}()},
\code{\link{GraphicalModel}()},
\code{\link{StructuralModel}()},
\code{\link{VariableSelection}()}
}
\concept{stability functions}
