% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textstat_dist.R, R/textstat_simil.R
\name{textstat_dist}
\alias{textstat_dist}
\alias{textstat_simil}
\title{Distance matrix between documents and/or features}
\usage{
textstat_dist(x, selection = character(0), n = NULL,
  margin = c("documents", "features"), method = "euclidean", upper = TRUE,
  diag = FALSE, p = 2)

textstat_simil(x, selection = character(0), n = NULL,
  margin = c("documents", "features"), method = "correlation",
  upper = FALSE, diag = FALSE)
}
\arguments{
\item{x}{a \link{dfm} object}

\item{selection}{character or character vector of document names or feature 
labels from the dfm}

\item{n}{the top \code{n} highest-ranking items will be returned.  If n is 
\code{NULL}, return all items.}

\item{margin}{identifies the margin of the dfm on which similarity or 
difference will be computed:  \code{documents} for documents or 
\code{features} for word/term features.}

\item{method}{method the distance measure to be used; see Details}

\item{upper}{whether the upper triangle of the symmetric \eqn{V \times V} 
matrix is recorded}

\item{diag}{whether the diagonal of the distance matrix should be recorded}

\item{p}{The power of the Minkowski distance.}
}
\description{
These functions compute distance matrix between documents and/or features from a 
\code{\link{dfm}} and return a standard \code{\link[stats]{dist}} object.
}
\details{
\code{textstat_dist} options are: \code{"euclidean"} (default), 
\code{"canberra"}, \code{"Chisquared"}, \code{"Chisquared2"}, \code{"hamming"}, \code{"kullback"}. 
\code{"manhattan"}, \code{"maximum"}, \code{"canberra"}, and \code{"minkowski"}.

\code{textstat_simil} options are: \code{"correlation"} (default),
\code{"cosine"}, \code{"jaccard"}, \code{"eJaccard"},
\code{"dice"}, \code{"eDice"}, \code{"simple matching"}, \code{"hamann"}, and
\code{"faith"}.
}
\note{
If you want to compute similarity on a "normalized" dfm object 
  (controlling for variable document lengths, for methods such as correlation
  for which different document lengths matter), then wrap the input dfm in 
  \code{\link{weight}(x, "relFreq")}.
}
\examples{
# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(inaugCorpus, Year > 1980), 
               remove = stopwords("english"), stem = TRUE)
               
## distance

# compute some document distances
(tmp <- textstat_dist(presDfm, margin = "documents"))

# for specific comparisons
textstat_dist(presDfm, "1985-Reagan", n = 5, margin = "documents")
textstat_dist(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
textstat_dist(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
textstat_dist(presDfm, "2005-Bush", margin = "documents", method = "eJaccard")

## similarities

# compute some document similarities
(tmp <- textstat_simil(presDfm, margin = "documents"))

# output as a list
as.list(tmp)[1:2]

# for specific comparisons
textstat_simil(presDfm, "1985-Reagan", n = 5, margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents", method = "cosine")

# compute some term similarities
textstat_simil(presDfm, c("fair", "health", "terror"), method = "cosine", 
               margin = "features", 20)

}
\seealso{
\link{dfm}

\code{\link{textstat_dist}}, \code{\link{as.list.dist}}
}
\author{
Haiyan Wang
}
