% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_weight.R
\name{dfm_weight}
\alias{dfm_weight}
\alias{dfm_smooth}
\title{weight the feature frequencies in a dfm}
\usage{
dfm_weight(x, type = c("frequency", "relFreq", "relMaxFreq", "logFreq",
  "tfidf"), weights = NULL)

dfm_smooth(x, smoothing = 1)
}
\arguments{
\item{x}{document-feature matrix created by \link{dfm}}

\item{type}{a label of the weight type:
\describe{
 \item{\code{"frequency"}}{integer feature count (default when a dfm is created)}
 \item{\code{"relFreq"}}{the proportion of the feature counts of total feature counts (aka relative frequency)}
 \item{\code{"relMaxFreq"}}{the proportion of the feature counts of the highest feature count in a document}
 \item{\code{"logFreq"}}{take the logarithm of 1 + the feature count, for base 10}
 \item{\code{"tfidf"}}{Term-frequency * inverse document frequency. For a
  full explanation, see, for example, 
  \url{http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html}.
   This implementation will not return negative values.  For finer-grained
  control, call \code{\link{tfidf}} directly.}
  }}

\item{weights}{if \code{type} is unused, then \code{weights} can be 
a named numeric vector of weights to be applied to the dfm, 
where the names of the vector correspond to feature labels of the dfm, and 
the weights will be applied as multipliers to the existing feature counts 
for the corresponding named fatures.  Any features not named will be 
assigned a weight of 1.0 (meaning they will be unchanged).}

\item{smoothing}{constant added to the dfm cells for smoothing, default is 1}
}
\value{
The dfm with weighted values.
}
\description{
Returns a document by feature matrix with the feature frequencies weighted 
according to one of several common methods.  Some shortcut functions that offer finer-grained control are:
\itemize{
\item{\code{\link{tf}}}{  compute term frequency weights}
\item{\code{\link{tfidf}}}{  compute term frequency-inverse document frequency weights}
\item{\code{\link{docfreq}}}{  compute document frequencies of features}
}
}
\details{
This converts a matrix from sparse to dense format, so may exceed memory
requirements depending on the size of your input matrix.
}
\note{
For finer grained control, consider calling the convenience functions directly.
}
\examples{
dtm <- dfm(data_corpus_inaugural)

x <- apply(dtm, 1, function(tf) tf/max(tf))
topfeatures(dtm)
normDtm <- dfm_weight(dtm, "relFreq")
topfeatures(normDtm)
maxTfDtm <- dfm_weight(dtm, type = "relMaxFreq")
topfeatures(maxTfDtm)
logTfDtm <- dfm_weight(dtm, type = "logFreq")
topfeatures(logTfDtm)
tfidfDtm <- dfm_weight(dtm, type = "tfidf")
topfeatures(tfidfDtm)

# combine these methods for more complex dfm_weightings, e.g. as in Section 6.4
# of Introduction to Information Retrieval
head(logTfDtm <- dfm_weight(dtm, type = "logFreq"))
head(tfidf(logTfDtm, normalize = FALSE))

#' # apply numeric weights
str <- c("apple is better than banana", "banana banana apple much better")
(mydfm <- dfm(str, remove = stopwords("english")))
dfm_weight(mydfm, weights = c(apple = 5, banana = 3, much = 0.5))

\dontshow{
testdfm <- dfm(data_corpus_inaugural[1:5])
for (w in c("frequency", "relFreq", "relMaxFreq", "logFreq", "tfidf")) {
    testw <- dfm_weight(testdfm, w)
    cat("\\n\\n=== weight() TEST for:", w, "; class:", class(testw), "\\n")
    head(testw)
}}
# smooth the dfm
dfm_smooth(mydfm, 0.5)
}
\references{
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schutze.
  \emph{Introduction to Information Retrieval}. Vol. 1. Cambridge: Cambridge 
  University Press, 2008.
}
\seealso{
\code{\link{tf}},  \code{\link{tfidf}}, \code{\link{docfreq}}
}
\author{
Paul Nulty and Kenneth Benoit
}
\keyword{dfm}
