% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokenize.R
\name{tokenize}
\alias{tokenize}
\title{Tokenize a character vector
Parse the elements of a character vector into a list of cleaned tokens.}
\usage{
tokenize(text, removePunc = TRUE, removeNum = TRUE, toLower = TRUE,
  stemWords = TRUE, rmStopWords = TRUE)
}
\arguments{
\item{text}{The character vector to be tokenized}

\item{removePunc}{\code{TRUE} or \code{FALSE} indicating whether or not to remove punctuation from \code{text}.  If \code{TRUE}, puncuation will be removed.  Defaults to \code{TRUE}.}

\item{removeNum}{\code{TRUE} or \code{FALSE} indicating whether or not to remove numbers from \code{text}.  If \code{TRUE}, numbers will be removed.  Defaults to \code{TRUE}.}

\item{toLower}{\code{TRUE} or \code{FALSE} indicating whether or not to coerce all of \code{text} to lowercase.  If \code{TRUE}, \code{text} will be coerced to lowercase.  Defaults to \code{TRUE}.}

\item{stemWords}{\code{TRUE} or \code{FALSE} indicating whether or not to stem resulting tokens.  If \code{TRUE}, the outputted tokens will be tokenized using \code{SnowballC::wordStem()}.  Defaults to \code{TRUE}.}

\item{rmStopWords}{\code{TRUE}, \code{FALSE}, or character vector of stopwords to remove. If \code{TRUE}, words in \code{tm::stopwords("SMART")} will be removed prior to stemming. If \code{FALSE}, no stopword removal will occur. If a character vector is passed, this vector will be used as the list of stopwords to be removed.  Defaults to \code{TRUE}.}
}
\description{
Tokenize a character vector
Parse the elements of a character vector into a list of cleaned tokens.
}
\examples{
tokenize("Mr. Feeny said the test would be on Sat. At least I'm 99.9\% sure that's what he said.")
tokenize("Bill is trying to earn a Ph.D. in his field.", rmStopWords=FALSE)
}
