% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/S4classes.R, R/corpus.R
\docType{class}
\name{corpus-class}
\alias{corpus-class}
\alias{zoom}
\alias{corpus}
\alias{get_corpus}
\alias{remote_corpus}
\alias{remote_corpus-class}
\alias{corpus,character-method}
\alias{corpus,missing-method}
\title{Corpus class initialization}
\usage{
\S4method{corpus}{character}(.Object, server = NULL, restricted)

\S4method{corpus}{missing}()
}
\arguments{
\item{.Object}{The upper-case ID of a CWB corpus stated by a
length-one \code{character} vector.}

\item{server}{If \code{NULL} (default), the corpus is expected to be present
locally. If provided, the name of an OpenCPU server (can be an IP address)
that hosts a corpus, or several corpora. The \code{corpus}-method will then
instantiate a \code{remote_corpus} object.}

\item{restricted}{A \code{logical} value, whether access to a remote corpus is
restricted (\code{TRUE}) or not (\code{FALSE}).}
}
\description{
Corpora indexed using the Corpus Workbench (CWB) offer an efficient data
structure for large, linguistically annotated corpora. The
\code{corpus}-class keeps basic information on a CWB corpus. Corresponding to
the name of the class, the \code{corpus}-method is the initializer for
objects of the \code{corpus} class. A CWB corpus can also be hosted remotely
on an \href{https://www.opencpu.org}{OpenCPU} server. The \code{remote_corpus}
class (which inherits from the \code{corpus} class) will handle respective
information. A (limited) set of polmineR functions and methods can be
executed on the corpus on the remote machine from the local R session by
calling them on the \code{remote_corpus} object. Calling the
\code{corpus}-method without an argument will return a \code{data.frame} with
basic information on the corpora that are available.
}
\details{
Calling \code{corpus()} will return a \code{data.frame} listing the corpora
available locally and described in the active registry directory, and some
basic information on the corpora.

A \code{corpus} object is instantiated by passing a corpus ID as
  argument \code{.Object}. Following the conventions of the Corpus Workbench
  (CWB), Corpus IDs are written in upper case. If \code{.Object} includes
  lower case letters, the \code{corpus} object is instantiated nevertheless,
  but a warning is issued to prevent bad practice. If \code{.Object} is not a
  known corpus, the error message will include a suggestion if there is a
  potential candidate that can be identified by \code{agrep}.

A limited set of methods of the \code{polmineR} package is exposed
  to be executed on a remote OpenCPU server. As a matter of convenience, the
  whereabouts of an OpenCPU server hosting a CWB corpus can be stated in an
  environment variable "OPENCPU_SERVER". Environment variables for R sessions
  can be set easily in the \code{.Renviron} file. A convenient way to do this
  is to call \code{usethis::edit_r_environ()}.
}
\section{Slots}{

\describe{
\item{\code{corpus}}{A length-one \code{character} vector, the upper-case ID of a CWB
corpus.}

\item{\code{data_dir}}{The directory where the files for the indexed corpus are.}

\item{\code{type}}{The type of the corpus (e.g. "plpr" for a corpus of plenary protocols).}

\item{\code{name}}{An additional name for the object that may be more telling than
the corpus ID.}

\item{\code{encoding}}{The encoding of the corpus, given as a length-one
\code{character} vector.}

\item{\code{size}}{Number of tokens (size) of the corpus, a length-one \code{integer}
vector.}

\item{\code{server}}{The URL (can be IP address) of the OpenCPU server. The slot is
available only with the \code{remote_corpus} class inheriting from the
\code{corpus} class.}

\item{\code{user}}{If the corpus on the server requires authentication, the username.}

\item{\code{password}}{If the corpus on the server requires authentication, the password.}
}}

\examples{
use("polmineR")

# get corpora present locally
y <- corpus()

# initialize corpus object
r <- corpus("REUTERS")
r <- corpus ("reuters") # will work, but will result in a warning


# apply core polmineR methods
a <- size(r)
b <- s_attributes(r)
c <- count(r, query = "oil")
d <- dispersion(r, query = "oil", s_attribute = "id")
e <- kwic(r, query = "oil")
f <- cooccurrences(r, query = "oil")

# used corpus initialization in a pipe
y <- corpus("REUTERS") \%>\% s_attributes()
y <- corpus("REUTERS") \%>\% count(query = "oil")

# working with a remote corpus
\dontrun{
REUTERS <- corpus("REUTERS", server = Sys.getenv("OPENCPU_SERVER"))
count(REUTERS, query = "oil")
size(REUTERS)
kwic(REUTERS, query = "oil")

GERMAPARL <- corpus("GERMAPARL", server = Sys.getenv("OPENCPU_SERVER"))
s_attributes(GERMAPARL)
size(x = GERMAPARL)
count(GERMAPARL, query = "Integration")
kwic(GERMAPARL, query = "Islam")

p <- partition(GERMAPARL, year = 2000)
s_attributes(p, s_attribute = "year")
size(p)
kwic(p, query = "Islam", meta = "date")

GERMAPARL <- corpus("GERMAPARLMINI", server = Sys.getenv("OPENCPU_SERVER"))
s_attrs <- s_attributes(GERMAPARL, s_attribute = "date")
sc <- subset(GERMAPARL, date == "2009-11-10")
}
}
\seealso{
Methods to extract basic information from a \code{corpus} object are
  covered by the \link{corpus-methods} documentation object. Use the
  \code{\link{s_attributes}} method to get information on structural
  attributes. Analytical methods available for \code{corpus} objects are
  \code{\link{size}}, \code{\link{count}}, \code{\link{dispersion}},
  \code{\link{kwic}}, \code{\link{cooccurrences}},
  \code{\link{as.TermDocumentMatrix}}.

Other classes to manage corpora: 
\code{\link{phrases}},
\code{\link{regions}},
\code{\link{subcorpus}}
}
\concept{classes to manage corpora}
