% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/CorpusData.R
\name{CorpusData}
\alias{CorpusData}
\title{Manage Corpus Data and Encode CWB Corpus.}
\description{
Manage Corpus Data and Encode CWB Corpus.

Manage Corpus Data and Encode CWB Corpus.
}
\examples{
library(RcppCWB)
library(data.table)

# this example relies on the R method to write data to disk, there is also a method "CWB"
# that relies on CWB tools to generate the indexed corpus. The CWB can downloaded
# and installed within the package by calling cwb_install()

# create temporary registry file so that data in RcppCWB package can be used

registry_rcppcwb <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
registry_tmp <- fs::path(tempdir(), "registry")
if (!dir.exists(registry_tmp)) dir.create(registry_tmp)
r <- registry_file_parse("REUTERS", registry_dir = registry_rcppcwb)
r[["home"]] <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters")
registry_file_write(r, corpus = "REUTERS", registry_dir = registry_tmp)

# decode structural attribute 'places'

s_attrs_places <- RcppCWB::s_attribute_decode(
  corpus = "REUTERS",
  data_dir = system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters"),
  s_attribute = "places", method = "R"
)
s_attrs_places[["id"]] <- 1L:nrow(s_attrs_places)
setnames(s_attrs_places, old = "value", new = "places")

# decode positional attribute 'word'

tokens <- apply(s_attrs_places, 1, function(row){
  ids <- cl_cpos2id(
    corpus = "REUTERS", cpos = row[1]:row[2],
    p_attribute = "word", registry = registry_tmp
  )
  cl_id2str(corpus = "REUTERS", id = ids, p_attribute = "word", registry = registry_tmp)
})
tokenstream <- rbindlist(
lapply(
  1L:length(tokens),
  function(i) data.table(id = i, word = tokens[[i]]))
  )
tokenstream[["cpos"]] <- 0L:(nrow(tokenstream) - 1L)

# create CorpusData object (see vignette for further explanation)

CD <- CorpusData$new()
CD$tokenstream <- as.data.table(tokenstream)
CD$metadata <- as.data.table(s_attrs_places)

# Remove temporary registry with home dir still pointing to RcppCWB data dir
# to prevent data from being deleted
file.remove(fs::path(registry_tmp, "reuters"))
file.remove(registry_tmp)

# create temporary directories (registry directory and one for indexed corpora)

tmpdir <- normalizePath(tempdir(), winslash = "/")
if (.Platform$OS.type == "windows") tmpdir <- normalizePath(tmpdir, winslash = "/")
registry_tmp <- fs::path(tempdir(), "registry")
data_dir_tmp <- fs::path(tempdir(), "data_dir")
if (!dir.exists(registry_tmp)) dir.create(registry_tmp)
if (!dir.exists(data_dir_tmp)) dir.create(data_dir_tmp)

CD$encode(
  corpus = "REUTERS", encoding = "utf8",
  p_attributes = "word", s_attributes = "places",
  registry_dir = registry_tmp, data_dir = data_dir_tmp,
  method = "R"
)
reg <- registry_data(name = "REUTERS", id = "REUTERS", home = data_dir_tmp, p_attributes = "word")
registry_file_write(data = reg, corpus = "REUTERS", registry_dir = registry_tmp)

# see whether it works

cl_cpos2id(corpus = "REUTERS", p_attribute = "word", cpos = 0L:4049L, registry = registry_tmp)
}
\section{Public fields}{
\if{html}{\out{<div class="r6-fields">}}
\describe{
\item{\code{chunktable}}{A \code{data.table} with column "id" (unique values),
columns with metadata, and a column with text chunks.}

\item{\code{tokenstream}}{A \code{data.table} with a column "cpos" (corpus position), and columns
with positional attributes, such as "word", "lemma", "pos", "stem".}

\item{\code{metadata}}{A \code{data.table} with a column "id", to link data with chunks/tokenstream,
columns with document-level metadata, and a column "cpos_left" and "cpos_right", which can
be generated using method \code{$add_corpus_positions()}.}

\item{\code{sentences}}{A \code{data.table}.}

\item{\code{named_entities}}{A \code{data.table}.}
}
\if{html}{\out{</div>}}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-CorpusData-new}{\code{CorpusData$new()}}
\item \href{#method-CorpusData-print}{\code{CorpusData$print()}}
\item \href{#method-CorpusData-tokenize}{\code{CorpusData$tokenize()}}
\item \href{#method-CorpusData-import_xml}{\code{CorpusData$import_xml()}}
\item \href{#method-CorpusData-add_corpus_positions}{\code{CorpusData$add_corpus_positions()}}
\item \href{#method-CorpusData-purge}{\code{CorpusData$purge()}}
\item \href{#method-CorpusData-encode}{\code{CorpusData$encode()}}
\item \href{#method-CorpusData-clone}{\code{CorpusData$clone()}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-new"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-new}{}}}
\subsection{Method \code{new()}}{
Initialize a new instance of class \code{CorpusData}.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$new()}\if{html}{\out{</div>}}
}

\subsection{Returns}{
A class \code{CorpusData} object.
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-print"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-print}{}}}
\subsection{Method \code{print()}}{
Print summary of \code{CorpusData} object.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$print()}\if{html}{\out{</div>}}
}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-tokenize"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-tokenize}{}}}
\subsection{Method \code{tokenize()}}{
Simple tokenization of text in chunktable.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$tokenize(..., verbose = TRUE, progress = TRUE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{...}}{Arguments that are passed into \code{tokenizers::tokenize_words()}.}

\item{\code{verbose}}{Logical, whether to be verbose.}

\item{\code{progress}}{Logical, whether to show progress bar.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-import_xml"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-import_xml}{}}}
\subsection{Method \code{import_xml()}}{
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$import_xml(
  filenames,
  body = "//body",
  meta = NULL,
  mc = NULL,
  progress = TRUE
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{filenames}}{XXX}

\item{\code{body}}{An xpath expression defining the body of the xml document.}

\item{\code{meta}}{A named character vector with xpath expressions.}

\item{\code{mc}}{A numeric/integer value, number of cores to use.}

\item{\code{progress}}{Logical, whether to show progress bar.}
}
\if{html}{\out{</div>}}
}
\subsection{Details}{
Import XML files.
}

\subsection{Returns}{
The \code{CorpusData} object is returned invisibly.
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-add_corpus_positions"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-add_corpus_positions}{}}}
\subsection{Method \code{add_corpus_positions()}}{
Add column \code{cpos} to tokenstream and columns \code{cpos_left} and
\code{cpos_right} to metadata.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$add_corpus_positions(verbose = TRUE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{verbose}}{Logical, whether to be verbose.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-purge"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-purge}{}}}
\subsection{Method \code{purge()}}{
Remove patterns from chunkdata that are known to cause problems. This is
done most efficiently at the chunkdata level of data preparation as the
length of the character vector to handle is much smaller than when
tokenization/annotation has been performed.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$purge(
  replacements = list(c("^\\\\s*<.*?>\\\\s*$", ""), c("’", "'"))
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{replacements}}{XXX}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-encode"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-encode}{}}}
\subsection{Method \code{encode()}}{
Encode corpus. If the corpus already exists, it will be removed.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$encode(
  corpus,
  p_attributes = "word",
  s_attributes = NULL,
  encoding,
  registry_dir = Sys.getenv("CORPUS_REGISTRY"),
  data_dir = NULL,
  method = c("R", "CWB"),
  verbose = TRUE,
  compress = FALSE
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{corpus}}{The name of the CWB corpus.}

\item{\code{p_attributes}}{Positional attributes.}

\item{\code{s_attributes}}{Columns that will be encoded as structural attributes.}

\item{\code{encoding}}{Encoding/charset of the CWB corpus.}

\item{\code{registry_dir}}{Corpus registry, the directory where registry files are stored.}

\item{\code{data_dir}}{Directory where to create directory for indexed corpus files.}

\item{\code{method}}{Either "R" or "CWB".}

\item{\code{verbose}}{Logical, whether to be verbose.}

\item{\code{compress}}{Logical, whether to compress corpus.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-CorpusData-clone"></a>}}
\if{latex}{\out{\hypertarget{method-CorpusData-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{CorpusData$clone(deep = FALSE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}
}
}
}
