% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus.R
\name{corpus_install}
\alias{corpus_install}
\alias{corpus_packages}
\alias{corpus_rename}
\alias{corpus_remove}
\alias{corpus_as_tarball}
\alias{corpus_copy}
\alias{corpus_recode}
\alias{corpus_testload}
\alias{corpus_get_version}
\title{Install and manage corpora.}
\usage{
corpus_install(
  pkg = NULL,
  repo = "https://PolMine.github.io/drat/",
  tarball = NULL,
  doi = NULL,
  checksum = NULL,
  lib = .libPaths()[1],
  registry_dir,
  corpus_dir,
  ask = interactive(),
  load = TRUE,
  verbose = TRUE,
  user = NULL,
  password = NULL,
  ...
)

corpus_packages()

corpus_rename(
  old,
  new,
  registry_dir = Sys.getenv("CORPUS_REGISTRY"),
  verbose = TRUE
)

corpus_remove(corpus, registry_dir, ask = interactive(), verbose = TRUE)

corpus_as_tarball(corpus, registry_dir, data_dir, tarfile, verbose = TRUE)

corpus_copy(
  corpus,
  registry_dir,
  data_dir = NULL,
  registry_dir_new = file.path(normalizePath(tempdir(), winslash = "/"), "cwb",
    "registry", fsep = "/"),
  data_dir_new = file.path(normalizePath(tempdir(), winslash = "/"), "cwb",
    "indexed_corpora", tolower(corpus), fsep = "/"),
  remove = FALSE,
  verbose = interactive(),
  progress = TRUE
)

corpus_recode(
  corpus,
  registry_dir = Sys.getenv("CORPUS_REGISTRY"),
  data_dir = registry_file_parse(corpus, registry_dir)[["home"]],
  skip = character(),
  to = c("latin1", "UTF-8"),
  verbose = TRUE
)

corpus_testload(
  corpus,
  registry_dir = Sys.getenv("CORPUS_REGISTRY"),
  verbose = TRUE
)

corpus_get_version(corpus, registry_dir = Sys.getenv("CORPUS_REGISTRY"))
}
\arguments{
\item{pkg}{Name of the data package.}

\item{repo}{URL of the repository.}

\item{tarball}{URL,  S3-URI or local filename of a tarball with a CWB indexed
corpus. If \code{NULL} (default) and argument \code{doi} is stated, the
whereabouts of a corpus tarball will be derived from DOI.}

\item{doi}{The DOI (Digital Object Identifier) of a corpus deposited at
Zenodo (e.g. "10.5281/zenodo.3748858".)}

\item{checksum}{A length-one \code{character} vector with a MD5 checksum to
check for the integrity of a downloaded tarball. If the tarball is
downloaded from Zenodo by stating a DOI (argument \code{doi}), the checksum
included in the metadata for the record is used for the check.}

\item{lib}{Directory for R packages, defaults to \code{.libPaths()[1]}.}

\item{registry_dir}{The corpus registry directory. If missing, the result of
\code{cwb_registry_dir()}.}

\item{corpus_dir}{The directory that contains the data directories of indexed
corpora. If missing, the value of \code{cwb_corpus_dir()} will be used.}

\item{ask}{A \code{logical} value, whether to ask user for confirmation before
removing a corpus.}

\item{load}{A \code{logical} value, whether to load corpus after installation.}

\item{verbose}{Logical, whether to be verbose.}

\item{user}{A user name that can be specified to download a corpus from a password protected site.}

\item{password}{A password that can be specified to download a corpus from a password protected site.}

\item{...}{Further parameters that will be passed into
\code{install.packages}, if argument \code{tarball} is \code{NULL}, or into
or \code{download.file}, if \code{tarball} is specified.}

\item{old}{Name of the (old) corpus.}

\item{new}{Name of the (new) corpus.}

\item{corpus}{The ID of a CWB indexed corpus (in upper case).}

\item{data_dir}{The data directory where the files of the CWB corpus live.}

\item{tarfile}{Filename of tarball.}

\item{registry_dir_new}{Target directory with for (new) registry files.}

\item{data_dir_new}{Target directory for corpus files.}

\item{remove}{A \code{logical} value, whether to remove orginal files after
having created the copy.}

\item{progress}{Logical, whether to show a progress bar.}

\item{skip}{A character vector with s_attributes to skip.}

\item{to}{Character string describing the target encoding of the corpus.}
}
\value{
Logical value \code{TRUE} if installation has been successful, or \code{FALSE} if not.
}
\description{
Utility functions to assist the installation and management of indexed CWB
corpora.
}
\details{
A CWB corpus consists a set of binary files with corpus data
kept together in a data directory, and a registry file, which is a
plain test file that details the corpus id, corpus properties,
structural and positional attributes. The registry file also specifies
the path to the corpus data directory. Typically, the registry directory
and a corpus directory with the data directories for individual corpora
are within one parent folder (which might be called "cwb" by default).
See the following stylized directory structure.

\preformatted{
  .
  |- registry/
  |  |- corpus1
  |  +- corpus2
  |
  + indexed_corpora/
    |- corpus1/
    |  |- file1
    |  |- file2
    |  +- file3
    |
    +- corpus2/
       |- file1
       |- file2
       +- file3
}

The \code{corpus_install} function will assist the installation of a
corpus. The following scenarios are offered:
\itemize{
\item{If argument \code{tarball} is a local tarball, the tarball will
be extracted and files will be moved.}
\item{If \code{tarball} is a URL, the tarball will be downloaded from
the online location. It is possible to state user credentials using the
arguments \code{user} and \code{password}. Then the aforementioned
installation (scenario 1) is executed. If argument \code{pkg} is the
name of an installed package, corpus files will be moved into this
package.}
\item{If argument \code{doi} is Document Object Identifier (DOI), the URL
from which a corpus tarball can be downloaded is derived from the
information available at that location. The tarball is downloaded and the
corpus installed. If argument \code{pkg} is defined, files will be moved
into a R package, the syste registry and corpus directories are used
otherwise. Note that at this stage, it is assumed that the DOI has been
awarded by \href{https://zenodo.org/}{Zenodo}}
\item{If argument \code{pkg} is provided and specifies an R package (and
\code{tarball} is \code{NULL}), the corpus package available at a
CRAN-style repository specified by argument \code{repo} will be installed.
Internally, the \code{install.packages} function is called and further
arguments can be passed into this function call. This can be used to pass
user credentials, e.g. by adding \code{method = "wget" extra = "--user
  donald --password duck"}.
}
}
If the corpus to be installed is already available, a dialogue will ask the
user whether an existing corpus shall be deleted and installed anew, if
argument \code{ask} is \code{TRUE}.

\code{corpus_packages} will detect the packages that include CWB
corpora. Note that the directory structure of all installed packages is
evaluated which may be slow on network-mounted file systems.

\code{corpus_rename} will rename a corpus, affecting the name of the
registry file, the corpus id, and the name of the directory where data
files reside.

\code{corpus_remove()} can be used to delete a corpus.

\code{corpus_as_tarball} will create a tarball (.tar.gz-file) with
two subdirectories. The 'registry' subdirectory will host the registry file
for the tarred corpus. The data files will be put in a subdirectory with
the corpus name in the 'indexed_corpora' subdirectory.

\code{corpus_copy} will create a copy of a corpus (useful for
experimental modifications, for instance).

\code{corpus_get_version} parses the registry file and derives the
corpus version number from the corpus properties. The return value is a
\code{numeric_version} class object. The corpus version is expected to follow
semantic versioning (three digits, e.g. '0.8.1'). If the corpus version
has another format or if it is not available, the return value is \code{NA}.
}
\examples{
registry_file_new <- file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", "registry", "reuters", fsep = "/"
  )
if (file.exists(registry_file_new)) file.remove(registry_file_new)
corpus_copy(
  corpus = "REUTERS",
  registry_dir = system.file(package = "RcppCWB", "extdata", "cwb", "registry"),
  data_dir = system.file(
    package = "RcppCWB",
    "extdata", "cwb", "indexed_corpora", "reuters"
  )
)
unlink(file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", fsep = "/"),
  recursive = TRUE)
corpus <- "REUTERS"
pkg <- "RcppCWB"
s_attr <- "places"
Q <- '"oil"'

registry_dir_src <- system.file(package = pkg, "extdata", "cwb", "registry")
data_dir_src <- system.file(package = pkg, "extdata", "cwb", "indexed_corpora", tolower(corpus))

registry_dir_tmp <- file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", "registry", fsep = "/"
)
registry_file_tmp <- file.path(registry_dir_tmp, tolower(corpus), fsep = "/")
data_dir_tmp <- file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", "indexed_corpora", tolower(corpus), fsep = "/"
)

if (file.exists(registry_file_tmp)) file.remove(registry_file_tmp)
if (!dir.exists(data_dir_tmp)){
   dir.create(data_dir_tmp, recursive = TRUE)
} else {
  if (length(list.files(data_dir_tmp)) > 0L)
    file.remove(list.files(data_dir_tmp, full.names = TRUE))
}

corpus_copy(
  corpus = corpus,
  registry_dir = registry_dir_src,
  data_dir = data_dir_src,
  registry_dir_new = registry_dir_tmp,
  data_dir_new = data_dir_tmp
)

RcppCWB::cl_charset_name(corpus = corpus, registry = registry_dir_tmp)

corpus_recode(
  corpus = corpus,
  registry_dir = registry_dir_tmp,
  data_dir = data_dir_tmp,
  to = "UTF-8"
)

RcppCWB::cl_delete_corpus(corpus = corpus, registry = registry_dir_tmp)
RcppCWB::cqp_initialize(registry_dir_tmp)
RcppCWB::cl_charset_name(corpus = corpus, registry = registry_dir_tmp)

n_strucs <- RcppCWB::cl_attribute_size(
  corpus = corpus, attribute = s_attr, attribute_type = "s", registry = registry_dir_tmp
)
strucs <- 0L:(n_strucs - 1L)
struc_values <- RcppCWB::cl_struc2str(
  corpus = corpus, s_attribute = s_attr, struc = strucs, registry = registry_dir_tmp
)
speakers <- unique(struc_values)

Sys.setenv("CORPUS_REGISTRY" = registry_dir_tmp)
if (RcppCWB::cqp_is_initialized()) RcppCWB::cqp_reset_registry() else RcppCWB::cqp_initialize()
RcppCWB::cqp_query(corpus = corpus, query = Q)
cpos <- RcppCWB::cqp_dump_subcorpus(corpus = corpus)
ids <- RcppCWB::cl_cpos2id(
  corpus = corpus, p_attribute = "word", registry = registry_dir_tmp, cpos = cpos
)
str <- RcppCWB::cl_id2str(
  corpus = corpus, p_attribute = "word", registry = registry_dir_tmp, id = ids
)
unique(str)

unlink(file.path(normalizePath(tempdir(), winslash = "/"), "cwb", fsep = "/"), recursive = TRUE)
}
\seealso{
For managing registry files, see \code{\link{registry_file_parse}}
for switching to a packaged corpus.
}
