% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nametagger.R
\name{nametagger_options}
\alias{nametagger_options}
\title{Define text transformations serving as predictive elements in the nametagger model}
\usage{
nametagger_options(
  file = "nametagger.ner",
  type = c("generic", "english", "czech"),
  tagger = c("trivial", "external"),
  token = list(use = FALSE, window = 1),
  token_capitalised = list(use = FALSE, window = 0),
  token_normalised = list(use = FALSE, window = 0),
  token_normalisedsuffix = list(use = FALSE, window = 0, from = 1, to = 4),
  lemma = list(use = FALSE, window = 0),
  lemma_capitalised = list(use = FALSE, window = 0),
  lemma_normalised = list(use = FALSE, window = 0),
  lemma_normalisedsuffix = list(use = FALSE, window = 0, from = 1, to = 4),
  pos = list(use = tagger == "external", window = 0),
  time = list(use = FALSE, window = 0),
  url_email = list(use = FALSE, url = "URL", email = "EMAIL"),
  ner_previous = list(use = FALSE, window = 0),
  brown = list(use = FALSE, window = 0),
  gazetteers = list(use = FALSE, window = 0),
  gazetteers_enhanced = list(use = FALSE)
)
}
\arguments{
\item{file}{path to the filename where the model will be saved}

\item{type}{either one of 'generic', 'english' or 'czech'. See the documentation at the documentation at \url{http://ufal.mff.cuni.cz/nametag}.}

\item{tagger}{either one of 'trivial' (no lemma used in the training data), 'external' (you provided your own lemma in the training data)}

\item{token}{use forms as features}

\item{token_capitalised}{use capitalization of form as features}

\item{token_normalised}{use case normalized (first character as-is, others lowercased) forms as features}

\item{token_normalisedsuffix}{shortest longest – use suffixes of case normalized (first character as-is, others lowercased) forms of lengths between shortest and longest}

\item{lemma}{use raw lemmas as features}

\item{lemma_capitalised}{use capitalization of raw lemma as features}

\item{lemma_normalised}{use case normalized (first character as-is, others lowercased) raw lemmas as features}

\item{lemma_normalisedsuffix}{shortest longest – use suffixes of case normalized (first character as-is, others lowercased) raw lemmas of lengths between shortest and longest}

\item{pos}{use parts-of-speech tags as features}

\item{time}{recognize numbers which could represent hours, minutes, hour:minute time, days, months or years}

\item{url_email}{If an URL or an email is detected, it is immediately marked with specified named entity type and not used in further processing. The specified entity label to use can be specified with url and email (in that sequence)}

\item{ner_previous}{use named entities predicted by previous stage as features}

\item{brown}{file [prefix_lengths] – use Brown clusters found in the specified file. An optional list of lengths of cluster prefixes to be used in addition to the full Brown cluster can be specified.}

\item{gazetteers}{[files] – use given files as gazetteers. Each file is one gazetteers list independent of the others and must contain a set of lemma sequences, each on a line, represented as raw lemmas separated by spaces.}

\item{gazetteers_enhanced}{(form|rawlemma|rawlemmas) (embed_in_model|out_of_model) file_base entity [file_base entity ...] – use gazetteers from given files. Each gazetteer contains (possibly multiword) named entities per line. Matching of the named entities can be performed either using form, disambiguated rawlemma of any of rawlemmas proposed by the morphological analyzer. The gazetteers might be embedded in the model file or not; in either case, additional gazetteers are loaded during each startup. For each file_base specified in GazetteersEnhanced templates, three files are tried:
\itemize{
\item{file_base.txt: gazetteers used as features, representing each file_base with a unique feature}
\item{file_base.hard_pre.txt: matched named entities (finding non-overlapping entities, preferring the ones starting earlier and longer ones in case of ties) are forced to the specified entity type even before the NER model is executed}
\item{file_base.hard_post.txt: after running the NER model, tokens not recognized as entities are matched against the gazetteers (again finding non-overlapping entities, preferring the ones starting earlier and longer ones in case of ties) and marked as entity type if found}
}}
}
\value{
an object of class \code{nametagger_options} with transformation information to be used by \code{\link{nametagger}}
}
\description{
Define text transformations which are relevant in predicting your entity. 
Typical text transformations are the token itself, the lemma, the parts of speech tag of the token
or the token/lemma's and parts of speech tags in the neighbourhood of the word. \cr

Each argument should be a list with elements \code{use} and \code{window}. \cr
\itemize{
\item{\code{use} is a logical indicating if the transformation should be used in the model. }
\item{\code{window} specifies how many adjacent words can observe the feature template value of a given word. The default value of 0 denotes only the word in question, no surrounding words.}
}
If you specifiy the argument without specifying \code{use}, it will by default use it.
For arguments brown, gazetteers and gazetteers_enhanced, see the examples and 
the documentation at \url{http://ufal.mff.cuni.cz/nametag}.
}
\examples{
opts <- nametagger_options(token = list(window = 2))
opts
opts <- nametagger_options(time = list(use = TRUE, window = 3),
                           token_capitalised = list(use = TRUE, window = 1),
                           ner_previous = list(use = TRUE, window = 5))
opts                            
opts <- nametagger_options(
  lemma_capitalised = list(window = 3),
  brown = list(window = 1, file = "path/to/brown/clusters/file.txt"),
  gazetteers = list(window = 1, 
                    file_loc = "path/to/txt/file1.txt", 
                    file_time = "path/to/txt/file2.txt"))
opts
opts <- nametagger_options(
  lemma_capitalised = list(window = 3),
  brown = list(window = 2, 
               file = "path/to/brown/clusters/file.txt"),
  gazetteers_enhanced = list(
    loc  = "LOC",  type_loc  = "form", save_loc  = "embed_in_model", file_loc  = "pathto/loc.txt",  
    time = "TIME", type_time = "form", save_time = "embed_in_model", file_time = "pathto/time.txt")
    )
opts
}
