% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predict.R
\name{predict.rec_lin_model}
\alias{predict.rec_lin_model}
\title{Predict Matches Based on a Given Record Linkage Model}
\usage{
\method{predict}{rec_lin_model}(
  object,
  newdata_A,
  newdata_B,
  duplicates_in_A = FALSE,
  set_construction = c("size", "flr", "mmr"),
  fixed_method = "Newton",
  target_rate = 0.03,
  tol = 0.005,
  max_iter = 50,
  data_type = c("data.frame", "data.table", "matrix"),
  true_matches = NULL,
  ...
)
}
\arguments{
\item{object}{A \code{rec_lin_model} object from the \code{train_rec_lin} or \code{custom_rec_lin_model} functions.}

\item{newdata_A}{A duplicate-free \code{data.frame} or \code{data.table}.}

\item{newdata_B}{A duplicate-free \code{data.frame} or \code{data.table}.}

\item{duplicates_in_A}{Logical indicating whether to allow \code{A} to have duplicate records.}

\item{set_construction}{A method for constructing the predicted set of matches (\code{"size"}, \code{"flr"} or \code{"mmr"}).}

\item{fixed_method}{A method for solving fixed-point equations using the \link[FixedPoint]{FixedPoint} function.}

\item{target_rate}{A target false link rate (FLR) or missing match rate (MMR)
(used only if \code{set_construction == "flr"} or \code{set_construction == "mmr"}).}

\item{tol}{Error tolerance in the bisection procedure
(used only if \code{set_construction == "flr"} or \code{set_construction == "mmr"}).}

\item{max_iter}{A maximum number of iterations for the bisection procedure
(used only if \code{set_construction == "flr"} or \code{set_construction == "mmr"}).}

\item{data_type}{Data type for predictions with a custom ML model (\code{"data.frame"}, \code{"data.table"} or \code{"matrix"};
used only if \code{object} is from the \code{custom_rec_lin_model} function).}

\item{true_matches}{A \code{data.frame} or \code{data.table} indicating true matches.}

\item{...}{Additional controls passed to the \code{predict} function for custom ML model
(used only if the \code{object} is from the \code{custom_rec_lin_model} function).}
}
\value{
Returns a list containing:\cr
\itemize{
\item{\code{M_est} -- a \code{data.table} with predicted matches,}
\item{\code{set_construction} -- a method for constructing the predicted set of matches,}
\item{\code{n_M_est} -- estimated classification set size,}
\item{\code{flr_est} -- estimated false link rate (FLR),}
\item{\code{mmr_est} -- estimated missing match rate (MMR),}
\item{\code{iter} -- the number of iterations in the bisection procedure,}
\item{\code{eval_metrics} -- standard metrics for quality assessment, if \code{true_matches} is provided,}
\item{\code{confusion} -- confusion matrix, if \code{true_matches} is provided.}
}
}
\description{
Predicts matches between records in two datasets based on a given record linkage model,
using the maximum entropy classification (MEC) algorithm
(see \href{https://www150.statcan.gc.ca/n1/pub/12-001-x/2022001/article/00007-eng.htm}{Lee et al. (2022)}).
}
\details{
The \code{predict} function estimates the probability/density ratio
between matches and non-matches for pairs in given
datasets, based on a model obtained using the
\code{train_rec_lin} or \code{custom_rec_lin_model} functions.
Then, it estimates the number of matches and
returns the predicted matches, using the maximum
entropy classification (MEC) algorithm
(see \href{https://www150.statcan.gc.ca/n1/pub/12-001-x/2022001/article/00007-eng.htm}{Lee et al. (2022)}).

The \code{predict} function allows the construction of the predicted set
of matches using its estimated size or the bisection procedure,
described in \href{https://www150.statcan.gc.ca/n1/pub/12-001-x/2022001/article/00007-eng.htm}{Lee et al. (2022)},
based on a target False Link Rate (FLR)
or missing match rate (MMR). To use the second option, set \code{set_construction = "flr"}
or \code{set_construction = "mmr"} and
specify a target error rate using the \code{target_rate} argument.

By default, the function assumes that the datasets \code{newdata_A} and \code{newdata_B}
contain no duplicate records. This assumption
might be relaxed by allowing \code{newdata_A} to have duplicates. To do so,
set \code{duplicates_in_A = TRUE}.
}
\examples{
df_1 <- data.frame(
  "name" = c("James", "Emma", "William", "Olivia", "Thomas",
  "Sophie", "Harry", "Amelia", "George", "Isabella"),
  "surname" = c("Smith", "Johnson", "Brown", "Taylor", "Wilson",
  "Davis", "Clark", "Harris", "Lewis", "Walker")
)
 df_2 <- data.frame(
  "name" = c("James", "Ema", "Wimliam", "Olivia", "Charlotte",
  "Henry", "Lucy", "Edward", "Alice", "Jack"),
  "surname" = c("Smith", "Johnson", "Bron", "Tailor", "Moore",
  "Evans", "Hall", "Wright", "Green", "King")
)
comparators <- list("name" = jarowinkler_complement(),
                    "surname" = jarowinkler_complement())
matches <- data.frame("a" = 1:4, "b" = 1:4)
methods <- list("name" = "continuous_nonparametric",
                "surname" = "continuous_nonparametric")
model <- train_rec_lin(A = df_1, B = df_2, matches = matches,
                       variables = c("name", "surname"),
                       comparators = comparators,
                       methods = methods)

df_new_1 <- data.frame(
  "name" = c("John", "Emily", "Mark", "Anna", "David"),
  "surname" = c("Smith", "Johnson", "Taylor", "Williams", "Brown")
)
df_new_2 <- data.frame(
  "name" = c("John", "Emely", "Mark", "Michael"),
  "surname" = c("Smitth", "Johnson", "Tailor", "Henders")
)
predict(model, df_new_1, df_new_2)
}
\references{
Lee, D., Zhang, L.-C. and Kim, J. K. (2022). Maximum entropy classification for record linkage.
Survey Methodology, Statistics Canada, Catalogue No. 12-001-X, Vol. 48, No. 1.

Vo, T. H., Chauvet, G., Happe, A., Oger, E., Paquelet, S., and Garès, V. (2023).
Extending the Fellegi-Sunter record linkage model for mixed-type data with application to the French national health data system.
Computational Statistics & Data Analysis, 179, 107656.

Sugiyama, M., Suzuki, T., Nakajima, S. et al. Direct importance estimation for covariate shift adaptation.
Ann Inst Stat Math 60, 699–746 (2008). \doi{10.1007/s10463-008-0197-x}
}
\author{
Adam Struzik
}
