% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/supervised_learning.R
\name{train_rec_lin}
\alias{train_rec_lin}
\title{Train a Record Linkage Model}
\usage{
train_rec_lin(
  A,
  B,
  matches,
  variables,
  comparators = NULL,
  methods = NULL,
  prob_ratio = NULL,
  nonpar_hurdle = TRUE,
  controls_nleqslv = list(),
  controls_kliep = control_kliep()
)
}
\arguments{
\item{A}{A duplicate-free \code{data.frame} or \code{data.table}.}

\item{B}{A duplicate-free \code{data.frame} or \code{data.table}.}

\item{matches}{A \code{data.frame} or \code{data.table} indicating known matches.}

\item{variables}{A character vector of key variables used to create comparison vectors.}

\item{comparators}{A named list of functions for comparing pairs of records.}

\item{methods}{A named list of methods used for estimation (\code{"binary"}, \code{"continuous_parametric"} or \code{"continuous_nonparametric"}).}

\item{prob_ratio}{Probability/density ratio type (\code{"1"} or \code{"2"}).}

\item{nonpar_hurdle}{Logical indicating whether to use a hurdle model or not
(used only if the \code{"continuous_nonparametric"} method has been chosen for at least one variable).}

\item{controls_nleqslv}{Controls passed to the \link[nleqslv]{nleqslv} function (only if the \code{"continuous_parametric"} method has been chosen for at least one variable).}

\item{controls_kliep}{Controls passed to the \link[densityratio]{kliep} function (only if the \code{"continuous_nonparametric"} method has been chosen for at least one variable).}
}
\value{
Returns a list containing:\cr
\itemize{
\item{\code{b_vars} -- a character vector of variables used for the \code{"binary"} method (with the prefix \code{"gamma_"}),}
\item{\code{cpar_vars} -- a character vector of variables used for the \code{"continuous_parametric"} method (with the prefix \code{"gamma_"}),}
\item{\code{cnonpar_vars} -- a character vector of variables used for the \code{"continuous_nonparametric"} method (with the prefix \code{"gamma_"}),}
\item{\code{b_params} -- parameters estimated using the \code{"binary"} method,}
\item{\code{cpar_params} -- parameters estimated using the \code{"continuous_parametric"} method,}
\item{\code{cnonpar_params} -- probability of exact matching estimated using the \code{"continuous_nonparametric"} method,}
\item{\code{ratio_kliep} -- a result of the \link[densityratio]{kliep} function,}
\item{\code{ratio_kliep_list} -- an object containing the results of the \link[densityratio]{kliep} function,}
\item{\code{ml_model} -- here \code{NULL},}
\item{\code{pi_est} -- a prior probability of matching,}
\item{\code{match_prop} -- proportion of matches in the smaller dataset,}
\item{\code{variables} -- a character vector of key variables used for comparison,}
\item{\code{comparators} -- a list of functions used to compare pairs of records,}
\item{\code{methods} -- a list of methods used for estimation,}
\item{\code{"prob_ratio"} -- probability/density ratio type.}
}
}
\description{
Trains a supervised record linkage model using probability or density ratio estimation,
based on \href{https://www150.statcan.gc.ca/n1/pub/12-001-x/2022001/article/00007-eng.htm}{Lee et al. (2022)},
with several extensions.
}
\details{
Consider two datasets: \eqn{A} and \eqn{B}.
Let the bipartite comparison space \eqn{\Omega = A \times B} consist of
matches \eqn{M} and non-matches \eqn{U} between the records in files
\eqn{A} and \eqn{B}. For any pair of records \eqn{(a,b) \in \Omega},
let \eqn{\pmb{\gamma}_{ab} = (\gamma_{ab}^1,\gamma_{ab}^2,
\ldots,\gamma_{ab}^K)'} be the comparison vector between
a set of key variables. The original MEC algorithm uses the binary
comparison function to evaluate record pairs across two datasets.
However, this approach may be insufficient when handling datasets
with frequent errors across multiple variables.

We propose the use of continuous comparison functions to address
the limitations of binary comparison methods. We consider every
semi-metric, i.e., a function \eqn{d: A \times B \to \mathbb{R}},
satisfying the following conditions:\cr
\enumerate{
\item{\eqn{d(x,y) \geq 0},}
\item{\eqn{d(x,y) = 0} if and only if \eqn{x = y},}
\item{\eqn{d(x,y) = d(y,x)}.}
}
For example, we can use \eqn{1 - \text{Jaro-Winkler distance}} for character variables
(which is implemented in the \code{automatedRecLin} package as the \code{jarowinkler_complement} function)
or the Euclidean distance for numerical variables. The \code{automatedRecLin} package allows the use of
a different comparison function for each key variable (which should be specified
as a list in the \code{comparators} argument). The default function
for each key variable is \link[reclin2]{cmp_identical}
(the binary comparison function).

The \code{train_rec_lin} function is used to train a record linkage model,
when \eqn{M} and \eqn{U} are known (which might later serve as a classifier
for pairs outside \eqn{\Omega}). It offers different approaches to estimate the
probability/density ratio between matches and non-matches, which should be
specified as a list in the methods argument. The method suitable for the binary
comparison function is \code{"binary"}, which is also the default method for each
variable.

For the continuous semi-metrics we suggest the usage
of \code{"continuous_parametric"} or \code{"continuous_nonparametric"}
method. The \code{"continuous_parametric"} method assumes that
\eqn{\gamma_{ab}^k|M} and \eqn{\gamma_{ab}^k|U} follow
hurdle Gamma distributions. The density function of a hurdle
Gamma distribution is characterized by three parameters
\eqn{p_0 \in (0,1)} and \eqn{\alpha, \beta > 0} as follows:
\deqn{
f(x;p_0,\alpha,\beta) = p_0^{\mathbb{I}(x = 0)}[(1 - p_0) v(x;\alpha,\beta)]^{\mathbb{I}(x > 0)},
}
where
\deqn{
v(x;\alpha,\beta) = \frac{\beta^{\alpha} x^{\alpha - 1} \exp(-\beta x)}
{\Gamma(\alpha)}
}
is the density function of a Gamma distribution
(for details see \href{https://ideas.repec.org/a/eee/csdana/v179y2023ics0167947322002365.html}{Vo et al. (2023)}).
The \code{"continuous_nonparametric"} method does not assume anything about
the distributions of the comparison vectors. It directly
estimates the density ratio between the matches and the non-matches, using
the Kullback-Leibler Importance Estimation Procedure (KLIEP).
For details see \href{https://link.springer.com/article/10.1007/s10463-008-0197-x}{Sugiyama et al. (2008)}.
}
\examples{
df_1 <- data.frame(
  "name" = c("James", "Emma", "William", "Olivia", "Thomas",
  "Sophie", "Harry", "Amelia", "George", "Isabella"),
  "surname" = c("Smith", "Johnson", "Brown", "Taylor", "Wilson",
  "Davis", "Clark", "Harris", "Lewis", "Walker")
)
 df_2 <- data.frame(
  "name" = c("James", "Ema", "Wimliam", "Olivia", "Charlotte",
  "Henry", "Lucy", "Edward", "Alice", "Jack"),
  "surname" = c("Smith", "Johnson", "Bron", "Tailor", "Moore",
  "Evans", "Hall", "Wright", "Green", "King")
)
comparators <- list("name" = jarowinkler_complement(),
                    "surname" = jarowinkler_complement())
matches <- data.frame("a" = 1:4, "b" = 1:4)
methods <- list("name" = "continuous_nonparametric",
                "surname" = "continuous_nonparametric")
model <- train_rec_lin(A = df_1, B = df_2, matches = matches,
                       variables = c("name", "surname"),
                       comparators = comparators,
                       methods = methods)
model
}
\references{
Lee, D., Zhang, L.-C. and Kim, J. K. (2022). Maximum entropy classification for record linkage.
Survey Methodology, Statistics Canada, Catalogue No. 12-001-X, Vol. 48, No. 1.

Vo, T. H., Chauvet, G., Happe, A., Oger, E., Paquelet, S., and Garès, V. (2023).
Extending the Fellegi-Sunter record linkage model for mixed-type data with application to the French national health data system.
Computational Statistics & Data Analysis, 179, 107656.

Sugiyama, M., Suzuki, T., Nakajima, S. et al. Direct importance estimation for covariate shift adaptation.
Ann Inst Stat Math 60, 699–746 (2008). \doi{10.1007/s10463-008-0197-x}
}
\author{
Adam Struzik
}
