% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/step_tomek.R
\name{step_tomek}
\alias{step_tomek}
\title{Remove Tomek’s Links}
\usage{
step_tomek(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  column = NULL,
  skip = TRUE,
  seed = sample.int(10^5, 1),
  id = rand_id("tomek")
)
}
\arguments{
\item{recipe}{A recipe object. The step will be added to the
sequence of operations for this recipe.}

\item{...}{One or more selector functions to choose which
variable is used to sample the data. See \code{\link[=selections]{selections()}}
for more details. The selection should result in \emph{single
factor variable}. For the \code{tidy} method, these are not
currently used.}

\item{role}{Not used by this step since no new variables are
created.}

\item{trained}{A logical to indicate if the quantities for
preprocessing have been estimated.}

\item{column}{A character string of the variable name that will
be populated (eventually) by the \code{...} selectors.}

\item{skip}{A logical. Should the step be skipped when the
recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked
when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be
conducted on new data (e.g. processing the outcome variable(s)).
Care should be taken when using \code{skip = TRUE} as it may affect
the computations for subsequent operations.}

\item{seed}{An integer that will be used as the seed when
applied.}

\item{id}{A character string that is unique to this step to identify it.}
}
\value{
An updated version of \code{recipe} with the new step
added to the sequence of existing steps (if any). For the
\code{tidy} method, a tibble with columns \code{terms} which is
the variable used to sample.
}
\description{
\code{step_tomek} creates a \emph{specification} of a recipe
step that removes majority class instances of tomek links.
}
\details{
The factor variable used to balance around must only have 2 levels. All
other variables must be numerics with no missing data.

A tomek link is defined as a pair of points from different classes and are
each others nearest neighbors.

All columns in the data are sampled and returned by \code{\link[=juice]{juice()}}
and \code{\link[=bake]{bake()}}.

When used in modeling, users should strongly consider using the
option \code{skip = TRUE} so that the extra sampling is \emph{not}
conducted outside of the training set.
}
\section{Tidying}{
When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble with columns \code{terms}
(the selectors or variables selected) will be returned.
}

\section{Case weights}{


The underlying operation does not allow for case weights.
}

\examples{
library(recipes)
library(modeldata)
data(hpc_data)

hpc_data0 <- hpc_data \%>\%
  select(-protocol, -day)

orig <- count(hpc_data0, class, name = "orig")
orig

up_rec <- recipe(class ~ ., data = hpc_data0) \%>\%
  step_tomek(class) \%>\%
  prep()

training <- up_rec \%>\%
  bake(new_data = NULL) \%>\%
  count(class, name = "training")
training

# Since `skip` defaults to TRUE, baking the step has no effect
baked <- up_rec \%>\%
  bake(new_data = hpc_data0) \%>\%
  count(class, name = "baked")
baked

orig \%>\%
  left_join(training, by = "class") \%>\%
  left_join(baked, by = "class")

library(ggplot2)

ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +
  labs(title = "Without Tomek") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))

recipe(class ~ x + y, data = circle_example) \%>\%
  step_tomek(class) \%>\%
  prep() \%>\%
  bake(new_data = NULL) \%>\%
  ggplot(aes(x, y, color = class)) +
  geom_point() +
  labs(title = "With Tomek") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))
}
\references{
Tomek. Two modifications of cnn. IEEE Trans. Syst. Man Cybern.,
6:769-772, 1976.
}
\seealso{
\code{\link[=tomek]{tomek()}} for direct implementation

Other Steps for under-sampling: 
\code{\link{step_downsample}()},
\code{\link{step_nearmiss}()}
}
\concept{Steps for under-sampling}
