% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/match.R
\name{vec_locate_matches}
\alias{vec_locate_matches}
\title{Locate observations matching specified conditions}
\usage{
vec_locate_matches(
  needles,
  haystack,
  ...,
  condition = "==",
  filter = "none",
  incomplete = "compare",
  no_match = NA_integer_,
  remaining = "drop",
  multiple = "all",
  nan_distinct = FALSE,
  chr_proxy_collate = NULL,
  needles_arg = "",
  haystack_arg = ""
)
}
\arguments{
\item{needles, haystack}{Vectors used for matching.
\itemize{
\item \code{needles} represents the vector to search for.
\item \code{haystack} represents the vector to search in.
}

Prior to comparison, \code{needles} and \code{haystack} are coerced to the same type.}

\item{...}{These dots are for future extensions and must be empty.}

\item{condition}{Condition controlling how \code{needles} should be compared
against \code{haystack} to identify a successful match.
\itemize{
\item One of: \code{"=="}, \code{">"}, \code{">="}, \code{"<"}, or \code{"<="}.
\item For data frames, a length \code{1} or \code{ncol(needles)} character vector
containing only the above options, specifying how matching is determined
for each column.
}}

\item{filter}{Filter to be applied to the matched results.
\itemize{
\item \code{"none"} doesn't apply any filter.
\item \code{"min"} returns only the minimum haystack value matching the current
needle.
\item \code{"max"} returns only the maximum haystack value matching the current
needle.
\item For data frames, a length \code{1} or \code{ncol(needles)} character vector
containing only the above options, specifying a filter to apply to
each column.
}

Filters don't have any effect on \code{"=="} conditions, but are useful for
computing "rolling" matches with other conditions.

A filter can return multiple haystack matches for a particular needle
if the maximum or minimum haystack value is duplicated in \code{haystack}. These
can be further controlled with \code{multiple}.}

\item{incomplete}{Handling of missing values and
\link[=vec_detect_complete]{incomplete} observations in \code{needles}.
\itemize{
\item \code{"compare"} uses \code{condition} to determine whether or not a missing value
in \code{needles} matches a missing value in \code{haystack}. If \code{condition} is
\code{==}, \code{>=}, or \code{<=}, then missing values will match.
\item \code{"match"} always allows missing values in \code{needles} to match missing
values in \code{haystack}, regardless of the \code{condition}.
\item \code{"drop"} drops incomplete observations in \code{needles} from the result.
\item \code{"error"} throws an error if any \code{needles} are incomplete.
\item If a single integer is provided, this represents the value returned
in the \code{haystack} column for observations of \code{needles} that are
incomplete. If \code{no_match = NA}, setting \code{incomplete = NA} forces
incomplete observations in \code{needles} to be treated like unmatched values.
}

\code{nan_distinct} determines whether a \code{NA} is allowed to match a \code{NaN}.}

\item{no_match}{Handling of \code{needles} without a match.
\itemize{
\item \code{"drop"} drops \code{needles} with zero matches from the result.
\item \code{"error"} throws an error if any \code{needles} have zero matches.
\item If a single integer is provided, this represents the value returned in
the \code{haystack} column for observations of \code{needles} that have zero
matches. The default represents an unmatched needle with \code{NA}.
}}

\item{remaining}{Handling of \code{haystack} values that \code{needles} never matched.
\itemize{
\item \code{"drop"} drops remaining \code{haystack} values from the result.
Typically, this is the desired behavior if you only care when \code{needles}
has a match.
\item \code{"error"} throws an error if there are any remaining \code{haystack}
values.
\item If a single integer is provided (often \code{NA}), this represents the value
returned in the \code{needles} column for the remaining \code{haystack} values
that \code{needles} never matched. Remaining \code{haystack} values are always
returned at the end of the result.
}}

\item{multiple}{Handling of \code{needles} with multiple matches. For each needle:
\itemize{
\item \code{"all"} returns all matches detected in \code{haystack}.
\item \code{"any"} returns any match detected in \code{haystack} with no guarantees on
which match will be returned. It is often faster than \code{"first"} and
\code{"last"} if you just need to detect if there is at least one match.
\item \code{"first"} returns the first match detected in \code{haystack}.
\item \code{"last"} returns the last match detected in \code{haystack}.
\item \code{"warning"} throws a warning if multiple matches are detected, but
otherwise falls back to \code{"all"}.
\item \code{"error"} throws an error if multiple matches are detected.
}}

\item{nan_distinct}{A single logical specifying whether or not \code{NaN} should
be considered distinct from \code{NA} for double and complex vectors. If \code{TRUE},
\code{NaN} will always be ordered between \code{NA} and non-missing numbers.}

\item{chr_proxy_collate}{A function generating an alternate representation
of character vectors to use for collation, often used for locale-aware
ordering.
\itemize{
\item If \code{NULL}, no transformation is done.
\item Otherwise, this must be a function of one argument. If the input contains
a character vector, it will be passed to this function after it has been
translated to UTF-8. This function should return a character vector with
the same length as the input. The result should sort as expected in the
C-locale, regardless of encoding.
}

For data frames, \code{chr_proxy_collate} will be applied to all character
columns.

Common transformation functions include: \code{tolower()} for case-insensitive
ordering and \code{stringi::stri_sort_key()} for locale-aware ordering.}

\item{needles_arg, haystack_arg}{Argument tags for \code{needles} and \code{haystack}
used in error messages.}
}
\value{
A two column data frame containing the locations of the matches.
\itemize{
\item \code{needles} is an integer vector containing the location of
the needle currently being matched.
\item \code{haystack} is an integer vector containing the location of the
corresponding match in the haystack for the current needle.
}
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}

\code{vec_locate_matches()} is a more flexible version of \code{\link[=vec_match]{vec_match()}} used to
identify locations where each observation of \code{needles} matches one or
multiple observations in \code{haystack}. Unlike \code{vec_match()},
\code{vec_locate_matches()} returns all matches by default, and can match on
binary conditions other than equality, such as \code{>}, \code{>=}, \code{<}, and \code{<=}.
}
\details{
\code{\link[=vec_match]{vec_match()}} is identical to (but often slightly faster than):

\if{html}{\out{<div class="sourceCode">}}\preformatted{vec_locate_matches(
  needles,
  haystack,
  condition = "==",
  multiple = "first",
  nan_distinct = TRUE
)
}\if{html}{\out{</div>}}

\code{vec_locate_matches()} is extremely similar to a SQL join between \code{needles}
and \code{haystack}, with the default being most similar to a left join.

Be very careful when specifying match \code{condition}s. If a condition is
mis-specified, it is very easy to accidentally generate an exponentially
large number of matches.
}
\section{Dependencies of \code{vec_locate_matches()}}{

\itemize{
\item \code{\link[=vec_order_radix]{vec_order_radix()}}
\item \code{\link[=vec_detect_complete]{vec_detect_complete()}}
}
}

\examples{
x <- c(1, 2, NA, 3, NaN)
y <- c(2, 1, 4, NA, 1, 2, NaN)

# By default, for each element of `x`, all matching locations in `y` are
# returned
matches <- vec_locate_matches(x, y)
matches

# The result can be used to slice the inputs to align them
data_frame(
  x = vec_slice(x, matches$needles),
  y = vec_slice(y, matches$haystack)
)

# If multiple matches are present, control which is returned with `multiple`
vec_locate_matches(x, y, multiple = "first")
vec_locate_matches(x, y, multiple = "last")
vec_locate_matches(x, y, multiple = "any")
try(vec_locate_matches(x, y, multiple = "error"))

# By default, NA is treated as being identical to NaN.
# Using `nan_distinct = TRUE` treats NA and NaN as different values, so NA
# can only match NA, and NaN can only match NaN.
vec_locate_matches(x, y, nan_distinct = TRUE)

# If you never want missing values to match, set `incomplete = NA` to return
# `NA` in the `haystack` column anytime there was an incomplete observation
# in `needles`.
vec_locate_matches(x, y, incomplete = NA)

# `no_match` allows you to specify the returned value for a needle with
# zero matches. Note that this is different from an incomplete value,
# so specifying `no_match` allows you to differentiate between incomplete
# values and unmatched values.
vec_locate_matches(x, y, incomplete = NA, no_match = 0L)

# If you want to require that every `needle` has at least 1 match, set
# `no_match` to `"error"`:
try(vec_locate_matches(x, y, incomplete = NA, no_match = "error"))

# By default, `vec_locate_matches()` detects equality between `needles` and
# `haystack`. Using `condition`, you can detect where an inequality holds
# true instead. For example, to find every location where `x[[i]] >= y`:
matches <- vec_locate_matches(x, y, condition = ">=")

data_frame(
  x = vec_slice(x, matches$needles),
  y = vec_slice(y, matches$haystack)
)

# You can limit which matches are returned with a `filter`. For example,
# with the above example you can filter the matches returned by `x[[i]] >= y`
# down to only the ones containing the maximum `y` value of those matches.
matches <- vec_locate_matches(x, y, condition = ">=", filter = "max")

# Here, the matches for the `3` needle value have been filtered down to
# only include the maximum haystack value of those matches, `2`. This is
# often referred to as a rolling join.
data_frame(
  x = vec_slice(x, matches$needles),
  y = vec_slice(y, matches$haystack)
)

# In the very rare case that you need to generate locations for a
# cross match, where every observation of `x` is forced to match every
# observation of `y` regardless of what the actual values are, you can
# replace `x` and `y` with integer vectors of the same size that contain
# a single value and match on those instead.
x_proxy <- vec_rep(1L, vec_size(x))
y_proxy <- vec_rep(1L, vec_size(y))
nrow(vec_locate_matches(x_proxy, y_proxy))
vec_size(x) * vec_size(y)

# By default, missing values will match other missing values when using
# `==`, `>=`, or `<=` conditions, but not when using `>` or `<` conditions.
# This is similar to how `vec_compare(x, y, na_equal = TRUE)` works.
x <- c(1, NA)
y <- c(NA, 2)

vec_locate_matches(x, y, condition = "<=")
vec_locate_matches(x, y, condition = "<")

# You can force missing values to match regardless of the `condition`
# by using `incomplete = "match"`
vec_locate_matches(x, y, condition = "<", incomplete = "match")

# You can also use data frames for `needles` and `haystack`. The
# `condition` will be recycled to the number of columns in `needles`, or
# you can specify varying conditions per column. In this example, we take
# a vector of date `values` and find all locations where each value is
# between lower and upper bounds specified by the `haystack`.
values <- as.Date("2019-01-01") + 0:9
needles <- data_frame(lower = values, upper = values)

set.seed(123)
lower <- as.Date("2019-01-01") + sample(10, 10, replace = TRUE)
upper <- lower + sample(3, 10, replace = TRUE)
haystack <- data_frame(lower = lower, upper = upper)

# (values >= lower) & (values <= upper)
matches <- vec_locate_matches(needles, haystack, condition = c(">=", "<="))

data_frame(
  lower = vec_slice(lower, matches$haystack),
  value = vec_slice(values, matches$needle),
  upper = vec_slice(upper, matches$haystack)
)
}
