% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_aggregation.R
\name{aggregate_trades}
\alias{aggregate_trades}
\title{Aggregation of high-frequency data}
\usage{
aggregate_trades(data, algorithm = "Tick", timelag = 0, ...,
 verbose = TRUE)
}
\arguments{
\item{data}{A dataframe with 4 variables in the following
order (\code{timestamp}, \code{price}, \code{bid}, \code{ask}).}

\item{algorithm}{A character string refers to the algorithm used
to determine the trade initiator, a buyer or a seller. It takes one of four
values (\code{"Tick"}, \code{"Quote"}, \code{"LR"}, \code{"EMO"}). The default value is
\code{"Tick"}. For more information about the different algorithms, check the
details section.}

\item{timelag}{A number referring to the time lag in milliseconds
used to calculate the lagged midquote, bid and ask for the algorithms
\code{"Quote"}, \code{"EMO"} and \code{"LR"}.}

\item{...}{Additional arguments passed on to the function
\code{aggregate_trades()}. The recognized arguments are \code{reportdays},
and \code{is_parallel}. Other arguments will be ignored.
\itemize{
\item \code{reportdays} is binary variable that determines whether the
variable \code{day} is returned. The default value is \code{FALSE}.
\item \code{is_parallel} is a logical variable that specifies whether
the computation is performed using parallel or sequential processing.
The default value is \code{TRUE}. For more details, please refer to the
vignette 'Parallel processing' in the package, or
\href{https://pinstimation.com/articles/parallel_processing.html}{online}.
}}

\item{verbose}{A binary variable that determines whether detailed
information about the progress of the trade classification is displayed.
No output is produced when \code{verbose} is set to \code{FALSE}. The default
value is \code{TRUE}.}
}
\value{
Returns a dataframe of two (or three) variables. If \code{reportdays}
is set to \code{TRUE}, then the returned dataframe has three variables
\verb{\{day, b, s\}}. If \code{reportdays} is set to \code{FALSE}, then the
returned dataframe has two variables \verb{\{b, s\}}, and, therefore, can be
directly used for the estimation of the \code{PIN} and \code{MPIN} models.
}
\description{
Aggregates high-frequency trading data into aggregated
daily data using different trade classification algorithms.
}
\details{
The argument \code{algorithm} takes one of four values:
\itemize{
\item \code{"Tick"} refers to the tick algorithm: Trade is classified as a
buy (sell) if the price of the trade to be classified
is above (below) the closest different price of a previous trade.
\item \code{"Quote"} refers to the quote algorithm: it classifies a
trade as a buy (sell) if the trade price of the trade to be
classified is above (below) the mid-point of the bid and ask spread.
Trades executed at the mid-spread are not classified.
\item \code{"LR"}  refers to \code{LR} algorithm as in
\insertCite{LeeReady1991;textual}{PINstimation}. It classifies a trade
as a buy (sell) if its price is above (below) the mid-spread (quote
algorithm), and  uses the tick algorithm if the trade price is at
the mid-spread.
\item \code{"EMO"} refers to \code{EMO} algorithm as in
\insertCite{Ellis2000;textual}{PINstimation}.
It classifies trades at the bid (ask) as sells (buys) and uses the tick
algorithm to classify trades within the then prevailing bid-ask spread.
}

\code{LR} recommend the use of mid-spread five-seconds earlier ('5-second'
rule) mitigating trade misclassifications for many of the \code{150}
NYSE stocks they analyze. On the other hand, in more recent studies such
as \insertCite{piwowar2006;textual}{PINstimation} and
\insertCite{Aktas2014;textual}{PINstimation}, the use of
1-second lagged midquotes are shown to yield lower rates of
misclassifications. The default value is set to \code{0} seconds (no time-lag).
Considering the ultra-fast nature of today’s financial markets, time-lag
is in the unit of milliseconds. Shorter than 1-second lags can also be
implemented by entering values such as  \code{100} or \code{500}.
}
\examples{
# There is a preloaded dataset called 'hfdata' contained in the package.
# It is an artificially created high-frequency trading data. The dataset
# contains  100 000 trades and five variables 'timestamp', 'price',
# 'volume', 'bid', and 'ask'. For more information, type ?hfdata.

xdata <- hfdata
xdata$volume <- NULL

# Use the LR algorithm with a timelag of 0 milliseconds

daytrades <- aggregate_trades(xdata, algorithm = "LR", verbose = FALSE)

# Since the argument 'reportdays' is set to FALSE by default, then the
# output 'daytrades' can be used directly for the estimation of the PIN
# model, namely using pin_ea().

estimate <- pin_ea(daytrades, verbose = FALSE)

# Show the estimate

show(estimate)

}
\references{
\insertAllCited
}
