% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/optim-adadelta.R
\name{optim_adadelta}
\alias{optim_adadelta}
\title{Adadelta optimizer}
\usage{
optim_adadelta(params, lr = 1, rho = 0.9, eps = 1e-06, weight_decay = 0)
}
\arguments{
\item{params}{(iterable): list of parameters to optimize or list defining
parameter groups}

\item{lr}{(float, optional): learning rate (default: 1e-3)}

\item{rho}{(float, optional): coefficient used for computing a running average
of squared gradients (default: 0.9)}

\item{eps}{(float, optional): term added to the denominator to improve
numerical stability (default: 1e-6)}

\item{weight_decay}{(float, optional): weight decay (L2 penalty) (default: 0)}
}
\description{
It has been proposed in \href{https://arxiv.org/pdf/1212.5701.pdf}{ADADELTA: An Adaptive Learning Rate Method}
}
\note{
According to the original paper, decaying average of the squared gradients
is computed as follows:
\deqn{
E[g^2]_{t} = \rho E[g^2]_{t- 1} + (1 - \rho){g_{t}}^2
}

RMS of previous squared gradients up to time t:
\deqn{
RMS[g_{t}] = \sqrt{E[g^2]_{t} + \epsilon }
}

Adadelta update rule:
\deqn{
 \begin{array}{ll}
 \Delta \theta_{t} = - \frac{RMS [\Delta \theta]_{t - 1} }{RMS[g]_{t}}
 \theta_{t+1} = \theta_{t} + \Delta \theta_{t}
\end{array}
}
}
\examples{
if (torch_is_installed()) {
\dontrun{
optimizer <- optim_adadelta(model$parameters, lr = 0.1)
optimizer$zero_grad()
loss_fn(model(input), target)$backward()
optimizer$step()
}

}
}
