% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rulefit.R
\name{h2o.rulefit}
\alias{h2o.rulefit}
\title{Build a RuleFit Model}
\usage{
h2o.rulefit(
  x,
  y,
  training_frame,
  model_id = NULL,
  validation_frame = NULL,
  seed = -1,
  algorithm = c("AUTO", "DRF", "GBM"),
  min_rule_length = 3,
  max_rule_length = 3,
  max_num_rules = -1,
  model_type = c("rules_and_linear", "rules", "linear"),
  weights_column = NULL,
  distribution = c("AUTO", "bernoulli", "multinomial", "gaussian", "poisson", "gamma",
    "tweedie", "laplace", "quantile", "huber"),
  rule_generation_ntrees = 50,
  auc_type = c("AUTO", "NONE", "MACRO_OVR", "WEIGHTED_OVR", "MACRO_OVO",
    "WEIGHTED_OVO"),
  remove_duplicates = TRUE,
  lambda = NULL,
  max_categorical_levels = 10
)
}
\arguments{
\item{x}{(Optional) A vector containing the names or indices of the predictor variables to use in building the model.
If x is missing, then all columns except y are used.}

\item{y}{The name or column index of the response variable in the data. 
The response must be either a numeric or a categorical/factor variable. 
If the response is numeric, then a regression model will be trained, otherwise it will train a classification model.}

\item{training_frame}{Id of the training data frame.}

\item{model_id}{Destination id for this model; auto-generated if not specified.}

\item{validation_frame}{Id of the validation data frame.}

\item{seed}{Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
Defaults to -1 (time-based random number).}

\item{algorithm}{The algorithm to use to generate rules. Must be one of: "AUTO", "DRF", "GBM". Defaults to AUTO.}

\item{min_rule_length}{Minimum length of rules. Defaults to 3.}

\item{max_rule_length}{Maximum length of rules. Defaults to 3.}

\item{max_num_rules}{The maximum number of rules to return. defaults to -1 which means the number of rules is selected
by diminishing returns in model deviance. Defaults to -1.}

\item{model_type}{Specifies type of base learners in the ensemble. Must be one of: "rules_and_linear", "rules", "linear". Defaults to rules_and_linear.}

\item{weights_column}{Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from
the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative
weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the
data frame. This is typically the number of times a row is repeated, but non-integer values are supported as
well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If
you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get
an accurate prediction, remove all rows with weight == 0.}

\item{distribution}{Distribution function Must be one of: "AUTO", "bernoulli", "multinomial", "gaussian", "poisson", "gamma",
"tweedie", "laplace", "quantile", "huber". Defaults to AUTO.}

\item{rule_generation_ntrees}{Specifies the number of trees to build in the tree model. Defaults to 50. Defaults to 50.}

\item{auc_type}{Set default multinomial AUC type. Must be one of: "AUTO", "NONE", "MACRO_OVR", "WEIGHTED_OVR", "MACRO_OVO",
"WEIGHTED_OVO". Defaults to AUTO.}

\item{remove_duplicates}{\code{Logical}. Whether to remove rules which are identical to an earlier rule. Defaults to true. Defaults to
TRUE.}

\item{lambda}{Lambda for LASSO regressor.}

\item{max_categorical_levels}{For every categorical feature, only use this many most frequent categorical levels for model training. Only
used for categorical_encoding == EnumLimited. Defaults to 10.}
}
\description{
Builds a Distributed RuleFit model on a parsed dataset, for regression or 
classification.
}
\examples{
\dontrun{
library(h2o)
h2o.init()

# Import the titanic dataset:
f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
coltypes <- list(by.col.name = c("pclass", "survived"), types=c("Enum", "Enum"))
df <- h2o.importFile(f, col.types = coltypes)

# Split the dataset into train and test
splits <- h2o.splitFrame(data = df, ratios = 0.8, seed = 1)
train <- splits[[1]]
test <- splits[[2]]

# Set the predictors and response; set the factors:
response <- "survived"
predictors <- c("age", "sibsp", "parch", "fare", "sex", "pclass")

# Build and train the model:
rfit <- h2o.rulefit(y = response,
                    x = predictors,
                    training_frame = train,
                    max_rule_length = 10,
                    max_num_rules = 100,
                    seed = 1)

# Retrieve the rule importance:
print(rfit@model$rule_importance)

# Predict on the test data:
h2o.predict(rfit, newdata = test)
}
}
