% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/imputer_random_forest.R
\name{impute_with_random_forest_regressor}
\alias{impute_with_random_forest_regressor}
\title{Random Forest Regression Imputation function}
\usage{
impute_with_random_forest_regressor(
  sc,
  sdf,
  target_col,
  feature_cols,
  target_col_prev,
  max_depth = 15
)
}
\arguments{
\item{sc}{A Spark connection}

\item{sdf}{A Spark DataFrame}

\item{target_col}{The column with missing values to impute}

\item{feature_cols}{The columns to use as features in the Random Forest regression model. These columns should not have missing values.}

\item{target_col_prev}{the target column at the previous iteration. Used to calculate residuals.}

\item{max_depth}{Parameter of ml_random_forest, see its documentation for more details.}
}
\value{
The Spark DataFrame with missing values imputed in the target column
}
\description{
This function imputes missing values in a Spark DataFrame using Random Forest regression.
}
\examples{
# This example is not executed since it needs additional software (Apache Spark)
\dontrun{
# Example for Random Forest Regressor
library(sparklyr)
library(dplyr)

# Connect to Spark
# Assumes that you have already installed Spark with sparklyr::spark_install()
sc <- spark_connect(master = "local")

# Create sample data with missing continuous values in 'price'
sample_data <- data.frame(
 price = c(250000, NA, 180000, NA, 320000, 195000),
 bedrooms = c(3, 2, 2, 3, 4, 2),
 bathrooms = c(2, 1, 1, 2, 3, 1),
 sqft = c(1500, 900, 800, 1200, 2000, 850),
 age = c(10, 15, 25, 8, 5, 20)
)

# Copy to Spark DataFrame
sdf <- copy_to(sc, sample_data, "sample_data")

# Create previous iteration data (for residual calculation)
sdf_prev <- sdf \%>\%
 mutate(price = ifelse(is.na(price), 200000, price)) \%>\%
 select(price)

# Impute missing house prices using Random Forest regression
imputed_sdf <- impute_with_random_forest_regressor(
 sc = sc,
 sdf = sdf,
 target_col = "price",
 feature_cols = c("bedrooms", "bathrooms", "sqft", "age"),
 target_col_prev = sdf_prev
)

# View results
imputed_sdf \%>\% collect()

# Clean up
spark_disconnect(sc)
}
}
