#================================================================================================
# Load libraries - suppress messages
#
suppressMessages(library(tidyverse))
suppressMessages(library(tidymodels))
#================================================================================================
# Load the Adult Census Income dataset
#
adult_census <- read_csv("adult_census.csv", show_col_types = FALSE)
#================================================================================================
# Load the Adult Census Income dataset, create factors, and engineer a new feature
#
# It is best practice to set the seed for split reproducibility
set.seed(498798)
adult_split <- initial_split(adult_census, prop = 0.8, strata = "income")
# Create the training and test data frames
adult_train <- training(adult_split)
adult_test <- testing(adult_split)
#================================================================================================
# Use the entire dataset to define factor levels
#
# NOTE - The levels are not sorted as would normally happen when creating R factors
#
work_class_levels <- unique(adult_census$work_class)
education_levels <- unique(adult_census$education)
marital_status_levels <- unique(adult_census$marital_status)
occupation_levels <- unique(adult_census$occupation)
relationship_levels <- unique(adult_census$relationship)
race_levels <- unique(adult_census$race)
native_country_levels <- unique(adult_census$native_country)
#================================================================================================
# It is best practice to create character-based factors outside of a recipe
#
adult_train <- adult_train %>%
mutate(work_class = factor(work_class, levels = work_class_levels),
education = factor(education, levels = education_levels),
marital_status = factor(marital_status, levels = marital_status_levels),
occupation = factor(occupation, levels = occupation_levels),
relationship = factor(relationship, levels = relationship_levels),
race = factor(race, levels = race_levels),
sex = factor(sex),
native_country = factor(native_country, levels = native_country_levels),
income = factor(income))
str(adult_train)
#================================================================================================
# Craft the recipe - recipes package
#
# The use of "~ ." tells tidymodels to use all other features to predict income
adult_recipe <- recipe(income ~ ., data = adult_train)
#================================================================================================
# Specify the algorithm - parsnip package
#
adult_model <- rand_forest(trees = 250) %>%
set_engine("randomForest") %>%
set_mode("classification")
#================================================================================================
# Set up workflow - workflow package
#
adult_workflow <- workflow() %>%
add_recipe(adult_recipe) %>%
add_model(adult_model)
#================================================================================================
# Fit model - parsnip package
#
# Setting seed for reproducibility
set.seed(54321)
adult_fit <- fit(adult_workflow, adult_train)
#================================================================================================
# Display the model's summarized output
#
adult_forest <- extract_fit_parsnip(adult_fit)
adult_forest