import h2o
import pandas as pd
from h2o.estimators import H2OGradientBoostingEstimator as H2OGBM
from sklearn.metrics import auc
import numpy as np
h2o.init(port=8080)
filepath = "Data/Classification/"
data = h2o.import_file(filepath+'Lending_Club_Loans.csv')
# Converting the H2O Frame to a Pandas DataFrame
data = data.as_data_frame()
# interaction feature "PaymentToIncome" based on domain knowledge
data['PaymentToIncome'] = (data['installment']*12.0)/data['annual_inc']
# new feature by transforming few continous variable to categorical ones
data['dti_cat'] = pd.qcut(data['dti'], q=[0, .25, .5, .75, 1.], labels=['Low','Moderate','Medium','High'])
data['int_rate_cat'] = pd.qcut(data['int_rate'], q=[0, .25, .5, .75, 1.], labels=['Low','Moderate','Medium','High'])
# changing existing revol_util feature from continous to categorical
data['revol_util'] = pd.qcut(data['revol_util'], q=[0, .25, .5, .75, 1.], labels=['Low','Moderate','Medium','High'])
data = h2o.H2OFrame(data)
# Checking the set of input features once again.
X = list(set(data.names) - set("loan_status")) # removing target from the list of all columns
y = "loan_status"
splits = data.split_frame(ratios = [0.70, 0.15], seed = 1)
train = splits[0]
valid = splits[1]
test = splits[2]
# Set up the H2O GBM parameters
gbm = H2OGBM(max_runtime_secs=40, nfolds=0, seed=42,
stopping_rounds=5,
ntrees= 1000,
max_depth = 5,
sample_rate= 0.64,
col_sample_rate= 0.67,
col_sample_rate_per_tree= 0.71,
col_sample_rate_change_per_level= 1.01,
score_tree_interval=5,
stopping_metric='AUC')
# Train the model
gbm.train(x=X, y=y, training_frame=train, validation_frame=valid)
print(gbm.model_performance(test_data=test))