Random forest in Python

Random forest is an extension of the decision tree algorithm that builds multiple decision trees and combines their predictions to make more accurate and robust predictions. It creates a collection of decision trees, each trained on a random subset of data and features. This randomness helps to reduce overfitting and improve generalization.

Implementation

The following steps demonstrate the process of training and visualizing a random forest model using the provided dataset.

Step 1 - Importing the libraries

In the first step, we import the necessary libraries.

Here, we use iloc() function in Python to assign the variables label1 and label2 the values of the feature variable and the values of the target variable respectively from the dataset.

Step 3 - Splitting the dataset into training and test set

In this step, we split the data into training and test sets using the train_test_split function. The test set is set to be 25% of the entire dataset and random_state is used to ensure reproducibilityObtaining the same results when a process is rerun with the same data and settings..

from matplotlib.colors import ListedColormap
seq1, seq2 = label1_train, label2_train
grid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),
                     np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))
plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),
             alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff')))
plt.xlim(grid1.min(), grid1.max())
plt.ylim(grid2.min(), grid2.max())
for key, value in enumerate(np.unique(seq2)):
    plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],
                c = ListedColormap(('mediumturquoise', 'lightsalmon'))(key), label = value)
plt.title('Training set of random forest')
plt.xlabel('Age')
plt.ylabel('Estimated salary')
plt.legend()
plt.savefig('output/1_training.png')

from matplotlib.colors import ListedColormap
seq1, seq2 = label1_test, label2_test
grid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),
                     np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))
plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),
             alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff')))
plt.xlim(grid1.min(), grid1.max())
plt.ylim(grid2.min(), grid2.max())
for key, value in enumerate(np.unique(seq2)):
    plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],
                c = ListedColormap(('mediumturquoise', 'lightsalmon'))(key), label = value)
plt.title('Test set of random forest')
plt.xlabel('Age')
plt.ylabel('Estimated salary')
plt.legend()
plt.savefig('output/2_testing.png')

main.py

Data.csv

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
items = pd.read_csv('Data.csv')
label1 = items.iloc[:, [2, 3]].values 
label2 = items.iloc[:, 4].values
from sklearn.model_selection import train_test_split
label1_train, label1_test, label2_train, label2_test = train_test_split(label1, label2, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
label1_train = scaling.fit_transform(label1_train)
label1_test = scaling.transform(label1_test)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=0)
model.fit(label1_train, label2_train)
prediction = model.predict(label1_test)
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(label2_test, prediction)
print(matrix)
from matplotlib.colors import ListedColormap
seq1, seq2 = label1_train, label2_train
grid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),
                     np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))
plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),
             alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff')))
plt.xlim(grid1.min(), grid1.max())
plt.ylim(grid2.min(), grid2.max())
for key, value in enumerate(np.unique(seq2)):
    plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],
                c = ListedColormap(('mediumturquoise', 'lightsalmon'))(key), label = value)
plt.title('Training set of random forest')
plt.xlabel('Age')
plt.ylabel('Estimated salary')
plt.legend()
plt.savefig('output/1_training.png')
plt.show()
from matplotlib.colors import ListedColormap
seq1, seq2 = label1_test, label2_test
grid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),
                     np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))
plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),
             alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff')))
plt.xlim(grid1.min(), grid1.max())
plt.ylim(grid2.min(), grid2.max())
for key, value in enumerate(np.unique(seq2)):
    plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],
                c = ListedColormap(('mediumturquoise', 'lightsalmon'))(key), label = value)
plt.title('Test set of random forest')
plt.xlabel('Age')
plt.ylabel('Estimated salary')
plt.legend()
plt.savefig('output/2_testing.png')
plt.show()

Random forest in Python

Implementation

Step 1 - Importing the libraries

Step 2 - Importing the dataset

Step 3 - Splitting the dataset into training and test set

Step 4 - Feature scaling

Step 5 - Fitting the model to the training dataset

Step 6 - Predicting the test set results

Step 7 - The visualization of training set results

Step 8 - The visualization of test set results

Code

Conclusion