Linear discriminant analysis in Python
Linear discriminant analysis (LDA) is a
The following steps demonstrate the process of training and visualizing a LDA model using the provided dataset.
Step 1 - Importing the libraries
In the first step, we import the necessary libraries.
import numpy as npimport matplotlib.pyplot as pltimport pandas as pd
Step 2 - Importing the dataset
After importing libraries, we load the dataset from a CSV file.
items = pd.read_csv('Data.csv')label1 = items.iloc[:, 0:13].valueslabel2 = items.iloc[:, 13].values
Here, we use the iloc() function in Python to assign the variables label1 and label2 values of the feature variable and the target variable values from the dataset.
Step 3 - Splitting the dataset into training and test set
In this step, we split the data into training and test sets using the train_test_split function.
from sklearn.model_selection import train_test_splitlabel1_train, label1_test, label2_train, label2_test = train_test_split(label1, label2, test_size = 0.2, random_state = 0)
Step 4 - Feature scaling
In this step, we scale the input features label1_train and label1_test to normalize the data.
from sklearn.preprocessing import StandardScalerscaling = StandardScaler()label1_train = scaling.fit_transform(label1_train)label1_test = scaling.transform(label1_test)
Step 5 - Applying LDA
Here, we apply LDA with two components to transform the training and test data.
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDAanalysis = LDA(n_components = 2)label1_train = analysis.fit_transform(label1_train, label2_train)label1_test = analysis.transform(label1_test)
Step 6 - Fitting the model to the training dataset
In this step, we create a logistic regression model and train it on the transformed training data.
from sklearn.linear_model import LogisticRegressionmodel = LogisticRegression(random_state = 0)model.fit(label1_train, label2_train)
Step 7 - Predicting the test set results
Here, we predict the outcomes of the test set.
prediction = model.predict(label1_test)from sklearn.metrics import confusion_matrix, accuracy_scorematrix = confusion_matrix(label2_test, prediction)print(matrix)score = accuracy_score(label2_test, prediction)print(score)
We also calculate the confusion matrix and accuracy score to evaluate the model's performance.
Learn about the evaluation metrics.
Step 8 - The visualization of training set results
Then we create a scatter plot to visualize the linear discriminants of the model and observe its separation of the three different categories in the training dataset.
from matplotlib.colors import ListedColormapseq1, seq2 = label1_train, label2_traingrid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff', 'mistyrose')))plt.xlim(grid1.min(), grid1.max())plt.ylim(grid2.min(), grid2.max())for key, value in enumerate(np.unique(seq2)):plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],c = ListedColormap(('mediumturquoise', 'lightsalmon', 'lightcoral'))(key), label = value)plt.title('Training set')plt.xlabel('LDA1')plt.ylabel('LDA2')plt.legend()plt.savefig('output/1_training.png')plt.show()
Here we saved the visualization generated by the code as an image file named 1_training.png in the specified folder output using plt.savefig.
Step 9 - The visualization of test set results
Similarly, we create a scatter plot to visualize the linear discriminants of the model and observe its separation of the three different categories in the test dataset.
from matplotlib.colors import ListedColormapseq1, seq2 = label1_test, label2_testgrid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff', 'mistyrose')))plt.xlim(grid1.min(), grid1.max())plt.ylim(grid2.min(), grid2.max())for key, value in enumerate(np.unique(seq2)):plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],c = ListedColormap(('mediumturquoise', 'lightsalmon', 'lightcoral'))(key), label = value)plt.title('Test set')plt.xlabel('LDA1')plt.ylabel('LDA2')plt.legend()plt.savefig('output/2_testing.png')plt.show()
Here we saved the visualization generated by the code as an image file named 2_testing.png in the specified folder output using plt.savefig.
Code
import numpy as npimport matplotlib.pyplot as pltimport pandas as pditems = pd.read_csv('Data.csv')label1 = items.iloc[:, 0:13].valueslabel2 = items.iloc[:, 13].valuesfrom sklearn.model_selection import train_test_splitlabel1_train, label1_test, label2_train, label2_test = train_test_split(label1, label2, test_size = 0.2, random_state = 0)from sklearn.preprocessing import StandardScalerscaling = StandardScaler()label1_train = scaling.fit_transform(label1_train)label1_test = scaling.transform(label1_test)from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDAanalysis = LDA(n_components = 2)label1_train = analysis.fit_transform(label1_train, label2_train)label1_test = analysis.transform(label1_test)from sklearn.linear_model import LogisticRegressionmodel = LogisticRegression(random_state = 0)model.fit(label1_train, label2_train)prediction = model.predict(label1_test)from sklearn.metrics import confusion_matrix, accuracy_scorematrix = confusion_matrix(label2_test, prediction)print(matrix)score = accuracy_score(label2_test, prediction)print(score)from matplotlib.colors import ListedColormapseq1, seq2 = label1_train, label2_traingrid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff', 'mistyrose')))plt.xlim(grid1.min(), grid1.max())plt.ylim(grid2.min(), grid2.max())for key, value in enumerate(np.unique(seq2)):plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],c = ListedColormap(('mediumturquoise', 'lightsalmon', 'lightcoral'))(key), label = value)plt.title('Training set')plt.xlabel('LDA1')plt.ylabel('LDA2')plt.legend()plt.savefig('output/1_training.png')plt.show()plt.clf()from matplotlib.colors import ListedColormapseq1, seq2 = label1_test, label2_testgrid1, grid2 = np.meshgrid(np.arange(start = seq1[:, 0].min() - 1, stop = seq1[:, 0].max() + 1, step = 0.01),np.arange(start = seq1[:, 1].min() - 1, stop = seq1[:, 1].max() + 1, step = 0.01))plt.contourf(grid1, grid2, model.predict(np.array([grid1.ravel(), grid2.ravel()]).T).reshape(grid1.shape),alpha = 0.75, cmap = ListedColormap(('lightblue', 'peachpuff', 'mistyrose')))plt.xlim(grid1.min(), grid1.max())plt.ylim(grid2.min(), grid2.max())for key, value in enumerate(np.unique(seq2)):plt.scatter(seq1[seq2 == value, 0], seq1[seq2 == value, 1],c = ListedColormap(('mediumturquoise', 'lightsalmon', 'lightcoral'))(key), label = value)plt.title('Test set')plt.xlabel('LDA1')plt.ylabel('LDA2')plt.legend()plt.savefig('output/2_testing.png')plt.show()
Conclusion
In conclusion, linear discriminant analysis (LDA) demonstrates the effective classification of data into distinct categories. The LDA plots visualize the separation achieved by the model, indicating its potential to distinguish between different data categories accurately.
Free Resources