Solution: Predicting Diabetes Using PySpark MLlib

The solution to predicting diabetes in patients using PySpark MLlib.

Press + to interact
main.py
diabetes.csv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
spark = SparkSession.builder.getOrCreate()
# Task 1: Load the Diabetes prediction data into a PySpark DataFrame
print("Reading 'diabetes.csv' into diabetes_df dataframe:")
diabetes_df = spark.read.csv("diabetes.csv", header = True, inferSchema = True)
print("First 5 rows of the diabetes_df:")
diabetes_df.show(5)
print("Check the column types of diabetes_df:")
print(diabetes_df.dtypes)
# Task 2: Data Preprocessing and EDA
print("Converting `age` column to Integer Type:")
diabetes_df = diabetes_df.withColumn("age", col("age").cast(IntegerType()))
print("Value types in the smoking_history column")
diabetes_df.groupBy("smoking_history").count()
print("Remvoing smoking_history column")
diabetes_df2 = diabetes_df.drop("smoking_history")
print("Calculating the average blood glucose level for diabetic patients")
print(diabetes_df2.filter(col("diabetes") == 1).select(avg("blood_glucose_level")).first()[0])
print("Calculating the average blood glucose level for normal patients")
print(diabetes_df2.filter(col("diabetes") == 0).select(avg("blood_glucose_level")).first()[0])
print("Calculating the average blood glucose level for normal patients")
diabetes_df2.groupBy("gender").count().orderBy('count').show()
# Task 3: Model Training and Evaluation
print("Performing string indexing on the gender column")
indexers = [
StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep")
for column in ["gender"]]
print("Assembling the features into a vector column")
assembler = VectorAssembler(
inputCols=["age", "hypertension", "heart_disease", "gender_index",
"bmi", "HbA1c_level", "blood_glucose_level"],
outputCol="features")
print("Instantiate a Logistic regression model")
lr = LogisticRegression(featuresCol="features", labelCol="diabetes")
print("Create a ML Pipeline combining indexer, assembler and lr")
pipeline = Pipeline(stages=indexers + [assembler, lr])
print("Split the data into training and test sets (80:20)")
(trainingData, testData) = diabetes_df2.randomSplit([0.8, 0.2])
print("Fit the model to the training data")
pipelineModel = pipeline.fit(trainingData)
print("Use the model to make predictions on the test data")
predictions = pipelineModel.transform(testData)
print("Select the diabetes and prediction columns to see how well we have done")
predictions.select("diabetes", "prediction")
print("Evaluate the model")
evaluator = BinaryClassificationEvaluator(labelCol="diabetes")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", round(accuracy, 2))

Get hands-on with 1200+ tech skills courses.