# loading the train data
train = pd.read_csv("train_titanic_Xy.csv")
# pre-processing
def impute_age(age_pclass):
Age = age_pclass[0]
Pclass = age_pclass[1]
if pd.isnull(Age):
if Pclass == 1:
return 38
elif Pclass == 2:
return 30
else:
return 25
else:
return Age
train["Age"] = train[["Age", "Pclass"]].apply(impute_age, axis=1)
train.drop("Cabin", axis=1, inplace=True)
train.dropna(inplace=True)
# creating dummies
sex_emb = pd.get_dummies(train[["Sex", "Embarked"]], drop_first=True)
train = pd.concat([train, sex_emb], axis=1)
# get rid of extraneous features
train.drop(["Sex", "Embarked", "Name", "Ticket", "PassengerId"], inplace=True, axis=1)
# training data
X_train = train.drop("Survived", axis=1)
y_train = train["Survived"]
# scaling the data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
# model ==> training
logR = LogisticRegression(penalty="l2", max_iter=10000, C=0.5, n_jobs=-1)
logR.fit(X_train_s, y_train)
# loading the test data
X_test = pd.read_csv("test_titanic_X.csv")
y_test = pd.read_csv("test_titanic_y.csv")
test = pd.merge(X_test, y_test, on="PassengerId", how="left")
# pre-processing
test["Age"] = test[["Age", "Pclass"]].apply(impute_age, axis=1)
test.drop("Cabin", axis=1, inplace=True)
test.dropna(inplace=True)
# repeat
sex_emb_test = pd.get_dummies(test[["Sex", "Embarked"]], drop_first=True)
test = pd.concat([test, sex_emb_test], axis=1)
test.drop(["Sex", "Embarked", "Name", "Ticket", "PassengerId"], axis=1, inplace=True)
# testing data
X_test = test.drop("Survived", axis=1)
y_test = test["Survived"]
# scaling the data
X_test_s = scaler.transform(X_test)
X_test_s = pd.DataFrame(X_test_s, columns=X_test.columns)
# predictions
pred_train = logR.predict(X_train_s)
pred_test = logR.predict(X_test_s)
# display
print("Training score (mean accuracy): ", logR.score(X_train_s, y_train))
print("Test Score (mean accuracy): ", logR.score(X_test_s, y_test))