This is a very famous data set and very often is a student's first step in machine learning! We'll be trying to predict a classification- survival or deceased. Let's begin our understanding of implementing Logistic Regression in Python for classification. We'll use a "semi-cleaned" version of the titanic data set, if you use the data set hosted directly on Kaggle, you may need to do some additional cleaning not shown in this lecture notebook.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
train=pd.read_csv("titanic_train.csv")
train.head()
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap="viridis");
sns.set_style("whitegrid")
#style must be one of white, dark, whitegrid, darkgrid, ticks
sns.countplot(x="Survived",data=train);
sns.distplot(train['Age'].dropna(),kde=False,bins=50)
train['Age'].dropna().plot(kind="hist",bins=50)
sns.countplot(x="SibSp",data=train)
train["Fare"].hist(bins=50,figsize=(10,4))
import cufflinks as cf
cf.go_offline()
train["Age"].iplot(bins=50)
train.head()
plt.figure(figsize=(12,8))
sns.boxplot(x="Pclass",y="Age",data=train)
### Calculate the mean of each class and use it to fill in the missing data
def impute_age(cols):
Age=cols[0]
Pclass=cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37
elif Pclass == 2:
return 29
else:
return 24
else:
return Age
train["Age"]=train[["Age","Pclass"]].apply(impute_age,axis=1)
train.columns
sns.heatmap(train.isnull())
train.drop("Cabin",axis=1,inplace=True)
train.columns
train.dropna(inplace=True)
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
sex=pd.get_dummies(train["Sex"],drop_first=True)
train['embark']=pd.get_dummies(train["Age"],drop_first=True)
train=pd.concat([train,sex,embark],axis=1)
train.head()
train.drop(["Sex","Embarked","Name","Ticket"],axis=1,inplace=True)
train.drop("PassengerId",axis=1,inplace=True)
train.head()
from sklearn.model_selection import train_test_split
x=train.drop('Survived',axis=1)
y=train['Survived']
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=101)
from sklearn.linear_model import LogisticRegression
logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)
predictions=logmodel.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)