본문 바로가기
  • 데이터에 가치를 더하다, 서영석입니다.
공부하는 습관을 들이자/Python_ML

[2-4]타이타닉 생존자 예측

by 꿀먹은데이터 2022. 1. 5.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

타이타닉 데이터 : http:///www.kaggle.com/titanic/data

titanic_df=pd.read_csv('C:/Users/dudtj/OneDrive - 숭실대학교 - Soongsil University/Desktop/CL/python/파이썬 머신러닝 완벽가이드/titanic_train.csv')
titanic_df.head(3)

print(titanic_df.info())

print(titanic_df.isnull().sum())

이를 통해 Age,Cabin,Embarked에 Null값이 있기 때문에 제거해준다.

titanic_df['Age'].fillna(titanic_df['Age'].mean(),inplace=True)
titanic_df['Cabin'].fillna('N',inplace=True)
titanic_df['Embarked'].fillna('N',inplace=True)
print(titanic_df.isnull().sum())

print('Sex 값 분포')
print(titanic_df['Sex'].value_counts())
print('Cabin 값 분포')
print(titanic_df['Cabin'].value_counts())
print('Embarked 값 분포')
print(titanic_df['Embarked'].value_counts())

Cabin의 경우 N이 687건으로 가장 많고 속성값이 제대로 정리가 되지 않았기 때문에, Cabin의 앞문자만 추출하였다.

titanic_df['Cabin']=titanic_df['Cabin'].str[:1]
print(titanic_df['Cabin'].head(3))

titanic_df.groupby(['Sex','Survived'])['Survived'].count()

시각화

남녀의 생존 비율

sns.barplot(x='Sex',y='Survived',data=titanic_df)

객실 등급에 따른 생존 비율

sns.barplot(x='Pclass',y='Survived',hue='Sex',data=titanic_df)

나이와 성별에 따른 생존 비율

#입력 age에 따라 구분 값을 반환하는 함수
def get_category(age):
    cat=''
    if age <=1 : cat='Unknown'
    elif age <=5 : cat='Baby'
    elif age <=12 : cat= 'Child'
    elif age <=18 : cat= 'Teenager'
    elif age <= 25 : cat= 'Student'
    elif age <= 35 : cat= 'Young Adult'
    elif age <= 60 : cat= 'Adult'
    else : cat= 'Elderly'
    
    return cat
plt.figure(figsize=(10,6))
group_names= ['Unknown','Baby','Child','Teenager','Student','Young Adult','Adult','Elderly']

#get_category()
titanic_df['Age_cat']=titanic_df['Age'].apply(lambda x :get_category(x))
sns.barplot(x='Age_cat',y='Survived',hue='Sex',data=titanic_df,order=group_names)
titanic_df.drop('Age_cat',axis=1,inplace=True)

from sklearn import preprocessing

def encode_features(dataDF):
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature]=le.transform(dataDF[feature])
    return dataDF

titanic_df = encode_features(titanic_df)
titanic_df.head()

from sklearn.preprocessing import LabelEncoder
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

def format_features(df):
    df['Cabin']=df['Cabin'].str[:1]
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df

def transform_features(df):
    df=fillna(df)
    df=drop_features(df)
    df=format_features(df)
    return df
titanic_df=pd.read_csv('C:/Users/dudtj/OneDrive - 숭실대학교 - Soongsil University/Desktop/CL/python/파이썬 머신러닝 완벽가이드/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

X_titanic_df = transform_features(X_titanic_df)

학습과 테스트 데이터 구분

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=11)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()

#DecisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train,y_train)
dt_pred = dt_clf.predict(X_test)
print("DecisionTreeClassifier 정확도 :",accuracy_score(y_test,dt_pred))

#RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train,y_train)
rf_pred = rf_clf.predict(X_test)

print("RandomForestClassifier 정확도 :",accuracy_score(y_test,rf_pred))
#LogisticRegression 학습/예측/평가
lr_clf.fit(X_train,y_train)
lr_pred = lr_clf.predict(X_test)
print("LogisticRegression 정확도 :",accuracy_score(y_test,lr_pred))

LogisticRegression이 타 알고리즘에 비해 높은 정확도를 보인다. 하지만  최적화 작업을 수행하지 않았고, 데이터 양도 충분치 않기 때문에 성능이 가장 좋다고 할 수는 없다.

교차 검증

from sklearn.model_selection import KFold
def exec_kfold(clf,folds=5):
    kfold=KFold(n_splits=folds)
    scores=[]

    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        X_train,X_test = X_titanic_df.values[train_index],X_titanic_df.values[test_index]
        y_train,y_test = y_titanic_df.values[train_index],y_titanic_df.values[test_index]
        #Classifier 학습,예측,정확도 계산
        clf.fit(X_train,y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print("교차 검증 정확도:",iter_count,accuracy)
    
    mean_score=np.mean(scores)
    print("평균 정확도:",mean_score)
exec_kfold(dt_clf,folds=5)

평균 정확도 : 0.78%

from sklearn.model_selection import cross_val_score

scores=cross_val_score(dt_clf,X_titanic_df,y_titanic_df,cv=5)
for iter_count,accuracy in enumerate(scores):
    print("교차 검증 정확도:",iter_count,accuracy)

print("평균 정확도:",np.mean(scores))

from sklearn.model_selection import GridSearchCV
parameters= {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5],'min_samples_leaf':[1,5,8]}

grid_dclf=GridSearchCV(dt_clf,param_grid=parameters,scoring='accuracy',cv=5) 
grid_dclf.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",grid_dclf.best_params_)
print("GridSearchCV 최고 정확도:",grid_dclf.best_score_)
best_dclf = grid_dclf.best_estimator_

dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test,dpredictions)
print("테스트 세트에서의 DecisionTreeClassifier 정확도 :",accuracy)

최적화된 하이퍼 파라미터를 대입시켜 DecisionTreeClassifier를 학습시킨 뒤 예측 정확도가 87.15% 향상되었다.

'공부하는 습관을 들이자 > Python_ML' 카테고리의 다른 글

[#3]데이터 전처리  (0) 2022.01.01
[2. (2) 교차 검증]  (0) 2021.12.30
[2. (1) 붓꽃 품종 예측하기]  (0) 2021.12.29
부스팅 알고리즘  (0) 2021.12.29