# 데이터프레임을 이용하기 위하여 판다스 패키지를 임포트한다
import pandas as pd
df_train = pd.read_excel('/content/drive/My Drive/Colab Notebooks/data/titanic.xls')
from google.colab import drive
drive.mount('/content/drive')
# 나이 컬럼 값이 널인경우 전체 나이의 평균으로 할당한다
df_train['age'] = df_train['age'].fillna(df_train['age'].mean())
df_train.drop(['cabin'], axis='columns', inplace=True)
df_train.drop(['embarked'], axis='columns', inplace=True)
df_train.drop(['boat'], axis='columns', inplace=True)
df_train.drop(['name'], axis='columns', inplace=True)
df_train.drop(['ticket'], axis='columns', inplace=True)
df_train.drop(['body'], axis='columns', inplace=True)
df_train.drop(['home.dest'], axis='columns', inplace=True)
df_train['fare'] = df_train['fare'].fillna(0)
# sex 컬럼을 범주형에서 수치형으로 변환한다.
df_train['sex'] = df_train['sex'].map({'female':0, 'male':1})
df_train.info()
df_X = df_train[['pclass', 'sex','age','sibsp', 'parch', 'fare']]
df_y = df_train[['survived']]
훈련데이터 생성
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.1, random_state=0 )
X_train