Day 10_DecisionTree_With_Preprocessing

2019. 7. 17. 16:27
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""Day 11_decision_tree.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1ybNWDQmSG1mzPaSpZOk8uTdZejQKcRAu
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"
 
# %matplotlib inline
 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import time
 
 
from scipy.stats import norm, multivariate_normal
# installing packages for interactive graphs
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
 
##############################################
################ Helper Function##############
##############################################
 
def my_df_dropNas(df, columns):
    for col in columns:
        df =  df[ df[col].notna() ]
        
def my_checkNas( x ):
    y = x.apply(lambda x : (
    sum(x.isna())
))
    return y
 
@interact_manual( x =IntSlider(0,0,12) )
def test_model( x ):
    print(x)
 
train = pd.read_csv(PATH + "titanic/train.csv")
test  = pd.read_csv(PATH + "titanic/test.csv")
 
##############################################
####### 시리즈 안에 있는 문자열을 맵핑########
##############################################
 
mapping = { "Mr":0 , "Miss":1, "Mrs":2 }
data['title'].map(mapping)
 
train = pd.read_csv(PATH + "titanic/train.csv")
test = pd.read_csv(PATH + "titanic/test.csv")
 
train.describe()
my_checkNas(train)
 
datas = [train, test]
## 파생변수를 만들때는 테스트할때도 사용될 수 있으므로 테스트데이터 에도 똑같이 만들어준다.
 
for data in datas:
    data['Age_mean'] = data['Age']
    data['Age_mean'] = data['Age_mean'].fillna(data['Age_mean'].mean())
 
train['Sex'].value_counts()
 
for data in datas:
    data['Gender'] = data['Sex'] == 'female'
 
my_checkNas(test)
 
## 원핫 인코딩 대체
for data in datas:
    data['Embarked_S'] = data['Embarked'] == "S"
    data['Embarked_C'] = data['Embarked'] == "C"
    data['Embarked_Q'] = data['Embarked'] == "Q"
 
## 가족의 수 컬럼 ( 자기 포함 )
for data in datas:
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
 
for data in datas:
    print(data['FamilySize'].value_counts())
    data['Family'] = train['FamilySize']
 
for data in datas:
#대가족(5명이상) : l , 소가족(1명) : S , 중가족( 2 ~ 4 ) : M
    data.loc[  data['FamilySize'] == 1, 'Family' ] = "S"
    data.loc[ (data['FamilySize'] > 1) & (data['FamilySize'] < 5 ), 'Family' ] = "M"
    data.loc[  data['FamilySize'] > 4, 'Family' ] = "L"
 
for data in datas:
    data['Family_S'] = data['Family'] == 'S'
    data['Family_M'] = data['Family'] == 'M'
    data['Family_L'] = data['Family'] == 'L'
 
for data in datas:
    data['Pclass'] = data['Pclass'].astype( 'category' )
 
fn = [ 'Gender','Age_mean','Embarked_S','Embarked_C','Embarked_Q','Family_S','Family_M','Family_L']
 
X_train = train[fn]
Y_train = train['Survived']
 
X_test  = test[fn]
 
# 모델링 ( decision tree )
from sklearn.tree import DecisionTreeClassifier
 
model = DecisionTreeClassifier(max_depth = 3)
 
model.fit(X_train, Y_train)
 
from sklearn.model_selection import KFold, cross_val_score
 
## 셔플 할 때에 random_state => randomseed
kfold = KFold( n_splits = 10, shuffle = True , random_state = 2019)
kfold
 
## cv => cross-validation generator
score = cross_val_score( model, X_train, Y_train, cv = kfold, scoring = 'accuracy' )
score.mean()
 
@interact( x =IntSlider(0,3,100) )
def test(x):
 
    model = DecisionTreeClassifier(max_depth = x, )
    
    model.fit(X_train, Y_train)
    
    ## 셔플 할 때에 random_state => randomseed
    kfold = KFold( n_splits = 100, shuffle = True , random_state = 2019)
    score = cross_val_score( model, X_train, Y_train, cv = kfold, scoring = 'accuracy' )
    print(score.mean())
 
result = model.predict( X_test )
 
test['Survived'] = result.copy()
 
submit = test[['PassengerId','Survived']]
submit.to_csv(PATH + "titanic/submit.csv", index = False )
 
 
Colored by Color Scripter
cs
저작자표시 (새창열림)
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

Day 12_RandomForeset_MushroomDataset (0)	2019.07.19
Day 11_Web Scraping (0)	2019.07.17
Day 09_Advanced Group_Agg_Apply (0)	2019.07.12
Day 09_TimeSeries (0)	2019.07.12
Day 09_bitly_Json_Dictionary_List_Advenced (0)	2019.07.12
Software knowledge worth spreading

Day 10_DecisionTree_With_Preprocessing

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바