1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | # -*- coding: utf-8 -*- """Day 11_decision_tree.ipynb Automatically generated by Colaboratory. Original file is located at """ from google.colab import drive drive.mount('/gdrive') PATH = "/gdrive/My Drive/Colab Notebooks/resources/" # %matplotlib inline import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import pandas as pd import seaborn as sns import time from scipy.stats import norm, multivariate_normal # installing packages for interactive graphs import ipywidgets as widgets from IPython.display import display from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider ############################################## ################ Helper Function############## ############################################## def my_df_dropNas(df, columns): for col in columns: df = df[ df[col].notna() ] def my_checkNas( x ): y = x.apply(lambda x : ( sum(x.isna()) )) return y @interact_manual( x =IntSlider(0,0,12) ) def test_model( x ): print(x) train = pd.read_csv(PATH + "titanic/train.csv") test = pd.read_csv(PATH + "titanic/test.csv") ############################################## ####### 시리즈 안에 있는 문자열을 맵핑######## ############################################## mapping = { "Mr":0 , "Miss":1, "Mrs":2 } data['title'].map(mapping) train = pd.read_csv(PATH + "titanic/train.csv") test = pd.read_csv(PATH + "titanic/test.csv") train.describe() my_checkNas(train) datas = [train, test] ## 파생변수를 만들때는 테스트할때도 사용될 수 있으므로 테스트데이터 에도 똑같이 만들어준다. for data in datas: data['Age_mean'] = data['Age'] data['Age_mean'] = data['Age_mean'].fillna(data['Age_mean'].mean()) train['Sex'].value_counts() for data in datas: data['Gender'] = data['Sex'] == 'female' my_checkNas(test) ## 원핫 인코딩 대체 for data in datas: data['Embarked_S'] = data['Embarked'] == "S" data['Embarked_C'] = data['Embarked'] == "C" data['Embarked_Q'] = data['Embarked'] == "Q" ## 가족의 수 컬럼 ( 자기 포함 ) for data in datas: data['FamilySize'] = data['SibSp'] + data['Parch'] + 1 for data in datas: print(data['FamilySize'].value_counts()) data['Family'] = train['FamilySize'] for data in datas: #대가족(5명이상) : l , 소가족(1명) : S , 중가족( 2 ~ 4 ) : M data.loc[ data['FamilySize'] == 1, 'Family' ] = "S" data.loc[ (data['FamilySize'] > 1) & (data['FamilySize'] < 5 ), 'Family' ] = "M" data.loc[ data['FamilySize'] > 4, 'Family' ] = "L" for data in datas: data['Family_S'] = data['Family'] == 'S' data['Family_M'] = data['Family'] == 'M' data['Family_L'] = data['Family'] == 'L' for data in datas: data['Pclass'] = data['Pclass'].astype( 'category' ) fn = [ 'Gender','Age_mean','Embarked_S','Embarked_C','Embarked_Q','Family_S','Family_M','Family_L'] X_train = train[fn] Y_train = train['Survived'] X_test = test[fn] # 모델링 ( decision tree ) from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(max_depth = 3) model.fit(X_train, Y_train) from sklearn.model_selection import KFold, cross_val_score ## 셔플 할 때에 random_state => randomseed kfold = KFold( n_splits = 10, shuffle = True , random_state = 2019) kfold ## cv => cross-validation generator score = cross_val_score( model, X_train, Y_train, cv = kfold, scoring = 'accuracy' ) score.mean() @interact( x =IntSlider(0,3,100) ) def test(x): model = DecisionTreeClassifier(max_depth = x, ) model.fit(X_train, Y_train) ## 셔플 할 때에 random_state => randomseed kfold = KFold( n_splits = 100, shuffle = True , random_state = 2019) score = cross_val_score( model, X_train, Y_train, cv = kfold, scoring = 'accuracy' ) print(score.mean()) result = model.predict( X_test ) test['Survived'] = result.copy() submit = test[['PassengerId','Survived']] submit.to_csv(PATH + "titanic/submit.csv", index = False ) | cs |
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글
Day 12_RandomForeset_MushroomDataset (0) | 2019.07.19 |
---|---|
Day 11_Web Scraping (0) | 2019.07.17 |
Day 09_Advanced Group_Agg_Apply (0) | 2019.07.12 |
Day 09_TimeSeries (0) | 2019.07.12 |
Day 09_bitly_Json_Dictionary_List_Advenced (0) | 2019.07.12 |