Day 13_bikeShare_DataSet

2019. 7. 25. 12:44
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# -*- coding: utf-8 -*-
"""Day 15_bike_shareDataSet.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/138H-v8iXlgduQAZkJg0wVrcl6X6oLqi0
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"
 
# %matplotlib inline
 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import time
from scipy import stats
 
 
from scipy.stats import norm, multivariate_normal
# installing packages for interactive graphs
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
 
##############################################
################ Helper Function##############
##############################################
 
def my_df_dropNas(df, columns):
    for col in columns:
        df =  df[ df[col].notna() ]
    return df
        
def my_checkNas( x ):
    y = x.apply(lambda x : (
    sum(x.isna())
))
    return y
 
@interact_manual( x =IntSlider(0,0,12) )
def test_model( x ):
    print(x)
 
df = pd.read_csv( PATH + "bike-sharing-demand/train.csv")
 
df.columns
 
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['second'] = df['datetime'].dt.second
 
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots( nrows = 2, ncols = 3 ) ## fig객체 + 행 수만큼의 리스트에 열 수만큼의 서브플롯 객체가 있다.
 
fig.set_size_inches( 18, 8 )
 
## ax1에 출력이 되게 해야한다. 기본으로는 제일 마지막 서브플롯에 출력된다.
# sns.barplot(data = df , x = 'year', y = 'count' )
sns.barplot(data = df , x = 'year', y = 'count' , ax = ax1)
sns.barplot(data = df , x = 'month', y = 'count' , ax = ax2, hue = 'year')
sns.barplot(data = df , x = 'day', y = 'count' , ax = ax3)
sns.barplot(data = df , x = 'hour', y = 'count' , ax = ax4)
sns.barplot(data = df , x = 'minute', y = 'count' , ax = ax5)
sns.barplot(data = df , x = 'second', y = 'count' , ax = ax6)
 
ax1.set(title = "count by years")
 
fig, axes = plt.subplots( nrows = 2, ncols = 2 )
fig.set_size_inches( 12, 10 )
 
sns.boxplot(data = df, y = 'count',x = "season", ax = axes[0][0], orient = 'v')
sns.boxplot(data = df, y = 'count',x = "hour", ax = axes[0][1], orient = 'v')
sns.boxplot(data = df, y = 'count',x = "workingday", ax = axes[1][0], orient = 'v')
sns.boxplot(data = df, y = 'count',x = "season", ax = axes[1][1], orient = 'v')
 
## dayofweek 0~6 까지의 숫자가 있다. 0 = 월요일 , 6 = 일요일
## 요일별 현황을 볼 수 있다.
 
df['dayofweek'] = df['datetime'].dt.dayofweek
 
## 시간의 흐름에 따라 그래프
## 근무일의 여부에 따른 그래프 ( 0, 1 )
## 요일에 따른 그래프 ( 0 ~ 6 )
## 계절에 따른 그래프 ( 1 ~ 4 )
 
fig, (ax1,ax2,ax3,ax4,ax5) = plt.subplots( nrows = 5 )
fig.set_size_inches( 18, 25 )
 
## 점을 출력하고 선으로 잇는다
 
## 각 시간별 
sns.pointplot( data = df , x = 'hour', y = 'count', ax = ax1 )
 
## 평일과 휴일 시간별 대여량 차이
sns.pointplot( data = df , x = 'hour', y = 'count', ax = ax2, hue = 'workingday' )
 
sns.pointplot( data = df , x = 'hour', y = 'count', ax = ax3, hue = 'dayofweek' )
 
sns.pointplot( data = df , x = 'hour', y = 'count', ax = ax4, hue = 'weather' )
 
sns.pointplot( data = df , x = 'hour', y = 'count', ax = ax5, hue = 'season' )
 
df.columns
 
corrMat = df[['temp','atemp','casual', 'registered', 'humidity', 'windspeed', 'count']]
 
corrMat = corrMat.corr()
 
mask = np.array(corrMat)
mask[np.tril_indices_from(mask)] = False
sns.heatmap(corrMat, square = True, annot = True, mask = mask)
 
fig, (ax1, ax2, ax3 ) = plt.subplots(ncols = 3)
fig.set_size_inches(12, 5)
sns.regplot(x = "temp", y = 'count', data = df, ax = ax1)
sns.regplot(x = "windspeed", y = 'count', data = df, ax = ax2)
sns.regplot(x = "humidity", y = 'count', data = df, ax = ax3)
 
## 연도하고 월만 뽑자
 
def cym(dtime):
    
    return "{0}-{1}".format(dtime.year, dtime.month)
    
 
df['year_month'] = df['datetime'].apply(cym)
df['year_month']
 
fig, ( ax1, ax2 ) = plt.subplots( ncols = 2 )
 
fig.set_size_inches( 18, 4 )
sns.barplot(data = df , x = 'year', y = 'count', ax = ax1)
sns.barplot(data = df , x = 'month', y = 'count', ax = ax2)
 
 
fig, ax3 = plt.subplots()
fig.set_size_inches( 18, 4 )
sns.barplot(data = df , x = 'year_month', y = 'count', ax = ax3)
 
## 중앙화
np.abs(df['count'] - df['count'].mean()) 
 
## 해당 값 보다 크면 아웃라이어 라고 가정
3 * df['count'].std()
 
df_noOutliers = df[ np.abs(df['count'] - df['count'].mean()) <= 3 * df['count'].std() ]
 
df_noOutliers['count'].mean()
 
sns.distplot(df['count'])
 
sns.distplot(df_noOutliers['count'])
 
## 종속 변수가 정규분포를 따르는것이 바람직 하다. 
sns.distplot(np.log(df_noOutliers['count']))
 
## 랜덤 포레스트 기반 예측
train = pd.read_csv( PATH + "bike-sharing-demand/train.csv",parse_dates = ['datetime'])
test = pd.read_csv( PATH + "bike-sharing-demand/test.csv",parse_dates = ['datetime'])
 
train.info()
 
train['year']      = train['datetime'].dt.year
train['month']     = train['datetime'].dt.month
train['day']       = train['datetime'].dt.day
train['hour']      = train['datetime'].dt.hour
train['minute']    = train['datetime'].dt.minute
train['second']    = train['datetime'].dt.second
train['dayofweek'] = train['datetime'].dt.dayofweek
 
test['year']      = test['datetime'].dt.year
test['month']     = test['datetime'].dt.month
test['day']       = test['datetime'].dt.day
test['hour']      = test['datetime'].dt.hour
test['minute']    = test['datetime'].dt.minute
test['second']    = test['datetime'].dt.second
test['dayofweek'] = test['datetime'].dt.dayofweek
 
fig, axes = plt.subplots(nrows = 2)
fig.set_size_inches(18, 10)
sns.countplot(data = train, ax = axes[0], x = 'windspeed' )
sns.countplot(data =  test, ax = axes[1], x = 'windspeed' )
 
## 풍속이 0인 값이 굉장히 많다.
## train.loc[ train['windspeed'] == 0 , 'windspeed'] = <대체하고자 하는 값>
 
#풍속 0, 풍속이 0이 아닌것.
 
trainWind0    = train[train['windspeed'] == 0 ] 
trainwindNot0 = train[train['windspeed'] != 0 ] 
 
print(trainWind0.shape, trainwindNot0.shape)
 
## 풍속을 예층하여 결측값 처리.
from sklearn.ensemble import RandomForestClassifier
 
def predict_windspeed(data):
    dataWind0    = data[data['windspeed'] == 0 ] 
    dataWindNot0 = data[data['windspeed'] != 0 ] 
    
    wCol = [ 'season','weather','humidity','month','temp','year','atemp' ]
    
    ## 수치형이 아닌 문자형태여야 모델에 적용할 수 있다.
    dataWindNot0['windspeed'] = dataWindNot0['windspeed'].astype('str')
    dataWind0['windspeed'] = dataWind0['windspeed'].astype('str')
    
    rfModel_wind = RandomForestClassifier()
    ## 풍속이 0이 아닌값으로 모델 생성
    ## 풍속이 0인 값들을 예측
    
    rfModel_wind.fit(dataWindNot0[wCol], dataWindNot0['windspeed'])
    
    wind0Values = rfModel_wind.predict( X = dataWind0[wCol])
    
    predictWind0    = dataWind0
    predictWindNot0 = dataWindNot0
    
    predictWind0['windspeed'] = wind0Values
    data = predictWindNot0.append(predictWind0)
    data['windspeed'] = data['windspeed'].astype('float')
    
    data.reset_index( inplace = True )
    
    return data
 
## 리그레서     => 연속형값 예층
train2 = predict_windspeed(train)
print("=" * 50)
test2  = predict_windspeed(test)
 
fig, axes = plt.subplots(nrows = 2)
fig.set_size_inches(18, 10)
 
sns.countplot(data = train2, ax = axes[0], x = 'windspeed' )
sns.countplot(data =  test2, ax = axes[1], x = 'windspeed' )
 
## 피처 선택
 
## 신호와 잡음 구분
## 피처가 무조건 많다고해서 좋은성능이 보장되는것은 아니다.
## 피처를 하나씩 추가하면서(변경), 성능이 좋지 않은 피처를 뺀다.
 
## categorical feature name
cfn = [ 'season', 'holiday', 'workingday', 'weather', 'dayofweek', 'year', 'month','hour' ]
train.columns
 
for var in cfn :
    train[var] = train[var].astype('cartegory')
    test[var]  =  test[var].astype('cartegory')
 
from sklearn.ensemble import RandomForestClassifier
 
## n=jobs => 코어의 갯수
model = RandomForestClassifier( random_state = 42, n_jobs = -1, n_estimators = 500 , max_leaf_nodes = 16 )
model
 
## 예측
model.fit(train2[[ 'season','weather','humidity','month','temp','year','atemp'] ],train2['count'])
res = model.predict(X = test2[ ['season','weather','humidity','month','temp','year','atemp'] ])
 
## 완성 후 서브미션.
 
df = pd.read_csv( PATH + "bike-sharing-demand/sampleSubmission.csv")
 
df.shape
 
res.shape
 
df['count'] = res
 
df[:10]
 
df.to_csv(PATH + "bike-sharing-demand/submit.csv", index = False )
 
 
Colored by Color Scripter
cs
저작자표시 (새창열림)
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

Day 15_Model_Evaluation_diabetesDataset (0)	2019.07.31
Day 14_house_price (0)	2019.07.30
Day 12_RandomForeset_MushroomDataset (0)	2019.07.19
Day 11_Web Scraping (0)	2019.07.17
Day 10_DecisionTree_With_Preprocessing (0)	2019.07.17
Software knowledge worth spreading

Day 13_bikeShare_DataSet

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바