Day 16_PCA_HR_DataSet

2019. 8. 1. 15:40
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
"""Day 22_PCA.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1e7UxuOQUHJfM4itcHDfOmyqhOFUO79B4
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"
 
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
 
print(tf.__version__)
 
#칼로리, 식습관 운동습관과 그에 따른 체형 데이터를
 
df = pd.DataFrame(columns=['calory', 'breakfast', 'lunch', 'dinner', 'exercise', 'body_shape'])
 
df.loc[0] = [1200, 1, 0, 0, 2, 'Skinny']
df.loc[1] = [2800, 1, 1, 1, 1, 'Normal']
df.loc[2] = [3500, 2, 2, 1, 0, 'Fat']
df.loc[3] = [1400, 0, 1, 0, 3, 'Skinny']
df.loc[4] = [5000, 2, 2, 2, 0, 'Fat']
df.loc[5] = [1300, 0, 0, 1, 2, 'Skinny']
df.loc[6] = [3000, 1, 0, 1, 1, 'Normal']
df.loc[7] = [4000, 2, 2, 2, 0, 'Fat']
df.loc[8] = [2600, 0, 2, 0, 0, 'Normal']
df.loc[9] = [3000, 1, 2, 1, 1, 'Fat']
 
from sklearn.decomposition import PCA
X = df[['calory', 'breakfast', 'lunch',    'dinner',    'exercise']]
Y = df['body_shape']
 
from sklearn.preprocessing import StandardScaler
x_std = StandardScaler().fit_transform(X)
print(x_std)
 
## 공분산 행렬
## 피처가 컬럼에 있는데 row로 바꾸기 위해 변환
 
features = x_std.T
features
 
cov_matrix = np.cov(features)
 
cov_matrix
 
## 공분산 행렬로부터 아이겐 벡터와 아이겐 벨류 추출
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
 
print('고유벡터 %s' % eig_vecs)
print('고유 값 %s ' % eig_vals)
 
eig_vals / sum(eig_vals)  # 73% 정보를 표현
 
## 첫번째 축에 해당하는 데이터를 주축으로 73%의 정보를 담고 있는 컬럼 생성.
projected_X = x_std.dot(eig_vecs.T[0])
 
res = pd.DataFrame(projected_X, columns = ['PC1'])
 
res['yaxis'] = 0.0
res['label'] = Y
res
 
import seaborn as sns
sns.lmplot('PC1', 'yaxis',data = res, hue = 'label' )
 
df = pd.read_csv(PATH + "data/HR_comma_sep.csv")
 
columns = df.columns.tolist()
 
df[:2]
 
correlation = df.corr()
 
plt.figure(figsize = (10, 10))
sns.heatmap(correlation, annot = True, cmap = 'cubehelix')
 
df['sales'].unique()
 
## 부서별 각 컬럼의 합계
sales = df.groupby('sales').sum()
 
groupby_sales = df.groupby('sales').mean()
groupby_sales
 
sales = {}
for i in groupby_sales['satisfaction_level'].index
    sales[str(i)] = groupby_sales['satisfaction_level'][i]
 
sales
 
## 컬럼의 이동.
df.head()
 
df_drop = df.drop( labels = ['sales', 'salary'], axis = 1 )
df_drop
 
## left라는 컬럼을 제일 앞으로 이동 시키고자 한다.
 
cols = df_drop.columns.tolist()
cols
 
## left 컬럼의 위치
cols.index('left')
cols.insert(0, cols.pop(cols.index('left')))
 
## 위에서 만든 컬럼 순서대로 리인덱스.
df_drop = df_drop.reindex( columns = cols )
df_drop
 
x = df_drop.iloc[ : , 1:8 ].values
y = df_drop.iloc[ : , 0   ].values
 
np.shape(x), np.shape(y)
 
X_std = StandardScaler().fit_transform(x)
 
#공분산 행렬
mean_vec = np.mean(X_std , axis = 0 )
mean_vec
cov_mat = ((X_std - mean_vec).T.dot((X_std - mean_vec)) / X_std.shape[0] - 1)
 
cov_mat
 
## 주의! 열이 행인덱스에 오도록 해서 계산한다.
cov_mat = np.cov(X_std.T)
 
sns.heatmap(cov_mat, cmap = 'cubehelix', annot = True , square = True )
 
eig_vals , eig_vecs = np.linalg.eig(cov_mat)
print("eigen vector %s" % eig_vecs)
print("eigen values %s" % eig_vals)
 
eigen_pairs = [ (eig_vals[i], eig_vecs[: ,i]) for i in range(len( eig_vals )) ]
 
eigen_pairs
 
## key = 튜플 안에서 정렬할 기준
## 튜플 정렬
eigen_pairs.sort( key = lambda x : x[0] , reverse = True )
eigen_pairs
 
# for i in eigen_pairs:
    # print( i[0] )
 
tot = sum(eig_vals) 
 
## 내림 차순 정렬
var_exp = [i / tot for i in sorted(eig_vals, reverse = True )] 
var_exp
 
pca = PCA().fit(X_std)
 
## 각 고유값이 포함하고 있는 분산의 정도. 몇차원으로 줄여야 할지 판단 가능.
plt.plot( np.cumsum( pca.explained_variance_ratio_ ) )
 
## 6차원으로 축소
sk_pca = PCA(n_components = 6)
y_sk = sk_pca.fit_transform(X_std)
 
 
## 7차원 데이터가 6차원으로 축소 되었다.
y_sk
 
 
Colored by Color Scripter
cs
저작자표시 (새창열림)
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

Day 15_Diabetes_House_XGBoost (0)	2019.07.31
Day 15_Model_Evaluation_diabetesDataset (0)	2019.07.31
Day 14_house_price (0)	2019.07.30
Day 13_bikeShare_DataSet (0)	2019.07.25
Day 12_RandomForeset_MushroomDataset (0)	2019.07.19
Software knowledge worth spreading

Day 16_PCA_HR_DataSet

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바