Day 05_Multivariate Gaussian_Winery_Classifier

2019. 7. 13. 02:02
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""Day 09_Multivariable Gaussian Classifier.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/17KCIHWtOSRyPwPzwNn3VgUMYH-Had7BR
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"
 
# %matplotlib inline
 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import time
 
 
from scipy.stats import norm, multivariate_normal
# installing packages for interactive graphs
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
 
@interact_manual( x =IntSlider(0,0,12) )
def test_model( x ):
    print(x)
 
data = np.loadtxt(PATH + 'winery/winery-multivariate/'+'wine.data.txt', delimiter=',')
 
featurenames = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']
 
# Split 178 instances into training set (trainx, trainy) of size 130 and test set (testx, testy) of size 48
np.random.seed(0)
perm = np.random.permutation(178)
trainx = data[perm[0:130],1:14]
trainy = data[perm[0:130],0]
testx = data[perm[130:178], 1:14]
testy = data[perm[130:178],0]
 
"""j = 1, 2, 3 <= 1, 2, 3 번 와인공장"""
 
## x = 데이터 프레임, y = 각 피처의 라벨 gernerative model을 사용하기 위해 필요한 파라미터를 리턴하는 함수.
def fit_generative_model(x,y):
    
    #와인 공장 갯수 3개
    k = 3  
    
    # 피처의 갯수
    d = (x.shape)[1]
    
    #13개 피처 각각의 평균 0번은 쓰지 않고 1, 2, 3번 인덱스만 사용.
    mu = np.zeros((k+1,d))
    
    #13 피처 각각의 표준편차 4행 13열 13깊이
    cov_mat = np.zeros((k+1,d,d))
    
    #클래스 분류시 가중치로 쓰일 pi변수 1, 2, 3번 인덱스만 사용.
    pi = np.zeros(k+1)
    
    
    #타겟 클래스의 수만큼 도는 포문
    for label in range(1,k+1):
        
        #타겟 클래스가 label과 일치하는 인덱스 추출. ( 1~3 )
        indices = (y == label)
        
        #해당 클래스의 평균을 mu의 클래스 번호 인덱스에 할당.
        mu[label] = np.mean(x[indices,:], axis=0)
        
        #해당 클래스의 공분산 매트릭스를 mu의 클래스 번호 인덱스에 할당.
        cov_mat[label] = np.cov(x[indices,:], rowvar=0, bias=1)
        
        #전체 y의 수에서 해당 타겟 클래스의 갯수를 pi의 타겟클래스 인덱스에 저장. 나중에 가중치로 쓰일 예정
        pi[label] = float(sum(indices))/float(len(y))
    return mu, cov_mat, pi
 
# Fit a Gaussian generative model to the training data
mu, cov_mat, pi = fit_generative_model(trainx,trainy)
 
## 타겟 클래스 갯수
k = 3
 
## 테스트셋의 타겟 벨류의 갯수 = np
nt = len(testy)
 
## 테스트 스코어를 저장할 변수 행 갯수 = 테스트셋 , 열갯수 = 4, 0번 인덱스 사용 x
score = np.zeros((nt, k+1))
 
 
## 테스트셋의 갯수만큼 도는 포문
for i in range(0, nt):
    
    ## 타겟 클래스의 갯수만큼 도는 포문
    for targetClass in range( 1, k+1 ):
 
        #P( testX[i] )를 구하는 과정, testx[i]를 하나의 점으로 평균이 mu[i]이고 공분산 행렬이 cov_mat[i]인
        #다차원 가우스 확률밀도함수내에 표시하고 그때의 확률이 얼마가 되는지 계산한다.
        score[ i, targetClass ] = np.log( pi[targetClass] ) + \
        multivariate_normal.logpdf( testx[i, : ], mean = mu[targetClass, : ], cov = cov_mat[ targetClass, :, : ] )
        
        ##nt[i]가 가중치 pi[i]와 평균이 u, 공분산 매트릭스가 cov인 다차원 가우스 확률 밀도 함수로 구한 P(nt[i])를 곱해서
        ## 그 값이 제일 높은 타겟 클래스에 속하도록 한다. => 실제로는 계산 단순화를 위해서 둘다 로그를 씌워서 더한다.
 
# score의 한 행이 하나의 레코드 이므로, 1, 2, 3번 인덱스 중 제일 확률이 높게나온 와인 공장으로 예측한다.
# 그때 argmax는 해당 인덱스를 리턴하므로 인덱스에 1을 더해서 와인 공장 번호를 바로 predictions에 넣도록 한다.
predictions = np.argmax( score[ : , 1:4 ], axis = 1 ) + 1        
        
## 예측과 실제 testy의 라벨을 비교해서 틀린것을 센다.    
errors = np.sum( predictions != testy )    
 
print ("Errors: " + str(errors) + "/" + str(nt))
 
Colored by Color Scripter
cs
저작자표시
'Python Library > Machine Learning' 카테고리의 다른 글

Day 06.logistic_regression_Sentiment_Analysis (0)	2019.07.14
Day 05_Multivariate Gaussian_Winery_Classifier_MNIST (0)	2019.07.13
Day 03.Probability and statistics Review (0)	2019.07.08
Day 02. KNN Practice with Spine Dataset (0)	2019.07.07
Day 02. Implementation Of K-Nearest Neighbor (0)	2019.07.07
Software knowledge worth spreading

Day 05_Multivariate Gaussian_Winery_Classifier

'Python Library > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바