# -*- coding: utf-8 -*-
"""Day 13_SVM_Sentiment_Analysis.ipynb

Automatically generated by Colaboratory.

Original file is located at
    
"""

from google.colab import drive
drive.mount('/gdrive')
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"

# %matplotlib inline
import string
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14)

from sklearn.feature_extraction.text import CountVectorizer

from google.colab import drive
drive.mount('/gdrive')
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"

##데이터 로드
with open (PATH + "sentiment-logistic-regression/sentiment_labelled_sentences/"'full_set.txt') as f:
    content = f.readlines()
content = [x.strip() for x in content ]

## 라벨과 문장으로 분리
sentences = [ x.split("\t")[0] for x in content ]
labels = [ x.split("\t")[1] for x in content ]

## 타겟 클래스를 0, 1에서 -1과 1로 변경
y = np.array(labels, dtype='int8')
y = 2*y - 1

################################
############ 전처리 #############
###############################

## 지우고자 하는 문자 리스트를 받아서 x에서 지우는 함수.
def full_remove(x, removal_list):
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

## 숫자 제거
digits = [str(x) for x in range(10)]
digit_less = [full_remove(x, digits) for x in sentences]

## 특수문자 제거
punc_less = [full_remove(x, list(string.punctuation)) for x in digit_less]

## 소문자로 변경
sents_lower = [x.lower() for x in punc_less]

## 불용어 정의
stop_set = set(['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from'])

## 불용어 제거
## 공백으로 분리
sents_split = [x.split() for x in sents_lower]
## 불용어에 없는 단어만 띄워쓰기로 재결합
sents_processed = [" ".join(list(filter(lambda a: a not in stop_set, x))) for x in sents_split]

## 불용어가 날아간 문장들
sents_processed[0:10]

from sklearn.feature_extraction.text import CountVectorizer

## bag of words ( Term frequency ) 만들기
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 4500)

data_features = vectorizer.fit_transform(sents_processed)

## data_features를 어레이로 바꿔준다
data_mat = data_features.toarray()

## 첫 250개 에는 y가 -1인값들의 인덱스가, 다음 250개에는 y가 1인 값들의 인덱스가 들어간다.
test_inds = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False))

## 처음 뽑은 500개를 제외한 나머지의 인덱스가 들어가게 된다.
train_inds = list(set(range(len(labels))) - set(test_inds))

train_data = data_mat[train_inds,]
train_labels = y[train_inds]

test_data = data_mat[test_inds,]
test_labels = y[test_inds]

print("train data: ", train_data.shape)
print("test data: ", test_data.shape)

from sklearn import svm

def fit_classifier( C_value = 1.0):
    clf = svm.LinearSVC( C = C_value, loss = 'hinge' )
    clf.fit( train_data, train_labels )
    
    train_preds = clf.predict(train_data)
    train_error = float(np.sum((train_preds > 0.0) != (train_labels > 0.0)))/len(train_labels)
    
    test_preds = clf.predict(test_data)
    test_error = float(np.sum((test_preds > 0.0) != (test_labels > 0.0)))/len(test_labels)
    
    return train_error, test_error

## Soft margin SVM의 C상수 값에 따라서 테스트 에러가 영향을 크게 받는다.
cvals = [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0]
for c in cvals:
    train_error, test_error = fit_classifier(c)
    print ("Error rate for C = %0.2f: train %0.3f test %0.3f" % (c, train_error, test_error))

"""#K-Fold Validation"""

def cross_validation_error(x, y, C_value, k):
    ## 인덱스 섞기
    n = len(y)
    indices = np.random.permutation(n)
    
    ## 에러값 초기화
    err = 0.0
    
    ## 테스트셋을 K등분
    for i in range(k):
        ## K등분의 i 번째를 test_indices에, 나머지를 train_indices에,
        test_indices = indices[ int( i * ( n/k )) : int((i + 1) * (n / k) - 1)]
        train_indices = np.setdiff1d( indices, test_indices )
        
        ## 상수 C로 훈련,
        clf = svm.LinearSVC( C = C_value, loss='hinge')
        clf.fit(x[train_indices], y[train_indices])
        
        ## 훈련된 모델로 테스트셋 예측
        preds = clf.predict(x[test_indices])
        
        ## 에러율을 누산한다.
        err += float(np.sum((preds > 0.0) != (y[test_indices] > 0.0)))/len(test_indices)
    
    return err / k

def choose_parameter(x,y,k):
    c_arr = np.linspace( 1 ,np.power(10,4) , 10)
    err_arr = np.array([ cross_validation_error(x, y, i, k) for i in c_arr ])
    return c_arr , err_arr

c_arr , err_arr = choose_parameter(train_data, train_labels, 10)

import seaborn as sns
sns.lineplot(c_arr, err_arr)
출처 및 참고자료 : edx - Machine Learning Fundamentals_week_6 Programming Assignment.2
