1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | # -*- coding: utf-8 -*- """Day 13_SVM_Sentiment_Analysis.ipynb Automatically generated by Colaboratory. Original file is located at """ from google.colab import drive drive.mount('/gdrive') PATH = "/gdrive/My Drive/Colab Notebooks/resources/" # %matplotlib inline import string import numpy as np import matplotlib import matplotlib.pyplot as plt matplotlib.rc('xtick', labelsize=14) matplotlib.rc('ytick', labelsize=14) from sklearn.feature_extraction.text import CountVectorizer from google.colab import drive drive.mount('/gdrive') PATH = "/gdrive/My Drive/Colab Notebooks/resources/" ##데이터 로드 with open (PATH + "sentiment-logistic-regression/sentiment_labelled_sentences/"'full_set.txt') as f: content = f.readlines() content = [x.strip() for x in content ] ## 라벨과 문장으로 분리 sentences = [ x.split("\t")[0] for x in content ] labels = [ x.split("\t")[1] for x in content ] ## 타겟 클래스를 0, 1에서 -1과 1로 변경 y = np.array(labels, dtype='int8') y = 2*y - 1 ################################ ############ 전처리 ############# ############################### ## 지우고자 하는 문자 리스트를 받아서 x에서 지우는 함수. def full_remove(x, removal_list): for w in removal_list: x = x.replace(w, ' ') return x ## 숫자 제거 digits = [str(x) for x in range(10)] digit_less = [full_remove(x, digits) for x in sentences] ## 특수문자 제거 punc_less = [full_remove(x, list(string.punctuation)) for x in digit_less] ## 소문자로 변경 sents_lower = [x.lower() for x in punc_less] ## 불용어 정의 stop_set = set(['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from']) ## 불용어 제거 ## 공백으로 분리 sents_split = [x.split() for x in sents_lower] ## 불용어에 없는 단어만 띄워쓰기로 재결합 sents_processed = [" ".join(list(filter(lambda a: a not in stop_set, x))) for x in sents_split] ## 불용어가 날아간 문장들 sents_processed[0:10] from sklearn.feature_extraction.text import CountVectorizer ## bag of words ( Term frequency ) 만들기 vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 4500) data_features = vectorizer.fit_transform(sents_processed) ## data_features를 어레이로 바꿔준다 data_mat = data_features.toarray() ## 첫 250개 에는 y가 -1인값들의 인덱스가, 다음 250개에는 y가 1인 값들의 인덱스가 들어간다. test_inds = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False)) ## 처음 뽑은 500개를 제외한 나머지의 인덱스가 들어가게 된다. train_inds = list(set(range(len(labels))) - set(test_inds)) train_data = data_mat[train_inds,] train_labels = y[train_inds] test_data = data_mat[test_inds,] test_labels = y[test_inds] print("train data: ", train_data.shape) print("test data: ", test_data.shape) from sklearn import svm def fit_classifier( C_value = 1.0): clf = svm.LinearSVC( C = C_value, loss = 'hinge' ) clf.fit( train_data, train_labels ) train_preds = clf.predict(train_data) train_error = float(np.sum((train_preds > 0.0) != (train_labels > 0.0)))/len(train_labels) test_preds = clf.predict(test_data) test_error = float(np.sum((test_preds > 0.0) != (test_labels > 0.0)))/len(test_labels) return train_error, test_error ## Soft margin SVM의 C상수 값에 따라서 테스트 에러가 영향을 크게 받는다. cvals = [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0] for c in cvals: train_error, test_error = fit_classifier(c) print ("Error rate for C = %0.2f: train %0.3f test %0.3f" % (c, train_error, test_error)) """#K-Fold Validation""" def cross_validation_error(x, y, C_value, k): ## 인덱스 섞기 n = len(y) indices = np.random.permutation(n) ## 에러값 초기화 err = 0.0 ## 테스트셋을 K등분 for i in range(k): ## K등분의 i 번째를 test_indices에, 나머지를 train_indices에, test_indices = indices[ int( i * ( n/k )) : int((i + 1) * (n / k) - 1)] train_indices = np.setdiff1d( indices, test_indices ) ## 상수 C로 훈련, clf = svm.LinearSVC( C = C_value, loss='hinge') clf.fit(x[train_indices], y[train_indices]) ## 훈련된 모델로 테스트셋 예측 preds = clf.predict(x[test_indices]) ## 에러율을 누산한다. err += float(np.sum((preds > 0.0) != (y[test_indices] > 0.0)))/len(test_indices) return err / k def choose_parameter(x,y,k): c_arr = np.linspace( 1 ,np.power(10,4) , 10) err_arr = np.array([ cross_validation_error(x, y, i, k) for i in c_arr ]) return c_arr , err_arr c_arr , err_arr = choose_parameter(train_data, train_labels, 10) import seaborn as sns sns.lineplot(c_arr, err_arr) | cs |
출처 및 참고자료 : edx - Machine Learning Fundamentals_week_6 Programming Assignment.2
'Python Library > Machine Learning' 카테고리의 다른 글
Day 09_Multiclass_SVM( Sklean ) (0) | 2019.07.20 |
---|---|
Day 09_Multiclass_PerceptronClassifier (0) | 2019.07.19 |
Day 08. Perceptron_Classification_Algorithm (0) | 2019.07.18 |
Day 07_ridge-regression_gradient_descent (0) | 2019.07.17 |
Day 06.logistic_regression_Sentiment_Analysis (0) | 2019.07.14 |