1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | # -*- coding: utf-8 -*- """Copy of 00_FileTemplate.ipynb Automatically generated by Colaboratory. Original file is located at """ from google.colab import drive drive.mount('/gdrive') # /gdrive/My Drive/Colab Notebooks/resources/ <== My resource path # %matplotlib inline import numpy as np import matplotlib.pyplot as plt import time import numpy as np train_data = np.load("/gdrive/My Drive/Colab Notebooks/resources/train_data.npy") train_labels = np.load("/gdrive/My Drive/Colab Notebooks/resources/train_labels.npy") test_data = np.load("/gdrive/My Drive/Colab Notebooks/resources/test_data.npy") test_labels = np.load("/gdrive/My Drive/Colab Notebooks/resources/test_labels.npy") print("Training dataset dimensions: " , np.shape(train_data)) print("Number of training labels: " , len(train_labels)) print("Testing dataset dimensions: " , np.shape(test_data)) print("Number of testing labels: " , len(test_labels)) train_digits, train_counts = np.unique(train_labels, return_counts = True ) print("Training set distribution:") #Zip can replace for-statements print(dict(zip( train_digits, train_counts ))) test_digits, test_counts = np.unique(test_labels, return_counts=True) print("Test set distribution:") print(dict(zip(test_digits, test_counts))) """## Functions""" def show_digit(x): plt.axis('off') plt.imshow(x.reshape((28, 28)), cmap = plt.cm.gray,interpolation = "bilinear") plt.show() return def vis_image(index, dataset = 'train'): if (dataset == 'train'): show_digit(train_data[index,]) label = train_labels[index] else : show_digit(test_data[index,]) label = test_labels[index] print("label " + str(label)) return def squared_dist(x, y): return np.sum( np.square( x-y ) ) def find_NN(x): # Compute distances from x to every row in train_data distances = [squared_dist(x,train_data[i,]) for i in range(len(train_labels))] # Get the index of the smallest distance return np.argmin(distances) def NN_classifier(x): index = find_NN(x) return train_labels[index] """##Main""" vis_image(0, "train") vis_image(0, "test" ) train_labels[4,] , train_labels[5,] ## Compute distance between a seven and a one print("Distance from 7 to 1: ", squared_dist(train_data[4,],train_data[5,])) ## Compute distance between a seven and a two print("Distance from 7 to 2: ", squared_dist(train_data[4,],train_data[1,])) ## Compute distance between two seven's print("Distance from 7 to 7: ", squared_dist(train_data[4,],train_data[7,])) ## Noticed that a seven and a one is fairly close print("A success case:") print("NN classification: " , NN_classifier(test_data[2,])) print("True label: " , test_labels[2]) print("The test image:") vis_image(2, "test") print("The corresponding nearest neighbor image:") vis_image(find_NN(test_data[2,]), "train") ## A failure case: print("A failure case:") print("NN classification: ", NN_classifier(test_data[39,])) print("True label: ", test_labels[39]) print("The test image:") vis_image(39, "test") print("The corresponding nearest neighbor image:") vis_image(find_NN(test_data[39,]), "train") t_before = time.time() test_predictions = [NN_classifier(test_data[i, ]) for i in range(len(test_labels))] t_after = time.time() err_positions = np.not_equal(test_predictions, test_labels) error = float(np.sum(err_positions)) / len(test_labels) print("Error of nearest neighbor classifier: ", error ) print("Classification time (seconds): " , t_after - t_before) """##Sklearn Implementation with the ball tree and the k-d tree.""" from sklearn.neighbors import BallTree t_before = time.time() ball_tree = BallTree(train_data) t_after = time.time() t_training = t_after - t_before print("Time to build data structure (seconds): ", t_training) t_before = time.time() test_neighbors = np.squeeze(ball_tree.query(test_data, k=1, return_distance=False)) ball_tree_predictions = train_labels[test_neighbors] t_after = time.time() t_testing = t_after - t_before print("Time to classify test set (seconds): ", t_testing) print("Ball tree produces same predictions as above? ", np.array_equal(test_predictions, ball_tree_predictions)) from sklearn.neighbors import KDTree t_before = time.time() kd_tree = KDTree(train_data) t_after = time.time() t_training = t_after - t_before print("Time to build data structure (seconds): ", t_training) t_before = time.time() test_neighbors = np.squeeze(kd_tree.query(test_data, k=1, return_distance=False)) kd_tree_predictions = train_labels[test_neighbors] t_after = time.time() t_testing = t_after - t_before print("Time to classify test set (seconds): ", t_testing) print("KD tree produces same predictions as above? ", np.array_equal(test_predictions, kd_tree_predictions)) """54 seconds on basic method. BallTree shorten the time to 11.03.""" | cs |
출처 및 참고자료 : edx - Machine Learning Fundamentals_week_1 Programming Assignment
'Python Library > Machine Learning' 카테고리의 다른 글
Day 05_Multivariate Gaussian_Winery_Classifier_MNIST (0) | 2019.07.13 |
---|---|
Day 05_Multivariate Gaussian_Winery_Classifier (0) | 2019.07.13 |
Day 03.Probability and statistics Review (0) | 2019.07.08 |
Day 02. KNN Practice with Spine Dataset (0) | 2019.07.07 |
Day 01. K-Nearest Neighbor ( KNN ) (0) | 2019.07.07 |