1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | # -*- coding: utf-8 -*- """Day 21_canser_mission.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Lvl2OmvVarGZ3grmeeW-xFhHE7H8KmCn """ from google.colab import drive drive.mount('/gdrive') PATH = "/gdrive/My Drive/Colab Notebooks/resources/" import matplotlib.pyplot as plt from tensorflow import keras import tensorflow as tf import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt print(tf.__version__) df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", header = None) from sklearn import preprocessing label_encoder = preprocessing.LabelEncoder() df2 = df.copy() ## 문자를 숫자로 변경 B = 0 , M = 1 df2[1] = label_encoder.fit_transform(df[1]) df2 fig = plt.gcf() fig.set_size_inches(10 * 2, 10 * 2) sns.heatmap(df2.corr(), annot = True) ## 상관관계 높은 순서대로 배열후 상위 10개의 피처 인덱스만 추출 top_10_featureIdx = np.argsort(df2.corr()[1])[::-1][:10] ## 상관관계 높은 순서대로 배열후 상위 10개의 피처 인덱스만 추출 # top_20_featureIdx = np.argsort(df2.corr()[1])[::-1].reset_index(drop = True)[:20] df3 = df2.iloc[:,top_10_featureIdx] df3.shape ## 열 이름 초기화 0번이 타겟 df3.columns = [ i for i in range(df3.shape[1])] df3.shape ## 정규화 및 전치 from sklearn.preprocessing import minmax_scale, StandardScaler, RobustScaler # xdata = minmax_scale( df3.iloc[:,1 : ] ) #결과 : 0.68xxx # xdata = RobustScaler().fit_transform(df3.iloc[:,1 : ]) #결과 : 0.63xxx xdata = StandardScaler().fit_transform(df3.iloc[:,1 : ]) ydata = df3.iloc[:,0] xdata = xdata.T ydata = np.array(ydata).reshape((1, -1)) xdata.shape, ydata.shape ## 입력 제외 레이어의 갯수 n_h = 1 ## x의 피처의 갯수. n_x = 9 ## 출력층 노드의 갯수. n_y = 1 ## 관측치의 갯수 m = 569 # 미사용 X = tf.placeholder(tf.float64, shape = [ n_x, None ] ) y = tf.placeholder(tf.float64, shape = [ 1 , None ] ) W1 = tf.Variable( tf.random.normal( [ 1, n_x ], dtype = tf.float64), dtype = tf.float64 ) B1 = tf.Variable( tf.random.normal( [ 1 ], dtype = tf.float64), dtype = tf.float64 ) ## 1번째 레이어의 입력 계산식 Z Z1 = W1 @ X + B1 ## 1번째 레이어의 엑티베이션 펑션 A1 = tf.sigmoid( Z1 ) ## loss(A1, y)의 총합의 평균 # cost = -tf.reduce_mean( y * tf.log( A1 ) + (1 - y) * tf.log( 1 - A1 ) ) cost = tf.losses.sigmoid_cross_entropy(y, A1) train = tf.train.GradientDescentOptimizer(0.01).minimize( cost ) predicted = tf.cast( A1 > 0.5, dtype = tf.float64 ) accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted ,y) , dtype = tf.float64)) m = xdata.shape[1] indices = np.random.permutation(m) ## k = 7 k-fold validation k = 7 acc = [] for i in range(k): test_indices = indices[ int( i * ( m / k )) : int((i + 1) * ( m / k ) - 1)] train_indices = np.setdiff1d( indices, test_indices ) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for step in range(10000): cv, _ = sess.run([ cost, train ], feed_dict = { X : xdata[:,train_indices], y : ydata[:,train_indices] } ) if step % 2000 == 0: print(step, cv) preds = sess.run(accuracy, feed_dict = { X : xdata[:,test_indices], y : ydata[:,test_indices] }) acc.append(preds) print('acc : ', np.sum(acc) / k) ## acc : 0.9341490299823632 """#2 번 문제 성능 향상""" df = pd.read_csv(PATH + "house/train.csv") features = ['sqft_living','bedrooms', 'bathrooms', 'sqft_lot','floors','zipcode','view', 'waterfront', 'grade','yr_built'] from sklearn.model_selection import train_test_split train_df, test_df = train_test_split( df, test_size = 0.2, random_state = 42 ) import xgboost as xgb model = xgb.XGBRegressor(max_depth = 5, n_estimators = 5000, learning_rate = 0.03) model.fit(train_df[features], train_df['price']) model.score(test_df[features], test_df['price']) ## 0.8223037240035181 | cs |
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글
Day 16_PCA_HR_DataSet (0) | 2019.08.01 |
---|---|
Day 15_Model_Evaluation_diabetesDataset (0) | 2019.07.31 |
Day 14_house_price (0) | 2019.07.30 |
Day 13_bikeShare_DataSet (0) | 2019.07.25 |
Day 12_RandomForeset_MushroomDataset (0) | 2019.07.19 |