1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | # -*- coding: utf-8 -*- """Day 08_Mission_baby_Movie.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1XwL5qAxvCR403REB8dT2o2W3PCjrYL5u """ from google.colab import drive drive.mount('/gdrive') PATH = "/gdrive/My Drive/Colab Notebooks/resources/" # %matplotlib inline import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import pandas as pd import seaborn as sns import time from scipy.stats import norm, multivariate_normal # installing packages for interactive graphs import ipywidgets as widgets from IPython.display import display from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider @interact_manual( x =IntSlider(0,0,12) ) def test_model( x ): print(x) years = range( 1880, 2011 ) pieces = [] for year in years: path = "/gdrive/My Drive/Colab Notebooks/resources/babynames/yob%d.txt" % year frame = pd.read_csv( path, names = ['name','gender','birth'] ) frame['year'] = year pieces.append( frame ) len(pieces) df = pd.concat(pieces, ignore_index = True) df[:2] ## 1880년 부터 2010년 사이에 이름에 들어간 알파벳의 빈도수 alphabet_dic = {} if 'a' not in alphabet_dic: alphabet_dic['a'] = 1 else : alphabet_dic['a'] += 1 alphabet_dic df['name'].size # 1690784 def acount_alpha( name ): for c in name: if c not in alphabet_dic: alphabet_dic[c] = 1 else : alphabet_dic[c] += 1 df['name'].map(acount_alpha) import operator sorted_x = sorted(alphabet_dic.items(), key=operator.itemgetter(1) , reverse = True) ## 상위 10 개 sorted_x[:10] # [('a', 1425321), # ('e', 1193584), # ('n', 905036), # ('i', 828887), # ('r', 688878), # ('l', 669406), # ('o', 469078), # ('t', 337296), # ('s', 333697), # ('y', 321302)] """#Movielens""" movies = pd.read_csv("/gdrive/My Drive/Colab Notebooks/resources/movielens/movies.dat", header = None , sep = "::" , names = ['movie_id', 'title','genres' ]) movies['year'] = movies['title'].str.extract(" \((\d{4})\)") movies['title'] = movies['title'].str.replace(" \((\d{4})\)", "") ## 장르 하나씩만 추출해서 원핫 인코딩 movies['genres'] = movies['genres'].str.replace("\|.*", "") ## 장르 따로 저장 genres_names = movies['genres'].value_counts().index genres_names = np.array(genres_names).reshape(-1,1) from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(n_values = "auto", handle_unknown = "ignore") ohe.fit( genres_names ) movies['genres'] = pd.Series(list(ohe.transform(movies[['genres']]).toarray())) movies[:10] ################################################################# ####### 피처 이름 변경 및 인덱스 타입 변경 조인 코드 저장용###### ################################################################# df = tempDf.rename(columns = { 0: 'genres'}) tempDf = pd.concat([pd.Series(genres_names.flatten()), pd.DataFrame(tempOh)], axis = 1 ,ignore_index = True) movies.columns movies.join(df.set_index('genres'), on='genres')[:2] | cs |
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글
Day 08. Movielens_Analysis (0) | 2019.07.11 |
---|---|
Day 08.Baby Names Analysis_cumsum_pivot (0) | 2019.07.11 |
Day 07_String_AggregationFuc (0) | 2019.07.10 |
Day 06_Visualization (0) | 2019.07.10 |
Day 05.Melt_PivotTable (0) | 2019.07.08 |