Day 08.Baby Names Analysis_cumsum_pivot

2019. 7. 11. 18:14

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: utf-8 -*-
"""Day 08_Babynames.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1J1ZN1Relxqr11TbHLudwU7aaF_rCIGmN
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"
 
# %matplotlib inline
 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import time
 
years = range( 1880, 2011 )
 
pieces = []
for year in years:
    
    path  = "/gdrive/My Drive/Colab Notebooks/resources/babynames/yob%d.txt" % year
    frame = pd.read_csv( path, names = ['name','gender','birth'] ) 
    frame['year'] = year
    
    pieces.append( frame )
    
len(pieces)
 
df = pd.concat(pieces, ignore_index = True)
 
df[:2]
 
total_birth = df.pivot_table( index = ['year'], columns = ['gender'], values = ['birth'],aggfunc = np.sum )
 
## 이름별 전체 대비 비율이 얼마나 되는가
## 각 연도별, 성별 전체인원대비 각 이름이 차지하는 비율이 얼마나 되는가.
def tempfu(x):
    x['prop'] =  x.birth / x.birth.sum()
    return x
 
df.groupby([ 'year','gender' ]).birth.sum()
# df2 = df.groupby( ['year', 'gender' ]).apply( tempfu )
 
df2 = df.groupby( ['year', 'gender' ]).apply( lambda x : (
    pd.Series(x.birth / x.birth.sum(), ["prop"])
))
df
 
df[:2]
 
## 연도별, 성별에 따른 빈도수가 가장 높은 이름 1000개 추출
df2_group = df2.groupby(['year', 'gender'])
 
df3 = df2_group.apply(lambda x : (
    
    ## 각 연도별 성별로 나눈 목록에서 상위 1000개의 이름만 추출
    x.sort_values( by = 'birth', ascending = False )[:1000]
 
))
 
top1000_idx = np.arange(len( df3 ))
 
## 이름 유행 분석
 
df3 = df3.reset_index(drop = True)
 
## 성별로 유행했던 이름을 알아보기 위해서 데이터 분리.
boys  = df3[ df3.gender == "M" ]
girls = df3[ df3.gender == "F" ]
 
## 행 인덱스에 년도가 오고 열에는 이름이오고 데이터에는 출생수를 출력하자.
total_birth = df3.pivot_table( 'birth' ,index = 'year', columns = 'name', aggfunc = sum)
 
## 행을 연도로 쓰게 되면 마치 시계열 데이터 처럼 각 변수를 뽑아서 볼수 있다.
subset = total_birth[ ['John', 'Harry', 'Mary', 'Marilyn'] ]
 
## figsize => 그림 크기 확대. 단위는 아마 인치로 예상됨.
 
ticks = np.linspace(1880 , 2011, 30 , dtype = np.uint32)
subset.plot(subplots = True, figsize = (12, 10), xticks = ticks )
 
## 각 연도에서 상위 1000개의 비율을 구하자. 전체 이름의 종류대비 상위 1000개의 비율로 상위 1000대가 전체 대비 얼마나 차지하는지 알 수 있다.
 
## 행 -> 연도 , 컬럼 -> 성별 , 값-> 비율
table = df3.pivot_table( "prop" ,index = 'year', columns = 'gender' , aggfunc = sum )
 
table.shape
 
table.plot( xticks = range(1880, 2011, 10), yticks = np.linspace(0.7,1.0,13) )
 
df = boys[ boys.year == 2010 ]  
 
## 누적합을 적용하면?
prop_cumsum = df.sort_values( by = 'prop' , ascending = False).prop.cumsum()
 
## 모든 년도의 남자아이 이름의 다양성을 알 수 있다.
 
## 인덱스를 제외한 값만 배열로 출력
prop_cumsum.values
 
## 정렬된 상태에서 0.5가 되는 위치? 117 번째
prop_cumsum.values.searchsorted( 0.5 )
 
df = boys[ boys.year == 1900 ]  
 
## 누적합을 적용하면?
prop_cumsum = df.sort_values( by = 'prop' , ascending = False).prop.cumsum()
 
## 모든 년도의 남자아이 이름의 다양성을 알 수 있다.
 
## 인덱스를 제외한 값만 배열로 출력
prop_cumsum.values
 
## 정렬된 상태에서 0.5가 되는 위치? 
prop_cumsum.values.searchsorted( 0.5 )
 
 
## 모든 년도에서 50%가 되는 위치의 인덱스를 비교하면 각 연도별로 얼마나 이름들이 잘 퍼져 있는지 확인 할 수 있다.
 
def get_quantile_count( x, q = 0.5 ):
    x.sort_values( by = 'prop', ascending = False)
    
    ## 각 그룹별 50%위치의 인덱스
    return (x.prop.cumsum().values.searchsorted(q))
    
    
diversity = df3.groupby( ['year','gender'] ).apply(get_quantile_count)
 
 
## 현재 스택되어있다. 무엇을 기준으로 언스택 할 것인지 명시. 즉, 컬럼에 무엇이 올 것인지 명시
 
## 그룹화 할때 준 기준은 그대로 행인덱스에 들어 있다. 언스택해서 옆으로 펼칠 수 있다.
diversity = diversity.unstack('gender')
 
## 기본적으로 행이x축으로, 컬럼이 범례로(휴), 값이 y축으로 온다.
 
## 전체에서 전체중 50%출생아 이름의 종류의 갯수.
diversity.plot( title = "Number of popular names in top 50%" )
 
## 이름의 마지막 글자가 어떻게 변화해 왔는지 조사.
 
get_last_letter = lambda x : x[-1]
last_letters = df.name.map(get_last_letter)
 
last_letters.name = "last_letter"
 
## 이름을 글자로 분리 => 알파벳 문자 비율의 변화(1880 .. 2010)
table = df.pivot_table( "birth", columns = ['gender', 'year'] , index = last_letters ,aggfunc = sum )
table[:2]
 
subtable = table.reindex( columns = [ 1910, 1960, 2010 ], level = 'year')
subtable.head()
 
letter_prop = subtable / subtable.sum()
 
## 
 
letter_prop['F'].plot(kind = "bar", title = "Fremale")
 
## d/ n / y로 끝나는 이름을 가진 남자아이의 비율
 
table=df.pivot_table('birth', index=last_letters, 
                  columns=['gender','year'], aggfunc=sum)
table
 
subtable=table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()
 
letter_prop=subtable/(subtable.sum())
letter_prop
 
 
letter_prop['F'].plot(kind='bar',rot=0, title='Female')
letter_prop['M'].plot(kind='bar',rot=0, title='Male')
 
letter_prop=table/table.sum()
dny_ts=letter_prop.ix[['d','n','y']].T
dny_ts.plot()
 
 
###이름을 굴자로 분리, 알파벳 문자 비율의 변화 ( 1880 ... 2010 )
 
### 과거에는 있기가 없었으나 최근 인기 있는 이름.
 
## 현재 인기있는 이름중에 과거에 인기 없었던 이름.
 
## 남 / 여 이름의 인기가 바뀐 이름.
 
 
Colored by Color Scripter
cs

저작자표시

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

Day 09_bitly_Json_Dictionary_List_Advenced (0)	2019.07.12
Day 08. Movielens_Analysis (0)	2019.07.11
Day 08. MapFunc_OneHotEncode_Join (0)	2019.07.11
Day 07_String_AggregationFuc (0)	2019.07.10
Day 06_Visualization (0)	2019.07.10

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

Software knowledge worth spreading

Day 08.Baby Names Analysis_cumsum_pivot

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역