Day 07_String_AggregationFuc

2019. 7. 10. 21:08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# -*- coding: utf-8 -*-
"""Day 07_String.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1I2KBgurecEa6iw2DYgF4W5qSaM17BSwu
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
# /gdrive/My Drive/Colab Notebooks/resources/ <==  My resource path
 
# %matplotlib inline
 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import re
import time
 
word = 'grail'
sent = 'a scratch'
 
"""#문자열 포멧팅"""
 
a = 'world'
s = 'hello {}'
 
s.format(a)
 
"Some digits of pi: {}".format(3.14)
"Some digits of pi: %(cont)s : %(value).2f" % {'cont' : 'e', 'value': 2.718}
 
###  ''' ''' 는 문자열을 나타낼떄 여러줄에 걸쳐서 나타낼때 사용.
 
data = """
kim 897546-5646513
lee 970102-2234567
"""
 
## 소괄호는 첫번째 그룸
pat = re.compile( "(\d{6})[-]\d{7}" )
 
##이 패턴에 데이터를 적용한다는 의미 
## \g<1>은 첫번째 그룸 , 그룹을 나타내는 문자열 g
pat.sub("\g<1>-*******", data)
 
"""# 정규 표현식
[ ] : 문자 클래스, 대괄호 안의 문자와 매치가 된다.
ex ) 정규식 : [ abc ] , a: 매치 , begin : 매치 , test : 매치
[ ^0-9 ] : ^를 쓰게 되면 반대의미 , 0-9를 뺴고 매칭한다.
\d : 숫자 [ 0-9 ]
\D: 숫자 아닌것 [ ^0-9 ]
\s: 공백문자 그룹[ \t \n \r \f ]
\S: 공백을 제외한 모든 그룹.
\w:문자 + 숫자, [ a-zA-Z0-9_ ] 언더바를 포함한다..!주의
.(점) : 모든문자 ( 줄바꿈 문자 제외 ),
ex) a.b  => a*b , a[.]b 와는 다르다. [.] 는 진짜 .이 와야 한다.
{최소 반복 횟수 , 최대반복 횟수}
{최소 반복 횟수 , }
{ , 최대 반복 횟수 }
? : { 0, 1 } 한번또는 0 번 
match , search 함수로 문자열에서 정규식이 매치되는 부분이 있는지 확인.
"""
 
# span ( 0번째 부터 1번 인덱스 앞까지 매치가 되었다), match = 매치된문자열, 표현식 안에 스페이스바 넣으면 그것도 포함된다.
pat = re.compile('[ a-z7]+')
 
res = pat.match("hi d7eep")
 
## 매치가 안될때는 None이 나온다. None자체가 False의 의미를 가진다.
if res:
    print("매치")
    
    #무엇이 매치 됬는지 알 수 있다.
    print(res.group())
else:
    print('매치안됨')
 
pat = re.compile('[a-z]+')
 
res = pat.match("3 deep")
res
 
## match는 왼쪽에서부터 매치가 안되면 탐색중단, search는 주어진 문자열에 대해서 매치가 되는 부분을 모두 탐색.
## 왼쪽에서부터 매치 안되는게 나오면 종료 => match
res = pat.search("3 deep")
res
 
## 전체가 다 나온다
pat.findall("3life is too short")
 
## 한번 될때까지 진행, 매치되면 종료.
pat.search("3life is too short")
 
## 매치 안되면 종료.
pat.match("3life is too short")
 
 
## findAll에 대한 결과가 이터레이터로 리턴.
res = pat.finditer("7 life is too short")
res
 
for i in res :
    print(i)
 
# 매치된 위치
res = pat.match("hello")
res.group()
 
# 몇번째 위치부터 매치가 됬는지 알수 있다
res.start()
res.end()
res.span()
 
pat = re.compile('[a-z]+')
res = pat.match("test")
res
 
## 축약형, 컴파일 함수를 사용하지 않는다.
## func("표현식", "대상 문자열")
 
re.match('[a-z]+', "test")
 
## 그외 메타 문자
 
 
## hello 또는 hi 라는 문자열에 매치되는 표현식을 만들고자 한다.   match는 문자열 왼쪽에서부터 그냥 한글자 한글자 검색한다, 즉 abcd로 검색해도 abcd8이 매치될 수 있다.
## 단어 단위가 아니라 문자 단위이다.
 
p = re.compile("abcd|a")
p.match('abcd8')
 
## ^는 문자열의 시작을 의미한다, 즉 search임에도 불구하고 문자열의 시작이 Life가 아니므로 서치가 안된다.
p = re.compile("^Life")
p.search('My Life is too short')
 
## 문자열의 끝이 short으로 끝나면. 서치된다, 원래 서치는 문자열 전체를 검색하지만 
p = re.compile("short$")
p.search('Life is too short life')
 
## 그룹기호 ( ) , 하나의 정규식을 여러 정규식으로 나눠서 보관할때, 특정 패턴의 반복을 확인하고자 할때,
 
p = re.compile("ABC")
res = p.match("ABCABCABC DEF")
res
 
## 소괄호 안에 있는것이 반복될 경우
 
p = re.compile("(ABC)+")
res = p.search("ABCABCABC DEF")
res
 
## 이름 + " " + 전화번호를 찾고자 할때, 
 
## kim 010-1234-5678 정확히 형식으로 되있는 문자열만 매치되도록!
 
## 010-1234-5678     => 매치가 안되도록.
## 010-1234-5678 kim => 매치가 안되도록.
 
## \s => 스페이스 또는 탭 하나만 매치된다. 
 
p = re.compile("^(\w+) (\d+-(\d+)-\d+$)")
res = p.search("kim 010-1234-5678")
 
 
 
res.group(1) ,res.group(2)
 
## 왼쪽 소괄호 부터 그룹 번호로 매겨진다.
res.group(3)
 
re.match( '[0-9]*' , '1234')
re.match( '[0-9]*' , '1234')
 
 
## [ ] 안에 있는 . 구두점은 진짜 문자 그대로의 구두점을 의미하고 모든 문자를 표시하려면 [ ] 밖으로 빼야한다.
re.match( 'K[.]' , 'K22')
 
## 그룹으로 반복을 표현 할 수 있다.
re.match( '(hi){3}' , 'hihihi')
 
re.match( "hello123" ,"hello123") 
 
re.match( "[가-힣]+" ,"딥러닝") 
 
## 대문자 제외 전부 매칭
re.match('[^A-Z]+', "helLo")
 
## 대문자로 시작하는 문자 매칭
re.match('^[A-Z]+', "HelLo")
 
## 별이 한개이상 있는지 없는지 판별하는 구문
## 메타 문자로 약속되어있는 문자는 \로 이스케이프
re.search("\*+" , "100 ** 2")
 
## 소괄호 안에 ?P<ln>를 줌으로써 그룹에 이름을 부여 할 수 있다.
res = re.match('(?P<ln>[0-9]+) (?P<sn>[0-9]+) (?P<tn>[0-9]+)', "011 1234 5678")
 
## 한 그룹이 스트링이므로 인덱스로 한글자 추출 가능.
res.group()[1]
 
## 그룹에 대해서 이름으로 참조
res.group('ln')
 
re.findall("[0-9]+",  " 1 2 abc 3 abc 4 5 abc 67")
 
re.match("[a-zA-Z]+[.]*[a-zA-Z]*$", "hello.asdasd")
 
re.match("[a-zA-Z]+[.]*[a-zA-Z]*$", "hello")
 
## 문자열 .capitalize() => 함수 직접 구현하기 
"asdf".capitalize()  
"asdf".count('f')    ## f가 몇개있는지 세기
"asdf".startswith('as') # as로 시작하니?
"asdf".endswith("df") ## df로 끝나니?
"asdf".find("s")   ##s가 몇번쨰에 있니
"asdf".index("df") ##단어에 해당하는 인덱스가 얼마니
 
 
"asdf".isalpha()  ## 알파벳이니?
"12312".isdecimal() ## 숫자니?
"I am 23".isalnum() ## 알파벳과 숫자로 되어있니?
"asdfkeas".strip()  ## 좌우에 공백 제거.
 
"""## Apply 계열 함수"""
 
df = pd.DataFrame({ "a": [1,2,3] , "b":[2,3,4] })
 
def mysql( x ):
    return x ** 2
 
def myexp( x, n ):
    return x**n
 
df
 
## 각 요소에 함수 적용
df.apply( mysql )
 
df.apply( myexp, n = 3 )
 
def prn(x):
    print(x)
 
## 가로 순서로 적용. (열 방향)
df.apply(prn, axis = 1)
 
import seaborn as sns
titanic = sns.load_dataset('titanic')
 
## 컬럼 단위로 결측값이 몇개인지 출력하는 함수를 만들고자 한다.
## 콜백 함수 안에서 넘파이 함수를 사용하면 컬럼 단위로 함수가 사용된다.
 
 
def count_missing(data):
 
    null_data = pd.isnull(data)
    null_count = np.sum(null_data)
    
    return null_count  # number of nan
 
cmis_col = titanic.apply( count_missing )
 
cmis_col
 
#결측값의 비율을 측정 하고자 한다.
 
## data는 axis에 대한 시리즈가 들어가게 된다. axis = 0이 기본이라서 열단위로 계산이 된다.
def prop_missing(data):
 
    null_data = pd.isnull(data)
    null_count = np.sum(null_data)
        
    return null_count / data.size  # number of nan
 
 
## 행단위로 null의 비율을 체크
cmis_col = titanic.apply( prop_missing , axis = 1)
cmis_col[:10]
 
titanic['num_missing'] = titanic.apply(count_missing, axis = 1 )
 
titanic['num_missing'][:10]
 
df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/resources/data/gapminder.tsv", sep = "\t")
 
## 연도별 기대 수명
 
df.groupby('year').lifeExp.mean()
 
## 몇년도가 있는지 중복제거해서 확인.
df.year.unique()
 
## 1952년 데이터를 출력
 
y1952 = df[ df[ 'year' ] == 1952 ]
 
## 사용자 정의 집계함수
 
## 각 그룹별 특정 컬럼별로 함수를 적용시킬 수 있다.
 
 
## 그룹별 평균 - 전체 평균
df.groupby('year').lifeExp.agg( lambda x, y : (
 
    np.mean(x) - y  # <-- np.mean()과 같다.
    
), y = df.lifeExp.mean() )
 
## 여러개의 집계함수를 agg의 인수로 리스트형태로 던질 수 있다.
df.groupby('year').lifeExp.agg([np.std, np.mean, np.count_nonzero])
 
## 컬럼 이름이 마음에 안들 경우
df.groupby('year').lifeExp.agg({ 'lifeExp' : 'std', 'gdp' : 'median' })
 
## transform => 데이터를 다를형태로 변환
def zscore( x ):
 
    return (x - x.mean()) / x.std()
 
# df.groupby('year').lifeExp.transform(zscore)
 
df[:2]
 
# 헤더가 없을때는 name 속성으로 미리 컬럼이름을 주면 된다.
 
names1880 = pd.read_csv("/gdrive/My Drive/Colab Notebooks/resources/babynames/yob1880.txt", names = ['name','gender','birth']) 
names1880.shape
names1880.info()
names1880[:5]
 
## 1880년에 태어난 모든 신생아 수
names1880.groupby('gender').birth.sum()
 
years = range( 1880, 2011 )
 
pieces = []
for year in years:
    
    path  = "/gdrive/My Drive/Colab Notebooks/resources/babynames/yob%d.txt" % year
    frame = pd.read_csv( path, names = ['name','gender','birth'] ) 
    frame['year'] = year
    
    pieces.append( frame )
    
len(pieces)
 
df = pd.concat(pieces, ignore_index = True)
 
df[:2]
 
# 연도와 성별에 따른 데이터를 재구성
# 각 연도별 성별에 따른 데이터의 갯수(신생아 수)
 
total_birth = df.pivot_table( index = ['year'], columns = ['gender'], values = ['birth'],aggfunc = np.sum )
total_birth['birth']['total'] = total_birth['birth']['F'] + total_birth['birth']['F']
 
total_birth[:2]
 
df = sns.load_dataset('titanic')
 
df.apply(lambda x :(  sum(x.isna())  ))
 
df = df.drop('deck',axis = 1)
df = df.dropna()
df.shape
 
df[:2]
 
sns.set(style="ticks")
 
 
 
g = sns.FacetGrid(df, col = 'pclass', row = 'embarked',hue = 'sex' ,margin_titles = True)
g.map( sns.countplot, "survived")
g.add_legend()
 
Colored by Color Scripter
cs
저작자표시
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

Day 08.Baby Names Analysis_cumsum_pivot (0)	2019.07.11
Day 08. MapFunc_OneHotEncode_Join (0)	2019.07.11
Day 06_Visualization (0)	2019.07.10
Day 05.Melt_PivotTable (0)	2019.07.08
Day 5. Sort_String_Binomial_Distribution (0)	2019.07.08
Software knowledge worth spreading

Day 07_String_AggregationFuc

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바