기초 문자열 처리와 정규표현식

2019. 5. 22. 14:56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#문장 -> 단어
s = "Learning R is so interesting"
strsplit(s , split = " ")
 
#단어 -> 문자
w = strsplit(s , split = " ")[[1]]
w[5]
 
p = strsplit(w[5] , split = "")
p
 
 
#단어에서의 문자 빈도수
 
 
# l = unlist(strsplit(w[1:5] , split = ""))  # char vector
# 
# paste(l, collapse = )
 
 
 
for(i in 1:5){
 myletters[i] =  print(strsplit(w[i],split = ""))
}
 
paste(myletters[[1]], collapse = "") # 벡터 합치기
 
mywords2 = list()
for(i in 1:5){
  mywords2[i] = paste(myletters[[i]], collapse = "")
}
 
mywords2 # 합쳐졌다.
 
paste(mywords2, collapse = " ")  # 리스트를 paste함수로  " "를 기준으로 합칠 수 있다.
 
 
 
### 예시 
 
R_wiki = "R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis. Polls, surveys of data miners, and studies of scholarly literature databases show that R's popularity has increased substantially in recent years.
R is a GNU package. The source code for the R software environment is written primarily in C, Fortran, and R. R is freely available under the GNU General Public License, and pre-compiled binary versions are provided for various operating systems. While R has a command line interface, there are several graphical front-ends available."
 
#\n을 기준으로 문단 구분. 문단 -> 문장 -> 단어 -> 철자 분리.
 
#문단으로 분리.
 
para = strsplit(R_wiki, split = "\n")[[1]]
para
#문장으로 분리.
 
s = strsplit(para, split = "\\. ")  #주의!!!   .는 예약문자이므로 이스케이프 꼭 해줄것..!!
s1 = s[[1]]  # 첫번째 문단을 문장으로.
s2 = s[[2]]
 
s1
s2
 
#단어로 분리.
w1 = strsplit(s1, split = " ")[[1]] #첫번째 문단의 첫번쨰 문장을 단어로.
w2 = strsplit(s1, split = " ")[[2]]
w3 = strsplit(s1, split = " ")[[3]]
 
w1
w2
w3
 
ws = cbind(c(w1, w2, w3))
ws = as.data.frame(ws)
 
ws %>% 
  group_by(V1) %>% 
  summarise(sum = n()) %>% 
  arrange(desc(sum))
 
 
 
#정규 표현식 
 
mysentence = "Learning R is so interesting"
 
regexpr('ing', mysentence)  #해당 문자열에 패턴에 일치하는 부분이 있는가?
 
# [1] 6                        매칭이 된 위치.
# attr(,"match.length")
# [1] 3                        매칭된 글자수.
# attr(,"index.type")
# [1] "chars"                  매칭된 타입.
# attr(,"useBytes")
# [1] TRUE                     결과
 
class(regexpr('ing', mysentence))
 
as.vector(regexpr('ing', mysentence)) #첫번째 요소값을 추출
regexpr('ing', mysentence)[1]         #위와 같음. 즉 첫번째 패턴매칭 인덱스
 
 
 
loc.length = attr(regexpr('ing', mysentence), "match.length")  #패턴의 길이.
loc.begin = regexpr('ing', mysentence)[1]
 
 
 
#종료 위치.
loc.end = loc.begin + loc.length - 1  # 시작인덱스를 포함하므로 -1 해줘야 한다.
loc.end
 
 
 
gregexpr('ing', mysentence)[[1]] # 첫번째는 발견위치 
 
length(gregexpr('ing', mysentence)[[1]]) # 두번 발견했음을 알수 있다.
 
loc.begin = as.vector(gregexpr('ing', mysentence)[[1]])
 
loc.begin #모든  ing패턴의 발견 시작위치벡터.
 
 
t_attr = attr(regexpr('ing', mysentence)[[1]], "match.length")  #match.length 속성 확인할것.
loc.length = t_attr
 
loc.end = loc.begin + loc.length -1  #끝위치 
 
 
 
 
#regexec regexpr과 비슷.
 
regexpr('interesting' , mysentence)  # regexpr은 발견 위치 인덱스를 리턴.
 
regexec('interesting' , mysentence)  # 현재 상황에서 똑같은 결과 리턴.
 
regexec('in(ter)estin(g)' , mysentence)  #소괄호에 묶여있는것도 인덱스를 리턴. 즉 그룹화 가능.
 
regexec('so (intere(s)ting)',mysentence) # 15는 so, 18은 i , 24는 s  길이는 ' '길이 , 첫번쨰 ()길이 , 두번째 ()길이
 
 
 
#단어 중에서 software 단어 검색
mysentences = unlist(s1)
mysentences = c(s1, s2)
 
 
 
regexpr("software", mysentences) #각 벡터 안에서 해당 단어의 위치 출력
 
#2회 이상 출력여부
gregexpr("software", mysentences) #각 벡터 안에서 각각 따로 등장횟수를 출력.
 
 
#단어 추출
myTemp = regexpr("software", mysentences)
my.begin = as.vector(myTemp)
 
my.begin #각 단어의 시작위치.
my.begin[my.begin == -1] = NA #-1은 결측 처리.
 
my.end = my.begin + attr(myTemp, "match.length") -1
 
my.end
 
 
myLocs = matrix(NA,nrow = length(my.begin), ncol=2 )  # maxrix(<초기값> )
 
#컬럼이름 변경. [,1] 같은 이름을 변경 가능
colnames(myLocs) = c("begin", "end")
 
myLocs
 
t = paste('sentence', 1:length(my.begin) , sep= '.') #sentence 1~7까지 있는 벡터 생성 seperate 문자 = '.' 기본은 ' '
rownames(myLocs) = t
 
myLocs
#1열에는 시작위치 , 2열에는 끝위치로 바꾸고 싶다..!
 
my.begin
my.end
 
#cbind(my.begin,my.end)
#cbind(my.begin[7],my.end[7])
 
for(i in 1:length(my.begin)){
  myLocs[i,] = cbind(my.begin[i], my.end[i])   #myLoc -> i번째 행에 복사!!
}
 
#넣기 종료.
myLocs
 
###### grep 있는지 없는지 확인
 
mysentences
grep('software', mysentences) #총 7개의 문장중 1,2,5번 문장에서 발견됬다.
grepl('software', mysentences) # 총 7개의 문장에서 단어가 발견됬는지 아닌지를 리턴.
 
 
######sub 문자열 치환
 
mysentence
 
sub('ing', 'ING', mysentence)
gsub('ing', 'ING', mysentence)
 
#전처리에서 흔한작업 -> 공백이 있는 고유명사 -> 하나의 단어
new.sent1 = gsub("R Foundation for Statistical Computing","R Foundation_for_Statistical_Computing",s1[1])
 
table(w1)      # 단어의 빈도 조사
sum(table(w1)) # 전체 단어의 갯수의 합
 
 
 
new.sent1
strsplit(new.sent1 , split = " ")
sum(table(strsplit(new.sent1 , split = " ")))
 
#특정 단어 제거
drop.sent1 = gsub( "and |by |for |the "  ,"",new.sent1)
strsplit(drop.sent1 , split = " ")
table(strsplit(drop.sent1 , split = " "))
sum(table(strsplit(drop.sent1 , split = " ")))
 
 
 
#패턴식
p1 = regexpr('ing', mysentence)            #패턴 저장 
regmatches(mysentence, p)                 #패턴과 매치되는 문자열 추출
 
p2 = gregexpr('ing', mysentence)
regmatches(mysentence, p2)
 
 
#invert  : 해당 표현을 제외
p2 = regexpr('ing', mysentence)
regmatches(mysentence, p2, invert = T)
 
 
 
mysentences
substr(mysentences, 1, 30)           # 각 문자열에서 1번쨰 부터 30번쨰 인덱스 추출 substr= 부분 문자열.
 
#ing로 끝나는 모든 단어를 추출
my2sentence = c("Learning R is so interesting", "He is a fasinating singer")
class(my2sentence)   #char vector
 
 
regexpr('ing' ,my2sentence)  #두개의 벡터 각각 ing 찾기
p0 = gregexpr('ing' ,my2sentence)
 
regmatches(my2sentence, p0)
 
 
#ing앞에 알파벳만 오도록 , 뒤에 아무것도 없도록.
 
#[[:alpha:]]  -> 모든 알파벳 문자 정규식
p1 = gregexpr('[[:alpha:]](ing)' ,my2sentence)
regmatches(my2sentence, p1)
 
 
#ing앞에 최소 1회이상 알파벳이 올수 있도록.   {a, b} : a글자 이상 b글자 이하 
p2 = gregexpr('[[:alpha:]]{1,}(ing)' ,my2sentence)
regmatches(my2sentence, p2)
 
#ing로 끝나는 단어만 매칭되도록. \\b : $의 의미 ->  즉 끝난다.  +: 한글자 이상.
p3 = gregexpr('[[:alpha:]]+(ing)\\b' ,my2sentence)
regmatches(my2sentence, p3)
 
### 예시
 
#ing로 끝나는 단어만 추출 
mysentences
p    = gregexpr('[[:alpha:]]+(ing)\\b' ,mysentences)
ings = regmatches(mysentences, p)
 
t = unlist(ings) #길이가 0인 벡터 제거.
table(t)         #같은 단어가 소문자와 대문자로 나눠져 버렸다
 
 
#대소문자 일괄 통일
#\\b : blank
 
 
p    = gregexpr('[[:alpha:]]+(ing)\\b' , tolower(mysentences))
ings = regmatches(tolower(mysentences), p)
 
t = unlist(ings) #길이가 0인 벡터 제거.
table(t)         #같은 단어가 소문자와 대문자로 나눠져 버렸다
 
 
Colored by Color Scripter
cs
저작자표시 (새창열림)
'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

문자열 처리 두번째, 정규식, StringR라이브러리, colnames, match_all, trim, allign,sub,length, (0)	2019.05.23
정규식, 심화 빈도그래프,타이타닉 호칭 추출 (0)	2019.05.23
tapply를 활용한 단어 빈도 조사 (0)	2019.05.22
리스트 <-> 벡터 , 속성값 추가 및 접근, lapply, tapply (0)	2019.05.22
변수타입 복습 정리 (0)	2019.05.22
Software knowledge worth spreading

기초 문자열 처리와 정규표현식

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

+ Recent posts

티스토리툴바