tm라이브러리를 통한 코퍼스 manipulation

2019. 5. 27. 15:19
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
install.packages("SnowballC")   #단어의 어근만을 추출 wordstem, stem document 참조.
install.packages("tm")
install.packages("RWeka")       # n-gram관련 패키지.
install.packages("koNLP")      
 
library(ggplot2)
library(dplyr)
library(stringr)   #문자열 처리 패키지.
library(tm)
library(SnowballC)
library(RWeka)
library(rJava)
library(KoNLP)
library("wordcloud")
library(RColorBrewer)
 
 
#weka
 
v_t = c("The United States comprises fifty states.", "In the United States, each state has its own laws.","However, federal law overrides state law in the United States.")
 
c_t = VCorpus(VectorSource(t)) # 벡터로부터 코퍼스 생성. 
#Vcorpus(DirSource("경로"))    #경로안에 있는 모든 문서로 코퍼스 생성
 
inspect(c_t)                  # 문서 인스펙션
 
tdm = TermDocumentMatrix(c_t) #단어-문서 행렬 생성
 
# <<TermDocumentMatrix (terms: 17, documents: 3)>>       #총단어 17개 문서 총 3개(벡터요소 갯수.)
#   Non-/sparse entries: 23/28                           #총 단어 갯수 17 * 3개중 2개 이상의 문서에서 등장한 단어수 / 1개 문서에서만 등장한 단어수.
# Sparsity           : 55%                               #51개중 28개 단어만 여러 문서에서 드앙했다.
# Maximal term length: 9                                 #제일긴 단어 수.
# Weighting          : term frequency (tf)               #주요 가중치(내용) : 단어 등장 횟수.
 
 
inspect(tdm)
 
 
#해당 코퍼스에서 n-gram 생성
TermDocumentMatrix(c_t)
 
 
 
bigramTokenizer = function(x){
  
  #NGramTokenizer : x를 n그램으로 나눠주는 역할. 
  NGramTokenizer(x, Weka_control( min = 2 , max = 3))  #x를 bi 또는 tri 그램으로 토큰화.
  #NGramTokenizer(x, Weka_control( min = 2 , max = 2))
}
 
#코퍼스를 특정 함수에 지정 되어져 있는 방식으로 토큰화 한다. 디폴트는 단어 하나씩 토큰화 한다.
tdm = TermDocumentMatrix(c_t , control = list(tokenize = bigramTokenizer)) #tokenize = 어떤식으로 토큰화 할것인지 미리 정의 해놓은 토큰화 규칙 적용
str(tdm) 
#tdm$dimnames$Terms  #나눠진 단어 수 총 18개.(바이그램일떄,)
 
 
#트라이 그램이어야 되는이유 the united states 같이 3개가 묶였을때 의미를 가지는 단어들이 있기 때문. 
tdm$dimnames$Terms  #나눠진 단어 수 총 35개. (바이그램 및 트라이그램일떄.)
 
 
tdm$dimnames$Docs   #총 문서수 3개.
inspect(tdm)
 
tdm #매트릭스 타입.
 
#sapply -> 입력 리스트 출력 벡터
#apply  -> 입력 어레이(매트릭스) 출력 벡터.
 
#어떤 단어가 몇번 등장했는가?
#35개 있는 행에 대해서 각 문서에서 몇번 등장했는지가 행에 있으므로.
#모두 더하게 되면 전체 문서에서 해당 단어가 몇번 등장했는지 나오게 된다.
 
a = apply(tdm[,], 1, sum)  #어레이(매트릭스) 를 받아서, MARGIN : 1)매트릭스 행단위 2)매트릭스 열단위  , 적용할 함수.
str(a)
 
sort(a, decreasing = T)  #내림차순 정렬
 
#mycopus에도 위 작업을 수행, 바이그램(트라이그램)시 최상위 10개의 단어도출
 
#한국어 처리.
 
 
loc = "논문/"
#파일을 가져와서 코퍼스 생성.
p = VCorpus(DirSource(loc)) 
 
p[[1]]   #코퍼스 내의 문서 참조.
v_k = p[[19]]$content
 
 
#영문자, 특수문자 제거
v_k = str_replace_all(v_k , "\\([a-zA-Z]+\\)|(‘)|(’)|(\\s·\\s)" , "")
 
#명사추출
extractNoun(v_k)
 
#숫자 표현 추출
l = lapply(p, function(x){
  result = str_extract_all(x, "[0-9]+" )
})
 
 
table(unlist(l))
p = tm_map(p, removeNumbers)
p[[1]]
 
#지정된 패턴 제거.
l_p = lapply(p, function(x){
  str_extract_all(x, "\\b[[:alpha:]]+[[:punct:]]+[[:alpha:]]+\\b")
})
 
table(unlist(l_p))
 
tempFunct = function( copus, oldexp, newexp ){
  #content_transformer 함수의 인자로 copus와 oldexp로 줘서 gsub으로 x에서 pattern에 매칭 되는 부분을 newexp로 바꾼다.
  tm_map( copus, content_transformer(function(x, pattern ){
    gsub( pattern, newexp, x )} ),
    oldexp)
}
 
#코퍼스에 한번에 적용하는 사용자 정의 함수.
p = tempFunct(p,"[a-z]","")
p = tempFunct(p,"[A-Z]","")
p = tempFunct(p,"\\(","")
p = tempFunct(p,"\\)","")
 
 
 
 
#명사들만 추출해서 " " 으로 이어 붙이는 문자열 생성.
f_nouns = function(t){
  paste(extractNoun(mytext), collapse = " ")
}
 
#명사 리스트를 출력하는 구문.
f_view_noun = function(t){
  extractNoun(t)
}
 
#코퍼스 전체에 대한 전체 문자열 조작.
l = lapply(p, function(x){
  str_replace_all(x, "[0-9]+", "")
})
 
#코퍼스 -> 리스트
l   = lapply(p, function(x){ str_replace_all(x, "[0-9]+" , "")  })
 
#코퍼스 -> 코퍼스 
cop = tm_map(p, function(x){ str_replace_all(x, "[0-9]+" , "")  })
 
 
#코퍼스내 모든 명사 추출.
myNounCorpus = tm_map(p, extractNoun)
 
 
#단어 빈도수 확인.
l = lapply(p, function(x){
  str_extract_all(x, boundary("word"))
})
 
imsi = myNounCorpus
 
for(i in 1:length(myNounCorpus)){
  myNounCorpus[[i]]$content =
    str_replace_all(imsi[[i]]$content,
                    "위키리스크[[:alpha:]]+",
                    "위키리크스")
}
 
dtm.k = DocumentTermMatrix(p)
 
class(myNounCorpus[[1]])
class(p[[1]][[1]])
 
 
####################기술 통계
 
word.freq =  apply(dtm.k[,], 2, sum)  #각 행별 합산.
head(word.freq)
length(word.freq)
 
sort(word.freq)                       #오름차순 정렬.
sort(word.freq, decreasing = T)       #내림차순 정렬.
 
sort.word.freq = sort(word.freq, decreasing = T)
sort.word.freq[1:20]
 
#누적합 cumsum()
cumsum.word.freq = cumsum(sort.word.freq)
cumsum.word.freq[1:20]   #1위부터 20위가 총 얼마나 나왔는지 확인가능.
 
#전체 대비 누적합의 비율.
prop.word.freq = cumsum.word.freq/cumsum.word.freq[length(cumsum.word.freq)  ]   #상위 몇프로 인지 확인가능   누적 단어 등장횟수 / 전체 단어 등장횟수.
 
 
#상위 20개가 몇퍼센트에 해당하는지 확인
prop.word.freq[1:20]    #단 20개의 단어가 전체 문서에서 등장한 단어의 40%를 차지한다.
 
#시각화  어느 구간에서 단어의 출현빈도가 꺽이는지 확인할수 있다.
plot(1:length(word.freq), prop.word.freq)  #등장 빈도가 제일 높은 단어부터 낮은 단어까지 누적 빈도를 그래프화.
plot(1:length(word.freq), prop.word.freq ,type = 'l')  #위 그래프를 라인그래프로 출력.
 
wordcloud(names(word.freq))   #word.freq에 names를 적용하면 컬럼 이름만 나온다.
 
wordcloud(names(word.freq), min.freq = 5) #최소 5번 이상 나온것만 보이게.
 
 
mypal = brewer.pal(8,"Dark2")   #색상 갯수, 색상 종류.(테마)
 
wordcloud(names(word.freq), freq=word.freq ,min.freq = 5,
          col=mypal, random.order = FALSE,    #글자 위치를 어느정도 일정하게 맞춘다.
          scale = c(4,0.2)) #최소 5번 이상 나온것만 보이게.
 
Colored by Color Scripter
cs
저작자표시
'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

난수, 시간측정, 그룹화 및 요약, 피봇팅, colSums, which.max (0)	2019.05.28
k-means 클러스터링 예제 (0)	2019.05.27
텍스트 마이닝 기초 (0)	2019.05.23
문자열 처리 두번째, 정규식, StringR라이브러리, colnames, match_all, trim, allign,sub,length, (0)	2019.05.23
정규식, 심화 빈도그래프,타이타닉 호칭 추출 (0)	2019.05.23
Software knowledge worth spreading

tm라이브러리를 통한 코퍼스 manipulation

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

+ Recent posts

티스토리툴바