텍스트 마이닝 기초

2019. 5. 23. 17:13
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
install.packages("SnowballC")   #단어의 어근만을 추출 wordstem, stem document 참조.
install.packages("tm")
 
 
library(ggplot2)
library(dplyr)
library(stringr)   #문자열 처리 패키지.
library(tm)
library(SnowballC)
 
 
#n-gram:2(bi)-gram, 3(tri)-gram
#n번 연이어 등장하는 단어들의 연결
 
 
t = "The United States comprises fifty states. In the United States, each state has its own laws. However, federal law overrides state law in the United States."
str_extract_all(t, boundary("character") )  #스펠링
str_extract_all(t, boundary("line_break") ) #공백과 줄바꿈
str_extract_all(t, boundary("sentence") )   #문장.
w = unlist(str_extract_all(t, boundary("word") )) 
length(table(w)) #서로다른 단어의 갯수. 
 
table(w)   #같은 의미의 state가 형태가 조금씩 달리 3개가 있다. 통일해야 한다.
sum(table(w)) #중복된걸를 다 포함한 갯수.
 
t2 = str_replace_all(t , "\\bUnited States", "United_States")
 
w2 = str_extract_all(t2, boundary("word"))
 
table(unlist(w2))
length(table(unlist(w2)))
sum(table(unlist(w2)))
 
 
 
#1. 2단어씩 연결하여 출력
#The United
#United States .....
 
loc = "papers/"
 
#말뭉치(다수의 문서) 구성
p = VCorpus(DirSource(loc))
p   #meta date 데이터를 설명하는 테이터.
 
summary(p)
class(p)
p[[1]]$content
p[[2]]$content
p[[3]]$content
p[[24]]$content
 
 
#메타 데이터 -> 문서를 설명하는 데이터
p[[2]]$meta
 
meta(p[[2]], tag='author') = "koh"  #태그에 대한 값 참조.
p[[2]]$meta
 
 
#단어 특수문자 단어
#코퍼스는 일종의 리스트 이므로 lapply를 사용.
 
myfunc = function(paper){
  str_extract_all(paper , "\\b[[:alnum:]]{1,}[[:punct:]]{1,}[[:alnum:]]{1,}" )  #[[:punct:]] = 특수문자.
}
l = lapply(p, myfunc)  #리스트로 출력 
unlist(l)
 
 
#테이블로 출력후 해당 결과에 대한 전처리를 어떻게 해야할지 고민해야 한다.
table(unlist(l))   #현재 코퍼스 내에 있는 모든 단어들의 빈도표.
 
 
 
 
#코퍼스로부터 수치로된 자료를 추출 
#코퍼스는 리스트에 불과하다.
myfunc2 = function(x){
  str_extract_all(x, "[0-9]+")
}
 
ggplot(as.data.frame(table(unlist(lapply(p, myfunc2))) ), aes(x=Var1, y=Freq))+
  geom_col
 
 
#대문자로 시작하는것
ggplot(as.data.frame(table(unlist(lapply(p, function(x){ str_extract_all(x, "\\b[A-Z][a-z]+\\b") })) ) )   , aes(x= Var1, y= Freq)) + geom_col()
 
#라이브러리 함수.
c = tm_map(p, removeNumbers)
c[[1]]$content
 
removePunctuation("hello....world")
 
install.packages("SnowballC")   #단어의 어근만을 추출
library(SnowballC)
 
wordStem(c("learn","learns", "learning", "learned"))
 
p = tm_map(p, stemDocument    )        #어근만 추출해서 그것으로 전부 변환시킨다.
p = tm_map(p, stripWhitespace )        #불필요한 공백 제거.
p = tm_map(p, removeNumbers   )        #숫자 제거
 
#대문자 -> 소문자 치환
p2 = tm_map(p, tolower                       )   #사용은 가능하나 권장되지 않음.
p2 = tm_map(p, content_transformer(tolower)  )   #content_transformer : copus 타입을 사용가능 하도록 변환.
 
#불용어 사전 적용 -> 삭제.
tm_map(p,removeWords, words = stopwords("SMART"))   #SMART 불용어 사전을 기반으로 불용어 삭제.
 
#어근 동일화
mycopus = tm_map(p, stemDocument, language = "en")
 
#myObj 코퍼스에서 oldexp라는 패턴에 매칭되는 부분을 newexp로 치환 가능.
mytempfunc = function(myObj, oldexp, newexp){
  newobj =  tm_map(myObj,
                   #x = myObj, pattern = oldexp , gsub = 치환
                   content_transformer(function(x,pattern) gsub(patten,newexp,x)),
                   oldexp)  
  print(newobj)
}
 
mycorpus <- mytempfunc(mycorpus,"-collar","collar")
mycorpus <- mytempfunc(mycorpus,"\\b((c|C)o-)","co")
mycorpus <- mytempfunc(mycorpus,"\\b((c|C)ross-)","cross")
mycorpus <- mytempfunc(mycorpus,"e\\.g\\.","for example")
mycorpus <- mytempfunc(mycorpus,"i\\.e\\.","that is")
mycorpus <- mytempfunc(mycorpus,"\\'s","")
mycorpus <- mytempfunc(mycorpus,"s’","s")
mycorpus <- mytempfunc(mycorpus,"ICD-","ICD")
mycorpus <- mytempfunc(mycorpus,"\\b((i|I)nter-)","inter")
mycorpus <- mytempfunc(mycorpus,"K-pop","Kpop")
mycorpus <- mytempfunc(mycorpus,"\\b((m|M)eta-)","meta")
mycorpus <- mytempfunc(mycorpus,"\\b((o|O)pt-)","opt")
mycorpus <- mytempfunc(mycorpus,"\\b((p|P)ost-)","post")
mycorpus <- mytempfunc(mycorpus,"-end","end")
mycorpus <- mytempfunc(mycorpus,"\\b((w|W)ithin-)","within")
mycorpus <- mytempfunc(mycorpus,"=","is equal to")
mycorpus <- mytempfunc(mycorpus,"and/or","and or")
mycorpus <- mytempfunc(mycorpus,"his/her","his her")
mycorpus <- mytempfunc(mycorpus,"-"," ")
 
 
######### TF/IDF
 
#DTM 행렬 만들기.
#가로 : 문서 , 세로 : 단어
#TermDocumentMatrix()   TDM
 
dtm = DocumentTermMatrix(p)
dtm
 
inspect(dtm[1:3,50:60]) #1~3문서 50~60번째 단어
 
 
Colored by Color Scripter
cs
저작자표시
'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

k-means 클러스터링 예제 (0)	2019.05.27
tm라이브러리를 통한 코퍼스 manipulation (0)	2019.05.27
문자열 처리 두번째, 정규식, StringR라이브러리, colnames, match_all, trim, allign,sub,length, (0)	2019.05.23
정규식, 심화 빈도그래프,타이타닉 호칭 추출 (0)	2019.05.23
기초 문자열 처리와 정규표현식 (0)	2019.05.22
Software knowledge worth spreading

텍스트 마이닝 기초

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

+ Recent posts

티스토리툴바