정규식, 심화 빈도그래프,타이타닉 호칭 추출

2019. 5. 23. 11:02
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
library(ggplot2)
library(dplyr)
library(stringr)
 
 
 
 
R_wiki = "R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis. Polls, surveys of data miners, and studies of scholarly literature databases show that R's popularity has increased substantially in recent years.
R is a GNU package. The source code for the R software environment is written primarily in C, Fortran, and R. R is freely available under the GNU General Public License, and pre-compiled binary versions are provided for various operating systems. While R has a command line interface, there are several graphical front-ends available."
 
para = strsplit(R_wiki,split = "\n")[[1]]
mysentences = unlist(s1)
 
# stat으로 시작하는 단어와 그 출현빈도.
p = gregexpr("(stat)[[:alpha:]]+", tolower(mysentences) )  # 소괄호 한에 있는것은 무조건 포함하라는 의미..!
regmatches(tolower(mysentences), p)
 
#대소문자 구분 
 
p = gregexpr("[[:upper:]]", (mysentences) )  #대문자 한글자만 추출.
u = regmatches(mysentences, p)
u
 
p = gregexpr("[[:lower:]]", (mysentences) )  #소문자 한글자만 추출.
l = regmatches(mysentences, p)
l
 
#대소문자 구문X 사용된 알파벳 횟수 추출 
 
p = gregexpr("[[:alpha:]]", tolower(mysentences) )  #대소문자를 통일해야 같은 문자로 인식.
a = regmatches(tolower(mysentences), p)
a
t = table(unlist(a))
 
max(t)    #최대값은 나오나 컬럼명을 알수 없다.
t[t == max(t)]   #최대값인 값만 출력.
length(t)        #알파벳 갯수
sum(t)           #총 알파벳 수
 
 
###################################
pressure
 
#점위에 선 출력.
ggplot(pressure, aes(x=temperature, y=pressure)) +
  geom_point(size = 2, color = 'red') +         #scatter graph
  geom_line(size = 1 , collr = 'deepskyblue')   #draw line    
 
 
#선위에 점 출력.
ggplot(pressure, aes(x=temperature, y=pressure)) +
  geom_line(size = 1 , color = 'deepskyblue') +  #draw line    
  geom_point(size = 2, color = 'red')         +  #scatter plot
  ggtitle('pressure data') + 
  xlab('temp') +
  ylab('prs')  +
  theme_dark()                                    #백 그라운드 변경.
  #theme_grey()
  #theme_classic()
  #theme_bw()                                     #백 그라운드 변경.
 
 
#ggplot의 bar그래프 출력 방식.
 
#1. x축만 지정 = > 빈도
ggplot(diamonds, aes(cut)) +
  geom_bar()
 
ggplot(diamonds, aes(cut)) +
  geom_bar(stat = 'count')  #빈도에 대해 출력하겠다. 가 디폴트.
 
#2. x,y축 모두 지정
str(sleep)
 
ggplot(sleep, aes(ID,extra, fill = group))  +  # x축에 ID  y축에 extra  , 그룹으로 구분해서 채우겠다.
  geom_bar(stat = 'identity',    #stat= statistic , 어떠한 통계치를 기반으로 그래프를 그리겠니?
           position = 'dodge')   #즉 y축에 해당하는 값을 기반으로 바를 그리겠다.
                                 #하나의 id에 해당되어있는 extra값을 다 더했다.
                                 
 
#다이아 몬드 각 컬러에 따른 cut의 빈도 그래프.
ggplot(diamonds, aes(color, fill=cut)) +
  geom_bar()
 
#전체 빈도 대비 각 cut의 비율.
ggplot(diamonds, aes(color, fill=cut)) +
  geom_bar(position = 'fill')              #color별 cut방식의 비율을 볼수 있다.
 
 
#가로로 비교 가능.
ggplot(diamonds, aes(color, fill=cut)) +
  geom_bar(position = 'fill') +
  coord_flip()
 
 
#알파벳 빈도를 그래프를그리기, ggplot은 df와 가장 호환성이 높다.
df = as.data.frame(t)
str(df)
colnames(df) = c("alpha","freq")
str(df)
 
#빈도가 다른 컬럼에 따로 있을 경우에는 stat을 identity로 한다.
#즉 x축에 a,a,a,b,b,c 같이 같은 값이 여러개 있을떄 몇개씩 있는지 보고자 할떄는 count를 쓰고 
#x축에 a,b,c,d  y축에 3,2,1,0 같이 두 컬럼에 빈도값이 있을때는 identity를 준다.
 
ggplot(df, aes(alpha, freq, fill=alpha)) + 
  geom_bar(stat = 'identity') +           #stat_count는 x,y두 축에 적용될수 없고 빈도에만 사용.
  guides(fill=FALSE)  +                   #fill함수로 만들어진 범례 제거.
  geom_hline(aes(yintercept=mean(t) ))    #t의 평균에 해당하는 위치에 가로선 출력. 
  #geom_hline(aes(yintercept=median(t) ))  #t의 중위수에 해당하는 위치에 hline을 그린다.    

############################################
#geom_col과 bar의 차이
mpg.suv = mpg[mpg$class == "suv", c("cty", "manufacturer")] 
colnames(mpg.suv) = c("c","m")
#m을 기준으로 c의 평균을 구하겠다. aggregate = 집합.
res = aggregate( c~m , mpg.suv, mean)
#order
o = order(res$c,decreasing = T)  # 제일 작은 순부터 큰순으로 해당 인덱스를 리턴한다.
res = head(res[o,])  #조건에 주면 정렬 가능.
res
#x, y축 모두를 출력하는 경우 geom_col이 적합.
ggplot(res, aes(x=m, y=c)) +
  geom_col()
#한 컬럼의 데이터의 빈도를 조사하는 경우 geom_bar가 적합.
ggplot(data = mpg , aes(class)) +
  geom_bar()
######################################################

번외 : 타이타닉 호칭 추출 

df  = read.csv("train.csv")
l = str_match_all(df$Name,pattern = ",\\s(.+)\\.")
v = vector()
for(i in 1:length(l)){
  v[i] = l[[i]][,2]  
}
v

Colored by Color Scripter
cs
저작자표시 (새창열림)
'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

텍스트 마이닝 기초 (0)	2019.05.23
문자열 처리 두번째, 정규식, StringR라이브러리, colnames, match_all, trim, allign,sub,length, (0)	2019.05.23
기초 문자열 처리와 정규표현식 (0)	2019.05.22
tapply를 활용한 단어 빈도 조사 (0)	2019.05.22
리스트 <-> 벡터 , 속성값 추가 및 접근, lapply, tapply (0)	2019.05.22
Software knowledge worth spreading

정규식, 심화 빈도그래프,타이타닉 호칭 추출

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

+ Recent posts

티스토리툴바