library, wordcloud, str_split, paste, nchar, str_replace, ggplot , wefare

2019. 5. 17. 14:47

라이브 러리

library(wordcloud)

library(ggplot2)

library(dplyr)

library(KoNLP)

library(stringr)

library(rJava)

library(dplyr)

library(ggplot2)

ggplot(data = mpg , aes( x = drv , y = cty)) +

geom_boxplot()

install.packages("rJava")

install.packages("KoNLP")

library(rJava)

library(KoNLP)

library(dplyr)

useNIADic()

txt = readLines("Data/hiphop.txt")

txt

letters

LETTERS

letters[5] # e

install.packages("stringr")

library(stringr)

txt

txt2 = txt

tolower("Eye for eye") #소문자로 통일

toupper("Eye for eye") #대문자로 통일

nchar('korea') #글자수 세기

nchar('대한민국')

mysentence = "learning r is so interesting"

mystr = strsplit(mysentence, split = " ") #문자열 분리 리스트로 리턴.

#리스트 요소 추출

mystr[[1]] #첫번쨰 리스트 출력

mystr[[1]][1] # 첫번째 리스트의 1번쨰 요소

#단어를 문자로 분리

strsplit(mystr[[1]][5],split="")

#5개 모두 분리.

for(i in 1:5){

print(strsplit(mystr[[1]][i],split=""))

}

#리스트에 저장

rep(NA,5)

myletters = list(rep(NA,5)) #사전 리스트 생성.

myletters

for(i in 1:5){

myletters[i] = print(strsplit(mystr[[1]][i],split=""))

}

myletters #각 단어의 스펠링이 들어있다.

paste("a" , "b") #스펠링의 연결.

paste(myletters[[1]], collapse = "") #연결시 딜리미터 추가.

myWords = list()

for(i in 1:5){

myWords[i] = paste(myletters[[i]], collapse = "")

}

myWords

paste(myWords, collapse = " ") #리스트의 각 요소를 공백으로 합치기.

#################################

rwiki = "R is a programming language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing.[6] The R language is widely used among statisticians and data miners for developing statistical software[7] and data analysis.[8] Polls, data mining surveys, and studies of scholarly literature databases show substantial increases in popularity in recent years.[9]. as of May 2019, R ranks 21st in the TIOBE index, a measure of popularity of programming languages.[10]

A GNU package,[11] source code for the R software environment is written primarily in C, Fortran and R itself,[12] and is freely available under the GNU General Public License. Pre-compiled binary versions are provided for various operating systems. Although R has a command line interface, there are several graphical user interfaces, such as RStudio, an integrated development environment."

rwiki

class(rwiki)

rwiki_para = strsplit(rwiki, split = "\n") # 엔터로 구분.

rwiki_para

class(rwiki_para)

str(rwiki_para) # 2개로 구성

#문단 -> 문장

rwiki_para[[1]]

rwiki_sent = strsplit(rwiki_para[[1]], split = "\\.") # .은 예약이 되있으므로 이스케이프 해줘야 한다, \\두개 붙일 것.

str(rwiki_sent)

#문장 -> 단어

rwiki_sent

str(rwiki_sent)

rwiki_sent[[1]][1]

strsplit(rwiki_sent[[1]][1], split = " ") #단어로 추출

test = "R?is# a100 programming한 language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing"

test = str_replace_all(test, "\\W", " ") # non char to space with escape char

test = str_replace_all(test, "\\d", " ") # digit to space with escape char

test

str_replace() # 문자열 치환.

str_replace_all()

fruits <- c("one apple", "two pears", "three bananas")

str_replace(fruits, "[aeiou]", "-")

str_replace_all(fruits, "[aeiou]", "-")

str_replace_all(fruits, "[aeiou]", toupper) # 모든 모음에 해당하는 문자를 대문자로! 함수를 쓸수 있다.

###################################

library(KoNLP)

library(KoNLP_dic)

txt = readLines("Data/hiphop.txt")

txt

txt = str_replace_all(txt, "\\W", " ")

txt

class(txt)

txt = extractNoun(txt)

#리스트 1~6200번 각 리스트에 저장된 단어에 대한 빈도 조사

#벡터화.

wordcount = table(unlist(txt))

class(wordcount)

df = as.data.frame(wordcount , stringsAsFactors = F) # 스트링을 스트링으로

str(df)

library(dplyr)

df = rename(df, word = Var1, freq = Freq)

nchar("hello") # 글자수 출력.

df = filter(df,nchar(df$word) > 1)

str(df)

# df %>% arrange(freq) %>%

# tail(10)

df %>% arrange(desc(freq)) %>%

head(20)

install.packages("wordcloud")

library(wordcloud)

pal = brewer.pal(8, "Dark2") # 파레트 에서 8개색상 추출 파레트 이름 dark2

wordcloud(words = df$word, freq = df$freq , min.freq = 10, max.words = 100, colors = pal,

scale=c(4,1), random.order = F) #단어 벡터 , 빈도 벡터 , 최소 빈도, 최대 단어수 , 색상 ,글자 크기 최대최소 차이,

text = read.csv("Data/twitter.csv", header = T, stringsAsFactors = F, fileEncoding = "UTF-8")

text

str(text) # X는 header에서 비어있는 컬럼의 이름.

text = rename(text, number = 번호 , id = 계정이름 , date = 작성일 , tw = 내용 )

str(text)

text$tw = str_replace_all(text$tw, "\\W", " ")

head(text$tw)

nouns = extractNoun(text$tw)

nouns # 리스트로 나옴.

wordCount = table(unlist(nouns)) # 리스트 -> 벡터 with unlist

class(wordCount)

df = as.data.frame(wordCount, stringsAsFactors = F)

str(df)

df = rename(df , word = Var1 )

df = rename(df , freq = Freq )

str(df)

df = filter(df, nchar(df$word) > 1)

str(df)

top20 = df %>%

arrange(desc(freq)) %>%

head(20)

library(ggplot2)

top20

order = arrange(top20, freq)$word

ggplot(top20 , aes(x = word, y = freq )) + ylim(0,2500) + geom_col()+ scale_x_discrete(limit=order) + coord_flip() + geom_text(aes(label = freq), hjust = -0.3)

#geom_text(aes(label = freq) 막대그래프 값 출력 + hjust= 글씨위치 조정.

#scale_x_discrete(limit=order) 막대 크기순으로 정렬

#coord_flip() : 막대그래프 세로로

pal = brewer.pal(8,"Dark2")

library(wordcloud)

wordcloud(words = df$word ,

freq = df$freq ,

colors = pal,

min.freq = 10,

max.words = 10,

random.order = F)

##########sav 파일 읽기

install.packages("foreign")

library(foreign)

library(readxl)

raw_welfare = read.spss(file = "Koweps.sav", to.data.frame = T)

welfare = raw_welfare

str(welfare)

View(welfare)

summary(welfare)

#성별 , 년도

welfare_renamed = rename(

welfare,

gender = h10_g3,

birth = h10_g4,

marriage = h10_g10,

religion = h10_g11,

income = p1002_8aq1,

code_job = h10_eco9,

code_region = h10_reg7

)

str(welfare_renamed)

#성별에 따른 월급 차이?

class(welfare_renamed$gender)

table(welfare_renamed$gender)

#성별에 0이 들어가 있는 경우에 NA로 처리가 되도록 처리.

welfare_renamed$gender = ifelse(welfare_renamed$gender == 0 , NA, welfare_renamed$gender)

table(is.na(welfare_renamed$gender))

welfare_renamed$gender = ifelse(welfare_renamed$gender == 1 , "male" , "female")

table(welfare_renamed$gender)

qplot(welfare_renamed$gender)

class(welfare_renamed$income)

summary(welfare_renamed$income) #NA가 있는 컬럼수 12030건.

wr = welfare_renamed # 이름 단축

qplot(wr$income) + # 2400정도의 이상치 떄문에 x축의 범위가많이 커졌다.

xlim(0, 1000) # 고로 x축의 범위를 1000으로 축소한다.

wr$income = ifelse(wr$income %in% c(0,9999), NA,wr$income) #c함수로 꼭 묶어줘야 함에 주의

table(is.na(wr$income)) # 0값 14개 NA에 추가되서 NA가 12044로 증가.

#성별에 따른 월급차이 분석.

icByGenger = wr %>%

filter(!is.na(income)) %>% #na인거 제거

group_by(gender) %>%

summarise(meanic = mean(income))

icByGenger # 거의 2배가까히 차이난다.

ggplot(icByGenger, aes(x = gender , y = meanic)) + geom_col()

#몇살에 가장 월급이 많을까?

summary(wr$birth)

table(is.na(wr$birth))

qplot(wr$birth)

wr$birth = ifelse(wr$birth == 9999 , NA, wr$birth)

table(is.na(wr$birth))

wr$age = 2019 - wr$birth + 1

summary(wr$age)

qplot(wr$age)

#나이에 따른 평균 급여

age_income = wr %>%

filter(!is.na(income)) %>%

group_by(age) %>%

summarise(meanIc = mean(income))

ggplot(age_income, aes(x = age , y = meanIc)) +

geom_line()

저작자표시 (새창열림)

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

USArrests, 행->컬럼. 지도 시각화, (0)	2019.05.20
라이브러리, ggplot x축 분할, Flip, Join, excel, discrete, fill, position (0)	2019.05.20
R <---> Mysql 연동 (0)	2019.05.16
rbind, apply, sample,split, subset, names, merge (0)	2019.05.16
head, view , summary, dim. summarize, rename, mutate, group_by (0)	2019.05.15

Software knowledge worth spreading

library, wordcloud, str_split, paste, nchar, str_replace, ggplot , wefare

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

+ Recent posts

티스토리툴바