library(foreign)
library(readxl)
library(ggplot2)
library(dplyr)
install.packages("ggiraphExtra")
library(ggiraphExtra)
raw_welfare<-read.spss(file="koweps.sav", to.data.frame = T)
welfare<-raw_welfare
str(welfare)
View(welfare)
summary(welfare)
wf = welfare
wf = rename(
welfare,
gender = h10_g3,
birth = h10_g4,
marriage = h10_g10,
religion = h10_g11,
income = p1002_8aq1,
code_job = h10_eco9,
code_region = h10_reg7
)
wf$age = 2019 - wf$birth + 1
wf = wf %>% mutate(ag = ifelse(age < 30, "young",
ifelse(age < 59, "middle",
"old")))
table(wf$ag)
qplot(wf$ag)
boxplot(wf$age)$stats
wf_Ic = wf %>%
filter(!is.na(income)) %>%
group_by(ag) %>%
summarise(meanIc = mean(income))
wf$gender<-ifelse(wf$gender==1,"male","female")
#나이순으로 정렬
ggplot(wf , aes(x = ag , y = income)) +
geom_col() +
scale_x_discrete(limits = c("young", "middle", "old"))
#성별과 연령대 별로 차이가 있는지?
gen_Ic = wf %>%
filter(!is.na(income)) %>%
group_by(ag, gender) %>%
summarise(meanIc = mean(income))
ggplot(data=gen_Ic, aes(x=ag, y=meanIc, fill=gender))+
geom_col(position = "dodge")+
scale_x_discrete(limits=c("young", "middle", "old"))
str(gen_Ic)
head(gen_Ic)
dim(gen_Ic)
gen_age = wf %>%
filter(!is.na(income)) %>%
group_by(age, gender) %>%
summarise(meanIc = mean(income))
head(gen_age)
ggplot(data=gen_age, aes(x = age , y = meanIc, col = gender)) +
geom_line()
#어떤 직업이 가장 많은 급여 ?
table(is.na(wf$code_job))
install.packages("raedxl")
library(readxl)
#첫번쨰 줄은 헤드로 , 2번째 시트를 읽기
list_jobs = read_excel("Data/Koweps_Codebook.xlsx",col_names = T, sheet = 2)
dim(list_jobs)
str(list_jobs)
wf$code_job
#list_jobs와 wf를 code_job을 기준으로 조인
wf = left_join(wf, list_jobs, id="code_job")
wf %>%
filter(!is.na(code_job)) %>%
select(code_job, job) %>%
head
#직업별 급여 평균 출력
#직업에 따른 평균 급여 추출 (NA포함 )
tapply(wf$income, wf$job, mean)
#직업에 따른 평균 급여 추출
jobIc = wf %>%
filter(!is.na(income) & !is.na(job)) %>%
group_by(job) %>%
summarise(meanIc = mean(income))
#탑텐 추출
top10 = jobIc %>%
arrange(desc(meanIc)) %>%
head(10)
top10
#가로막대화
ggplot(top10 , aes( x = job , y = meanIc)) +
coord_flip() +
geom_col()
#x축 막대의 길이에 따라서 정렬. job은 연속형 변수여서는 안되고 펙터 형태여야 한다.
#reorder(정렬대상변수(펙터) , 연속형 변수) : 즉 job을 meanIc에 대해서 정렬한후 출력하라.
ggplot(top10 , aes( x = reorder(job,meanIc), y = meanIc)) +
coord_flip() +
geom_col()
#오름차순 형태로 출력. 수치형 데이터 앞에 -를 붙일것.
ggplot(top10 , aes( x = reorder(job,-meanIc), y = meanIc)) +
coord_flip() +
geom_col()
#하위 10위에 해당하는 직업 추출후 직업명 , 평균임금 출력.
str(wf$income)
table(is.na(wf$income))
bottom_10 = jobIc %>%
arrange(meanIc) %>%
head(10)
bottom_10
ggplot(bottom_10, aes(x = reorder(job, -meanIc), y = meanIc)) +
coord_flip() +
geom_col() +
ylim(0, 150)
#남성 직업 -> 상위 10개 추출
#남성 직업의 빈도 10개
job_female = welfare %>%
filter(!is.na(job) & gender == "female") %>%
group_by(job) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
head(10)
job_male = welfare %>%
filter(!is.na(job) & gender == "male") %>%
group_by(job) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
head(10)
#종교의 유무에 따른 이혼율?
class(wf$religion)
table(wf$religion)
wf$religion = ifelse( wf$religion == 1 , "yes", "no")
table(wf$religion)
qplot(wf$religion)
#혼인상태
table(wf$marriage)
wf$group_marrage = ifelse(wf$marriage == 1, "marriage", ifelse(
wf$marriage == 3, "divorce", NA
))
table(wf$group_marrage) #주의!!! table함수는 NA가 나오지 않는다!!!!!!
table(is.na(wf$group_marrage))
qplot(wf$group_marrage)
round(5/8, 1) #숫자 반올림, 소수이하 1째자리 까지.
religion_marriage = wf %>%
filter(!is.na(group_marrage)) %>%
group_by(religion, group_marrage) %>%
summarise(n = n()) %>%
mutate(tot_group = sum(n)) %>% # 그룹별 n의 합.
mutate(pct = round(n/tot_group * 100,1))
religion_marriage
'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글
국내지도그래프,시계열 그래프 , ggChoropleth , plotly , dodge (0) | 2019.05.22 |
---|---|
USArrests, 행->컬럼. 지도 시각화, (0) | 2019.05.20 |
library, wordcloud, str_split, paste, nchar, str_replace, ggplot , wefare (0) | 2019.05.17 |
R <---> Mysql 연동 (0) | 2019.05.16 |
rbind, apply, sample,split, subset, names, merge (0) | 2019.05.16 |