head, view , summary, dim. summarize, rename, mutate, group_by

2019. 5. 15. 10:53

as.data.frame() -> 명시적으로 타입을 데이터 프레임으로 변경

/////////////////데이터 추정 명령어 //////////

head

view

summary

dim

str

////////////////////////////////////////////

# 이름변경

rename(<nameOfDF>, <ToName>=<fromName>)

exam %>%

mutate( tot = math + english + science,

avg = (tot)/ 3 ) %>%

exam %>% 
  mutate(res=ifelse(science >60, "pass", "fail")) %>% 
  head(5)

exam %>% summarise(meanMath=mean(math))

///////group////////

exam %>%

group_by(class) %>%

summarise(meanMath=mean(math))

///////////////////////////// 중요 /////////////////////////////

exam %>%

group_by(class) %>%

summarise(meanMath=mean(math), medMath=median(math),

sumMath=sum(math), count=n(),

sdMath=sd(math),minMath=min(math),maxMath=max(math)

)

sd= 표준편차

#그룹화 조건 두개 (대분류 중분류)

mpg %>%

group_by(manufacturer, drv) %>%

summarise(meanCty=mean(cty)) %>%

head(20)

////////////////////////

mpg %>%

group_by(manufacturer) %>%

filter(class=="suv") %>%

mutate( avg = (hwy + cty) / 2 ) %>%

summarise(meanAvg = mean(avg)) %>%

arrange(meanAvg) %>%

head(5)

////////////////////////////

#조인 

mid = data.frame(sid=c(100,200,300,400,500), scoreMid=c(90,90,50,70,100)) final = data.frame(sid=c(100,200,300,400,500), scoreFinal=c(70,60,80,90,40)) tot = left_join(mid ,final, ) tot

#기준컬럼이 다를 경우

mid = data.frame(sid=c(100,200,300,400,500),

scoreMid=c(90,90,50,70,100))

final = data.frame(sid2=c(100,200,300,400,500),

scoreFinal=c(70,60,80,90,40))

tot = left_join(mid ,final,by=c("sid" = "sid2"))

tot

//////////////////// 예제 //////////////////////////

tName = data.frame(teacher=c("aaa","bbb","ccc","ddd","eee"), class=c(1,2,3,4,5)) tName temp = left_join(exam, tName, by = "class") temp

////////////////////// 세로 합치기 ///////////////

mid = data.frame(sid=c(100,200,300,400,500),

scoreMid=c(90,90,50,70,100))

final = data.frame(sid=c(200,400,600,800,900),

scoreFinal=c(70,60,80,90,40))

exam_all = bind_rows(mid,final)

exam_all

//////////////////////////////////////////////////////

#결측치

df = data.frame(gender=c('f','m',NA,'m','f'),

score=c(50,40,40,30,NA))

is.na(df)

table(is.na(df))

table(is.na(df$gender))

table(is.na(df$score))

exam$class == 1 //// 조건에 맞는 행만 true 즉

#결측치 제거 filter함수는 조건이 true인 행만 출력

df %>% filter(is.na(score)) // score가 NA인 값만 추출

df %>% filter(!is.na(score)) // score가 NA가 아닌 값만 추출

#NA잇는거 전부 제거

dfNoMiss = df %>% filter(!is.na(score) & !is.na(gender))

dfNoMiss

#같은 기능

dfNoMiss2 = na.omit(df)

dfNoMiss2

#그냥 함수 내부에서 NA를 제외한후 연산 가능

mean(df$score, na.rm = T)

exam %>% summarise(meanSci=mean(science, na.rm = T))

#특정 행과 열에 결측치 추가

mpg[c(65, 124, 131, 153, 212), "hwy"] <- NA

#특정 행의 특정 컬럼만을 추출.

exam[ c(2,5,10), 5]

exam[ c(2,5,10), "science"]

///////////////////////////////////////////

### 결측치를 평균으로 대체 ###

exam

mean_sc = mean(exam$science, na.rm = T)

exam$science = ifelse(is.na(exam$science), mean_sc ,exam$science)

exam$science

table(is.na(exam$science))

mean(exam$science)

#테이블의 일부를 변경하고자 할경후 변경한후 원본 테이블에 저장을 해야만 그부분이 갱신된다.

/////////////////////////////////////////

#이상치 -3을 찾은후 NA로 대체

data$g = ifelse(data$g == -3 , NA, data$g)

data

data$s = ifelse(data$s < 3 , NA , data$s)

data

//////////////////////////////////

#극단치 확인

boxplot(mpg$cty)

#요약 통계치 , 로워 휘스커 , 로워 힌지 중위수 ,어퍼 힌지 , 어퍼 휘스커 바로 보기

boxplot(mpg$hwy)$stats

#극단치 NA로 대체 후 갯수 확인

mpg$hwy = ifelse(mpg$hwy < 12 | mpg$hwy > 37 , NA , mpg$hwy)

table(is.na(mpg$hwy))

//////////////////////////////////////

boxplot(mpg$cty)$stats[1]

mpg$hwy = ifelse(mpg$cty < boxplot(mpg$cty)$stats[1] | mpg$cty > boxplot(mpg$cty)$stats[5] , NA , mpg$cty)

table(is.na(mpg$cty))

# MATRIX

a = matrix(1:6,ncol=2)

str(a)

class(a)

b = data.frame(matrix(6:10, ncol=5))

************************흐름제어*************************

//////////////// 변수에 자연수 집합 바로 넣기

x = 1:5

x %% 2 // 나누기

for(i in 1:10){

print(i)

}

/////////////////////////선형 모델 //////

df = data.frame(x=1:5,y=seq(2,10,2))

#residual (잔차)

df[3,2] = NA

df = df %>% select(-V3)

resid(lm(y~x, data=df, na.action=na.exclude))

resid(lm(y~x, data=df, na.action=na.omit))

#resid :실제값과 예측값의 사이의 차이

*lm = 선형모델 만들기

////////////////////사용자 정의 함수 //////////////

f = function(a,b){

print(a)

print(b)

}

f(1,2)

f(b=1, a=2)

///////////////////////////////////////////////////

#데이터셋 보기

library(help="datasets")

저작자표시 (새창열림)

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

라이브러리, ggplot x축 분할, Flip, Join, excel, discrete, fill, position (0)	2019.05.20
library, wordcloud, str_split, paste, nchar, str_replace, ggplot , wefare (0)	2019.05.17
R <---> Mysql 연동 (0)	2019.05.16
rbind, apply, sample,split, subset, names, merge (0)	2019.05.16
패키지 추가, 조건 결합, 범주형, qplot, excel, filter, select, arrange (0)	2019.05.13

Software knowledge worth spreading

head, view , summary, dim. summarize, rename, mutate, group_by

'딥러닝 모델 설계 > R STUDIO' 카테고리의 다른 글

+ Recent posts

티스토리툴바