22-08-26 chap17_RandomForest

서부남93 2022. 8. 29. 09:49

# chap17_RandomForest

##################################################
#randomForest
##################################################
# 결정트리(Decision tree)에서 파생된 모델
# 랜덤포레스트는 앙상블 학습기법을 사용한 모델
# 앙상블 학습 : 새로운 데이터에 대해서 여러 개의 Tree로 학습한 다음,
# 학습 결과들을 종합해서 예측하는 모델(PPT 참고)
# DT보다 성능 향상, 과적합 문제를 해결

install.packages('randomForest')
library(randomForest) # randomForest()함수 제공

data(iris)

# 1. 랜덤 포레스트 모델 생성 : 분류트리
# 형식) randomForest(y ~ x, data, ntree=500, mtry)
?randomForest
# ntree=500 : tree 개수
# mtry=if (!is.null(y) && !is.factor(y)) -> 숫자형 여부
# max(floor(ncol(x)/3), 1) -> 숫자형
# else floor(sqrt(ncol(x))) -> 범주형

# iris의 mtry=2 결정
unique(iris$Species) # 3 Levels: setosa ...
floor(sqrt(ncol(iris[-5]))) # 2

model = randomForest(Species~., data=iris,
                     ntree=500, mtry=2,  na.action=na.omit)
model
# ntree : 생성할 tree 개수
# mtry : 표본 추출에 사용할 설명변수 개수(분류트리:sqrt(n), 회귀트리 : n/3)
# na.action : 결측치 처리(na.omit)

# OOB estimate of  error rate: 4%
# OOB error : 표본으로 선정되지 않은 자료로 평가한 오류

# 2. model 정보 확인
names(model) # 19컬럼 제공

#혼동행렬
confusion <- model$confusion
confusion

acc <- sum(diag(confusion)) / sum(confusion)
acc # 0.9592326

# x변수의 중요 정보
model$importance
#MeanDecreaseGini : 지니계수 = 1 - 지니불순도
#Sepal.Length         7.911342
#Sepal.Width          1.892859
#Petal.Length        44.920018
#Petal.Width         44.601208

# 3. 중요 변수 생성
model2 = randomForest(Species ~ ., data=iris,
                      ntree=500, mtry=2,
                      importance = T,
                      na.action=na.omit )
model2
#

importance(model2)
# MeanDecreaseAccuracy : 분류정확도 개선의 공헌도
# MeanDecreaseGini : 지니계수(불확실성 개선의 공헌도)

varImpPlot(model2)

################################
## 회귀트리(y변수 : 비율척도)
################################
library(MASS)
data("Boston")
str(Boston)
#crim : 도시 1인당 범죄율
#zn : 25,000 평방피트를 초과하는 거주지역 비율
#indus : 비상업지역이 점유하고 있는 토지 비율
#chas : 찰스강에 대한 더미변수(1:강의 경계 위치, 0:아닌 경우)
#nox : 10ppm 당 농축 일산화질소
#rm : 주택 1가구당 평균 방의 개수
#age : 1940년 이전에 건축된 소유주택 비율
#dis : 5개 보스턴 직업센터까지의 접근성 지수
#rad : 고속도로 접근성 지수
#tax : 10,000 달러 당 재산세율
#ptratio : 도시별 학생/교사 비율
#black : 자치 도시별 흑인 비율
#lstat : 하위계층 비율
#medv(y) : 소유 주택가격 중앙값 (단위 : $1,000)

x = 13
floor(x/3) # 4

boston_model <- randomForest(medv ~ ., data = Boston,
                             mtree = 500, mtry = 4,
                             importance = T,
                             na.action=na.omit)

boston_model
#Mean of squared residuals: 10.12131  -> MSE
#              % Var explained: 88.01 -> 분산설명력 : MSE 평가지표

names(boston_model)  # 18속성
# "predicted" "mse" "y"

# R2 score
y_pred <- boston_model$predicted
y_true <- boston_model$y

R2 <- cor(y_true, y_pred)^2
R2 # 0.8872821

# MSE
mse <- mean((y_true - y_pred)^2)
mse # 10.12131

# 중요변수 확인
importance(boston_model)
varImpPlot(boston_model)

################################
## 분류트리(y변수 : 범주형)
################################
titanic = read.csv(file.choose()) # titanic3.csv
str(titanic)
# titanic3.csv 변수 설명
#'data.frame': 1309 obs. of 14 variables:
#1.pclass : 1, 2, 3등석 정보를 각각 1, 2, 3으로 저장
#2.survived : 생존 여부. survived(생존=1), dead(사망=0)
#3.name : 이름(제외)
#4.sex : 성별. female(여성), male(남성)
#5.age : 나이
#6.sibsp : 함께 탑승한 형제 또는 배우자의 수
#7.parch : 함께 탑승한 부모 또는 자녀의 수
#8.ticket : 티켓 번호(제외)
#9.fare : 티켓 요금
#10.cabin : 선실 번호(제외)
#11.embarked : 탑승한 곳(제외) C(Cherbourg), Q(Queenstown), S(Southampton)
#12.boat     : (제외)Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
#13.body     : (제외)int  NA NA NA 135 NA NA NA NA NA 22 ...
#14.home.dest: (제외)

# 삭제 칼럼 : 3, 8, 10~14
df <- titanic[, -c(3, 8, 10:14)]
dim(df)  # 1309    7

# y변수 요인형 변환 : 중요
df$survived <- as.factor(df$survived) # 집단변수

floor(sqrt(6)) # mtry = 2

# 분류모델 생성
titanic_model <- randomForest(survived ~ ., data = df,
             mtree = 500, mtry = 2,
             importance = T,
             na.action=na.omit)

titanic_model
#OOB estimate of  error rate: 19.62%
#Confusion matrix:
#  0   1 class.error
#0 559  59  0.09546926
#1 146 281  0.34192037

# 중요변수 제공
importance(titanic_model)
varImpPlot(titanic_model)