Sunday, May 1, 2011

Decision Tree Mechanics with R and SAS

(click to enlarge)






(click to enlarge)


(click to enlarge)


(click to enlarge)

(click to enlarge)

(click to enlarge)
(click to enlarge)
# *------------------------------------------------------------------
# | PROGRAM NAME: R_tree_basic 
# | DATE:4/26/11   
# | CREATED BY: Matt Bogard 
# | PROJECT FILE:P:\R  Code References\Data Mining_R              
# *----------------------------------------------------------------
# | PURPOSE: demo of basic decision tree mechanics               
# |
# *------------------------------------------------------------------
 
rm(list=ls()) # get rid of any existing data 
ls() # view open data sets
 
setwd('/Users/wkuuser/Desktop/R Data Sets') # mac 
setwd("P:\\R  Code References\\R Data") # windows
 
library(rpart) # install rpart decision tree library
 
# *------------------------------------------------------------------
# | get data            
# *-----------------------------------------------------------------
 
dat1 <-  read.csv("basicTree.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8")
plot( dat1$x2, dat1$x1, col = dat1$class) # plot data space
 
# fit decision tree
 
(r <- rpart(class ~ x1 + x2, data = dat1)) 
 
plot(r)
text(r)
 
library(rattle) # data mining package
drawTreeNodes(r) # for more detailed tree plot supported by rattle
 
# *------------------------------------------------------------------
# |  
# | 
# | chi square test - 1st split 
# |   
# |         
# *-----------------------------------------------------------------
 
# create a categorical for the first cutoff for x2 > 6.5
 
dat1$cutoff <- (ifelse (dat1$x2 >= 6.5, "x2 >=6.5 ", "x2 < 6.5"))
 
# library(MASS) # required for cross tabulation 
 
tab1 <- table(dat1$cutoff,dat1$class) # cross tabulation
print(tab1) 
 
Xsq <-chisq.test(tab1, correct = FALSE)# chi-squared test for independence
print(Xsq)
 
print(Xsq$exp) # print expected values
 
# *------------------------------------------------------------------
# |  chi square test - 1st split - choose an arbitrarily higher and 
# |  lower split value and compare to optimal split chosen by tree            
# *-----------------------------------------------------------------
 
# higher x2 split value
 
dat1$h1 <- (ifelse (dat1$x2  >= 7.5, "x2 >=7.5 ", "x2 < 7.5"))
tab2 <- table(dat1$h1,dat1$class) # cross tabulation
print(tab2) 
 
Xsq <-chisq.test(tab2, correct = FALSE)# chi-squared test for independence
print(Xsq) # chi square value is lower
 
# lower x2 split value
 
dat1$l1 <- (ifelse (dat1$x2  >= 5.5, "x2 >=5.5 ", "x2 < 5.5"))
tab3 <- table(dat1$l1,dat1$class) # cross tabulation
print(tab3) 
 
Xsq <-chisq.test(tab3, correct = FALSE)# chi-squared test for independence
print(Xsq) # chi square value is lower
 
# look at current dat1 data set summary
dim(dat1)
names(dat1)
 
# *------------------------------------------------------------------
# |  
# |
# | chi square test - 2nd split   
# |
# |          
# *-----------------------------------------------------------------
 
# *------------------------------------------------------------------
# | get data            
# *-----------------------------------------------------------------
 
# to get the data in the 2nd split we have to first subset or partition the 
# data where x2 >= 6.5, hence the partition in the 'recursive partitioning' 
# algorithm used by decision trees) 
 
dat2 <- dat1[dat1$x2 >= 6.5,]
dim(dat2) # N = 44 which is correct, recall print(tab1)
 
# *------------------------------------------------------------------
# | create a categorical for the second cutoff for x1 >= 4.5           
# *-----------------------------------------------------------------
 
dat2$cutoff <- (ifelse (dat2$x1 >= 4.5, "x1 >=4.5 ", "x1 < 4.5"))
 
# *------------------------------------------------------------------
# | cross tab & chi square test       
# *-----------------------------------------------------------------
 
tab4 <- table(dat2$cutoff,dat2$class) # cross tabulation
print(tab4) 
 
Xsq <-chisq.test(tab4, correct = FALSE)# chi-squared test for independence
print(Xsq) # X-squared = 44, df = 1, p-value = 3.284e-11
 
# *------------------------------------------------------------------
# |  chi square test - 2nd split - choose an arbitrarily higher and 
# |  lower split value and compare to optimal split chosen by tree            
# *-----------------------------------------------------------------
 
# higher x1 split value
 
dat2$h2 <- (ifelse (dat2$x1 >= 5.5, "x1 >=5.5 ", "x1 < 5.5"))
tab5 <- table(dat2$h2,dat2$class) # cross tabulation
print(tab5) 
 
Xsq <-chisq.test(tab5, correct = FALSE)# chi-squared test for independence
print(Xsq) # X-squared  is lower
 
# lower x1 split value
 
dat2$l2 <- (ifelse (dat2$x1 >= 3.5, "x1 >=3.5 ", "x1 < 3.5"))
tab6 <- table(dat2$l2,dat2$class) # cross tabulation
print(tab6) 
 
Xsq <-chisq.test(tab6, correct = FALSE)# chi-squared test for independence
print(Xsq) # X-squared  is lower
 
# look at data in dat2
dim(dat2)
names(dat2)
 
# note dat2 inherits the variables h1 and l1 (cutoffs for x2) from dat1, but they are not 
# relevant to our analysis in the 2nd split (which focuses on cutoffs for x1)
 
 
# *------------------------------------------------------------------
# |  
# | export data sets to SAS to repeat analysis using a SAS data set,
# | base SAS, and Enterprise Miner
# |   
# |          
# *-----------------------------------------------------------------
 
 
# export data sets to SAS to repeat analysis using a SAS data set,
# base SAS, and Enterprise Miner
 
library(foreign)
# export data and SAS code for analyzing dat1
write.foreign(dat1, "dat1.txt", "dat1_read.sas",   package="SAS") 
# export data and SAS code for analyzing dat2
write.foreign(dat2, "dat2.txt", "basicTreedat2_read.sas",   package="SAS")  
 
 
 
 
 
 
Created by Pretty R at inside-R.org

No comments:

Post a Comment