(click to enlarge)
(click to enlarge)
(click to enlarge)
(click to enlarge)
(click to enlarge)
(click to enlarge)
# *------------------------------------------------------------------ # | PROGRAM NAME: R_tree_basic # | DATE:4/26/11 # | CREATED BY: Matt Bogard # | PROJECT FILE:P:\R Code References\Data Mining_R # *---------------------------------------------------------------- # | PURPOSE: demo of basic decision tree mechanics # | # *------------------------------------------------------------------ rm(list=ls()) # get rid of any existing data ls() # view open data sets setwd('/Users/wkuuser/Desktop/R Data Sets') # mac setwd("P:\\R Code References\\R Data") # windows library(rpart) # install rpart decision tree library # *------------------------------------------------------------------ # | get data # *----------------------------------------------------------------- dat1 <- read.csv("basicTree.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8") plot( dat1$x2, dat1$x1, col = dat1$class) # plot data space # fit decision tree (r <- rpart(class ~ x1 + x2, data = dat1)) plot(r) text(r) library(rattle) # data mining package drawTreeNodes(r) # for more detailed tree plot supported by rattle # *------------------------------------------------------------------ # | # | # | chi square test - 1st split # | # | # *----------------------------------------------------------------- # create a categorical for the first cutoff for x2 > 6.5 dat1$cutoff <- (ifelse (dat1$x2 >= 6.5, "x2 >=6.5 ", "x2 < 6.5")) # library(MASS) # required for cross tabulation tab1 <- table(dat1$cutoff,dat1$class) # cross tabulation print(tab1) Xsq <-chisq.test(tab1, correct = FALSE)# chi-squared test for independence print(Xsq) print(Xsq$exp) # print expected values # *------------------------------------------------------------------ # | chi square test - 1st split - choose an arbitrarily higher and # | lower split value and compare to optimal split chosen by tree # *----------------------------------------------------------------- # higher x2 split value dat1$h1 <- (ifelse (dat1$x2 >= 7.5, "x2 >=7.5 ", "x2 < 7.5")) tab2 <- table(dat1$h1,dat1$class) # cross tabulation print(tab2) Xsq <-chisq.test(tab2, correct = FALSE)# chi-squared test for independence print(Xsq) # chi square value is lower # lower x2 split value dat1$l1 <- (ifelse (dat1$x2 >= 5.5, "x2 >=5.5 ", "x2 < 5.5")) tab3 <- table(dat1$l1,dat1$class) # cross tabulation print(tab3) Xsq <-chisq.test(tab3, correct = FALSE)# chi-squared test for independence print(Xsq) # chi square value is lower # look at current dat1 data set summary dim(dat1) names(dat1) # *------------------------------------------------------------------ # | # | # | chi square test - 2nd split # | # | # *----------------------------------------------------------------- # *------------------------------------------------------------------ # | get data # *----------------------------------------------------------------- # to get the data in the 2nd split we have to first subset or partition the # data where x2 >= 6.5, hence the partition in the 'recursive partitioning' # algorithm used by decision trees) dat2 <- dat1[dat1$x2 >= 6.5,] dim(dat2) # N = 44 which is correct, recall print(tab1) # *------------------------------------------------------------------ # | create a categorical for the second cutoff for x1 >= 4.5 # *----------------------------------------------------------------- dat2$cutoff <- (ifelse (dat2$x1 >= 4.5, "x1 >=4.5 ", "x1 < 4.5")) # *------------------------------------------------------------------ # | cross tab & chi square test # *----------------------------------------------------------------- tab4 <- table(dat2$cutoff,dat2$class) # cross tabulation print(tab4) Xsq <-chisq.test(tab4, correct = FALSE)# chi-squared test for independence print(Xsq) # X-squared = 44, df = 1, p-value = 3.284e-11 # *------------------------------------------------------------------ # | chi square test - 2nd split - choose an arbitrarily higher and # | lower split value and compare to optimal split chosen by tree # *----------------------------------------------------------------- # higher x1 split value dat2$h2 <- (ifelse (dat2$x1 >= 5.5, "x1 >=5.5 ", "x1 < 5.5")) tab5 <- table(dat2$h2,dat2$class) # cross tabulation print(tab5) Xsq <-chisq.test(tab5, correct = FALSE)# chi-squared test for independence print(Xsq) # X-squared is lower # lower x1 split value dat2$l2 <- (ifelse (dat2$x1 >= 3.5, "x1 >=3.5 ", "x1 < 3.5")) tab6 <- table(dat2$l2,dat2$class) # cross tabulation print(tab6) Xsq <-chisq.test(tab6, correct = FALSE)# chi-squared test for independence print(Xsq) # X-squared is lower # look at data in dat2 dim(dat2) names(dat2) # note dat2 inherits the variables h1 and l1 (cutoffs for x2) from dat1, but they are not # relevant to our analysis in the 2nd split (which focuses on cutoffs for x1) # *------------------------------------------------------------------ # | # | export data sets to SAS to repeat analysis using a SAS data set, # | base SAS, and Enterprise Miner # | # | # *----------------------------------------------------------------- # export data sets to SAS to repeat analysis using a SAS data set, # base SAS, and Enterprise Miner library(foreign) # export data and SAS code for analyzing dat1 write.foreign(dat1, "dat1.txt", "dat1_read.sas", package="SAS") # export data and SAS code for analyzing dat2 write.foreign(dat2, "dat2.txt", "basicTreedat2_read.sas", package="SAS")
No comments:
Post a Comment