# Rattle: Data Mining by Example # # Copyright (c) 2011 Graham.Williams@togaware.com # # Random Forests - Chapter 12 of the Rattle book. # # Note how these algorithms are sensitive to missing values. Thus we use the # subset of the dataset that excludes observations with missing values. source("datasets.R") ################################################################################ # Random Forest weatherRF <- new.env(parent=weatherDS) evalq({ require(randomForest) set.seed(42) build.time <- system.time(model <- randomForest(formula=form, data=data[train.na.omit, vars])) pr <- predict(model, data[test.na.omit, vars], type="prob")[,2] cl <- predict(model, data[test.na.omit, vars]) doRiskChart(pr, data, test.na.omit, target, risk, main="Risk Chart randomForest weather [test] RISK_MM") }, weatherRF) weatherRF$model weatherRF$build.time ################################################################################ # Conditional Trees Random Forest weatherCFOREST <- new.env(parent=weatherDS) evalq({ require(party) set.seed(42) build.time <- system.time(model <- cforest(formula=form, data=data[train.na.omit, vars])) pr <- sapply(treeresponse(model, data[test.na.omit, vars]), function(x) x[2]) cl <- levels(data[[target]])[1+as.integer(pr>0.5)] doRiskChart(pr, data, test.na.omit, target, risk, main="Risk Chart cforest weather [test] RISK_MM") }, weatherCFOREST) print(weatherCFOREST$model) # The cforest implementation takes more time. weatherCFOREST$build.time # List the importance of the variables. We need to ensure missing values are not # included in the triaing data to be able to list variable importance. vi <- as.data.frame(sort(varimp(weatherCFOREST$model), decreasing=TRUE)) names(vi) <- 'Importance' vi # Print the first tree. with(weatherCFOREST, party:::prettytree(model@ensemble[[1]], names(model@data@get("input")))) ################################################################################ # Experimental: Try CORElearn package random forest weatherCORERF <- new.env(parent=weatherDS) evalq({ require(CORElearn) set.seed(42) build.time <- system.time(model <- CoreModel(formula=form, data=data[train, vars], model="rf")) predicted <- predict(model, data[test, vars]) pr <- predicted$prob[,2] cl <- predicted$class doRiskChart(pr, data, test, target, risk, main="Risk Chart CoreModel rf weather [test] RISK_MM") modelEval(model, data[test, target], cl) }, weatherCORERF) #selectionEstimator="MDL", minNodeWeightRF=5, rfNoTrees=500) weatherCORERF$build.time ################################################################################ # Experiment with multiple models of the same data to illustrate the random # element of building a random forest. set.seed(42) for (i in 1:5) { evalq({ require(randomForest) build.time <- system.time(model <- randomForest(formula=form, data=data[train.na.omit, vars])) pr <- predict(model, data[test.na.omit, vars], type="prob")[,2] cl <- predict(model, data[test.na.omit, vars]) doRiskChart(pr, data, test.na.omit, target, risk, main=paste("randomForest weather [test] RISK_MM Plot", i)) }, weatherRF) cat(paste("\n**********\nModel", i, "\n**********\n")) print(weatherRF$model) print(weatherRF$build.time) }