#*********************************************************************************************************
# © 2013 jakemdrew.com. All rights reserved.
# This source code is licensed under The GNU General Public License (GPLv3):
# http://opensource.org/licenses/gpl-3.0.html
#*********************************************************************************************************
#*********************************************************************************************************
# R Machine Learning in Parallel with GLM, SVM, and AdaBoost
# Created By - Jake Drew
# Version - 1.0, 03/07/2014
#*********************************************************************************************************
#-------------------------------------------------------------------------------
#-----------------------Create Test and Training Data Bootstraps----------------
#-------------------------------------------------------------------------------
sampleCount <- 20
# Thanks to the UCI repository Magic Gamma telescope data set
# http://archive.ics.uci.edu/ml/machine-learning-databases/
magicGamma = read.table(
"http://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data"
, header = F, sep=",")
#create unique id for magicGamma file (used later)
magicGamma <- data.frame(id=1:nrow(magicGamma),magicGamma)
# Take 20% random sample of the data for testing, 80% for training
testIndexes <- sample(1:nrow(magicGamma), size=0.2*nrow(magicGamma))
testData <- magicGamma[testIndexes,]
trainData <- magicGamma[-testIndexes,]
# Create random bootstrap training samples (with replacement)
trainSamples <- for(i in 1:sampleCount){
trainData[sample(1:nrow(trainData)
, size=0.2*nrow(trainData), replace=TRUE),]
}
#-------------------------------------------------------------------------------
#-----------------------Setup Parallel Processing-------------------------------
#-------------------------------------------------------------------------------
#number of bootstrap samples to create
sampleCount <- 8
# Run in parallel on Linux using doMC (uncomment for Linux parallel cluster)
#library(doMC)
#registerDoMC(cores=sampleCount) #<- # of processors / hyperthreads on machine
# Run in parallel on Windows using doSNOW (uncomment for Windows parallel cluster)
library(doSNOW)
cluster<-makeCluster(sampleCount) #<- # of processors / hyperthreads on machine
registerDoSNOW(cluster)
#-------------------------------------------------------------------------------
#-----------------------Create Random Sample Test and Training Data-------------
#-------------------------------------------------------------------------------
# Thanks to the UCI repository Magic Gamma telescope data set
# http://archive.ics.uci.edu/ml/machine-learning-databases/
magicGamma = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data", header = F, sep=",")
#create unique id for magicGamma file
magicGamma <- data.frame(id=1:nrow(magicGamma),magicGamma)
# Take 20% random sample of the data for testing, 80% for training
testIndexes <- sample(1:nrow(magicGamma), size=0.2*nrow(magicGamma))
testData <- magicGamma[testIndexes,]
trainData <- magicGamma[-testIndexes,]
# Create random bootstrap training samples (with replacement) in parallel
trainSamples <- foreach(i = 1:sampleCount) %dopar% {
trainData[sample(1:nrow(trainData)
, size=0.2*nrow(trainData), replace=TRUE),]
}
#-------------------------------------------------------------------------------
#-----------------------Time Bootstraping Serial vs. Parallel-------------------
#-------------------------------------------------------------------------------
#number of bootstrap samples to create
sampleCount <- 8
#-----------------------glm serial-----------------------
timer <- proc.time()
glmSerial <- foreach(i = 1:sampleCount) %do% {
glm(V11 ~ .,trainSamples[[i]][,-1], family=binomial())
}
proc.time() - timer
#-----------------------glm parallel-----------------------
timer <- proc.time()
glmSerial <- foreach(i = 1:sampleCount) %dopar% {
glm(V11 ~ .,trainSamples[[i]][,-1], family=binomial())
}
proc.time() - timer
#-----------------------svm serial-----------------------
timer <- proc.time()
modelDataSvm <- foreach(i = 1:sampleCount) %do% {
library(e1071)
svm(V11 ~ ., trainSamples[[i]][,-1], probability=TRUE
, cost=10, gamma=0.1)
}
proc.time() - timer
#-----------------------svm parallel-----------------------
timer <- proc.time()
modelDataSvm <- foreach(i = 1:sampleCount) %dopar% {
library(e1071)
svm(V11 ~ ., trainSamples[[i]][,-1], probability=TRUE
, cost=10, gamma=0.1)
}
proc.time() - timer
#-----------------------ada serial-----------------------
timer <- proc.time()
modelDataAda <- foreach(i = 1:sampleCount) %do% {
library(ada)
ada(x = trainSamples[[i]][,c(-1,-ncol(testData))]
, y = as.numeric(trainSamples[[i]][,ncol(testData)]) - 1)
}
proc.time() - timer
#-----------------------ada-----------------------
timer <- proc.time()
modelDataAda <- foreach(i = 1:sampleCount) %dopar% {
library(ada)
ada(x = trainSamples[[i]][,c(-1,-ncol(testData))]
, y = as.numeric(trainSamples[[i]][,ncol(testData)]) - 1)
}
proc.time() - timer
#-------------------------------------------------------------------------------
#-----------------------Practical Example Code Below----------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-----------------------Setup Parallel Processing-------------------------------
#-------------------------------------------------------------------------------
#number of bootstrap samples to create
sampleCount <- 8
# Run in parallel on Linux using doMC (uncomment for Linux parallel cluster)
#library(doMC)
#registerDoMC(cores=sampleCount) #<- # of processors / hyperthreads on machine
# Run in parallel on Windows using doSNOW (uncomment for Windows parallel cluster)
library(doSNOW)
cluster<-makeCluster(sampleCount) #<- # of processors / hyperthreads on machine
registerDoSNOW(cluster)
#-------------------------------------------------------------------------------
#-----------------------Create Random Sample Test and Training Data-------------
#-------------------------------------------------------------------------------
# Thanks to the UCI repository Magic Gamma telescope data set
# http://archive.ics.uci.edu/ml/machine-learning-databases/
magicGamma = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data", header = F, sep=",")
#create unique id for magicGamma file
magicGamma <- data.frame(id=1:nrow(magicGamma),magicGamma)
# Take 20% random sample of the data for testing, 80% for training
testIndexes <- sample(1:nrow(magicGamma), size=0.2*nrow(magicGamma))
testData <- magicGamma[testIndexes,]
trainData <- magicGamma[-testIndexes,]
# Create random bootstrap training samples (with replacement) in parallel
trainSamples <- foreach(i = 1:sampleCount) %dopar% {
trainData[sample(1:nrow(trainData)
, size=0.2*nrow(trainData), replace=TRUE),]
}
#-------------------------------------------------------------------------------
#-----------------------Create Function To Measure Accuracy---------------------
#-------------------------------------------------------------------------------
accuracy <- function (truth, predicted){
tTable <- table(truth,predicted)
print(tTable)
tp <- tTable[1,1]
if(ncol(tTable)>1){ fp <- tTable[1,2] } else { fp <- 0}
if(nrow(tTable)>1){ fn <- tTable[2,1] } else { fn <- 0}
if(ncol(tTable)>1 & nrow(tTable)>1){ tn <- tTable[2,2] } else { tn <- 0}
return((tp + tn) / (tp + tn + fp + fn))
}
#-------------------------------------------------------------------------------
#-----------------------Benchmark Against All Available Training Data-----------
#-------------------------------------------------------------------------------
#-----------------------glm all-----------------------
timer <- proc.time()
glmAll <- glm(V11 ~ .,trainData[,-1], family=binomial())
proc.time() - timer
timer <- proc.time()
glmAllTest <- predict(glmAll, testData[,c(-1 ,-ncol(testData))])
proc.time() - timer
#add predicted class and actual class to test data
glmAllResults <- data.frame(id = testData$id, actualClass = testData$V11)
glmAllResults$predictedClass <- ifelse(glmAllTest < 0,"g","h")
#calculate glm all model accuracy
accuracy(glmAllResults$actualClass,glmAllResults$predictedClass)
#-----------------------svm all-----------------------
timer <- proc.time()
library(e1071)
svmAll <- svm(V11 ~ ., trainData[,-1], probability=TRUE, cost=10, gamma=0.1)
proc.time() - timer
timer <- proc.time()
svmAllTest <- predict(svmAll, testData[,c(-1 ,-ncol(testData))], probability=TRUE)
proc.time() - timer
#add predicted class and actual class to test data
svmAllResults <- data.frame(id = testData$id, actualClass = testData$V11
,predictedClass = svmAllTest)
#calculate svm all model accuracy
accuracy(svmAllResults$actualClass,svmAllResults$predictedClass)
#-----------------------ada all-----------------------
timer <- proc.time()
library(ada)
adaAll <- ada(x = trainData[,c(-1,-ncol(trainData))]
, y = as.numeric(trainData[,ncol(trainData)]) - 1)
proc.time() - timer
timer <- proc.time()
adaAllTest <- predict(adaAll, testData[,c(-1 ,-ncol(testData))], type="probs")
proc.time() - timer
#add predicted class and actual class to test data
adaAllResults <- data.frame(id = testData$id, actualClass = testData$V11)
adaAllResults$predictedClass <- ifelse(adaAllTest[,1] > adaAllTest[,2],"g","h")
#calculate ada all model accuracy
accuracy(adaAllResults$actualClass,adaAllResults$predictedClass)
#-------------------------------------------------------------------------------
#-----------------------Create Different Bootstrap Models in Parallel-----------
#-------------------------------------------------------------------------------
#-----------------------glm-----------------------
timer <- proc.time()
modelDataGlm <- foreach(i = 1:sampleCount) %dopar% {
glm(V11 ~ .,trainSamples[[i]][,-1], family=binomial())
}
proc.time() - timer
#-----------------------svm-----------------------
# Could run tune.svm() in parallelbefore this step if needed to get the best
# values for the cost and gamma parameters.... (very slow)
timer <- proc.time()
modelDataSvm <- foreach(i = 1:sampleCount) %dopar% {
library(e1071)
svm(V11 ~ ., trainSamples[[i]][,-1], probability=TRUE
, cost=10, gamma=0.1)
}
proc.time() - timer
#-----------------------ada-----------------------
timer <- proc.time()
modelDataAda <- foreach(i = 1:sampleCount) %dopar% {
library(ada)
ada(x = trainSamples[[i]][,c(-1,-ncol(testData))]
, y = as.numeric(trainSamples[[i]][,ncol(testData)]) - 1)
}
proc.time() - timer
#-------------------------------------------------------------------------------
#-----------------------Predict the Test Data in Parallel Using Each Model------
#-------------------------------------------------------------------------------
#-----------------------glm-----------------------
timer <- proc.time()
predictDataGlm <- foreach(i = 1:sampleCount) %dopar% {
predict(modelDataGlm[[i]], testData[,c(-1 ,-ncol(testData))])
}
proc.time() - timer
#-----------------------svm-----------------------
timer <- proc.time()
predictDataSvm <- foreach(i = 1:sampleCount) %dopar% {
library(e1071)
predict(modelDataSvm[[i]], testData[,c(-1 ,-ncol(testData))]
, probability=TRUE)
}
proc.time() - timer
#-----------------------ada-----------------------
timer <- proc.time()
predictDataAda <- foreach(i = 1:sampleCount) %dopar% {
predict(modelDataAda[[i]], testData[,c(-1 ,-ncol(testData))]
, type="probs")
}
proc.time() - timer
#-------------------------------------------------------------------------------
#-----------------------Rank Each Model's Bootstap Data-------------------------
#-------------------------------------------------------------------------------
rankPredictData <- function(predictData, getPofG, rankDataObject=NULL,reverseRank=FALSE, addFinalRank=FALSE){
rankData <- rankDataObject
#Rank the test data's probability of g for each model
rankCols <- foreach(i = 1:length(predictData)) %dopar% {
if(getPofG == "svm"){
Pg <- attr(predictData[[i]], "probabilities")[,c("g")]
g <- ifelse(Pg >= 0.5,1,0)
} else if (getPofG == "ada"){
Pg <- predictData[[i]][,1]
g <- ifelse(predictData[[i]][,1] >= predictData[[i]][,2],1,0)
} else {
Pg <- predictData[[i]]
g <- ifelse(Pg <=0,1,0)
}
if(reverseRank){ Pg <- -Pg }
data.frame(gVote=g, rankG=rank(Pg))
}
#convert list into one data frame
colOffset<-ifelse(is.null(rankData),0,ncol(rankData)-1)
newRankData <- rankCols[[1]]
colnames(newRankData)[2] <- paste("rank",colOffset + 1,sep='')
for(i in 2:length(rankCols)){
newRankData <- data.frame(newRankData, rankCols[[i]]$rankG)
newRankData$gVote <- newRankData$gVote + rankCols[[i]]$gVote
colnames(newRankData)[i+1] <- paste("rank",i + colOffset,sep="")
}
#Combine any previous rankData, if provided
if(is.null(rankData)){
rankData <- newRankData
} else {
rankData <- data.frame(rankData,newRankData[,-1])
rankData$gVote <- rankData$gVote + newRankData$gVote
}
if(addFinalRank){
#create final ranking by summing all ranks and then sort DESC
rankData$finalRank <- apply(rankData[,-1],1,sum)
#add ground truth class column to output
rankData <- data.frame(id=testData$id,rankData,actualClass=testData[,ncol(testData)])
#Sort by the rank of class g
rankData <- rankData[with(rankData, order(-finalRank)), ]
#add predicted class column to output
rankData$predictedClass <- ifelse(rankData$gVote >= ((ncol(rankData)-1) / 2),"g","h")
}
return(rankData)
}
#-----------------------Create the final ranking tables-------------------
rankGlm <- rankPredictData(predictDataGlm,"glm",NULL,TRUE,TRUE)
rankSvm <- rankPredictData(predictDataSvm,"svm",NULL,FALSE,TRUE)
rankAda <- rankPredictData(predictDataAda,"ada",NULL,FALSE,TRUE)
#-------------------------------------------------------------------------------
#-----------------------Calculate Accuracy for each model-----------------------
#-------------------------------------------------------------------------------
#-----------------------topN Accuracy Function----------------------------------
rankTopN <- function(rankData) {
rankAll<-rankData
topN <- 150
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 175
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 250
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 500
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 1000
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 1500
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 2000
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
topN <- 2488
print(accuracy(rankAll[1:topN,]$actualClass,rankAll[1:topN,]$predictedClass))
}
#-----------------------Truth Table and Accuracy - Bootstrap Models-----------------
# glm accuracy
accuracy(rankGlm$actualClass,rankGlm$predictedClass)
# svm accuracy
accuracy(rankSvm$actualClass,rankSvm$predictedClass)
# ada accuracy
accuracy(rankAda$actualClass,rankAda$predictedClass)
#-----------------------TopN Accuracy - Bootstrap Models----------------------------
rankTopN(rankGlm)
rankTopN(rankSvm)
rankTopN(rankAda)
#-----------------------TopN Accuracy - Blended Bootstrap Models----------------------------
rankGlmAll <- rankPredictData(predictDataGlm,"glm",NULL,TRUE,FALSE)
rankGlmSvm <- rankPredictData(predictDataSvm,"svm",rankGlmAll,FALSE,FALSE)
rankAll <- rankPredictData(predictDataAda,"ada",rankGlmSvm,FALSE,TRUE)
rankTopN(rankAll)
#-------------------------------------------------------------------------------
#-----------------------Stacking top 2000 SVM and ADA Predictions---------------
#-------------------------------------------------------------------------------
rankGlm <- rankPredictData(predictDataGlm,"glm",NULL,TRUE,TRUE)
rankSvm <- rankPredictData(predictDataSvm,"svm",NULL,FALSE,TRUE)
rankAda <- rankPredictData(predictDataAda,"ada",NULL,FALSE,TRUE)
#get top 2000 predictions for each model
rankSvm <- rankSvm[1:2000,]
rankAda <- rankAda[1:1500,]
rankGlm <- rankGlm[1:100,]
#Find Ada records missing from Svm
rankSvmAdaMatches <- which(rankSvm$id %in% rankAda$id)
rankAdaMissing <- rankAda[-rankSvmAdaMatches,]
#combine Svm and missing Ada
rankSvmAda <- rbind(rankSvm,rankAdaMissing)
#Find glm records missing from SvmAda
rankGlmSvmAdaMatches <- which(rankGlm$id %in% rankSvmAda$id)
rankGlmMissing <- rankGlm[-rankGlmSvmAdaMatches,]
#combine Svm and missing Ada
rankGlmSvmAda <- rbind(rankSvmAda,rankGlmMissing)
#check accuracy rankSvmAda
accuracy(rankGlmSvmAda$actualClass,rankGlmSvmAda$predictedClass)
nrow(rankGlmSvmAda)