Ad-click-prediciton
output: html_document
title: "ClickThroughAnalysis"knitr::opts_chunk$set(echo = TRUE)
Import Libraries
library(caTools)
library(rpart)
library(rpart.plot)
library(caret)
library(e1071)
library(xgboost)
library(Matrix)
library(randomForest)
library(MLmetrics)
Import datasets
Import dataset, the data files are in the github repo https://github.com/dimension-less/Ad-click-prediciton/blob/master/content_train.tsv
train <- read.csv("content_train.tsv",sep = '\t')
Data Cleaning
We are using only content1
train_content1<-train[,c(1,2,11:27)]
train_content1<-na.omit(train_content1)
summary(train_content1)
Analysing the dependent variable
table(train_content1$content_1)
Looking at the summary we can say that various columns have outliers and the dataset is highly imbalanced
Removing outliers using boxplot technique
From express total spend
boxplot(train_content1$express.total.spend)
outlier <- boxplot.stats(train_content1$express.total.spend)$out
min(train_content1$express.total.spend)
train_content1$express.total.spend[train_content1$express.total.spend >= min(outlier)] <- min(outlier)
#boxplot.stats(train_content1$express.total.spend)$out
boxplot(train_content1$express.total.spend)
#train_content1$express.total.spend<-log(train_content1$express.total.spend)
From metro total spend
boxplot(train_content1$metro.total.spend)
outlier <- boxplot.stats(train_content1$metro.total.spend)$out
train_content1$metro.total.spend[train_content1$metro.total.spend >= min(outlier)] <- min(outlier)
#boxplot.stats(train_content1$metro.total.spend)$out
boxplot(train_content1$metro.total.spend)
From superstore.total.spend
boxplot(train_content1$superstore.total.spend)
outlier <- boxplot.stats(train_content1$superstore.total.spend)$out
train_content1$superstore.total.spend[train_content1$superstore.total.spend >= min(outlier)] <- min(outlier)
boxplot(train_content1$superstore.total.spend)
From extra.total.spend
boxplot(train_content1$extra.total.spend)
outlier <- boxplot.stats(train_content1$extra.total.spend)$out
train_content1$extra.total.spend[train_content1$extra.total.spend >= min(outlier)] <- min(outlier)
boxplot(train_content1$extra.total.spend)
From fandf.total.spend
boxplot(train_content1$fandf.total.spend)
outlier <- boxplot.stats(train_content1$fandf.total.spend)$out
#min(train_content1$fandf.total.spend)
train_content1$fandf.total.spend[train_content1$fandf.total.spend >= min(outlier)] <- min(outlier)
boxplot(train_content1$fandf.total.spend)
From petrol.total.spend
boxplot(train_content1$petrol.total.spend)
outlier <- boxplot.stats(train_content1$petrol.total.spend)$out
train_content1$petrol.total.spend[train_content1$petrol.total.spend >= min(outlier)] <- min(outlier)
boxplot(train_content1$petrol.total.spend)
From direct.total.spend
boxplot(train_content1$direct.total.spend)
outlier <- boxplot.stats(train_content1$direct.total.spend)$out
#min(train_content1$direct.total.spend)
train_content1$direct.total.spend[train_content1$direct.total.spend >= min(outlier)] <- min(outlier)
boxplot(train_content1$direct.total.spend)
hist(train_content1$direct.total.spend)
#train_content1$direct.total.spend<-log(train_content1$direct.total.spend)
So far we have removed the outliers from all the spend variables.
set.seed(88)
split <- sample.split(train_content1$content_1,SplitRatio = 0.7)
Train <-subset(train_content1,split==TRUE)
Test <-subset(train_content1,split==FALSE)
#summary(Train)
Model Building
Build decision tree using rpart.
Since the dataset is highly imbalanced we cannot use the rpart with default parameters. We will be doing cost-sensitive learning
We cannot use classification error
as a performance metric here.
We will be using penalty error
as our performance metric
Penalty Error = sum(Confusion Matrix * Penalty Matrix)/(No. of Observations)
We have to change the loss matrix so that while creating splits, it penalises the errors of both the classes differently.
Creating a penalty matrix that penalises FN
91 times more than FP
. We are keeping the same ratio as we have #Negatives/#Positives
PenaltyMatrix = matrix(data = c(0,1,91,0),nrow = 2,ncol = 2,byrow = T)
Applying Decision Tree
train_content1$county<- as.character(train_content1$county)
model<-rpart(content_1~express.no.transactions+express.total.spend+metro.no.transactions+metro.total.spend+superstore.no.transactions+superstore.total.spend+extra.no.transactions+extra.total.spend+fandf.no.transactions+fandf.total.spend+petrol.no.transactions+petrol.total.spend+direct.no.transactions+direct.total.spend+gender+affluency,data=Train,cp=0.001,method='class',parms=list(loss=PenaltyMatrix))
pred_train <- predict(model,type='class')
table(Train$content_1,pred_train)
The tree works well on the train data, but there is a risk of overfitting as we have used very low value of cp
complexity parameter.
We will be doing predictions on the test data to validate our model and calculating the penalty error
pred_test <- predict(model,newdata = Test,type='class')
conf <- as.matrix(table(`Actual` = Test$content_1,'Prediction'=pred_test))
conf
PenaltyError <- sum(conf*PenaltyMatrix)/nrow(Test)
PenaltyError
From the confusion matrix above it is clear that the tree is overfitting and we are not getting the same level of performance.
caret
Build decission tree using We are using caret package to apply cross-validation
numFolds = trainControl(method = "cv",number = 10)
cpGrid <- expand.grid(cp = seq(0.00001,0.001,0.00001))
model_cv<-train(as.factor(content_1)~express.no.transactions+express.total.spend+metro.no.transactions+metro.total.spend+superstore.no.transactions+superstore.total.spend+extra.no.transactions+extra.total.spend+fandf.no.transactions+fandf.total.spend+petrol.no.transactions+petrol.total.spend+direct.no.transactions+direct.total.spend+gender+affluency,data = Train,trControl=numFolds,tuneGrid=cpGrid,method="rpart",maximize=F,metric="Kappa")
pred_train <-predict(model_cv)
table(Train$content_1,pred_train)
pred_test <-predict(model_cv,newdata = Test)
table(Test$content_1,pred_test)
Build RandomForest
Here we will be downsampling the class 0
to reduce the imbalance
model_rf<-randomForest(as.factor(content_1)~express.no.transactions+express.total.spend+metro.no.transactions+metro.total.spend+superstore.no.transactions+superstore.total.spend+extra.no.transactions+extra.total.spend+fandf.no.transactions+fandf.total.spend+petrol.no.transactions+petrol.total.spend+direct.no.transactions+direct.total.spend+gender+affluency,data=Train,sampsize = c(900,351),strata=Train$content_1,cutoff=c(0.7,0.3))
# Doing prediction on the test data
pred_rf <-predict(model_rf,newdata = Test)
conf_rf <-as.matrix(table(Test$content_1,pred_rf))
conf_rf
PenaltyError_rf <- sum(conf_rf*PenaltyMatrix)/nrow(Test)
PenaltyError_rf
xgboost
Applying Gradient Boosting using We will be using scale_pos_weight
argument to balance the data.
Along with that we will be passing a custom evaluation metric in the feval
argument that calculates the penalty error
and minimizes it
sparse_matrix<-sparse.model.matrix(content_1~.-1-customer.id-county,data = Train)
#sparse_matrix@Dim
dtrain<-xgb.DMatrix(data=sparse_matrix,label=Train$content_1)
sparse_matrix_test<-sparse.model.matrix(content_1~.-1-customer.id-county,data = Test)
dtest<-xgb.DMatrix(data=sparse_matrix_test,label=Test$content_1)
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
B<-matrix(data = rep(0,4),nrow = 2,ncol = 2)
A<-as.matrix(table(labels,preds>=0.5))
B[1:nrow(A),1:ncol(A)]<-A
err<-sum(B*PenaltyMatrix)/nrow(dtrain)
return(list(metric = "penalty_error", value = err))
}
watchlist=list(train = dtrain, test=dtest)
params=list(eta=0.01,max_depth=8,objective="binary:logistic")
model_xgb<-xgb.train(params = params,data = dtrain,verbose = 0,watchlist = watchlist,nrounds=10000,nthread=8,maximize=F,early_stopping_rounds = 100,scale_pos_weight=91,feval=evalerror)
imp <-xgb.importance(feature_names = colnames(sparse_matrix),model = model_xgb)
xgb.plot.importance(imp)
pred_xgb<-predict(model_xgb,newdata = dtest)
table(Test$content_1,pred_xgb>=0.5)
We are finalizing with this xgboost
model. Now we will be doing our prediction on the content_test.tsv
data.
test <- read.csv("content_test.tsv",sep='\t')
Removing Outliers from test data
From express total spend
boxplot(test$express.total.spend)
outlier <- boxplot.stats(test$express.total.spend)$out
#min(test$express.total.spend)
test$express.total.spend[test$express.total.spend >= min(outlier)] <- min(outlier)
#boxplot.stats(test$express.total.spend)$out
boxplot(test$express.total.spend)
#test$express.total.spend<-log(test$express.total.spend)
From metro total spend
boxplot(test$metro.total.spend)
outlier <- boxplot.stats(test$metro.total.spend)$out
test$metro.total.spend[test$metro.total.spend >= min(outlier)] <- min(outlier)
#boxplot.stats(test$metro.total.spend)$out
boxplot(test$metro.total.spend)
From superstore.total.spend
boxplot(test$superstore.total.spend)
outlier <- boxplot.stats(test$superstore.total.spend)$out
test$superstore.total.spend[test$superstore.total.spend >= min(outlier)] <- min(outlier)
boxplot(test$superstore.total.spend)
From extra.total.spend
boxplot(test$extra.total.spend)
outlier <- boxplot.stats(test$extra.total.spend)$out
test$extra.total.spend[test$extra.total.spend >= min(outlier)] <- min(outlier)
boxplot(test$extra.total.spend)
From fandf.total.spend
boxplot(test$fandf.total.spend)
outlier <- boxplot.stats(test$fandf.total.spend)$out
#min(test$fandf.total.spend)
test$fandf.total.spend[test$fandf.total.spend >= min(outlier)] <- min(outlier)
boxplot(test$fandf.total.spend)
From petrol.total.spend
boxplot(test$petrol.total.spend)
outlier <- boxplot.stats(test$petrol.total.spend)$out
test$petrol.total.spend[test$petrol.total.spend >= min(outlier)] <- min(outlier)
boxplot(test$petrol.total.spend)
From direct.total.spend
boxplot(test$direct.total.spend)
outlier <- boxplot.stats(test$direct.total.spend)$out
#min(test$direct.total.spend)
test$direct.total.spend[test$direct.total.spend >= min(outlier)] <- min(outlier)
boxplot(test$direct.total.spend)
#hist(test$direct.total.spend)
#test$direct.total.spend<-log(test$direct.total.spend)
We will be using the xgboost model built for prediction
sparse_test<-sparse.model.matrix(~.-1-customer.id-county,data = test)
dtest2<-xgb.DMatrix(data=sparse_test)
pred_test_xgb <- predict(model_xgb,newdata = dtest2)
LogLoss
.
Calculating We will be calculating logloss for the prediction of clicking on content1
# Read the true values as 'labels'
# labels = read.csv(file_name)
#You will have to passthe true value as `labels`
#logloss = LogLoss(y_pred = pred_test_xgb,y_true = labels)
#logloss