In this notebook a model is proposed for predicting the sale price of houses based on the Ames Housing dataset from the “House Prices: Advanced Regression Techniques” Kaggle competition. The data provided includes 2919 houses out of which 1460 are used for training and 1459 for testing. The data requires to be cleaned and machine learning models such as random fores and gradient boosting are explored to make the predictions.
First we load the necessary libraries that we will be using for analyzing, modelling and plotting.
library(ggplot2)
library(caret)
library(scales)
library(plyr)
library(corrplot)
We can now import the data into the train and test variables.
train <- read.csv('train.csv', stringsAsFactors = FALSE)
test <- read.csv('test.csv', stringsAsFactors = FALSE)
We will assign the Id column to the test_labels variable since we will use it at the end when we integrate the CSV that will be uploaded to Kaggle. We assign NA to SalePrice in the test set since there are no values for it. We will also join the train and test data and assign it to the all variable.
test_labels <- test$Id
test$SalePrice <- NA
all <- rbind(train, test)
A lot of data is missing from the dataset so first thing we need to do is explore which variable are the ones that are missing more data. We can clearly see that PoolQC is the variable which is missing more data than other which intuitively we can infere that a lot of houses don’t really have a pool so they are assigned NA.
na_col <- which(colSums(is.na(all)) > 0)
sort(colSums(sapply(all[na_col], is.na)), decreasing = TRUE)
## PoolQC MiscFeature Alley Fence SalePrice
## 2909 2814 2721 2348 1459
## FireplaceQu LotFrontage GarageYrBlt GarageFinish GarageQual
## 1420 486 159 159 159
## GarageCond GarageType BsmtCond BsmtExposure BsmtQual
## 159 157 82 82 81
## BsmtFinType2 BsmtFinType1 MasVnrType MasVnrArea MSZoning
## 80 79 24 23 4
## Utilities BsmtFullBath BsmtHalfBath Functional Exterior1st
## 2 2 2 2 1
## Exterior2nd BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 1 1 1 1
## Electrical KitchenQual GarageCars GarageArea SaleType
## 1 1 1 1 1
We will create a function to clean the data.
clean <- function(df){
quality <- c("None", "Fa", "TA", "Gd", "Ex")
quality2 <- c("None","Po","Fa", "TA", "Gd", "Ex")
finish <- c("None"=0, "Unf"=1, "RFn"=2, "Fin"=3)
exp <- c("None","No", "Mn", "Av", "Gd")
fin <- c("None","Unf","LwQ","Rec","BLQ","ALQ","GLQ")
## Pool
df[["PoolQC"]][is.na(df[["PoolQC"]])] <- "None"
#poolqc<-revalue(df[["PoolQC"]], levels=quality)
poolqc<-factor(df[["PoolQC"]], levels=quality)
df[["PoolQC"]] <- as.numeric(poolqc)
## MiscFeature
df[["MiscFeature"]][is.na(df[["MiscFeature"]])] <- "None"
df[["MiscFeature"]] <- as.factor(df[["MiscFeature"]])
## Alley
df[["Alley"]][is.na(df[["Alley"]])] <- "None"
df[["Alley"]] <- as.factor(df[["Alley"]])
## Fence
df[["Fence"]][is.na(df[["Fence"]])] <- "None"
df[["Fence"]] <- as.factor(df[["Fence"]])
## Fireplace
df[["FireplaceQu"]][is.na(df[["FireplaceQu"]])] <- "None"
#fp<-revalue(df[["FireplaceQu"]], levels=quality)
fp<-factor(df[["FireplaceQu"]], levels=quality2)
df[["FireplaceQu"]] <- as.numeric(fp)
## Lot
df[["LotFrontage"]][is.na(df[["LotFrontage"]])] <- 69 # media
## Garage
df[["GarageYrBlt"]][is.na(df[["GarageYrBlt"]])] <- 0
df[["GarageYrBlt"]] <- factor(df[["GarageYrBlt"]])
df[["GarageFinish"]][is.na(df[["GarageFinish"]])] <- "None"
#gf <- revalue(df[["GarageFinish"]], finish)
gf <- factor(df[["GarageFinish"]], finish)
df[["GarageFinish"]]<-as.integer(gf)
df[["GarageQual"]][is.na(df[["GarageQual"]])] <- "None"
df[["GarageQual"]] <- as.numeric(factor(df[["GarageQual"]], levels=quality))
df[["GarageCond"]][is.na(df[["GarageCond"]])] <- "None"
df[["GarageCond"]] <- as.numeric(factor(df[["GarageCond"]], levels=quality))
df[["GarageType"]][is.na(df[["GarageType"]])] <- "None"
df[["GarageCars"]][is.na(df[["GarageCars"]])] <- 0
df[["GarageArea"]][is.na(df[["GarageArea"]])] <- 0
## Basement
df[["BsmtCond"]][is.na(df[["BsmtCond"]])] <- "None"
df[["BsmtCond"]] <- as.numeric(factor(df[["BsmtCond"]], levels=quality))
df[["BsmtExposure"]][is.na(df[["BsmtExposure"]])] <- "None"
df[["BsmtExposure"]] <- as.numeric(factor(df[["BsmtExposure"]], levels=exp))
df[["BsmtQual"]][is.na(df[["BsmtQual"]])] <- "None"
df[["BsmtQual"]] <- as.numeric(factor(df[["BsmtQual"]], levels=quality))
df[["BsmtFinType1"]][is.na(df[["BsmtFinType1"]])] <- "None"
df[["BsmtFinType1"]] <- as.numeric(factor(df[["BsmtFinType1"]], levels=fin))
df[["BsmtFinType2"]][is.na(df[["BsmtFinType2"]])] <- "None"
df[["BsmtFinType2"]] <- as.numeric(factor(df[["BsmtFinType2"]], levels=fin))
df[["BsmtFullBath"]][is.na(df[["BsmtFullBath"]])] <- 0
df[["BsmtHalfBath"]][is.na(df[["BsmtHalfBath"]])] <- 0
df[["BsmtFinSF1"]][is.na(df[["BsmtFinSF1"]])] <- 0
df[["BsmtFinSF2"]][is.na(df[["BsmtFinSF2"]])] <- 0
df[["BsmtUnfSF"]][is.na(df[["BsmtUnfSF"]])] <- 0
df[["TotalBsmtSF"]][is.na(df[["TotalBsmtSF"]])] <- 0
## Masory
df[["MasVnrType"]][is.na(df[["MasVnrType"]])] <- "None"
df[["MasVnrArea"]][is.na(df[["MasVnrArea"]])] <- 0
## MSZoning
df[["MSZoning"]][is.na(df[["MSZoning"]])] <- "RL"
## Utilities
df[["Utilities"]][is.na(df[["Utilities"]])] <- "AllPub"
## Functional
df[["Functional"]][is.na(df[["Functional"]])] <- "Typ"
## Exteriors
df[["Exterior1st"]][is.na(df[["Exterior1st"]])] <- "Other"
df[["Exterior2nd"]][is.na(df[["Exterior2nd"]])] <- "Other"
df[["ExterQual"]] <- as.numeric(factor(df[["ExterQual"]], levels=quality))
df[["ExterCond"]] <- as.numeric(factor(df[["ExterCond"]], levels=quality2))
## Electrical
df[["Electrical"]][is.na(df[["Electrical"]])] <- "SBrkr"
## Kitchen Quality
df[["KitchenQual"]][is.na(df[["KitchenQual"]])] <- "TA"
df[["KitchenQual"]] <- as.numeric(factor(df[["KitchenQual"]], levels=quality))
## Sale Type
df[["SaleType"]][is.na(df[["SaleType"]])] <- "WD"
df[["Neighborhood"]]<- as.factor(df[["Neighborhood"]])
return(df)
}
We will create new variables to add to our dataset. This will help us to get better results.
fe <- function(df){
df[["TotalSF"]] <- df[["TotalBsmtSF"]] + df[["X1stFlrSF"]] + df[["X2ndFlrSF"]]
df[["OverallGrade"]] <- df[["OverallQual"]] * df[["OverallCond"]]
df[["ExterGrade"]] <- df[["ExterQual"]] * df[["ExterCond"]]
df[["FireplaceScore"]] <- df[["Fireplaces"]] * df[["FireplaceQu"]]
df[["TotalBath"]] <- df[["BsmtFullBath"]] + (0.5 * df[["BsmtHalfBath"]]) + df[["FullBath"]] + (0.5 * df[["HalfBath"]])
df[["TotalSqFeet"]] <- df[["GrLivArea"]] + df[["TotalBsmtSF"]]
df[["TotalPorchSF"]] <- df[["OpenPorchSF"]] + df[["EnclosedPorch"]] + df[["X3SsnPorch"]] + df[["ScreenPorch"]]
df[["PoolScore"]] <- df[["PoolArea"]] * df[["PoolQC"]]
df[["KitchenScore"]] <- df[["KitchenAbvGr"]] * df[["KitchenQual"]]
df[["Remod"]] <- ifelse(df[["YearBuilt"]]==df[["YearRemodAdd"]], 0, 1)
df[["Age"]] <- as.numeric(df[["YrSold"]])-df[["YearRemodAdd"]]
return(df)
}
We call the feature engineering and cleaning functions we just created to update our data.
train <- fe(clean(train))
test <- fe(clean(test))
test$SalePrice <- NA
all <- rbind(train, test)
We can now take a look to our data through a histogram.
ggplot(data=all[!is.na(all$SalePrice),], aes(x=SalePrice)) +
geom_histogram(fill="black", binwidth = 10000) +
scale_x_continuous(breaks= seq(0, 800000, by=100000), labels=comma)
We can use a log function to normalize the data. We must revert the values with the exp function before uploading the results to kaggle.
train$SalePrice <- log(train$SalePrice)
We want to see which variables have the most correlation with SalePrice with a correlation matrix.
numericVars <- which(sapply(all, is.numeric))
numericVarNames <- names(numericVars)
all_numVar <- all[, numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs")
cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE))
cor_high <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_high
## [1] "SalePrice" "OverallQual" "TotalSF" "TotalSqFeet"
## [5] "GrLivArea" "ExterQual" "KitchenQual" "GarageCars"
## [9] "TotalBath" "GarageArea" "BsmtQual" "ExterGrade"
## [13] "TotalBsmtSF" "X1stFlrSF" "OverallGrade" "FullBath"
## [17] "TotRmsAbvGrd" "YearBuilt" "FireplaceQu" "YearRemodAdd"
## [21] "FireplaceScore" "Age"
cor_numVar <- cor_numVar[cor_high, cor_high]
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", tl.cex = .6,cl.cex = .6, number.cex=.5)
plot(all$SalePrice, all$OverallQual)
plot(all$SalePrice, all$GrLivArea)
We can remove outliers from GrLivArea.
train[which(train$GrLivArea > 4000), c("GrLivArea", "SalePrice")]
train <- train[-c(524,1299),]
We will try different models but we are mainly interested in using random forests and gradient boosting. We use the Caret library to test some of the models.
trnctrl <- trainControl(
method = "repeatedcv",
number = 2,
repeats = 2
)
traind <- SalePrice ~ OverallQual + TotalSF + GrLivArea + ExterQual + KitchenQual + GarageCars + TotalBath + GarageArea + BsmtQual + TotalBsmtSF + X1stFlrSF + OverallGrade + FullBath + TotRmsAbvGrd + YearBuilt + YearRemodAdd + ExterGrade + FireplaceQu + FireplaceScore + TotalSqFeet + TotalPorchSF + PoolScore + KitchenScore + Age + Remod + LotArea + YearBuilt + BsmtFinType1 + BsmtFinType2 + MSZoning
knn <- train(traind, data=train, method = "knn", trControl = trnctrl)
svm <- train(traind, data=train, method = "svmLinear", trControl = trnctrl)
#gbm <- train(traind, data=train, method = "gbm", trControl = trnctrl)
rf <- train(traind, data=train, method = "rf", trControl = trnctrl)
We will now try xgboost and we will do hyperparameter search with Caret.
#xgb_grid = expand.grid(
# nrounds = 1000,
# eta = c(0.1, 0.05, 0.01),
# max_depth = c(2, 3, 4, 5, 6),
# gamma = 0,
# colsample_bytree = 1,
# min_child_weight = c(1, 2, 3, 4 ,5),
# subsample = 1
#)
xgb_grid = expand.grid(
nrounds = 450,
eta = 0.05,
max_depth = 3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 4,
subsample = 1
)
xgb <- train(traind, data=train, method = "xgbTree", trControl = trnctrl, tuneGrid = xgb_grid)
models = list(rf=rf, svm=svm, xgb=xgb, knn=knn)
results <- resamples(models)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: rf, svm, xgb, knn
## Number of resamples: 4
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.09484232 0.09590017 0.09676344 0.09673287 0.09759614 0.09856228 0
## svm 0.08771771 0.08931307 0.09142725 0.09094203 0.09305622 0.09319593 0
## xgb 0.08868200 0.08929597 0.08990399 0.09033888 0.09094690 0.09286553 0
## knn 0.15746181 0.16129913 0.16295659 0.16201936 0.16367682 0.16470247 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.1335479 0.1372215 0.1402523 0.1401268 0.1431576 0.1464549 0
## svm 0.1207836 0.1227679 0.1276771 0.1273993 0.1323085 0.1334594 0
## xgb 0.1257747 0.1268030 0.1284852 0.1298144 0.1314966 0.1365124 0
## knn 0.2245448 0.2249277 0.2256626 0.2256609 0.2263958 0.2267737 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.8727451 0.8740390 0.8787227 0.8792916 0.8839752 0.8869757 0
## svm 0.8899539 0.8932132 0.8985088 0.8987269 0.9040225 0.9079360 0
## xgb 0.8883223 0.8916080 0.8947127 0.8946036 0.8977083 0.9006668 0
## knn 0.6763123 0.6787673 0.6820450 0.6848424 0.6881201 0.6989672 0
dotplot(results)
We will now predict using test data with the xgb model. We will also call the exp function since we used log to normalize and now we must revert it.
pred <- exp(predict(xgb, test))
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48985 128432 156672 179118 207846 764344
Finally, we create a CSV file so we can upload the results to Kaggle and we make sure that they have the correct column names.
out <- data.frame(test$Id, pred)
colnames(out)[1]<-"Id"
colnames(out)[2]<-"SalePrice"
write.csv(out, file = "out.csv", row.names = F, quote=FALSE)
With this approach we could get an RMSE of 0.13410. Further improvements can be made to achieve a better result.
[1] https://rstudio-pubs-static.s3.amazonaws.com/349700_be0afbbec67a4b48a56878f32c28f710.html
[2] https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda
[3] https://topepo.github.io/caret/available-models.html
[4] https://www.kaggle.com/c/house-prices-advanced-regression-techniques/kernels