House Price Prediction
Predicting the prices of houses in Ames, Iowa.
!pip install xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
sns.set(style='ticks', color_codes = True)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df = train.append(test , sort = True)
print('Train:',train.shape)
print('Test:',test.shape)
print('DataFrame:',df.shape)
df.head()
df.isnull().sum()
df.dtypes
cat = list(df.select_dtypes('object'))
num = list(df.select_dtypes(['int64','float64']))
cat
num
na = df[num].isnull().sum()
na = na[na > 0]
na = na.sort_values(ascending=False)
print(na)
na = df[cat].isnull().sum()
na = na[na > 0]
na = na.sort_values(ascending=False)
print(na)
df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
df.GarageYrBlt.fillna(0, inplace=True)
df.MasVnrArea.fillna(0, inplace=True)
df.BsmtHalfBath.fillna(0, inplace=True)
df.BsmtFullBath.fillna(0, inplace=True)
df.GarageArea.fillna(0, inplace=True)
df.GarageCars.fillna(0, inplace=True)
df.TotalBsmtSF.fillna(0, inplace=True)
df.BsmtUnfSF.fillna(0, inplace=True)
df.BsmtFinSF2.fillna(0, inplace=True)
df.BsmtFinSF1.fillna(0, inplace=True)
df[num].isnull().sum()
so all missing values treated exept SalePrice, Here SalePrice is that of test data which we have to predict.
Categorical columns missing values
df.PoolQC.fillna('NA', inplace=True)
df.MiscFeature.fillna('NA', inplace=True)
df.Alley.fillna('NA', inplace=True)
df.Fence.fillna('NA', inplace=True)
df.FireplaceQu.fillna('NA', inplace=True)
df.GarageCond.fillna('NA', inplace=True)
df.GarageQual.fillna('NA', inplace=True)
df.GarageFinish.fillna('NA', inplace=True)
df.GarageType.fillna('NA', inplace=True)
df.BsmtExposure.fillna('NA', inplace=True)
df.BsmtCond.fillna('NA', inplace=True)
df.BsmtQual.fillna('NA', inplace=True)
df.BsmtFinType2.fillna('NA', inplace=True)
df.BsmtFinType1.fillna('NA', inplace=True)
df.MasVnrType.fillna('None', inplace=True)
df.Exterior2nd.fillna('None', inplace=True)
df.Functional.fillna(df.Functional.mode()[0], inplace=True)
df.Utilities.fillna(df.Utilities.mode()[0], inplace=True)
df.Exterior1st.fillna(df.Exterior1st.mode()[0], inplace=True)
df.SaleType.fillna(df.SaleType.mode()[0], inplace=True)
df.KitchenQual.fillna(df.KitchenQual.mode()[0], inplace=True)
df.Electrical.fillna(df.Electrical.mode()[0], inplace=True)
df.MSZoning.fillna(df.MSZoning.mode()[0], inplace=True)
df[cat].isnull().sum()
def boxplot(var):
sns.catplot(x=var, y='SalePrice',data = train, kind='box')
boxplot('Alley')
boxplot('BldgType')
boxplot('BsmtCond')
boxplot('BsmtFinType1')
boxplot('BsmtExposure')
boxplot('BsmtFinType2')
boxplot('BsmtQual')
boxplot('CentralAir')
boxplot('Condition1')
boxplot('Condition2')
boxplot('Electrical')
boxplot('ExterCond')
boxplot('ExterQual')
boxplot('Exterior1st')
boxplot('Exterior2nd')
boxplot('Fence')
boxplot('FireplaceQu')
boxplot('Foundation')
boxplot('Functional')
boxplot('GarageCond')
boxplot('GarageFinish')
boxplot('GarageQual')
boxplot('GarageType')
boxplot('Heating')
boxplot('HeatingQC')
boxplot('HouseStyle')
boxplot('KitchenQual')
boxplot('LandContour')
boxplot('LandSlope')
boxplot('LotConfig')
boxplot('LotShape')
boxplot('MSZoning')
boxplot('MasVnrType')
boxplot('MiscFeature')
boxplot('Neighborhood')
boxplot('PavedDrive')
boxplot('PoolQC')
boxplot('RoofMatl')
boxplot('RoofStyle')
boxplot('SaleCondition')
boxplot('SaleType')
boxplot('Street')
boxplot('Utilities')
High Dependency:
CentralAir
BsmtQual
Alley
ExterCond
Condition2
Condition1
ExterQual
GarageQual
GarageFinish
HouseStyle
KitchenQual
MSZoning
PavedDrive
Neighborhood
MiscFeature
RoofStyle
RoofMatl
PoolQC
SaleType
Modarate Dependency:
BldgType
BsmtCond
BsmtFinType1
BsmtFinType2
BsmtExposure
Electrical
Fence
Exterior1st
Exterior2nd
FireplaceQu
Foundation
Functional
GarageCond
Heating
GarageType
LandContour
HeatingQC
SaleCondition
Street
Utilities
Low Dependency:
LotShape
LotConfig
LandSlope
MasVnrType
corrmat = df.corr()
f, ax = plt.subplots(figsize=(24, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
k = 20
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df[cols].values.T)
f, ax = plt.subplots(figsize=(24, 9))
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True,
fmt='.1f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
It's needed to create dummy vars and map categorical features in order to run ML model.
df.Alley = df.Alley.map({'NA':0, 'Grvl':1, 'Pave':2})
df.BsmtCond = df.BsmtCond.map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.BsmtExposure = df.BsmtExposure.map({'NA':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4})
df['BsmtFinType1'] = df['BsmtFinType1'].map({'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})
df['BsmtFinType2'] = df['BsmtFinType2'].map({'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})
df.BsmtQual = df.BsmtQual.map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.ExterCond = df.ExterCond.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.ExterQual = df.ExterQual.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.FireplaceQu = df.FireplaceQu.map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.Functional = df.Functional.map({'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8})
df.GarageCond = df.GarageCond.map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.GarageQual = df.GarageQual.map({'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.HeatingQC = df.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.KitchenQual = df.KitchenQual.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
df.LandSlope = df.LandSlope.map({'Sev':1, 'Mod':2, 'Gtl':3})
df.PavedDrive = df.PavedDrive.map({'N':1, 'P':2, 'Y':3})
df.PoolQC = df.PoolQC.map({'NA':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
df.Street = df.Street.map({'Grvl':1, 'Pave':2})
df.Utilities = df.Utilities.map({'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'AllPub':4})
new_num = ['Alley','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtQual',
'ExterCond','ExterQual','FireplaceQu','Functional','GarageCond',
'GarageQual','HeatingQC','KitchenQual','LandSlope','PavedDrive','PoolQC',
'Street','Utilities']
num = num + new_num
for i in new_num:
cat.remove(i)
num
cat
df.MSSubClass = df.MSSubClass.map({20:'class1', 30:'class2', 40:'class3', 45:'class4',
50:'class5', 60:'class6', 70:'class7', 75:'class8',
80:'class9', 85:'class10', 90:'class11', 120:'class12',
150:'class13', 160:'class14', 180:'class15', 190:'class16'})
num.remove('MSSubClass')
cat.append('MSSubClass')
df['Age'] = df.YrSold - df.YearBuilt
df['AgeRemod'] = df.YrSold - df.YearRemodAdd
df['AgeGarage'] = df.YrSold - df.GarageYrBlt
max_AgeGarage = np.max(df.AgeGarage[df.AgeGarage < 1000])
df['AgeGarage'] = df['AgeGarage'].map(lambda x: max_AgeGarage if x > 1000 else x)
df.Age = df.Age.map(lambda x: 0 if x < 0 else x)
df.AgeRemod = df.AgeRemod.map(lambda x: 0 if x < 0 else x)
df.AgeGarage = df.AgeGarage.map(lambda x: 0 if x < 0 else x)
df=df.drop(['YrSold','YearBuilt','YearRemodAdd','GarageYrBlt'],axis=1)
for i in ['YrSold','YearBuilt','YearRemodAdd','GarageYrBlt']:
num.remove(i)
num = num + ['Age','AgeRemod','AgeGarage']
dummy_drop = []
for i in cat:
dummy_drop += [ i+'_'+str(df[i].unique()[-1]) ]
df = pd.get_dummies(df,columns=cat)
df = df.drop(dummy_drop,axis=1)
df.head()
df.dtypes
df.shape
X_train = df[:-1459].drop(['SalePrice','Id'], axis=1)
y_train = df[:-1459]['SalePrice']
X_test = df[-1459:].drop(['SalePrice','Id'], axis=1)
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb.score(X_train, y_train)
imp = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = X_train.columns)
imp = imp.sort_values(['Importance'], ascending = False)
imp
feat_sel = imp[:56]
feat_list = feat_sel.index.tolist
feat_list()
df_new = df.copy()
df_new = df_new.filter(['OverallQual', 'GarageCars', 'CentralAir_Y', 'GrLivArea', 'MSZoning_RM', 'KitchenAbvGr', 'TotalBsmtSF',
'BsmtQual', 'GarageQual', 'KitchenQual', 'FullBath', 'RoofMatl_CompShg', 'MSZoning_RL', 'Alley',
'GarageType_Attchd', 'LandSlope', 'Neighborhood_Crawfor', 'Condition1_PosA', 'HouseStyle_1.5Fin',
'Heating_Grav', 'BsmtFinSF1', 'RoofStyle_Flat', 'ExterQual', 'OverallCond', 'Condition2_Norm',
'MSZoning_C (all)', 'AgeRemod', '1stFlrSF', 'Exterior1st_HdBoard', 'FireplaceQu', 'LandContour_Bnk',
'Neighborhood_StoneBr', 'SaleType_WD', 'GarageArea', 'BedroomAbvGr', 'Functional', 'GarageCond',
'Neighborhood_Somerst', 'Exterior1st_BrkFace', 'Age', '2ndFlrSF', 'MSZoning_FV', 'LotConfig_CulDSac',
'Neighborhood_ClearCr', 'ExterCond', 'LotArea', 'BsmtFinSF2', 'Exterior2nd_Wd Shng', 'BsmtExposure',
'Fence_GdPrv', 'TotRmsAbvGrd', 'BsmtFinType1', 'SaleCondition_Abnorml', 'MSSubClass_class2',
'PoolArea', 'OpenPorchSF','SalePrice'])
X_train = df_new[:-1459].drop(['SalePrice'], axis=1)
y_train = df_new[:-1459]['SalePrice']
X_test = df_new[-1459:].drop(['SalePrice'], axis=1)
X_train.shape,y_train.shape,X_test.shape
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb.score(X_train, y_train)
y_pred = xgb.predict(X_test)
testID = pd.read_csv('test.csv')
output = pd.DataFrame({'Id': testID['Id'], 'SalePrice': y_pred})
output.to_csv('predictionfinal.csv', index=False)