import os #get directories
import pandas as pd #data processing
#libraries for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#displaying only Two decimal points for float data
pd.options.display.float_format = '{:,.2f}'.format


print(os.getcwd()) #check were the files are

#load data
df_test = pd.read_csv('C:/Users/Samsung/test.csv')
df_train = pd.read_csv('C:/Users/Samsung/train.csv')
df = pd.concat([df_test, df_train])
pd.merge()
#store in a object only the possible feature cols
feat = df_train.drop(labels = 'SalePrice', axis = 1)

C:\Users\Samsung


print('train shape:{}'.format(df_train.shape), '\n test shape:{}'.format(df_test.shape))
#data statistics
df_train.describe(include = ['int64','float64'])

train shape:(1460, 81) 
 test shape:(1459, 80)


#exploring the response variable:

#defining plot:
plt.figure(figsize=(16,6))
plt.title('Histogram of House Prices (U$D)')
#calling histogram
sns.kdeplot(x= df['SalePrice'], shade = True)

<AxesSubplot:title={'center':'Histogram of House Prices (U$D)'}, xlabel='SalePrice', ylabel='Density'>


#finding missing values on the training df

#SalePrice
print('How many Nan values at predicted var?\n {}'.format(df_train['SalePrice'].isnull().sum()))

#feature columns:
print('How many missing values on feature cols?\n{}'.format(feat.isna().sum().sum()))
#by cols:
Nan_col = [col for col in feat.columns if feat[col].isnull().any()]
print('\nColumns with missing values: [{}]\n'.format(len(Nan_col)))
print(feat[Nan_col].isnull().sum().sort_values(ascending= False))

How many Nan values at predicted var?
 0
How many missing values on feature cols?
6965

Columns with missing values: [19]

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64


#replacing empty spaces with the a variable that represents the absence of the feature:

fill_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType1', 'BsmtFinType2', 'BsmtExposure','BsmtCond', 'BsmtQual']

check_list=[]

for col in fill_cols:
    df_train.loc[:,col].fillna(value = 'None', inplace = True)
    df_test.loc[:,col].fillna(value = 'None', inplace = True) 
    #check if the operation worked (has Nan == False)
    check_list.append(len(df_train.loc[df_train[col].isna()]) == 0)

#Check if all test_list values are True (Bear in mind: True == 1)!!!
if (sum(check_list) == len(fill_cols)):
    print('All empty values have been replaced!')
else:
    print('Check again')

All empty values have been replaced!


cat = [col for col in feat.columns if (feat[col].dtype == 'object')]

high_cdly = {} #high cardinality features (  >6 cathegories)
for i in cat:
    if df_train[i].nunique() > 6:
        high_cdly[i] = df_train[i].nunique()
        print('col_name: {} \n --------- \n'.format(i))
        print(df_train[i].value_counts().sort_values(ascending = False))

col_name: Neighborhood 
 --------- 

NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
SWISU       25
StoneBr     25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64
col_name: Condition1 
 --------- 

Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: Condition1, dtype: int64
col_name: Condition2 
 --------- 

Norm      1445
Feedr        6
Artery       2
RRNn         2
PosN         2
PosA         1
RRAe         1
RRAn         1
Name: Condition2, dtype: int64
col_name: HouseStyle 
 --------- 

1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64
col_name: RoofMatl 
 --------- 

CompShg    1434
Tar&Grv      11
WdShngl       6
WdShake       5
ClyTile       1
Membran       1
Metal         1
Roll          1
Name: RoofMatl, dtype: int64
col_name: Exterior1st 
 --------- 

VinylSd    515
HdBoard    222
MetalSd    220
Wd Sdng    206
Plywood    108
CemntBd     61
BrkFace     50
WdShing     26
Stucco      25
AsbShng     20
Stone        2
BrkComm      2
ImStucc      1
CBlock       1
AsphShn      1
Name: Exterior1st, dtype: int64
col_name: Exterior2nd 
 --------- 

VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
Other        1
CBlock       1
Name: Exterior2nd, dtype: int64
col_name: BsmtFinType1 
 --------- 

Unf     430
GLQ     418
ALQ     220
BLQ     148
Rec     133
LwQ      74
None     37
Name: BsmtFinType1, dtype: int64
col_name: BsmtFinType2 
 --------- 

Unf     1256
Rec       54
LwQ       46
None      38
BLQ       33
ALQ       19
GLQ       14
Name: BsmtFinType2, dtype: int64
col_name: Functional 
 --------- 

Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: Functional, dtype: int64
col_name: GarageType 
 --------- 

Attchd     870
Detchd     387
BuiltIn     88
None        81
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64
col_name: SaleType 
 --------- 

WD       1267
New       122
COD        43
ConLD       9
ConLw       5
ConLI       5
CWD         4
Oth         3
Con         2
Name: SaleType, dtype: int64


#Print the cat_columns that have a high cardinality:
print(high_cdly)

{'Neighborhood': 25, 'Condition1': 9, 'Condition2': 8, 'HouseStyle': 8, 'RoofMatl': 8, 'Exterior1st': 15, 'Exterior2nd': 16, 'BsmtFinType1': 7, 'BsmtFinType2': 7, 'Functional': 7, 'GarageType': 7, 'SaleType': 9}


#defining plot:
plt.figure(figsize=(16,6))
plt.title('Stripplot for hipotesys testing (influence of neighbourhood on SalePrice)')
#calling stripplot
sns.stripplot(x = df_train['Neighborhood'], y = df_train['SalePrice'])

<AxesSubplot:title={'center':'Stripplot for hipotesys testing (influence of neighbourhood on SalePrice)'}, xlabel='Neighborhood', ylabel='SalePrice'>


#dictionary to store which columns to change
ToReplace = {}

#loop to add keys (column names) and values (cathegories to be replaced)
for col_ in high_cdly.keys():
    ValueList = []
    for u in list(df_train[col_].unique()):
        if (df_train[df_train[col_] == u].shape[0] < (0.1*(df_train[col_].shape[0]))):
            ValueList.append(u)
        else:
            continue
    ToReplace[col_] = ValueList

del ValueList


#Replacing the cathegories with low presence

for col in ToReplace.keys():
    df_train[col] = df_train[col].replace(ToReplace[col], 'Other')
    df_test[col] = df_test[col].replace(ToReplace[col], 'Other')


#define a list of numerical columns in dataframe
num_cols = list(set(df_train.columns) - set(cat))

#create a list of those columns acording with the criteria:
ToKeep = [col for col in num_cols if  df_train[col].corr(df_train['SalePrice']) > 0.35]
ToKeep.remove('SalePrice')

#plotting Colinearity heatmap:
dataCor = df_train[ToKeep].corr()

plt.figure(figsize=(24,9))
plt.title('Correlations Matrix')
sns.heatmap(dataCor, annot = True, cmap=plt.cm.Reds)

del dataCor


#select the columns to drop:
num_cols.remove('SalePrice')
ToDrop = list(set(num_cols)-set(ToKeep))

#update Training DataFrame:
df_train.drop(ToDrop, axis = 1, inplace = True)


from sklearn.model_selection import train_test_split

#Select response variable & features 
y = df_train['SalePrice']
X = df_train.drop(['SalePrice'], axis = 1)

#define train | valid data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      train_size=0.8, test_size=0.2, 
                                                      random_state = 1)


#see if there are cathegories that are not present on train but are in validation:
del cat
cat = [col for col in X_train.columns if (X_train[col].dtype == 'object')]
    
IsInTrain = [col for col in cat if set(X_valid[col]).issubset(X_train[col])] 
NotInTrain = list(set(cat) - set(IsInTrain))

print('Columns with cathegories not in training:\n {}'.format(NotInTrain))

Columns with cathegories not in training:
 ['HeatingQC']


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


#remember that cathegorical ones were already stored at 'cat' object, and numerical at 'ToKeep' object)
num = ToKeep #changing the column name to ease further understanding

#break cat_cols into ones to be OH and Ordinal encoded:
OE_cat = ['LotShape', 'LandContour', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']
OH_cat = list(set(cat) - set(OE_cat))


#Create pipeline for imputing and encoding cathegorical variables (OBS: select the ones for OneHot and Oordinal Enc.)
num_transf = SimpleImputer(strategy = 'mean')

OE_transf = Pipeline(steps = [
    ('Impute', SimpleImputer(strategy = 'most_frequent')),
    ('O_Encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = 26))
])

OH_transf = Pipeline(steps = [
    ('Imput', SimpleImputer(strategy = 'most_frequent')),
    ('OH_Encoder', OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
])


#Make a transformer for preprocess numercial and cathegorical cols
preprocessor = ColumnTransformer(transformers = [
    ('num_transf', num_transf, num),
    ('OE_transf', OE_transf, OE_cat),
    ('OH_transf', OH_transf, OH_cat)
])


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#create model object:

RFmodel = RandomForestRegressor(n_estimators = 850,
                               criterion = 'mse')

RF_Pipeline = Pipeline(steps= [
    ('preprocessing', preprocessor),
    ('modeling', RFmodel)
])

RF_Pipeline.fit(X_train, y_train)
RFpred = RF_Pipeline.predict(X_valid)

RFscore = mean_absolute_error(RFpred, y_valid)
print('Mean Absolute Error:\n{:.2f}'.format(RFscore))

Mean Absolute Error:
16672.46


plt.figure(figsize = (8,3))
plt.xlabel('SalePrice predicted')
plt.title('Regression model')
sns.regplot(x = RFpred, y= y_valid)

<AxesSubplot:title={'center':'Regression model'}, xlabel='SalePrice predicted', ylabel='SalePrice'>


from scipy import stats #module that allows to make statistical tests!
import numpy as np

#plotting the graph
plt.figure(figsize = (12, 4))
stats.probplot(x = y, dist = 'norm', plot = plt)

((array([-3.30513952, -3.04793228, -2.90489705, ...,  2.90489705,
          3.04793228,  3.30513952]),
  array([ 34900,  35311,  37900, ..., 625000, 745000, 755000], dtype=int64)),
 (74160.16474519417, 180921.19589041095, 0.9319665641512986))


#ln tranform:
y = np.log(y)

#plotting the graph
plt.figure(figsize = (12, 4))
stats.probplot(x = y, dist = 'norm', plot = plt)

((array([-3.30513952, -3.04793228, -2.90489705, ...,  2.90489705,
          3.04793228,  3.30513952]),
  array([10.46024211, 10.47194981, 10.54270639, ..., 13.34550693,
         13.5211395 , 13.53447303])),
 (0.39826223081618867, 12.024050901109383, 0.9953761475636611))


#Transform y_train and y_valid:
y_train, y_valid = np.log(y_train), np.log(y_valid)


Results = {}

#define funtion to store metrics value
def mae_scoring(n_tree):
    ppl = Pipeline(steps=[
        ('prep', preprocessor),
        ('mod', RandomForestRegressor(n_estimators = n_tree,
                                     criterion = 'mse', random_state = 1))
    ])
    ppl.fit(X_train, y_train)
    RF_preds = ppl.predict(X_valid)
    #store the results in a dictionary(for later)
    Results[n_tree] = RF_preds
    
    #get the mean absolute error:
    return mean_absolute_error(y_valid, RF_preds)

#store the data on a dictionary object
Optimal_finding = {tree:mae_scoring(tree) for tree in list(range(350, 1051, 100))}

plt.plot(list(Optimal_finding.keys()),
         list(Optimal_finding.values()))
plt.show()


plt.figure(figsize = (8,3))
plt.xlabel('SalePrice predicted')
plt.title('Regression model (log scale)')
sns.regplot(x = Results[450], y= y_valid)

cor_df = pd.DataFrame({'predicted': Results[450], 'real': y_valid})
print('loss (mae):\n {}'.format(Optimal_finding[450]))
print('CORRELATION :\n {}'.format(cor_df['predicted'].corr(cor_df['real'])))

del cor_df

loss (mae):
 0.10084218511327964
CORRELATION :
 0.9384627181000369


from xgboost import XGBRegressor


GBmodel = XGBRegressor(n_estimators = 1000, learning_rate = 0.05, n_jobs =2)

#encoding data:
GBtransformer = preprocessor.fit(X_train)
X_train = GBtransformer.transform(X_train)
X_valid = GBtransformer.transform(X_valid)

#fitting
GBmodel.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds = 5,
            verbose = False
           )

#predict
GBpred = GBmodel.predict(X_valid)

#evaluate
GBscore = mean_absolute_error(GBpred, y_valid)
print('gradient boosting scoring: {}'.format(GBscore))

C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:793: UserWarning: `early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.
  warnings.warn(

gradient boosting scoring:0.09702269791527932


plt.figure(figsize = (8,3))
plt.xlabel('XGB SalePrice predicted')
plt.title('Regression model (log scale)')
sns.regplot(x = GBpred, y= y_valid)

<AxesSubplot:title={'center':'Regression model'}, xlabel='XGB SalePrice predicted', ylabel='SalePrice'>


#get the exponentials to return it to the real values:

#RF predictions:
model_1= np.exp(Results[450])

#XGB predicitons
model_2= np.exp(GBpred)

#models combined
model_3 = (model_1 / 2) + (model_2 / 2)

#real value:
y_real = np.exp(y_valid)

#plot Random Forest Model
fig = plt.figure(figsize = (16,8))
ax1 = fig.add_axes([0,0.5, 0.45, 0.5])
plt.title('Predicted RF vs real SalePrice', size = 12)
sns.kdeplot(x = y_real, label = 'RealPrice', color = 'blue', linewidth = 4, ax = ax1)
sns.kdeplot(x = model_1, label = 'RF predictor', color = 'orange', linewidth = 3, ax = ax1)
plt.legend()

#plot XGBoost model
ax2 = fig.add_axes([0.5, 0.5, 0.45, 0.5])
plt.title('Predicted XGBoost vs real SalePrice', size = 12)
sns.kdeplot(x = y_real, label = 'RealPrice', color = 'blue', linewidth = 4, ax = ax2)
sns.kdeplot(x = model_2, label = 'XGboost Predictor', color = 'red', linewidth = 3, ax = ax2)
plt.legend()

#plot composed model
ax3 = fig.add_axes([0,0, 0.95, 0.4])
plt.title('Predicted Combined vs real SalePrice', size = 12)
sns.kdeplot(x = y_real, label = 'RealPrice', color = 'blue', linewidth = 4, ax = ax3)
sns.kdeplot(x = model_3, label = 'Combined Predictor', color = 'green', linewidth = 3, ax = ax3)
plt.legend()

<matplotlib.legend.Legend at 0x1a3ff47c190>


fig = plt.figure(figsize = (16,8))
ax1 = fig.add_axes([0,0.5, 0.45, 0.5])
plt.title('Predicted RF vs real SalePrice', size = 12)
sns.regplot(x = y_real, y = model_1, color = 'orange', ax = ax1)


#plot XGBoost model
ax2 = fig.add_axes([0.5, 0.5, 0.45, 0.5])
plt.title('Predicted XGBoost vs real SalePrice', size = 12)
sns.regplot(x = y_real, y = model_2, color = 'red', ax = ax2)


#plot composed model
ax3 = fig.add_axes([0,0, 0.95, 0.4])
plt.title('Predicted Combined vs real SalePrice', size = 12)
sns.regplot(x = y_real, y = model_3, color = 'green', ax = ax3)

<matplotlib.axes._axes.Axes at 0x1a3ffa71940>


print('expected difference of Real Price VS Model Prediction: \n--------------')
print("1. RF MAE:  {:.2%}".format((mean_absolute_error(model_1, y_real)/np.mean(y_real))))

print("2. XGBoost MAE:  {:.2%}".format((mean_absolute_error(model_2, y_real)/np.mean(y_real))))

print("3. Combined MAE:  {:.2%}".format((mean_absolute_error(model_3, y_real)/np.mean(y_real))))

expected difference of Real Price VS Model Prediction: 
--------------
1. RF MAE:  9.90%
2. XGBoost MAE:  9.86%
3. Combined MAE:  9.51%

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
count	1,460.00	1,460.00	1,201.00	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00	1,452.00	1,460.00	...	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00	1,460.00
mean	730.50	56.90	70.05	10,516.83	6.10	5.58	1,971.27	1,984.87	103.69	443.64	...	94.24	46.66	21.95	3.41	15.06	2.76	43.49	6.32	2,007.82	180,921.20
std	421.61	42.30	24.28	9,981.26	1.38	1.11	30.20	20.65	181.07	456.10	...	125.34	66.26	61.12	29.32	55.76	40.18	496.12	2.70	1.33	79,442.50
min	1.00	20.00	21.00	1,300.00	1.00	1.00	1,872.00	1,950.00	0.00	0.00	...	0.00	0.00	0.00	0.00	0.00	0.00	0.00	1.00	2,006.00	34,900.00
25%	365.75	20.00	59.00	7,553.50	5.00	5.00	1,954.00	1,967.00	0.00	0.00	...	0.00	0.00	0.00	0.00	0.00	0.00	0.00	5.00	2,007.00	129,975.00
50%	730.50	50.00	69.00	9,478.50	6.00	5.00	1,973.00	1,994.00	0.00	383.50	...	0.00	25.00	0.00	0.00	0.00	0.00	0.00	6.00	2,008.00	163,000.00
75%	1,095.25	70.00	80.00	11,601.50	7.00	6.00	2,000.00	2,004.00	166.00	712.25	...	168.00	68.00	0.00	0.00	0.00	0.00	0.00	8.00	2,009.00	214,000.00
max	1,460.00	190.00	313.00	215,245.00	10.00	9.00	2,010.00	2,010.00	1,600.00	5,644.00	...	857.00	547.00	552.00	508.00	480.00	738.00	15,500.00	12.00	2,010.00	755,000.00

Predicting House Prices with Random Forest and XGBoost¶

1. Loading the data¶

2. Explanatory Data Analysis (EDA)¶

2.1 Train & test data statistics¶

2.2 Response Variable Distribution¶

2.3 Missing Values¶

2.3.1 Filling empty spaces¶

Note: If the program returns an error, it is just a warning telling us the values are being replaced on the original DF.

2.4 Reducing High Cardinality¶

Points taken:¶

2.5 Numerical Features Selection¶

3. Preprocessing¶

Note: Because at least one column has new cathegory never seen in trainning, we have to add '`use_encoded_value`' as handle_unknown argument.¶

4. Random Forest Regressor¶

4.1 Improve Performance¶

- The first assumption is true. One way to correct it, is applying a log-transformation into the predicted variable. If we do this, the result improves a lot:

5. Extreme Gradient Boosting Regressor¶

6. Models combined¶

7. Conclusion¶

Predicting House Prices with Random Forest and XGBoost¶

1. Loading the data¶

2. Explanatory Data Analysis (EDA)¶

2.1 Train & test data statistics¶

2.2 Response Variable Distribution¶

2.3 Missing Values¶

2.3.1 Filling empty spaces¶

Note: If the program returns an error, it is just a warning telling us the values are being replaced on the original DF.

2.4 Reducing High Cardinality¶

Points taken:¶

2.5 Numerical Features Selection¶

3. Preprocessing¶

Note: Because at least one column has new cathegory never seen in trainning, we have to add 'use_encoded_value' as handle_unknown argument.¶

4. Random Forest Regressor¶

4.1 Improve Performance¶

- The first assumption is true. One way to correct it, is applying a log-transformation into the predicted variable. If we do this, the result improves a lot:

5. Extreme Gradient Boosting Regressor¶

6. Models combined¶

7. Conclusion¶

Note: Because at least one column has new cathegory never seen in trainning, we have to add '`use_encoded_value`' as handle_unknown argument.¶