8. Tree-based Methods
Exercise 11: Predicting Purchase
in Caravan
dataset with a boosted tree classifier
Preparing the data
Information on the dataset can be found here
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
caravan = pd.read_csv('../../datasets/Caravan.csv', index_col=0)
caravan.reset_index(inplace=True, drop=True)
caravan.head()
|
MOSTYPE |
MAANTHUI |
MGEMOMV |
MGEMLEEF |
MOSHOOFD |
MGODRK |
MGODPR |
MGODOV |
MGODGE |
MRELGE |
... |
APERSONG |
AGEZONG |
AWAOREG |
ABRAND |
AZEILPL |
APLEZIER |
AFIETS |
AINBOED |
ABYSTAND |
Purchase |
0 |
33 |
1 |
3 |
2 |
8 |
0 |
5 |
1 |
3 |
7 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
No |
1 |
37 |
1 |
2 |
2 |
8 |
1 |
4 |
1 |
4 |
6 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
No |
2 |
37 |
1 |
2 |
2 |
8 |
0 |
4 |
2 |
4 |
3 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
No |
3 |
9 |
1 |
3 |
3 |
3 |
2 |
3 |
2 |
4 |
5 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
No |
4 |
40 |
1 |
4 |
2 |
10 |
1 |
4 |
1 |
4 |
7 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
No |
5 rows × 86 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5822 entries, 0 to 5821
Data columns (total 86 columns):
MOSTYPE 5822 non-null int64
MAANTHUI 5822 non-null int64
MGEMOMV 5822 non-null int64
MGEMLEEF 5822 non-null int64
MOSHOOFD 5822 non-null int64
MGODRK 5822 non-null int64
MGODPR 5822 non-null int64
MGODOV 5822 non-null int64
MGODGE 5822 non-null int64
MRELGE 5822 non-null int64
MRELSA 5822 non-null int64
MRELOV 5822 non-null int64
MFALLEEN 5822 non-null int64
MFGEKIND 5822 non-null int64
MFWEKIND 5822 non-null int64
MOPLHOOG 5822 non-null int64
MOPLMIDD 5822 non-null int64
MOPLLAAG 5822 non-null int64
MBERHOOG 5822 non-null int64
MBERZELF 5822 non-null int64
MBERBOER 5822 non-null int64
MBERMIDD 5822 non-null int64
MBERARBG 5822 non-null int64
MBERARBO 5822 non-null int64
MSKA 5822 non-null int64
MSKB1 5822 non-null int64
MSKB2 5822 non-null int64
MSKC 5822 non-null int64
MSKD 5822 non-null int64
MHHUUR 5822 non-null int64
MHKOOP 5822 non-null int64
MAUT1 5822 non-null int64
MAUT2 5822 non-null int64
MAUT0 5822 non-null int64
MZFONDS 5822 non-null int64
MZPART 5822 non-null int64
MINKM30 5822 non-null int64
MINK3045 5822 non-null int64
MINK4575 5822 non-null int64
MINK7512 5822 non-null int64
MINK123M 5822 non-null int64
MINKGEM 5822 non-null int64
MKOOPKLA 5822 non-null int64
PWAPART 5822 non-null int64
PWABEDR 5822 non-null int64
PWALAND 5822 non-null int64
PPERSAUT 5822 non-null int64
PBESAUT 5822 non-null int64
PMOTSCO 5822 non-null int64
PVRAAUT 5822 non-null int64
PAANHANG 5822 non-null int64
PTRACTOR 5822 non-null int64
PWERKT 5822 non-null int64
PBROM 5822 non-null int64
PLEVEN 5822 non-null int64
PPERSONG 5822 non-null int64
PGEZONG 5822 non-null int64
PWAOREG 5822 non-null int64
PBRAND 5822 non-null int64
PZEILPL 5822 non-null int64
PPLEZIER 5822 non-null int64
PFIETS 5822 non-null int64
PINBOED 5822 non-null int64
PBYSTAND 5822 non-null int64
AWAPART 5822 non-null int64
AWABEDR 5822 non-null int64
AWALAND 5822 non-null int64
APERSAUT 5822 non-null int64
ABESAUT 5822 non-null int64
AMOTSCO 5822 non-null int64
AVRAAUT 5822 non-null int64
AAANHANG 5822 non-null int64
ATRACTOR 5822 non-null int64
AWERKT 5822 non-null int64
ABROM 5822 non-null int64
ALEVEN 5822 non-null int64
APERSONG 5822 non-null int64
AGEZONG 5822 non-null int64
AWAOREG 5822 non-null int64
ABRAND 5822 non-null int64
AZEILPL 5822 non-null int64
APLEZIER 5822 non-null int64
AFIETS 5822 non-null int64
AINBOED 5822 non-null int64
ABYSTAND 5822 non-null int64
Purchase 5822 non-null object
dtypes: int64(85), object(1)
memory usage: 3.8+ MB
caravan = pd.get_dummies(caravan, drop_first=True)
a. Train test split
from sklearn.model_selection import train_test_split
X, y = caravan.drop(columns=['Purchase_Yes']), caravan['Purchase_Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1000, random_state=27)
X_train.shape
b. Fit boosted tree model
from sklearn.ensemble import GradientBoostingClassifier
boost_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01)
boost_clf.fit(X_train, y_train)
GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.01, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=1000,
n_iter_no_change=None, presort='auto', random_state=None,
subsample=1.0, tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False)
feat_imp = pd.DataFrame({'Feature Importance': boost_clf.feature_importances_},
index=X.columns).sort_values(by='Feature Importance', ascending=False)
feat_imp
|
Feature Importance |
PBRAND |
7.989783e-02 |
MOPLLAAG |
7.138105e-02 |
MKOOPKLA |
6.779442e-02 |
MBERARBG |
6.553880e-02 |
PPERSAUT |
5.104349e-02 |
MSKD |
4.747047e-02 |
MINK7512 |
4.596157e-02 |
PPLEZIER |
3.968977e-02 |
MGODOV |
3.914403e-02 |
MOPLMIDD |
3.819071e-02 |
APLEZIER |
3.810555e-02 |
APERSAUT |
3.343816e-02 |
MOSTYPE |
2.926208e-02 |
PTRACTOR |
2.795271e-02 |
PBYSTAND |
2.634395e-02 |
ALEVEN |
2.426373e-02 |
MSKC |
2.391492e-02 |
MINK4575 |
2.262974e-02 |
MBERHOOG |
2.139981e-02 |
MBERARBO |
1.534339e-02 |
MFALLEEN |
1.528145e-02 |
MINKGEM |
1.474515e-02 |
MGEMOMV |
1.381983e-02 |
MGODPR |
1.217382e-02 |
MSKB2 |
1.195765e-02 |
MINK3045 |
9.918341e-03 |
MBERMIDD |
9.052936e-03 |
PLEVEN |
8.742057e-03 |
MOPLHOOG |
8.205575e-03 |
MFWEKIND |
7.063705e-03 |
... |
... |
MHKOOP |
6.941976e-04 |
MSKB1 |
6.914181e-04 |
MRELOV |
5.480189e-04 |
ATRACTOR |
2.495046e-04 |
PMOTSCO |
2.387955e-04 |
MBERBOER |
2.193347e-04 |
AMOTSCO |
2.157301e-04 |
MGODRK |
2.125493e-04 |
PWABEDR |
1.929167e-04 |
PWERKT |
1.205401e-04 |
AWERKT |
9.615065e-05 |
AFIETS |
8.984330e-05 |
PFIETS |
8.910239e-05 |
AWABEDR |
3.331575e-05 |
APERSONG |
2.180892e-05 |
PPERSONG |
1.190193e-05 |
PWALAND |
1.866996e-07 |
AWALAND |
1.618010e-07 |
AWAOREG |
0.000000e+00 |
AGEZONG |
0.000000e+00 |
AZEILPL |
0.000000e+00 |
AVRAAUT |
0.000000e+00 |
AAANHANG |
0.000000e+00 |
PGEZONG |
0.000000e+00 |
ABESAUT |
0.000000e+00 |
PAANHANG |
0.000000e+00 |
PBESAUT |
0.000000e+00 |
PZEILPL |
0.000000e+00 |
PWAOREG |
0.000000e+00 |
PVRAAUT |
0.000000e+00 |
85 rows × 1 columns
c. Predict Purchase
and compare with KNN, Logistic Regression
Confusion Matrix and precision for Boosted Tree model
from sklearn.metrics import confusion_matrix
y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in boost_clf.predict_proba(X_test)],
name='Predicted')
boost_tree_conf = pd.crosstab(y_act, y_pred, margins=True)
boost_tree_conf
Predicted |
No |
Yes |
All |
Actual |
|
|
|
No |
4328 |
204 |
4532 |
Yes |
242 |
48 |
290 |
All |
4570 |
252 |
4822 |
# fraction of people predicted to make a purchase that actually do - this is the "precision"
boost_tree_conf.at['Yes', 'Yes']/(boost_tree_conf.at['Yes', 'No'] + boost_tree_conf.at['Yes', 'Yes'])
Confusion matrix and precision for KNN model
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in knn_clf.predict_proba(X_test)],
name='Predicted')
knn_conf = pd.crosstab(y_act, y_pred, margins=True)
knn_conf
Predicted |
No |
Yes |
All |
Actual |
|
|
|
No |
4340 |
192 |
4532 |
Yes |
259 |
31 |
290 |
All |
4599 |
223 |
4822 |
# fraction of people predicted to make a purchase that actually do - this is the "precision"
knn_conf.at['Yes', 'Yes']/(knn_conf.at['Yes', 'No'] + knn_conf.at['Yes', 'Yes'])
Confusion matrix and precision for Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg_clf = LogisticRegression()
logreg_clf.fit(X_train, y_train)
y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in logreg_clf.predict_proba(X_test)],
name='Predicted')
logreg_conf = pd.crosstab(y_act, y_pred, margins=True)
logreg_conf
Predicted |
No |
Yes |
All |
Actual |
|
|
|
No |
4275 |
257 |
4532 |
Yes |
254 |
36 |
290 |
All |
4529 |
293 |
4822 |
# fraction of people predicted to make a purchase that actually do - this is the "precision"
logreg_conf.at['Yes', 'Yes']/(logreg_conf.at['Yes', 'No'] + logreg_conf.at['Yes', 'Yes'])