islr notes and exercises from An Introduction to Statistical Learning

8. Tree-based Methods

Exercise 11: Predicting Purchase in Caravan dataset with a boosted tree classifier

Preparing the data

Information on the dataset can be found here

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
caravan = pd.read_csv('../../datasets/Caravan.csv', index_col=0)
caravan.reset_index(inplace=True, drop=True)
caravan.head()
MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE ... APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND Purchase
0 33 1 3 2 8 0 5 1 3 7 ... 0 0 0 1 0 0 0 0 0 No
1 37 1 2 2 8 1 4 1 4 6 ... 0 0 0 1 0 0 0 0 0 No
2 37 1 2 2 8 0 4 2 4 3 ... 0 0 0 1 0 0 0 0 0 No
3 9 1 3 3 3 2 3 2 4 5 ... 0 0 0 1 0 0 0 0 0 No
4 40 1 4 2 10 1 4 1 4 7 ... 0 0 0 1 0 0 0 0 0 No

5 rows × 86 columns

caravan.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5822 entries, 0 to 5821
Data columns (total 86 columns):
MOSTYPE     5822 non-null int64
MAANTHUI    5822 non-null int64
MGEMOMV     5822 non-null int64
MGEMLEEF    5822 non-null int64
MOSHOOFD    5822 non-null int64
MGODRK      5822 non-null int64
MGODPR      5822 non-null int64
MGODOV      5822 non-null int64
MGODGE      5822 non-null int64
MRELGE      5822 non-null int64
MRELSA      5822 non-null int64
MRELOV      5822 non-null int64
MFALLEEN    5822 non-null int64
MFGEKIND    5822 non-null int64
MFWEKIND    5822 non-null int64
MOPLHOOG    5822 non-null int64
MOPLMIDD    5822 non-null int64
MOPLLAAG    5822 non-null int64
MBERHOOG    5822 non-null int64
MBERZELF    5822 non-null int64
MBERBOER    5822 non-null int64
MBERMIDD    5822 non-null int64
MBERARBG    5822 non-null int64
MBERARBO    5822 non-null int64
MSKA        5822 non-null int64
MSKB1       5822 non-null int64
MSKB2       5822 non-null int64
MSKC        5822 non-null int64
MSKD        5822 non-null int64
MHHUUR      5822 non-null int64
MHKOOP      5822 non-null int64
MAUT1       5822 non-null int64
MAUT2       5822 non-null int64
MAUT0       5822 non-null int64
MZFONDS     5822 non-null int64
MZPART      5822 non-null int64
MINKM30     5822 non-null int64
MINK3045    5822 non-null int64
MINK4575    5822 non-null int64
MINK7512    5822 non-null int64
MINK123M    5822 non-null int64
MINKGEM     5822 non-null int64
MKOOPKLA    5822 non-null int64
PWAPART     5822 non-null int64
PWABEDR     5822 non-null int64
PWALAND     5822 non-null int64
PPERSAUT    5822 non-null int64
PBESAUT     5822 non-null int64
PMOTSCO     5822 non-null int64
PVRAAUT     5822 non-null int64
PAANHANG    5822 non-null int64
PTRACTOR    5822 non-null int64
PWERKT      5822 non-null int64
PBROM       5822 non-null int64
PLEVEN      5822 non-null int64
PPERSONG    5822 non-null int64
PGEZONG     5822 non-null int64
PWAOREG     5822 non-null int64
PBRAND      5822 non-null int64
PZEILPL     5822 non-null int64
PPLEZIER    5822 non-null int64
PFIETS      5822 non-null int64
PINBOED     5822 non-null int64
PBYSTAND    5822 non-null int64
AWAPART     5822 non-null int64
AWABEDR     5822 non-null int64
AWALAND     5822 non-null int64
APERSAUT    5822 non-null int64
ABESAUT     5822 non-null int64
AMOTSCO     5822 non-null int64
AVRAAUT     5822 non-null int64
AAANHANG    5822 non-null int64
ATRACTOR    5822 non-null int64
AWERKT      5822 non-null int64
ABROM       5822 non-null int64
ALEVEN      5822 non-null int64
APERSONG    5822 non-null int64
AGEZONG     5822 non-null int64
AWAOREG     5822 non-null int64
ABRAND      5822 non-null int64
AZEILPL     5822 non-null int64
APLEZIER    5822 non-null int64
AFIETS      5822 non-null int64
AINBOED     5822 non-null int64
ABYSTAND    5822 non-null int64
Purchase    5822 non-null object
dtypes: int64(85), object(1)
memory usage: 3.8+ MB
caravan = pd.get_dummies(caravan, drop_first=True)

a. Train test split

from sklearn.model_selection import train_test_split

X, y = caravan.drop(columns=['Purchase_Yes']), caravan['Purchase_Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1000, random_state=27)
X_train.shape
(1000, 85)

b. Fit boosted tree model

from sklearn.ensemble import GradientBoostingClassifier

boost_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01)
boost_clf.fit(X_train, y_train)
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
feat_imp = pd.DataFrame({'Feature Importance': boost_clf.feature_importances_},
                        index=X.columns).sort_values(by='Feature Importance', ascending=False)

feat_imp
Feature Importance
PBRAND 7.989783e-02
MOPLLAAG 7.138105e-02
MKOOPKLA 6.779442e-02
MBERARBG 6.553880e-02
PPERSAUT 5.104349e-02
MSKD 4.747047e-02
MINK7512 4.596157e-02
PPLEZIER 3.968977e-02
MGODOV 3.914403e-02
MOPLMIDD 3.819071e-02
APLEZIER 3.810555e-02
APERSAUT 3.343816e-02
MOSTYPE 2.926208e-02
PTRACTOR 2.795271e-02
PBYSTAND 2.634395e-02
ALEVEN 2.426373e-02
MSKC 2.391492e-02
MINK4575 2.262974e-02
MBERHOOG 2.139981e-02
MBERARBO 1.534339e-02
MFALLEEN 1.528145e-02
MINKGEM 1.474515e-02
MGEMOMV 1.381983e-02
MGODPR 1.217382e-02
MSKB2 1.195765e-02
MINK3045 9.918341e-03
MBERMIDD 9.052936e-03
PLEVEN 8.742057e-03
MOPLHOOG 8.205575e-03
MFWEKIND 7.063705e-03
... ...
MHKOOP 6.941976e-04
MSKB1 6.914181e-04
MRELOV 5.480189e-04
ATRACTOR 2.495046e-04
PMOTSCO 2.387955e-04
MBERBOER 2.193347e-04
AMOTSCO 2.157301e-04
MGODRK 2.125493e-04
PWABEDR 1.929167e-04
PWERKT 1.205401e-04
AWERKT 9.615065e-05
AFIETS 8.984330e-05
PFIETS 8.910239e-05
AWABEDR 3.331575e-05
APERSONG 2.180892e-05
PPERSONG 1.190193e-05
PWALAND 1.866996e-07
AWALAND 1.618010e-07
AWAOREG 0.000000e+00
AGEZONG 0.000000e+00
AZEILPL 0.000000e+00
AVRAAUT 0.000000e+00
AAANHANG 0.000000e+00
PGEZONG 0.000000e+00
ABESAUT 0.000000e+00
PAANHANG 0.000000e+00
PBESAUT 0.000000e+00
PZEILPL 0.000000e+00
PWAOREG 0.000000e+00
PVRAAUT 0.000000e+00

85 rows × 1 columns

c. Predict Purchase and compare with KNN, Logistic Regression

Confusion Matrix and precision for Boosted Tree model

from sklearn.metrics import confusion_matrix

y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
                  name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in boost_clf.predict_proba(X_test)],
                        name='Predicted')
boost_tree_conf = pd.crosstab(y_act, y_pred, margins=True)
boost_tree_conf
Predicted No Yes All
Actual
No 4328 204 4532
Yes 242 48 290
All 4570 252 4822
# fraction of people predicted to make a purchase that actually do - this is the "precision"
boost_tree_conf.at['Yes', 'Yes']/(boost_tree_conf.at['Yes', 'No'] + boost_tree_conf.at['Yes', 'Yes'])
0.16551724137931034

Confusion matrix and precision for KNN model

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
                  name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in knn_clf.predict_proba(X_test)],
                        name='Predicted')
knn_conf = pd.crosstab(y_act, y_pred, margins=True)
knn_conf
Predicted No Yes All
Actual
No 4340 192 4532
Yes 259 31 290
All 4599 223 4822
# fraction of people predicted to make a purchase that actually do - this is the "precision"
knn_conf.at['Yes', 'Yes']/(knn_conf.at['Yes', 'No'] + knn_conf.at['Yes', 'Yes'])
0.10689655172413794

Confusion matrix and precision for Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg_clf = LogisticRegression()
logreg_clf.fit(X_train, y_train)

y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
                  name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in logreg_clf.predict_proba(X_test)],
                        name='Predicted')
logreg_conf = pd.crosstab(y_act, y_pred, margins=True)
logreg_conf
Predicted No Yes All
Actual
No 4275 257 4532
Yes 254 36 290
All 4529 293 4822
# fraction of people predicted to make a purchase that actually do - this is the "precision"
logreg_conf.at['Yes', 'Yes']/(logreg_conf.at['Yes', 'No'] + logreg_conf.at['Yes', 'Yes'])
0.12413793103448276