islr notes and exercises from An Introduction to Statistical Learning

9. Support Vector Machines

Exercise 8: Using SVMs to classify Purchase in OJ dataset

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Preparing the data

Information on the dataset can be found here

oj = pd.read_csv('../../datasets/OJ.csv', index_col=0)
oj.reset_index(inplace=True, drop=True)
oj.head()
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
0 CH 237 1 1.75 1.99 0.00 0.0 0 0 0.500000 1.99 1.75 0.24 No 0.000000 0.000000 0.24 1
1 CH 239 1 1.75 1.99 0.00 0.3 0 1 0.600000 1.69 1.75 -0.06 No 0.150754 0.000000 0.24 1
2 CH 245 1 1.86 2.09 0.17 0.0 0 0 0.680000 2.09 1.69 0.40 No 0.000000 0.091398 0.23 1
3 MM 227 1 1.69 1.69 0.00 0.0 0 0 0.400000 1.69 1.69 0.00 No 0.000000 0.000000 0.00 1
4 CH 228 7 1.69 1.69 0.00 0.0 0 0 0.956535 1.69 1.69 0.00 Yes 0.000000 0.000000 0.00 0
oj.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 18 columns):
Purchase          1070 non-null object
WeekofPurchase    1070 non-null int64
StoreID           1070 non-null int64
PriceCH           1070 non-null float64
PriceMM           1070 non-null float64
DiscCH            1070 non-null float64
DiscMM            1070 non-null float64
SpecialCH         1070 non-null int64
SpecialMM         1070 non-null int64
LoyalCH           1070 non-null float64
SalePriceMM       1070 non-null float64
SalePriceCH       1070 non-null float64
PriceDiff         1070 non-null float64
Store7            1070 non-null object
PctDiscMM         1070 non-null float64
PctDiscCH         1070 non-null float64
ListPriceDiff     1070 non-null float64
STORE             1070 non-null int64
dtypes: float64(11), int64(5), object(2)
memory usage: 150.5+ KB
# drop superfluous variables
oj = oj.drop(columns=['STORE', 'Store7'])
oj.columns
Index(['Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'PctDiscMM', 'PctDiscCH', 'ListPriceDiff'],
      dtype='object')
from sklearn.preprocessing import LabelEncoder

# label encode string variable
purchase_le = LabelEncoder()
purchase_le.fit(oj['Purchase'].values)
oj.loc[ : , 'Purchase'] = purchase_le.transform(oj['Purchase'])
purchase_le.classes_
array(['CH', 'MM'], dtype=object)
from sklearn.preprocessing import MinMaxScaler

# scale all columns to interval [0, 1]
oj_std = (oj - oj.min()) / (oj.max() - oj.min())
oj_std.describe()
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff PctDiscMM PctDiscCH ListPriceDiff
count 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000 1070.000000
mean 0.389720 0.536888 0.493302 0.443551 0.659019 0.103720 0.154206 0.147664 0.161682 0.565808 0.701861 0.607944 0.623272 0.147505 0.108093 0.495433
std 0.487915 0.305064 0.384831 0.254924 0.223976 0.234948 0.267292 0.354932 0.368331 0.307862 0.229725 0.204834 0.207300 0.253128 0.246282 0.244399
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.254902 0.166667 0.250000 0.500000 0.000000 0.000000 0.000000 0.000000 0.325267 0.454545 0.514286 0.511450 0.000000 0.000000 0.318182
50% 0.000000 0.588235 0.333333 0.425000 0.666667 0.000000 0.000000 0.000000 0.000000 0.600027 0.818182 0.671429 0.687023 0.000000 0.000000 0.545455
75% 1.000000 0.803922 1.000000 0.750000 0.816667 0.000000 0.287500 0.000000 0.000000 0.850916 0.854545 0.714286 0.755725 0.280282 0.000000 0.681818
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

a. Train test split

from sklearn.model_selection import train_test_split

X, Y = oj.drop(columns=['Purchase']), oj['Purchase']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=800)

b. Linear SVC with C = 0.01

from sklearn.svm import SVC

linear_svc = SVC(kernel='linear', C=0.01)
linear_svc.fit(X_train, Y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

c. Training and test error rates for linear SVC

from sklearn.metrics import accuracy_score

linear_svc_train_error = accuracy_score(linear_svc.predict(X_train), Y_train)
linear_svc_test_error = accuracy_score(linear_svc.predict(X_test), Y_test)

f'The linear SVC train error is {linear_svc_train_error}'
'The linear SVC train error is 0.75125'
f'The linear SVC test error is {linear_svc_test_error}'
'The linear SVC test error is 0.7333333333333333'

d. Tuning cost parameter for linear SVC

from sklearn.model_selection import GridSearchCV

param = {'C': [0.01, 0.1, 1, 10]}
linear_svc = SVC(kernel='linear')
linear_svc_search = GridSearchCV(estimator=linear_svc,
                                 param_grid=param,
                                 cv=8,
                                 scoring='accuracy')
%timeit -n1 -r1 linear_svc_search.fit(X_train, Y_train)
3.33 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


/anaconda3/envs/islr/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
linear_svc_search.best_params_
{'C': 1}
linear_svc_search.best_score_
0.82625

e. Training and test error rates for radial SVC with optimized cost

linear_svc = SVC(kernel='linear', C=linear_svc_search.best_params_['C'])
linear_svc.fit(X_train, Y_train)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
from sklearn.metrics import accuracy_score

linear_svc_train_error = accuracy_score(linear_svc.predict(X_train), Y_train)
linear_svc_test_error = accuracy_score(linear_svc.predict(X_test), Y_test)

f'The linear SVC train error is {linear_svc_train_error}'
'The linear SVC train error is 0.83125'
f'The linear SVC test error is {linear_svc_test_error}'
'The linear SVC test error is 0.8555555555555555'

f. Repeat b. - e. for radial SVC

Radial SVC with C = 0.01

from sklearn.svm import SVC

radial_svc = SVC(kernel='rbf', C=0.01)
radial_svc.fit(X_train, Y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

Training and test error rates for radial SVC

from sklearn.metrics import accuracy_score

radial_svc_train_error = accuracy_score(radial_svc.predict(X_train), Y_train)
radial_svc_test_error = accuracy_score(radial_svc.predict(X_test), Y_test)

f'The radial SVC train error is {radial_svc_train_error}'
'The radial SVC train error is 0.6175'
f'The radial SVC test error is {radial_svc_test_error}'
'The radial SVC test error is 0.5888888888888889'

Tuning cost parameter for radial SVC

from sklearn.model_selection import GridSearchCV

param = {'C': [0.01, 0.1, 1, 10]}
radial_svc = SVC(kernel='rbf')
radial_svc_search = GridSearchCV(estimator=radial_svc,
                                 param_grid=param,
                                 cv=8,
                                 scoring='accuracy')
%timeit -n1 -r1 radial_svc_search.fit(X_train, Y_train)
1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


/anaconda3/envs/islr/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
radial_svc_search.best_params_
{'C': 10}
radial_svc_search.best_score_
0.795

Training and test error rates for SVC with optimized cost

radial_svc = SVC(kernel='rbf', C=radial_svc_search.best_params_['C'])
radial_svc.fit(X_train, Y_train)
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
radial_svc_train_error = accuracy_score(radial_svc.predict(X_train), Y_train)
radial_svc_test_error = accuracy_score(radial_svc.predict(X_test), Y_test)

f'The radial SVC train error is {radial_svc_train_error}'
'The radial SVC train error is 0.85375'
f'The radial SVC test error is {radial_svc_test_error}'
'The radial SVC test error is 0.8074074074074075'

g. Repeat b. - e. for quadratic SVC

Quadratic SVC with C = 0.01

from sklearn.svm import SVC

quad_svc = SVC(kernel='poly', degree=2, C=0.01)
quad_svc.fit(X_train, Y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

Training and test error rates for radial SVC

from sklearn.metrics import accuracy_score

quad_svc_train_error = accuracy_score(quad_svc.predict(X_train), Y_train)
quad_svc_test_error = accuracy_score(quad_svc.predict(X_test), Y_test)

f'The quadratic SVC train error is {quad_svc_train_error}'
'The quadratic SVC train error is 0.83625'
f'The quadratic SVC test error is {quad_svc_test_error}'
'The quadratic SVC test error is 0.8555555555555555'

Tuning cost parameter for radial SVC

from sklearn.model_selection import GridSearchCV

param = {'C': [0.01, 0.1, 1, 10]}
quad_svc = SVC(kernel='poly', degree=2)
quad_svc_search = GridSearchCV(estimator=quad_svc,
                                 param_grid=param,
                                 cv=8,
                                 scoring='accuracy')
%timeit -n1 -r1 quad_svc_search.fit(X_train, Y_train)
quad_svc_search.best_params_
quad_svc_search.best_score_

Training and test error rates for SVC with optimized cost

quad_svc = SVC(kernel='', C=quad_svc_search.best_params_['C'])
quad_svc.fit(X_train, Y_train)
quad_svc_train_error = accuracy_score(quad_svc.predict(X_train), Y_train)
quad_svc_test_error = accuracy_score(quad_svc.predict(X_test), Y_test)

f'The quadratic SVC train error is {quad_svc_train_error}'
f'The quadratic SVC test error is {quad_svc_test_error}'