9. Support Vector Machines
Exercise 8: Using SVMs to classify Purchase
in OJ
dataset
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
Preparing the data
Information on the dataset can be found here
oj = pd.read_csv('../../datasets/OJ.csv', index_col=0)
oj.reset_index(inplace=True, drop=True)
oj.head()
|
Purchase |
WeekofPurchase |
StoreID |
PriceCH |
PriceMM |
DiscCH |
DiscMM |
SpecialCH |
SpecialMM |
LoyalCH |
SalePriceMM |
SalePriceCH |
PriceDiff |
Store7 |
PctDiscMM |
PctDiscCH |
ListPriceDiff |
STORE |
0 |
CH |
237 |
1 |
1.75 |
1.99 |
0.00 |
0.0 |
0 |
0 |
0.500000 |
1.99 |
1.75 |
0.24 |
No |
0.000000 |
0.000000 |
0.24 |
1 |
1 |
CH |
239 |
1 |
1.75 |
1.99 |
0.00 |
0.3 |
0 |
1 |
0.600000 |
1.69 |
1.75 |
-0.06 |
No |
0.150754 |
0.000000 |
0.24 |
1 |
2 |
CH |
245 |
1 |
1.86 |
2.09 |
0.17 |
0.0 |
0 |
0 |
0.680000 |
2.09 |
1.69 |
0.40 |
No |
0.000000 |
0.091398 |
0.23 |
1 |
3 |
MM |
227 |
1 |
1.69 |
1.69 |
0.00 |
0.0 |
0 |
0 |
0.400000 |
1.69 |
1.69 |
0.00 |
No |
0.000000 |
0.000000 |
0.00 |
1 |
4 |
CH |
228 |
7 |
1.69 |
1.69 |
0.00 |
0.0 |
0 |
0 |
0.956535 |
1.69 |
1.69 |
0.00 |
Yes |
0.000000 |
0.000000 |
0.00 |
0 |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 18 columns):
Purchase 1070 non-null object
WeekofPurchase 1070 non-null int64
StoreID 1070 non-null int64
PriceCH 1070 non-null float64
PriceMM 1070 non-null float64
DiscCH 1070 non-null float64
DiscMM 1070 non-null float64
SpecialCH 1070 non-null int64
SpecialMM 1070 non-null int64
LoyalCH 1070 non-null float64
SalePriceMM 1070 non-null float64
SalePriceCH 1070 non-null float64
PriceDiff 1070 non-null float64
Store7 1070 non-null object
PctDiscMM 1070 non-null float64
PctDiscCH 1070 non-null float64
ListPriceDiff 1070 non-null float64
STORE 1070 non-null int64
dtypes: float64(11), int64(5), object(2)
memory usage: 150.5+ KB
# drop superfluous variables
oj = oj.drop(columns=['STORE', 'Store7'])
oj.columns
Index(['Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
'SalePriceCH', 'PriceDiff', 'PctDiscMM', 'PctDiscCH', 'ListPriceDiff'],
dtype='object')
from sklearn.preprocessing import LabelEncoder
# label encode string variable
purchase_le = LabelEncoder()
purchase_le.fit(oj['Purchase'].values)
oj.loc[ : , 'Purchase'] = purchase_le.transform(oj['Purchase'])
purchase_le.classes_
array(['CH', 'MM'], dtype=object)
from sklearn.preprocessing import MinMaxScaler
# scale all columns to interval [0, 1]
oj_std = (oj - oj.min()) / (oj.max() - oj.min())
oj_std.describe()
|
Purchase |
WeekofPurchase |
StoreID |
PriceCH |
PriceMM |
DiscCH |
DiscMM |
SpecialCH |
SpecialMM |
LoyalCH |
SalePriceMM |
SalePriceCH |
PriceDiff |
PctDiscMM |
PctDiscCH |
ListPriceDiff |
count |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
1070.000000 |
mean |
0.389720 |
0.536888 |
0.493302 |
0.443551 |
0.659019 |
0.103720 |
0.154206 |
0.147664 |
0.161682 |
0.565808 |
0.701861 |
0.607944 |
0.623272 |
0.147505 |
0.108093 |
0.495433 |
std |
0.487915 |
0.305064 |
0.384831 |
0.254924 |
0.223976 |
0.234948 |
0.267292 |
0.354932 |
0.368331 |
0.307862 |
0.229725 |
0.204834 |
0.207300 |
0.253128 |
0.246282 |
0.244399 |
min |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
25% |
0.000000 |
0.254902 |
0.166667 |
0.250000 |
0.500000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.325267 |
0.454545 |
0.514286 |
0.511450 |
0.000000 |
0.000000 |
0.318182 |
50% |
0.000000 |
0.588235 |
0.333333 |
0.425000 |
0.666667 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.600027 |
0.818182 |
0.671429 |
0.687023 |
0.000000 |
0.000000 |
0.545455 |
75% |
1.000000 |
0.803922 |
1.000000 |
0.750000 |
0.816667 |
0.000000 |
0.287500 |
0.000000 |
0.000000 |
0.850916 |
0.854545 |
0.714286 |
0.755725 |
0.280282 |
0.000000 |
0.681818 |
max |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
a. Train test split
from sklearn.model_selection import train_test_split
X, Y = oj.drop(columns=['Purchase']), oj['Purchase']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=800)
b. Linear SVC with C = 0.01
from sklearn.svm import SVC
linear_svc = SVC(kernel='linear', C=0.01)
linear_svc.fit(X_train, Y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='linear', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
c. Training and test error rates for linear SVC
from sklearn.metrics import accuracy_score
linear_svc_train_error = accuracy_score(linear_svc.predict(X_train), Y_train)
linear_svc_test_error = accuracy_score(linear_svc.predict(X_test), Y_test)
f'The linear SVC train error is {linear_svc_train_error}'
'The linear SVC train error is 0.75125'
f'The linear SVC test error is {linear_svc_test_error}'
'The linear SVC test error is 0.7333333333333333'
d. Tuning cost parameter for linear SVC
from sklearn.model_selection import GridSearchCV
param = {'C': [0.01, 0.1, 1, 10]}
linear_svc = SVC(kernel='linear')
linear_svc_search = GridSearchCV(estimator=linear_svc,
param_grid=param,
cv=8,
scoring='accuracy')
%timeit -n1 -r1 linear_svc_search.fit(X_train, Y_train)
3.33 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
/anaconda3/envs/islr/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
DeprecationWarning)
linear_svc_search.best_params_
linear_svc_search.best_score_
e. Training and test error rates for radial SVC with optimized cost
linear_svc = SVC(kernel='linear', C=linear_svc_search.best_params_['C'])
linear_svc.fit(X_train, Y_train)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='linear', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
from sklearn.metrics import accuracy_score
linear_svc_train_error = accuracy_score(linear_svc.predict(X_train), Y_train)
linear_svc_test_error = accuracy_score(linear_svc.predict(X_test), Y_test)
f'The linear SVC train error is {linear_svc_train_error}'
'The linear SVC train error is 0.83125'
f'The linear SVC test error is {linear_svc_test_error}'
'The linear SVC test error is 0.8555555555555555'
f. Repeat b. - e. for radial SVC
Radial SVC with C = 0.01
from sklearn.svm import SVC
radial_svc = SVC(kernel='rbf', C=0.01)
radial_svc.fit(X_train, Y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
Training and test error rates for radial SVC
from sklearn.metrics import accuracy_score
radial_svc_train_error = accuracy_score(radial_svc.predict(X_train), Y_train)
radial_svc_test_error = accuracy_score(radial_svc.predict(X_test), Y_test)
f'The radial SVC train error is {radial_svc_train_error}'
'The radial SVC train error is 0.6175'
f'The radial SVC test error is {radial_svc_test_error}'
'The radial SVC test error is 0.5888888888888889'
Tuning cost parameter for radial SVC
from sklearn.model_selection import GridSearchCV
param = {'C': [0.01, 0.1, 1, 10]}
radial_svc = SVC(kernel='rbf')
radial_svc_search = GridSearchCV(estimator=radial_svc,
param_grid=param,
cv=8,
scoring='accuracy')
%timeit -n1 -r1 radial_svc_search.fit(X_train, Y_train)
1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
/anaconda3/envs/islr/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
DeprecationWarning)
radial_svc_search.best_params_
radial_svc_search.best_score_
Training and test error rates for SVC with optimized cost
radial_svc = SVC(kernel='rbf', C=radial_svc_search.best_params_['C'])
radial_svc.fit(X_train, Y_train)
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
radial_svc_train_error = accuracy_score(radial_svc.predict(X_train), Y_train)
radial_svc_test_error = accuracy_score(radial_svc.predict(X_test), Y_test)
f'The radial SVC train error is {radial_svc_train_error}'
'The radial SVC train error is 0.85375'
f'The radial SVC test error is {radial_svc_test_error}'
'The radial SVC test error is 0.8074074074074075'
g. Repeat b. - e. for quadratic SVC
Quadratic SVC with C = 0.01
from sklearn.svm import SVC
quad_svc = SVC(kernel='poly', degree=2, C=0.01)
quad_svc.fit(X_train, Y_train)
SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=2, gamma='auto_deprecated',
kernel='poly', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
Training and test error rates for radial SVC
from sklearn.metrics import accuracy_score
quad_svc_train_error = accuracy_score(quad_svc.predict(X_train), Y_train)
quad_svc_test_error = accuracy_score(quad_svc.predict(X_test), Y_test)
f'The quadratic SVC train error is {quad_svc_train_error}'
'The quadratic SVC train error is 0.83625'
f'The quadratic SVC test error is {quad_svc_test_error}'
'The quadratic SVC test error is 0.8555555555555555'
Tuning cost parameter for radial SVC
from sklearn.model_selection import GridSearchCV
param = {'C': [0.01, 0.1, 1, 10]}
quad_svc = SVC(kernel='poly', degree=2)
quad_svc_search = GridSearchCV(estimator=quad_svc,
param_grid=param,
cv=8,
scoring='accuracy')
%timeit -n1 -r1 quad_svc_search.fit(X_train, Y_train)
quad_svc_search.best_params_
quad_svc_search.best_score_
Training and test error rates for SVC with optimized cost
quad_svc = SVC(kernel='', C=quad_svc_search.best_params_['C'])
quad_svc.fit(X_train, Y_train)
quad_svc_train_error = accuracy_score(quad_svc.predict(X_train), Y_train)
quad_svc_test_error = accuracy_score(quad_svc.predict(X_test), Y_test)
f'The quadratic SVC train error is {quad_svc_train_error}'
f'The quadratic SVC test error is {quad_svc_test_error}'