9. Support Vector Machines

Exercise 7: Using SVMs to classify mileage in `Auto` dataset

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Preparing the data

auto = pd.read_csv('../../datasets/Auto.csv', index_col=0)
auto.reset_index(inplace=True, drop=True)
auto.head()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150	3436	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150	3433	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140	3449	10.5	70	1	ford torino

auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null int64
weight          392 non-null int64
acceleration    392 non-null float64
year            392 non-null int64
origin          392 non-null int64
name            392 non-null object
dtypes: float64(3), int64(5), object(1)
memory usage: 27.6+ KB

a. Create high/low mileage variable and preprocess

# add mileage binary variable
auto['high_mpg'] = (auto['mpg'] > auto['mpg'].median()).astype(int)

auto.head()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130	3504	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150	3436	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150	3433	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140	3449	10.5	70	1	ford torino

We found this article helpful. We’ll scale the data to the interval [0, 1].

df = auto.drop(columns=['name'])
df = (df - df.min())/(df.max() - df.min())

b. Linear SVC

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# rough tuning param
param = {'C': np.logspace(0, 9, 10)}
linear_svc = SVC(kernel='linear')
linear_svc_search = GridSearchCV(estimator=linear_svc,
                                 param_grid=param,
                                 cv=7,
                                 scoring='accuracy')
X, Y = df.drop(columns=['high_mpg']), auto['high_mpg']
%timeit -n1 -r1 linear_svc_search.fit(X, Y)

804 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

linear_svc_search_df = pd.DataFrame(linear_svc_search.cv_results_)
linear_svc_search_df[['param_C', 'mean_test_score']]

	param_C	mean_test_score
0	1	0.908163
1	10	0.969388
2	100	0.984694
3	1000	0.997449
4	10000	0.997449
5	100000	0.997449
6	1e+06	0.997449
7	1e+07	0.997449
8	1e+08	0.997449
9	1e+09	0.997449

linear_svc_search.best_params_

{'C': 1000.0}

linear_svc_search.best_score_

0.9974489795918368

# fine tuning param
param = {'C': np.linspace(500, 1500, 1000)}
linear_svc = SVC(kernel='linear')
linear_svc_search = GridSearchCV(estimator=linear_svc,
                                 param_grid=param,
                                 cv=7,
                                 scoring='accuracy')
%timeit -n1 -r1 linear_svc_search.fit(X, Y)

54.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

linear_svc_search.best_params_

{'C': 696.1961961961962}

linear_svc_search.best_score_

0.9974489795918368

c. Nonlinear SVCs

Polynomial SVC

# rough param tuning
params = {'C': np.logspace(-4, 4, 9),
          'gamma': np.logspace(-4, 4, 9),
          'degree': [2, 3]}
poly_svc = SVC(kernel='poly')
poly_svc_search = GridSearchCV(estimator=poly_svc,
                                 param_grid=params,
                                 cv=7,
                                 scoring='accuracy')
%timeit -n1 -r1 poly_svc_search.fit(X, Y)

8.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

poly_svc_search.best_params_

{'C': 0.001, 'degree': 2, 'gamma': 100.0}

poly_svc_search.best_score_

0.9617346938775511

params = {'C': np.linspace(0.0001, 0.01, 20),
          'gamma': np.linspace(50, 150, 20)}
poly_svc = SVC(kernel='poly', degree=2)
poly_svc_search = GridSearchCV(estimator=poly_svc,
                                 param_grid=params,
                                 cv=7,
                                 scoring='accuracy')
%timeit -n1 -r1 poly_svc_search.fit(X, Y)

19.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

poly_svc_search.best_params_

{'C': 0.0068736842105263166, 'gamma': 50.0}

poly_svc_search.best_score_

0.9668367346938775

Radial SVC

# rough param tuning
params = {'C': np.logspace(-4, 4, 9),
          'gamma': np.logspace(-4, 4, 9)}
radial_svc = SVC(kernel='rbf')
radial_svc_search = GridSearchCV(estimator=radial_svc,
                                 param_grid=params,
                                 cv=7,
                                 scoring='accuracy')
%timeit -n1 -r1 radial_svc_search.fit(X, Y)

5.34 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

radial_svc_search.best_params_

{'C': 1000.0, 'gamma': 0.01}

radial_svc_search.best_score_

0.9795918367346939

params = {'C': np.logspace(4, 9, 5),
          'gamma': np.linspace(0.001, 0.1, 100)}
radial_svc = SVC(kernel='rbf')
radial_svc_search = GridSearchCV(estimator=radial_svc,
                                 param_grid=params,
                                 cv=7,
                                 scoring='accuracy')
%timeit -n1 -r1 radial_svc_search.fit(X, Y)

20.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

radial_svc_search.best_params_

{'C': 177827.94100389228, 'gamma': 0.002}

radial_svc_search.best_score_

0.9974489795918368

d. CV error plots

Linear SVC

linear_svc_df = pd.DataFrame(linear_svc_search.cv_results_)
sns.lineplot(x=linear_svc_df['param_C'], y=linear_svc_df['mean_test_score'])

<matplotlib.axes._subplots.AxesSubplot at 0x1a1f5e42b0>

png

Quadratic SVC

from mpl_toolkits import mplot3d

poly_svc_df = pd.DataFrame(poly_svc_search.cv_results_)

cost, gamma = poly_svc_df['param_C'].unique(), poly_svc_df['param_gamma'].unique()

X, Y = np.meshgrid(cost, gamma)
Z = scores = poly_svc_df['mean_test_score'].values.reshape(len(cost), len(gamma))

fig = plt.figure(figsize=(10, 8))
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
                cmap='Greys', edgecolor='none')
ax.set_xlabel('cost')
ax.set_ylabel('gamma')
ax.set_zlabel('cv_accuracy');
ax.view_init(20, 135)

png

radial_svc_df = pd.DataFrame(radial_svc_search.cv_results_)

cost, gamma = radial_svc_df['param_C'].unique(), radial_svc_df['param_gamma'].unique()

X, Y = np.meshgrid(cost, gamma)
Z = scores = radial_svc_df['mean_test_score'].values.reshape(len(gamma), len(cost))

fig = plt.figure(figsize=(10, 8))
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
                cmap='Greys', edgecolor='none')
ax.set_xlabel('cost')
ax.set_ylabel('gamma')
ax.set_zlabel('cv_accuracy');
ax.view_init(20, 15)

png

islr notes and exercises from An Introduction to Statistical Learning

9. Support Vector Machines

Exercise 7: Using SVMs to classify mileage in Auto dataset

Preparing the data

a. Create high/low mileage variable and preprocess

b. Linear SVC

c. Nonlinear SVCs

Polynomial SVC

Radial SVC

d. CV error plots

Linear SVC

Quadratic SVC

Exercise 7: Using SVMs to classify mileage in `Auto` dataset