islr notes and exercises from An Introduction to Statistical Learning

9. Support Vector Machines

Exercise 6: SVCs for barely linearly separable data

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

a. Generate data and scatterplot

data = pd.DataFrame({'X_1': np.random.uniform(size=1000), 'X_2': np.random.uniform(size=1000),
                     'Y': np.zeros(1000)})


for i in data.index:
    X_1, X_2 = data.loc[i, 'X_1'], data.loc[i, 'X_2']
    if X_1 + X_2 > 1.05:
        data.loc[i, 'Y'] = 1
    elif X_1 + X_2 < 0.95:
        data.loc[i, 'Y'] = -1
        data.loc[i, 'Y'] = np.random.choice([-1, 1])
data.loc[:, 'Y'] = pd.to_numeric(data['Y'], downcast='integer')
X_1 X_2 Y
0 0.349514 0.564869 -1
1 0.336660 0.669171 1
2 0.452217 0.179149 -1
3 0.724898 0.141218 -1
4 0.509156 0.747757 1
plt.figure(figsize=(10, 8))
sns.scatterplot(x=data['X_1'], y=data['X_2'], data=data, hue='Y')

b. Cross-validation error for SVC as a function of cost parameter

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# cost parameter
params = {'C': np.linspace(1, 50, 1000)}

# fit model
svc = SVC(kernel='linear')
svc_search = GridSearchCV(svc, 
                          scoring='accuracy')[['X_1', 'X_2']], data['Y'])
svc_search_df = pd.DataFrame(svc_search.cv_results_)
sns.lineplot(x=np.linspace(1, 50, 1000), y=svc_search_df['mean_test_score'])
{'C': 3.305305305305305}

c. Test error for SVC as a function of cost parameter

# generate test data
test_data = pd.DataFrame({'X_1': np.random.uniform(size=1000), 'X_2': np.random.uniform(size=1000),
                     'Y': np.zeros(1000)})

for i in test_data.index:
    X_1, X_2 = test_data.loc[i, 'X_1'], test_data.loc[i, 'X_2']
    if X_1 + X_2 > 1.05:
        test_data.loc[i, 'Y'] = 1
    elif X_1 + X_2 < 0.95:
        test_data.loc[i, 'Y'] = -1
        test_data.loc[i, 'Y'] = np.random.choice([-1, 1])
test_data.loc[:, 'Y'] = pd.to_numeric(test_data['Y'], downcast='integer')
from sklearn.metrics import accuracy_score

# train and test data
X_train, Y_train = data[['X_1', 'X_2']], data['Y']
X_test, Y_test = test_data[['X_1', 'X_2']], test_data['Y']

# trained models
svcs = {C: SVC(kernel='linear', C=C).fit(X_train, Y_train) for C in np.linspace(1, 50, 1000)}

# errors df
svcs_train_errors = np.array([accuracy_score(svcs[C].predict(X_train), Y_train) for C in svcs])
svcs_test_errors = np.array([accuracy_score(svcs[C].predict(X_test), Y_test) for C in svcs])
svcs_errors_df = pd.DataFrame({'C': np.linspace(1, 50, 1000), 
                               'train_error': svcs_train_errors,
                               'cv_error': svc_search_df['mean_test_score'],
                               'test_error': svcs_test_errors
C train_error cv_error test_error
0 1.000000 0.953 0.952 0.954
1 1.049049 0.954 0.954 0.954
2 1.098098 0.954 0.953 0.954
3 1.147147 0.954 0.953 0.953
4 1.196196 0.954 0.954 0.954
plt.figure(figsize=(10, 8))
sns.lineplot(x='C', y='train_error', data=svcs_errors_df, label='train_error')
sns.lineplot(x='C', y='cv_error', data=svcs_errors_df, label='cv_error')
sns.lineplot(x='C', y='test_error', data=svcs_errors_df, label='test_error')
