using dis
in Boston
datasetInformation on the dataset can be found here
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style(style='whitegrid')
boston = pd.read_csv('../../datasets/Boston.csv', index_col=0)
boston = boston.reset_index(drop=True)
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
crim 506 non-null float64
zn 506 non-null float64
indus 506 non-null float64
chas 506 non-null int64
nox 506 non-null float64
rm 506 non-null float64
age 506 non-null float64
dis 506 non-null float64
rad 506 non-null int64
tax 506 non-null int64
ptratio 506 non-null float64
black 506 non-null float64
lstat 506 non-null float64
medv 506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.4 KB
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly = PolynomialFeatures(degree=3)
linreg = LinearRegression()
X, y = boston['dis'].values, boston['nox'].values
X, y = (X - X.mean())/X.std(), (y - y.mean())/y.std(), 1)), y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
fig = plt.figure(figsize=(10, 7))
sns.lineplot(x=X, y=linreg.predict(poly.fit_transform(X.reshape(-1,1))), color='red')
sns.scatterplot(x=X, y=y, color='grey', alpha=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2329f320>
regs = {d:None for d in range(1, 11)}
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(12,20))
for (i, d) in enumerate(regs):
poly = PolynomialFeatures(degree=d)
linreg = LinearRegression().fit(poly.fit_transform(X.reshape(-1, 1)), y)
plt.subplot(5, 2, i + 1)
sns.lineplot(x=X, y=linreg.predict(poly.fit_transform(X.reshape(-1,1))), color='red')
sns.scatterplot(x=X, y=y, color='grey', alpha=0.5)
plt.xlabel('degree ' + strd.)
We’ll estimate the root mean squared error using 10-fold cross validation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
poly_reg_pipe = Pipeline([('poly', PolynomialFeatures()), ('linreg', LinearRegression())])
poly_reg_params = {'poly__degree': np.arange(1, 10)}
poly_reg_search = GridSearchCV(estimator=poly_reg_pipe, param_grid=poly_reg_params, cv=10,
scoring='neg_mean_squared_error'), 1), y)
/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/ DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
GridSearchCV(cv=10, error_score='raise-deprecating',
steps=[('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('linreg', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
fit_params=None, iid='warn', n_jobs=None,
param_grid={'poly__degree': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='neg_mean_squared_error', verbose=0)
{'poly__degree': 3}
For this we’ll use patsy
Python module. This blog post was helpful.
We’re using
default choice for the knots (equally spaced quantiles), and default degree (3).
from patsy import dmatrix
X_tr = dmatrix("bs(x, df=4)", {'x': X})
spline_reg = LinearRegression(), y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
fig = plt.figure(figsize=(10, 7))
sns.lineplot(x=X, y=spline_reg.predict(X_tr), color='red')
sns.scatterplot(x=X, y=y, color='grey', alpha=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2338a240>
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(spline_reg.predict(X_tr), y))
fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(15, 10))
for (i, d) in enumerate(range(4, 12)):
X_tr = dmatrix("bs(x, df=" + strd. + ")", {'x': X})
spline_reg = LinearRegression(), y)
rmse = round(np.sqrt(mean_squared_error(spline_reg.predict(X_tr), y)), 4)
plt.subplot(2, 4, i + 1)
sns.lineplot(x=X, y=spline_reg.predict(X_tr), color='red')
plt.plot([], [], label='rmse = ' + str(rmse))
sns.scatterplot(x=X, y=y, color='grey', alpha=0.5)
plt.xlabel(strd. + ' degree of freedom')
from sklearn.model_selection import cross_val_score
spline_cv_rmses = {d:None for d in range(4, 12)}
for d in spline_cv_rmses:
X_tr = dmatrix("bs(x, df=" + strd. + ")", {'x': X})
linreg = LinearRegression()
cv_rmse = np.sqrt(-np.mean(cross_val_score(linreg,
X_tr, y, cv=10,
spline_cv_rmses[d] = cv_rmse
spline_cvs = pd.DataFrame({'dofs': list(spline_cv_rmses.keys()),
'cv_rmses': list(spline_cv_rmses.values())})
dofs | cv_rmses | |
0 | 4 | 0.632125 |
1 | 5 | 0.589646 |
2 | 6 | 0.592387 |
3 | 7 | 0.608314 |
4 | 8 | 0.629096 |
5 | 9 | 0.621942 |
6 | 10 | 0.640462 |
7 | 11 | 0.661658 |
fig = plt.figure(figsize=(10, 7))
sns.lineplot(x=spline_cvs['dofs'], y = spline_cvs['cv_rmses'], color='red')
<matplotlib.axes._subplots.AxesSubplot at 0x1a23c056a0>
By cross-validation, (for this choice of knots), 5 is the best number of degrees of freedom