import pandas as pd
default = pd.read_csv("../../datasets/Default.csv", index_col=0)
default.head()
default | student | balance | income | |
---|---|---|---|---|
1 | No | No | 729.526495 | 44361.625074 |
2 | No | Yes | 817.180407 | 12106.134700 |
3 | No | No | 1073.549164 | 31767.138947 |
4 | No | No | 529.250605 | 35704.493935 |
5 | No | No | 785.655883 | 38463.495879 |
default.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 4 columns):
default 10000 non-null object
student 10000 non-null object
balance 10000 non-null float64
income 10000 non-null float64
dtypes: float64(2), object(2)
memory usage: 390.6+ KB
default['const'] = 1
columns = list(default.columns)
columns.remove('const')
default = default[['const'] + columns]
default.head()
const | default | student | balance | income | |
---|---|---|---|---|---|
1 | 1 | No | No | 729.526495 | 44361.625074 |
2 | 1 | No | Yes | 817.180407 | 12106.134700 |
3 | 1 | No | No | 1073.549164 | 31767.138947 |
4 | 1 | No | No | 529.250605 | 35704.493935 |
5 | 1 | No | No | 785.655883 | 38463.495879 |
default
, student
to numerical valuesLet Yes=1, No=0
default['default'] = [int(value=='Yes') for value in default['default']]
default['student'] = [int(value=='Yes') for value in default['student']]
default.head()
const | default | student | balance | income | |
---|---|---|---|---|---|
1 | 1 | 0 | 0 | 729.526495 | 44361.625074 |
2 | 1 | 0 | 1 | 817.180407 | 12106.134700 |
3 | 1 | 0 | 0 | 1073.549164 | 31767.138947 |
4 | 1 | 0 | 0 | 529.250605 | 35704.493935 |
5 | 1 | 0 | 0 | 785.655883 | 38463.495879 |
default.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 5 columns):
const 10000 non-null int64
default 10000 non-null int64
student 10000 non-null int64
balance 10000 non-null float64
income 10000 non-null float64
dtypes: float64(2), int64(3)
memory usage: 468.8 KB
from sklearn.linear_model import LogisticRegression
X, y = default[['balance', 'income', 'const']], default['default']
logit = LogisticRegression(solver='lbfgs').fit(X, y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logit_train_0 = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
y_pred_0 = logit_train_0.predict(X_test)
from sklearn.metrics import accuracy_score
errs_0 = {}
errs_0['err0'] = 1 - accuracy_score(y_test, y_pred_0)
errs_0['err0']
0.038799999999999946
# train and predict
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
logit_train_1 = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
y_pred_1 = logit_train_1.predict(X_test)
# get error
errs_0['err1'] = 1 - accuracy_score(y_test, y_pred_1)
errs_0['err1']
0.03159999999999996
# train and predict
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
logit_train_2 = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
y_pred_2 = logit_train_2.predict(X_test)
# get error
errs_0['err2'] = 1 - accuracy_score(y_test, y_pred_2)
errs_0['err2']
0.027200000000000002
These results are close to each other? Their average is
sum(errs_0.values())/len(errs_0)
0.0325333333333333
X, y = default[['balance', 'income', 'student', 'const']], default['default']
errs_1 = {}
for i in range(3):
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i)
errs_1['err' + stri.] = 1 - accuracy_score(y_test,
LogisticRegression(solver='lbfgs').fit(X_train, y_train).predict(X_test))
errs_1
{'err0': 0.038799999999999946,
'err1': 0.03159999999999996,
'err2': 0.027200000000000002}
sum(errs_1.values())/len(errs_1)
0.0325333333333333
The test error hasn’t changed