Learning curves with scikit-learn

Author: Leonardo Espin

Date: 1/8/2019

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression,Ridge #regularized linear regression
from sklearn.model_selection import learning_curve

#main data, cross validation data and validation datasets:
X=pd.read_csv('ex5data1-X.csv',header=None)
y=pd.read_csv('ex5data1-y.csv',header=None)
Xtest=pd.read_csv('ex5data1-Xt.csv',header=None)
ytest=pd.read_csv('ex5data1-yt.csv',header=None)
Xval=pd.read_csv('ex5data1-Xv.csv',header=None)
yval=pd.read_csv('ex5data1-yv.csv',header=None)

A single-variable linear regression of the main data:

reg=LinearRegression().fit(X,y)
pred_y = reg.predict(X)
print(reg.coef_)
print(reg.intercept_ )

[[0.36777566]]
[13.08782802]

plt.figure(figsize=(8,5))
plt.plot(X,y,'.');
plt.plot(X,pred_y)
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');

Clearly we won't be able to fit the data as it is. Regularization will not help either:

regReg=Ridge().fit(X,y)#default lambda (called alpha in sklearn) =1
rPred_y = regReg.predict(X)
print(regReg.coef_)
print(regReg.intercept_ )

[[0.36773843]]
[13.08763867]

The learning curves help to characterize the problem:

train_sizes,train_scores,test_scores = learning_curve(Ridge(),X,y,
                                                scoring='neg_mean_squared_error')

plt.figure(figsize=(8,5))
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()

train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best");

Clearly it is a large bias model. Both curves are converging to a large error value.

Below I merge X and Xtest because sklearn's learning_curve takes care of doing cross validation.

Xfull=pd.concat([X,Xtest],axis=0,ignore_index=True)
yfull=pd.concat([y,ytest],axis=0,ignore_index=True)
plt.plot(Xfull,yfull,'.');
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');

To solve the large bias problem, let's add features to the data:

#adding polynomial features
for i in range(7):
    Xfull[i+2]=Xfull[0].apply(lambda x: np.power(x,i+2))
    
Xfull.head()

Repeating linear regression without regularization:

#default lambda (called alpha in sklearn) is 1. We remove regularization by making it zero
regReg=Ridge(alpha=0,normalize=True).fit(Xfull.iloc[0:12,:],
                                         yfull.iloc[0:12])
rPred_y = regReg.predict(Xfull.iloc[0:12,:])
print(regReg.coef_)
print(regReg.intercept_ )

[[ 3.18263451e-01  2.40959734e-02  6.27155867e-04 -1.63977503e-05
  -8.28278332e-07  1.83246824e-09  3.20374087e-10  2.48306908e-12]]
[3.61277073]

plt.figure(figsize=(8,5))
plt.plot(Xfull.iloc[0:12,0],yfull.iloc[0:12],'+');
plt.plot(Xfull.iloc[0:12,0],rPred_y,'x')
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');

So now the model fits the data well. But how it will generalize to new data?

Here is were learning curves become really useful:

#notice that I'm using the full dataset, since learning_curve takes care of
#doing cross validation

#import warnings #this two lines are just to make output preety here
#warnings.filterwarnings("ignore")

train_sizes,train_scores,test_scores = learning_curve(Ridge(alpha=0,normalize=True),
                                                      Xfull,yfull,
                                                      train_sizes=np.linspace(.1, 1.0, 10),
                                                scoring='neg_mean_squared_error')

plt.figure(figsize=(8,5))
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.ylim(top=100)

train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best");

So, this example shows high variance as we add new data. The model does not generalize well.

Choosing a regularization parameter $\lambda$¶

lamb=[0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
errT=[]
errCV=[]
mTr=len(y)
mTe=len(ytest)
for l in lamb:
    #notice the conversion from lambda to alpha below
    regReg=Ridge(alpha=l/(2*mTr),normalize=True).fit(Xfull.iloc[0:12,:],y)
    train = regReg.predict(Xfull.iloc[0:12,:])
    test=regReg.predict(Xfull.iloc[12:,:])
    errT.append(((y.values- train) ** 2).sum()/(2*mTr) )
    errCV.append(((ytest.values - test) ** 2).sum()/(2*mTe) )

plt.figure(figsize=(8,5))
plt.xlabel("Lambda")
plt.ylabel("Error")
plt.grid()
plt.ylim(top=20)
plt.xlim(left=0,right=10)
plt.plot(lamb, errT,label="Train")
plt.plot(lamb, errCV,label="Cross-validation")
plt.legend(loc="best");

Recalculating the learning curves with the optimal value of $\lambda$

train_sizes,train_scores,test_scores = learning_curve(Ridge(alpha=3/(2*mTr),normalize=True),
                                                      Xfull,yfull,
                                                      train_sizes=np.linspace(.1, 1.0, 10),
                                                scoring='neg_mean_squared_error')

plt.figure(figsize=(8,5))
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.ylim(top=100)

train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best");

this model generalizes well to new data. Let's test it with the validation data:

#generate polynomial features for the validation data:
for i in range(7):
    Xval[i+2]=Xval[0].apply(lambda x: np.power(x,i+2))
    
#calculate regressor

regReg=Ridge(alpha=3/(2*mTr),normalize=True).fit(Xfull.iloc[0:12,:],y)
train = regReg.predict(Xfull.iloc[0:12,:])

validation=regReg.predict(Xval) #seems that predictor is applying normalization here too!!
errT=(((y.values- train) ** 2).sum()/(2*mTr))
errVal=(((yval.values - validation) ** 2).sum()/(2*len(yval)))
print('Train error: {}'.format(errT))
print('Validation error: {}'.format(errVal))

Train error: 2.5877049542806385
Validation error: 4.190395255084082

plt.figure(figsize=(8,5))
plt.plot(Xval[0],yval,'+');
plt.plot(Xval[0],validation,'x')
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');

#let's see the regularized regression coefficients:
print(regReg.coef_)
print(regReg.intercept_ )

[[2.67835450e-01 6.10254114e-03 7.74911263e-05 1.35308049e-06
  2.09694585e-08 3.16920841e-10 6.15718317e-12 6.55912613e-14]]
[6.67832161]

Notice that $\theta_0 =6.68$ and $\theta_1=0.27$ are comparable in magnitude to the same coefficients in the first regression we did, which didn't use regularization.

	0	2	3	4	5	6	7	8
0	-15.937	253.987969	-4047.806262	6.450989e+04	-1.028094e+06	1.638474e+07	-2.611235e+08	4.161526e+09
1	-29.153	849.897409	-24777.059165	7.223256e+05	-2.105796e+07	6.139027e+08	-1.789710e+10	5.217543e+11
2	36.190	1309.716100	47398.625659	1.715356e+06	6.207874e+07	2.246630e+09	8.130553e+10	2.942447e+12
3	37.492	1405.650064	52700.632199	1.975852e+06	7.407865e+07	2.777357e+09	1.041287e+11	3.903992e+12
4	-48.059	2309.667481	-111000.309469	5.334564e+06	-2.563738e+08	1.232107e+10	-5.921382e+11	2.845757e+13