Author: Leonardo Espin
Date: 1/8/2019
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression,Ridge #regularized linear regression
from sklearn.model_selection import learning_curve
#main data, cross validation data and validation datasets:
X=pd.read_csv('ex5data1-X.csv',header=None)
y=pd.read_csv('ex5data1-y.csv',header=None)
Xtest=pd.read_csv('ex5data1-Xt.csv',header=None)
ytest=pd.read_csv('ex5data1-yt.csv',header=None)
Xval=pd.read_csv('ex5data1-Xv.csv',header=None)
yval=pd.read_csv('ex5data1-yv.csv',header=None)
A single-variable linear regression of the main data:
reg=LinearRegression().fit(X,y)
pred_y = reg.predict(X)
print(reg.coef_)
print(reg.intercept_ )
plt.figure(figsize=(8,5))
plt.plot(X,y,'.');
plt.plot(X,pred_y)
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');
Clearly we won't be able to fit the data as it is. Regularization will not help either:
regReg=Ridge().fit(X,y)#default lambda (called alpha in sklearn) =1
rPred_y = regReg.predict(X)
print(regReg.coef_)
print(regReg.intercept_ )
The learning curves help to characterize the problem:
train_sizes,train_scores,test_scores = learning_curve(Ridge(),X,y,
scoring='neg_mean_squared_error')
plt.figure(figsize=(8,5))
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best");
Clearly it is a large bias model. Both curves are converging to a large error value.
Below I merge X
and Xtest
because sklearn
's learning_curve
takes care of doing cross validation.
Xfull=pd.concat([X,Xtest],axis=0,ignore_index=True)
yfull=pd.concat([y,ytest],axis=0,ignore_index=True)
plt.plot(Xfull,yfull,'.');
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');
To solve the large bias problem, let's add features to the data:
#adding polynomial features
for i in range(7):
Xfull[i+2]=Xfull[0].apply(lambda x: np.power(x,i+2))
Xfull.head()
Repeating linear regression without regularization:
#default lambda (called alpha in sklearn) is 1. We remove regularization by making it zero
regReg=Ridge(alpha=0,normalize=True).fit(Xfull.iloc[0:12,:],
yfull.iloc[0:12])
rPred_y = regReg.predict(Xfull.iloc[0:12,:])
print(regReg.coef_)
print(regReg.intercept_ )
plt.figure(figsize=(8,5))
plt.plot(Xfull.iloc[0:12,0],yfull.iloc[0:12],'+');
plt.plot(Xfull.iloc[0:12,0],rPred_y,'x')
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');
So now the model fits the data well. But how it will generalize to new data?
Here is were learning curves become really useful:
#notice that I'm using the full dataset, since learning_curve takes care of
#doing cross validation
#import warnings #this two lines are just to make output preety here
#warnings.filterwarnings("ignore")
train_sizes,train_scores,test_scores = learning_curve(Ridge(alpha=0,normalize=True),
Xfull,yfull,
train_sizes=np.linspace(.1, 1.0, 10),
scoring='neg_mean_squared_error')
plt.figure(figsize=(8,5))
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.ylim(top=100)
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best");
So, this example shows high variance as we add new data. The model does not generalize well.
lamb=[0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
errT=[]
errCV=[]
mTr=len(y)
mTe=len(ytest)
for l in lamb:
#notice the conversion from lambda to alpha below
regReg=Ridge(alpha=l/(2*mTr),normalize=True).fit(Xfull.iloc[0:12,:],y)
train = regReg.predict(Xfull.iloc[0:12,:])
test=regReg.predict(Xfull.iloc[12:,:])
errT.append(((y.values- train) ** 2).sum()/(2*mTr) )
errCV.append(((ytest.values - test) ** 2).sum()/(2*mTe) )
plt.figure(figsize=(8,5))
plt.xlabel("Lambda")
plt.ylabel("Error")
plt.grid()
plt.ylim(top=20)
plt.xlim(left=0,right=10)
plt.plot(lamb, errT,label="Train")
plt.plot(lamb, errCV,label="Cross-validation")
plt.legend(loc="best");
Recalculating the learning curves with the optimal value of $\lambda$
train_sizes,train_scores,test_scores = learning_curve(Ridge(alpha=3/(2*mTr),normalize=True),
Xfull,yfull,
train_sizes=np.linspace(.1, 1.0, 10),
scoring='neg_mean_squared_error')
plt.figure(figsize=(8,5))
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.ylim(top=100)
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best");
this model generalizes well to new data. Let's test it with the validation data:
#generate polynomial features for the validation data:
for i in range(7):
Xval[i+2]=Xval[0].apply(lambda x: np.power(x,i+2))
#calculate regressor
regReg=Ridge(alpha=3/(2*mTr),normalize=True).fit(Xfull.iloc[0:12,:],y)
train = regReg.predict(Xfull.iloc[0:12,:])
validation=regReg.predict(Xval) #seems that predictor is applying normalization here too!!
errT=(((y.values- train) ** 2).sum()/(2*mTr))
errVal=(((yval.values - validation) ** 2).sum()/(2*len(yval)))
print('Train error: {}'.format(errT))
print('Validation error: {}'.format(errVal))
plt.figure(figsize=(8,5))
plt.plot(Xval[0],yval,'+');
plt.plot(Xval[0],validation,'x')
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)');
#let's see the regularized regression coefficients:
print(regReg.coef_)
print(regReg.intercept_ )
Notice that $\theta_0 =6.68$ and $\theta_1=0.27$ are comparable in magnitude to the same coefficients in the first regression we did, which didn't use regularization.