Week 3 - Linear Regression
(OLS = ordinary least squared regression)
(e.g. linear regression & basis function regression)
0. initial set-up
# randomly split data to training set & test set
import numpy as np
# (to assess generalisation error)
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
X_train, X_test, y_train, y_test = train_test_split(ds, y,
import seaborn as sns
test_size=0.2, random_state=90051)
sns.set_style('darkgrid')
print("Training set has {} instances. Test set has {}
plt.rcParams['figure.dpi'] = 108
instances.".format(X_train.shape[0],
X_test.shape[0]))
1. The Boston Housing dataset
input = data about towns (506 instances, 13 features)
# select subset of the features
predicted output = median house value
X_train_s = X_train[features].values
X_test_s = X_test[features].values
# load data, convert to pandas dataframe
# Training set has 404 instances. Test set has 102
from sklearn.datasets import load_boston
instances.
boston = load_boston()
ds = pd.DataFrame(boston.data,
2. Linear algebra solutions
columns=boston.feature_names)
Fit a linear regression to the single-featured data.
y = pd.Series(boston.target, name='MEDV')
(How? fit the model by linear algebra functions)
ds.head()
# focus on single feature LSTAT only (input)
features = ['LSTAT']
# response variable (output) = “MEDV”
#ds['LSTAT'] = ds['LSTAT'].apply(lambda x: x/100.)
for f in features:
plt.figure()
plt.scatter(ds[f], y, marker='.')
plt.xlabel(f)
plt.ylabel(r'Median House Value ($\times
10^3$ USD)')
# Prepend a column of 1's to the design matrices print('Train MSE:', mean_squared_error(y_pred_train,
(combined the bias term to the weights vector) y_train))
X_train_b = np.column_stack( print('Test MSE:', mean_squared_error(y_pred_test,
(np.ones_like(X_train_s), X_train_s) ) y_test))
X_test_b = np.column_stack( # Train MSE: 38.6322164416081
(np.ones_like(X_test_s), X_test_s)) # Test MSE: 38.00420488101306
print('Design matrix shape:', X_train_s.shape)
# Design matrix shape: (404, 1) 4. linear regression using sklearn
# use sklearn to fit linear regression directly
w = np.linalg.solve( (X_train_b.T @ X_train_b), from sklearn.linear_model import LinearRegression
(X_train_b.T @ y_train)) lr = LinearRegression().fit(X_train_s, y_train)
print('Weights:', w)
# Weights: [34.51530004 -0.95801769] # lr returns the weight for bias term
lr.intercept_
@ matmul np.dot() # 34.5153000408642
# plot predictions on test data # lr returns the weight for non-bias terms
# X = design matrix, w = weights vector lr.coef_
def predict(X, w): # array([-0.95801769])
return np.dot(X, w)
# predict by sklearn (using ALL 13 features)
X_grid = np.linspace(X_train_s.min(), X_train_s.max(), lr_full = LinearRegression().fit(X_train, y_train)
num=1001) y_pred_train = lr_full.predict(X_train)
x = np.column_stack((np.ones_like(X_grid), X_grid)) y_pred_test = lr_full.predict(X_test)
y = predict(x, w)
plt.plot(X_grid, y, 'k-', label='Prediction') print('Train MSE:', mean_squared_error(y_pred_train,
plt.scatter(X_train_s, y_train, color='b', marker='.', y_train))
label='Train') print('Test MSE:', mean_squared_error(y_pred_test,
#plt.scatter(X_test_s, y_test, color='r', marker='.', y_test))
label='Test') # Train MSE: 20.059284291202285
plt.legend() # Test MSE: 30.72694987338853
plt.ylabel("$y$ (Median House Value)")
plt.xlabel("$x$ (LSTAT)") # calculate mean squared error by sklearn
plt.show() from sklearn.metrics import mean_squared_error as
sk_mse
print('Train MSE:', sk_mse(y_pred_train, y_train))
print('Test MSE:', sk_mse(y_pred_test, y_test))
# Train MSE: 20.059284291202285
# Test MSE: 30.72694987338853
5. Basis expansion
basis expansion = extend linear regression by mapping
original features to another feature
(purpose: model the non-linear relationship of original
features by linear relationship of new features)
(How? perform linear regression on new features s.t.
# calculate mean error over training set & test set
def mean_squared_error(y_true, y_pred): Then, weights for transformed features should be:
return np.mean((y_pred - y_true)**2)
(by applying normal equation),
y_pred_train = predict(X_train_b, w)
y_pred_test = predict(X_test_b, w)
where is transformed design metrix.
# The graph polynomial LR is better fit than LR model
One option for mapping is “polynomial basis # compute mean squared error on train set/test set
function of single-feature, e.g. y_pred_train_poly = lr_poly.predict(Phi_train)
, y_pred_test_poly = lr_poly.predict(Phi_test)
which includes bias term = 1 print('Train MSE for polynomial features of degree {}:
{:.3f}'.format(degree,
# compute the transformed design matrix mean_squared_error(y_pred_train_poly,
# (purpose: get new features by transformation) y_train)))
from sklearn.preprocessing import print('Test MSE for polynomial features of degree {}:
PolynomialFeatures {:.3f}'.format(degree,
degree = 2 mean_squared_error(y_pred_test_poly,
poly = PolynomialFeatures(degree=degree) y_test)))
Phi_train = poly.fit_transform(X_train_s) print('Train MSE using linear features only:
Phi_test = poly.fit_transform(X_test_s) {:.3f}'.format(mean_squared_error(lr.predict(X
print("Original design matrix (first 5 rows):\n", _train_s), y_train)))
X_train_s[0:5], "\n") print('Test MSE using linear features only:
print("Transformed design matrix (first 5 rows):\n", {:.3f}'.format(mean_squared_error(lr.predict(X
Phi_train[0:5]) _test_s), y_test)))
# original vs transformed design matrix: # output (large decrease in train error, but smaller
decrease in test error)
# adjust the degree of polynomial for basis expansion
# degree = 0 ~ 11
degrees = list(range(12))
models = list()
# linear regression on transformed feature train_mses = list() # mean squared error
lr_poly = LinearRegression(fit_intercept=False). test_mses = list()
fit(Phi_train, y_train)
X_grid = np.linspace(min(X_train_s.min(),
# plot the prediction by lr_poly X_test_s.min()),
X_grid = np.linspace(X_train_s.min(), X_train_s.max(), max(X_train_s.max(), X_test_s.max()),
num=1001) num=1001) # draw blank graph
Phi_grid = poly.fit_transform(X_grid[:,np.newaxis])
y = lr_poly.predict(Phi_grid) plt.figure(figsize=(20,16))
plt.plot(X_grid, y, 'k-', label='Prediction') for i, degree in enumerate(degrees):
plt.scatter(X_train_s, y_train, color='b', marker='.', plt.subplot(len(degrees)//2, 2, i+1)
label='Train')
plt.scatter(X_test_s, y_test, color='r', marker='.', # Transform features
label='Test') poly = PolynomialFeatures(degree=degree)
plt.legend() Phi_train, Phi_test = poly.fit_transform(X_train_s),
plt.ylabel("$y$ (Median House Value)") poly.fit_transform(X_test_s)
plt.xlabel("$x$ (LSTAT)") Phi_grid = poly.fit_transform(X_grid[:,np.newaxis])
plt.show()
# Fit model
lr_poly = LinearRegression().fit(Phi_train, y_train)
models.append(lr_poly)
# Evaluate
train_mse =
mean_squared_error(lr_poly.predict(Phi_trai
n), y_train)
train_mses.append(train_mse)
test_mse =
mean_squared_error(lr_poly.predict(Phi_test)
, y_test) 6. Bonus: ridge regression
test_mses.append(test_mse) ridge regression = one of regularisation with L2 penalty
(purpose: solve bias-variance trade-off)
# plot scatter plot for predicted y and actual y (How? add penalty term to least squares cost, which
plt.plot(X_grid, lr_poly.predict(Phi_grid), 'k', encourage sparse/small weight)
label='Prediction')
plt.scatter(X_train_s, y_train, color='b', marker='.',
label='Train')
plt.scatter(X_test_s, y_test, color='r', marker='.',
label='Test')
plt.title('Degree {} | Train MSE {:.3f}, Test MSE
{:.3f}'.format(degree, train_mse, test_mse)) # rescale LSTAT feature
plt.legend() from sklearn.linear_model import Ridge
X_train_s = X_train_s / 100.0
plt.suptitle('Polynomial regression for different X_test_s = X_test_s / 100.0
polynomial degrees', y=1.05, fontsize=32)
plt.tight_layout() # replace “LinearRegression()” by “Ridge(alpha =
0.002)” (all other code is same as Part 5)
degrees = list(range(12))
models = list()
train_mses = list()
test_mses = list()
X_grid = np.linspace(min(X_train_s.min(),
X_test_s.min()),
max(X_train_s.max(), X_test_s.max()),
num=1001)
plt.figure(figsize=(20,16))
for i, degree in enumerate(degrees):
plt.subplot(len(degrees)//2, 2, i+1)
# plot “mean squared error vs polynomial degree”
plt.plot(degrees, train_mses, color='b', label='Train') # Transform features
plt.plot(degrees, test_mses, color='r', label='Test') poly = PolynomialFeatures(degree=degree)
plt.title('MSE vs. polynomial degree') Phi_train, Phi_test = poly.fit_transform(X_train_s),
plt.ylabel('MSE') poly.fit_transform(X_test_s)
plt.xlabel('Polynomial degree') Phi_grid = poly.fit_transform(X_grid[:,np.newaxis])
plt.legend()
plt.show() # Fit model
lr_poly = Ridge(alpha = 0.002).fit(Phi_train, y_train)
models.append(lr_poly)
# Evaluate
train_mse =
mean_squared_error(lr_poly.predict(Phi_train
), y_train)
train_mses.append(train_mse)
test_mse = # plot “L2 norm of weights vs degree”
mean_squared_error(lr_poly.predict(Phi_test) w_L2 = [np.sum(m.coef_**2) for m in models]
, y_test) plt.plot(degrees, w_L2)
test_mses.append(test_mse) plt.xlabel('Polynomial degree')
plt.ylabel(r'$\| \mathbf{w} \|_2^2$')
plt.plot(X_grid, lr_poly.predict(Phi_grid), 'k', plt.show()
label='Prediction')
plt.scatter(X_train_s, y_train, color='b', marker='.',
label='Train')
#plt.scatter(X_test_s, y_test, color='r', marker='.',
label='Test')
plt.title('Degree {} | Train MSE {:.3f}'.format(degree,
train_mse))
plt.legend()
plt.suptitle('Polynomial ridge regression for different
polynomial degrees', y=1.05, fontsize=32)
plt.tight_layout()
# after applying ridge regression, model is no longer
overfitting for large polynomial degrees
# plot “mean squared error vs degree” for ridge
plt.plot(degrees, train_mses, color='b', label='Train')
plt.plot(degrees, test_mses, color='r', label='Test')
plt.title('MSE vs. polynomial degree')
plt.ylabel('MSE')
plt.xlabel('Polynomial degree')
plt.legend()
plt.show()