**Part 1- Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
def intro():
%matplotlib notebook
plt.figure()
plt.scatter(X_train, y_train, label='training data')
plt.scatter(X_test, y_test, label='test data')
plt.legend(loc=4);
intro()
**Question 1
def answer_one():
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
# To capture interactions between the original features by adding them as
features to the linear model.
clf = LinearRegression() # Initialize linear classifier
preds = np.zeros((4,100)) # Final tuple with predictions for all orders.
X_input = np.linspace(0,10,100) # Given requirement
orders = [1,3,6,9] # Given requirement
for i in range(len(orders)):
poly = PolynomialFeatures(orders[i]) # Object to add polynomial features.
# Add polynomial features to training data and input data:
# Need to transpose X_train and X_input for poly_fit to work.
X_train_poly = poly.fit_transform(X_train[None].T)
X_input_poly = poly.fit_transform(X_input[None].T)
# Train linear regression classifier with training data:
clf.fit(X_train_poly, y_train)
# Get predictions from linear classifier using transformed input data:
# This is still a weighted linear combination of features, so it's still a
linear model, and can use same least-squares estimation method for w and b.
preds[i,:] = clf.predict(X_input_poly)
return preds
answer_one()
**Question 2
def answer_two():
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
r2_test, r2_train = [],[]
for i in range (10):
poly = PolynomialFeatures(i)
X_train_poly = poly.fit_transform(X_train[None].T)
X_test_poly = poly.fit_transform(X_test[None].T)
linreg = LinearRegression().fit(X_train_poly,y_train)
r2_train.append(linreg.score(X_train_poly,y_train))
r2_test.append(linreg.score(X_test_poly,y_test))
return (r2_train, r2_test)
answer_two()
**Question 3
def answer_three():
return (0,9,6)
answer_three()
**Question 4
def answer_four():
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import r2_score
poly = PolynomialFeatures(12)
X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))
X_test_poly = poly.transform(X_test.reshape(-1, 1))
linreg = LinearRegression().fit(X_train_poly,y_train)
linlasso = Lasso(alpha=0.01,max_iter=10000, tol=0.1).fit(X_train_poly,y_train)
# Asked to find score for TEST SET!
return (linreg.score(X_test_poly,y_test),linlasso.score(X_test_poly,y_test))
answer_four()
**Part 2 - Classification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
mush_df = pd.read_csv('assets/mushrooms.csv')
mush_df2 = pd.get_dummies(mush_df)
X_mush = mush_df2.iloc[:,2:]
y_mush = mush_df2.iloc[:,1]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_mush, y_mush,
random_state=0)
**Question 5
def answer_five():
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0).fit(X_train2, y_train2)
df = pd.DataFrame({'feature': X_train2.columns.values, 'feature importance':
clf.feature_importances_})
top_features = df.sort_values(by=['feature importance'], ascending=0)
['feature'].head(5).tolist()
return top_features
**Question 6
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve, train_test_split
from sklearn.datasets import load_iris
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_subset, y_subset = X.iloc[:50], y[:50]
def answer_six():
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
import numpy as np
param_range = np.logspace(-4, 1, 6) # [0.0001, 0.001, 0.01, 0.1, 1, 10]
train_scores, test_scores = validation_curve(
SVC(kernel='rbf', C=1, random_state=0),
X_subset, y_subset,
param_name='gamma',
param_range=param_range,
cv=3,
n_jobs=2
)
mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)
return (mean_train_scores, mean_test_scores)
print(answer_six())
**Question 7
def answer_seven():
return (0.0001,10,0.1)
answer_seven()