import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# Let's simulate some data
np.random.seed(0)
# Generate two clouds of points from normal distributions
n_samples = 1000
# Generate points for the first group
mean1 = [1, -1]
cov1 = [[1, 0], [0, 1]]
cloud1 = np.random.multivariate_normal(mean1, cov1, n_samples)
# Generate points for the second group
mean2 = [-1, 1]
cov2 = [[1, 0], [0, 1]]
cloud2 = np.random.multivariate_normal(mean2, cov2, n_samples)
# Combine the two groups to create the feature matrix X
X = np.vstack((cloud1, cloud2))
# Generate the target variable y
y = np.concatenate((np.zeros(n_samples), np.ones(n_samples)))
# TODO: Split the data into training and testing sets
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
random_state=0)
# Reshape the target variables. What is the -1 used for?
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))
# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print()
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
# Visualize the dataset
plt.figure(figsize=(8, 4))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis')
plt.title('Dataset Visualization')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(ticks=[0, 1], label='Class')
plt.grid()
# TODO: Plot train data
plt.scatter( X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis',
edgecolor='black', linewidth=1, marker='s', label='Train Data')
# TODO: Plot test data
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='viridis',
edgecolor='black', linewidth=1, marker='^', label='Test Data')
# Set legend
plt.legend()
# Show the plot
plt.show()
# TODO: Implement the predict function for logistic regression
def predict(X, theta):
"""
Predict the target variable using the logistic regression model.
Parameters:
X (numpy.ndarray): Feature matrix of shape (n, p), where n is the number of
samples and p is the number of features.
theta (numpy.ndarray): Model parameters of shape (p, 1).
Returns:
probabilities (numpy.ndarray): Predicted probabilities of shape (n, 1).
"""
# TODO: Define probabilitiesof the logistic regression
z=X@theta
probabilities = 1/(1+np.exp(-z))
return probabilities
# TODO: Implement the cost function for logistic regression
def cost_function(X, y, theta):
"""
Compute the cost function for logistic regression.
Parameters:
X (numpy.ndarray): Feature matrix of shape (n, p), where n is the number of
samples and p is the number of features.
y (numpy.ndarray): Target values of shape (n, 1).
theta (numpy.ndarray): Model parameters of shape (p, 1).
Returns:
cost (float): Cost value corresponding to the logistic loss.
"""
n = len(y)
# TODO: Calculate probabilities
probabilities = predict(X,theta)
# TODO: Compute the cost function (- log-likelihood)
cost =-np.sum(y* np.log(probabilities)+(1-y)*np.log(1-probabilities))/n
return cost
# TODO: Implement the gradient function for logistic regression
def gradient_function(X, y, theta):
"""
Compute the gradient of the cost function for logistic regression.
Parameters:
X (numpy.ndarray): Feature matrix of shape (n, p), where n is the number of
samples and p is the number of features.
y (numpy.ndarray): Target values of shape (n, 1).
theta (numpy.ndarray): Model parameters of shape (p, 1).
Returns:
gradient (numpy.ndarray): Gradient vector of shape (p, 1).
"""
n = len(y)
# TODO: Calculate probabilities
probabilities = predict(X,theta)
# TODO: Compute the gradient of the cost function
gradient = 1/n( X.T @ (probabilities - y ))
return gradient
# TODO: Implement the train function to learn the weights of the logistic
regression model using gradient descent
def train_model(X_train, y_train, learning_rate, num_iterations):
"""
Train the logistic regression model using gradient descent optimization.
Parameters:
X_train (numpy.ndarray): Feature matrix of shape (n, p) for training data.
y_train (numpy.ndarray): Target values of shape (n, 1) for training data.
learning_rate (float): Learning rate for gradient descent.
num_iterations (int): Number of iterations for training.
Returns:
theta (numpy.ndarray): Model parameters of shape (p, 1).
costs_train (list): List of training costs at each iteration.
"""
n, p = X_train.shape
theta = np.zeros((p, 1))
costs_train = []
# TODO: Implement the optimization part
for _ in range(num_iterations):
gradient = gradient_function(X_train, y_train, theta)
theta -= learning_rate * gradient
cost_train = cost_function(X_train, y_train, theta)
costs_train.append(cost_train)
return theta, costs_train
# Generate artificial data for demonstration
np.random.seed(0)
# TODO: Train the logistic regression model
learning_rate = .001
num_iterations = 10
n, p = X_train.shape
theta = np.zeros((p, 1))
costs_train = []
# TODO: Make predictions on the training data
theta_hat, costs_train = train_model(X_train, y_train, learning_rate,
num_iterations)
probability_threshold = 1/2
probabilities_train = predict(X_train,theta_hat)
probabilities_test= predict(X_test,theta_hat)
y_train_pred = predict(probabilities_train>= probability_threshold).astype(int)
y_test_pred = predict(probabilities_test>=probability_threshold).astype(int)