7/30/24, 3:55 PM Untitled4.
ipynb - Colab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load the Diabetes dataset
def load_data():
diabetes = load_diabetes(as_frame=True)
data = diabetes.frame
return data
# Prepare the data
def prepare_data(data):
X = data.drop('target', axis=1).values # Features
y = data['target'].values # Target variable
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test
# Train the model
def train_linear_regression(X_train, y_train):
model = LinearRegression()
model.fit(X_train, y_train)
return model
# Predict with the model
def predict(model, X_test):
return model.predict(X_test)
# Plot feature distributions
def plot_feature_distributions(data):
plt.figure(figsize=(12, 8))
for i, feature in enumerate(data.columns[:-1]): # Exclude the target column
plt.subplot(4, 3, i+1) # Adjusted to fit all features
sns.histplot(data[feature], kde=True)
plt.title(feature)
plt.tight_layout()
plt.show()
# Plot actual vs predicted values
def plot_actual_vs_predicted(y_test, y_pred):
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', lw=2)
plt.show()
# Plot residuals
def plot_residuals(y_test, y_pred):
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residuals Distribution')
plt.show()
# Main workflow
def main():
# Load and prepare data
data = load_data()
X_train, X_test, y_train, y_test = prepare_data(data)
# Train the model
model = train_linear_regression(X_train, y_train)
print("Trained coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# Predict on the test set
y_pred = predict(model, X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
( d)
https://colab.research.google.com/drive/1D79M0mc1VqFBI-BSWQk8leEX87LjbAmI#printMode=true 1/3
7/30/24, 3:55 PM Untitled4.ipynb - Colab
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Display the original data features for reference
print("Original features (first 5 rows):")
print(data.head())
# Plot visualizations
plot_feature_distributions(data)
plot_actual_vs_predicted(y_test, y_pred)
plot_residuals(y_test, y_pred)
# Run the main function
if __name__ == "__main__":
main()
https://colab.research.google.com/drive/1D79M0mc1VqFBI-BSWQk8leEX87LjbAmI#printMode=true 2/3
7/30/24, 3:55 PM Untitled4.ipynb - Colab
Trained coefficients: [ 37.90402135 -241.96436231 542.42875852 347.70384391 -931.48884588
518.06227698 163.41998299 275.31790158 736.1988589 48.67065743]
Intercept: 151.34560453985995
Mean Squared Error: 2900.193628493482
R-squared: 0.4526027629719195
Original features (first 5 rows):
age sex bmi bp s1 s2 s3 \
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142
s4 s5 s6 target
0 -0.002592 0.019907 -0.017646 151.0
1 -0.039493 -0.068332 -0.092204 75.0
2 -0.002592 0.002861 -0.025930 141.0
3 0.034309 0.022688 -0.009362 206.0
4 -0.002592 -0.031988 -0.046641 135.0
https://colab.research.google.com/drive/1D79M0mc1VqFBI-BSWQk8leEX87LjbAmI#printMode=true 3/3