#
# Project: Predictive Academic Performance Model
# Description: This Python script demonstrates a simple machine learning model
# to predict a student's final grade based on a simulated dataset.
# It uses the pandas library for data handling and scikit-learn
# for the machine learning model itself.
#
# How to Run:
# 1. Ensure you have Python installed.
# 2. Install the required libraries:
# pip install pandas scikit-learn
# 3. Save this code as a .py file (e.g., 'predictive_model.py').
# 4. Run it from your terminal:
# python predictive_model.py
#
# This code is fully self-contained for a competition presentation.
#
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# --- 1. Create a Simulated Dataset ---
# In a real project, you would load data from a CSV file.
# For this demonstration, we'll create a synthetic dataset using a dictionary.
# The features are 'Hours_Studied', 'Attendance_Percentage', and 'Midterm_Grade'.
# The target variable (what we want to predict) is 'Final_Grade'.
print("--- Creating a simulated dataset... ---")
data = {
'Hours_Studied': [2, 5, 3, 7, 4, 6, 8, 3, 5, 9, 1, 6, 4, 8, 2],
'Attendance_Percentage': [85, 95, 88, 98, 92, 90, 99, 80, 94, 97, 75, 91, 89,
96, 82],
'Midterm_Grade': [70, 85, 75, 90, 80, 88, 95, 65, 83, 92, 60, 89, 78, 93, 72],
'Final_Grade': [75, 90, 80, 95, 85, 92, 98, 68, 87, 96, 62, 90, 82, 97, 76]
}
# Create a Pandas DataFrame from the dictionary.
# A DataFrame is a powerful data structure for tabular data.
df = pd.DataFrame(data)
print("\nSimulated Dataset:")
print(df)
print("\n" + "="*50 + "\n")
# --- 2. Data Preparation ---
# We need to separate our features (X) from our target variable (y).
# 'Final_Grade' is our target, and the other columns are our features.
print("--- Preparing data for the model... ---")
X = df[['Hours_Studied', 'Attendance_Percentage', 'Midterm_Grade']]
y = df['Final_Grade']
# Split the data into training and testing sets.
# We'll use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("\n" + "="*50 + "\n")
# --- 3. Model Training ---
# Initialize the Linear Regression model.
print("--- Training the Linear Regression model... ---")
model = LinearRegression()
# Train the model using the training data.
# The model learns the relationship between X_train and y_train.
model.fit(X_train, y_train)
print("Model training complete.")
print("\n" + "="*50 + "\n")
# --- 4. Model Evaluation ---
# Make predictions on the test data.
print("--- Evaluating the model's performance... ---")
y_pred = model.predict(X_test)
# Calculate key performance metrics.
# Mean Squared Error (MSE): A lower value indicates a better fit.
mse = mean_squared_error(y_test, y_pred)
# R-squared (R2): A value closer to 1.0 indicates a better fit.
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")
print("\n" + "="*50 + "\n")
# --- 5. Making a New Prediction ---
# This is the final, impressive part of the demonstration.
# We can use our trained model to predict a new student's grade.
print("--- Making a new prediction... ---")
# Let's create a new student's data.
new_student_data = np.array([[6, 95, 85]]) # Hours, Attendance, Midterm
new_student_df = pd.DataFrame(new_student_data, columns=['Hours_Studied',
'Attendance_Percentage', 'Midterm_Grade'])
# Use the model to predict the final grade.
predicted_grade = model.predict(new_student_df)
print(f"New student data: {new_student_df.to_string(index=False)}")
print(f"\nPredicted Final Grade for this student: {predicted_grade[0]:.2f}")
print("\n" + "="*50 + "\n")
# Optional: Print the model's coefficients to show how it works.
print("--- Model Coefficients (insight into the model) ---")
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)
print("\n(This shows the influence of each feature on the final grade.)")