import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from [Link] import MinMaxScaler
from sklearn.linear_model import LinearRegression
from [Link] import r2_score, mean_squared_error, mean_absolute_error
from [Link] import make_pipeline
import [Link] as plt
import seaborn as sns
import logging
from typing import Tuple, Dict
from datetime import datetime
# Configure logging
[Link](level=[Link], format='%(levelname)s: %(message)s')
# --------------------------------------
# Module 1: Data Loading (Strict Validation)
# --------------------------------------
def load_data(file_path: str, target_column: str) -> [Link]:
"""Load Excel file with strict numerical validation"""
try:
df = pd.read_excel(file_path)
# Enforce numerical-only requirement
if not all([Link](lambda x: [Link](x, [Link]))):
raise ValueError("All columns must be numerical")
if target_column not in [Link]:
raise ValueError(f"Target column '{target_column}' not found")
[Link](f"Data loaded: {[Link][0]} rows, {[Link][1]} cols")
return df
except FileNotFoundError:
[Link]("Excel file not found")
raise
# --------------------------------------
# Module 2: Preprocessing (Explicit Row Removal)
# --------------------------------------
def preprocess_data(df: [Link]) -> [Link]:
"""Remove missing rows with logging"""
initial_rows = [Link][0]
df_clean = [Link]().drop_duplicates()
removed = initial_rows - df_clean.shape[0]
[Link](f"Removed {removed} rows ({removed/initial_rows:.1%})")
return df_clean
# --------------------------------------
# Module 3: Feature Scaling (Leakage-Proof)
# --------------------------------------
def create_scaler(X_train: [Link]) -> MinMaxScaler:
"""Create scaler fitted ONLY on training data"""
scaler = MinMaxScaler().fit(X_train)
[Link]("Scaler fitted on training data")
return scaler
# --------------------------------------
# Module 4: Data Splitting (Reproducible)
# --------------------------------------
def split_data(X: [Link], y: [Link], test_size: float=0.2,
random_state: int=42) -> Tuple:
"""Strict numerical split with seed"""
return train_test_split(
X, y,
test_size=test_size,
random_state=random_state
# --------------------------------------
# Module 5: Model Training (With Pipeline)
# --------------------------------------
def train_model(X_train: [Link], y_train: [Link]) -> LinearRegression:
"""Linear regression with integrated scaling"""
model = make_pipeline(
MinMaxScaler(),
LinearRegression()
).fit(X_train, y_train)
[Link]("Model trained")
return model
# --------------------------------------
# Module 6: Evaluation (With Baseline)
# --------------------------------------
def evaluate_model(model: LinearRegression, X_test: [Link],
y_test: [Link]) -> Dict[str, float]:
"""Metrics with naive baseline comparison"""
# Baseline (mean prediction)
baseline_pred = np.full_like(y_test, y_test.mean())
# Model predictions
y_pred = [Link](X_test)
return {
"R2": r2_score(y_test, y_pred),
"Baseline R2": r2_score(y_test, baseline_pred),
"RMSE": [Link](mean_squared_error(y_test, y_pred)),
"Baseline RMSE": [Link](mean_squared_error(y_test, baseline_pred)),
"MAE": mean_absolute_error(y_test, y_pred)
# --------------------------------------
# Module 7: Prediction (Input Validation)
# --------------------------------------
def predict_input(model: LinearRegression, feature_names: list,
X_train: [Link]) -> None:
"""Validate numerical inputs against training ranges"""
try:
inputs = []
for feat in feature_names:
val = float(input(f"Enter {feat}: "))
# Validate against known ranges
min_val = X_train[feat].min()
max_val = X_train[feat].max()
if not (min_val <= val <= max_val):
[Link](f"{feat} value outside training range")
[Link](val)
prediction = [Link]([Link]([inputs]))[0]
print(f"Prediction: {prediction:.4f}")
except ValueError:
print("Invalid numerical input")
# --------------------------------------
# Module 8: Visualization (Timestamped)
# --------------------------------------
def plot_results(y_true: [Link], y_pred: [Link]) -> None:
"""Actual vs Predicted plot with timestamp"""
timestamp = [Link]().strftime("%Y%m%d_%H%M%S")
[Link](figsize=(8,5))
[Link](x=y_true, y=y_pred)
[Link]([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
[Link](f"results_{timestamp}.png"))
[Link]("Plot saved")
# --------------------------------------
# Main Pipeline (Leakage-Proof Order)
# --------------------------------------
def main(file_path: str, target_column: str) -> None:
try:
# 1. Load with validation
df = load_data(file_path, target_column)
# 2. Clean data
df_clean = preprocess_data(df)
# 3. Split first!
X = df_clean.drop(target_column, axis=1)
y = df_clean[target_column]
X_train, X_test, y_train, y_test = split_data(X, y)
# 4. Train model (with internal scaling)
model = train_model(X_train, y_train)
# 5. Evaluate
metrics = evaluate_model(model, X_test, y_test)
print("Metrics:", metrics)
# 6. Visualize
plot_results(y_test, [Link](X_test))
# 7. Prediction mode
predict_input(model, [Link](), X_train)
except Exception as e:
[Link](f"Fatal error: {str(e)}")
if __name__ == "__main__":
main("reactor_data.xlsx", "target_column")