0% found this document useful (0 votes)
31 views6 pages

Code Structure

The document outlines a Python script for a machine learning pipeline using linear regression, including data loading, preprocessing, model training, evaluation, and prediction. It emphasizes strict validation, logging, and leakage-proof practices throughout the process. The main function orchestrates the workflow, handling exceptions and providing metrics and visualizations.

Uploaded by

Hanan Arif
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
31 views6 pages

Code Structure

The document outlines a Python script for a machine learning pipeline using linear regression, including data loading, preprocessing, model training, evaluation, and prediction. It emphasizes strict validation, logging, and leakage-proof practices throughout the process. The main function orchestrates the workflow, handling exceptions and providing metrics and visualizations.

Uploaded by

Hanan Arif
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from [Link] import MinMaxScaler

from sklearn.linear_model import LinearRegression

from [Link] import r2_score, mean_squared_error, mean_absolute_error

from [Link] import make_pipeline

import [Link] as plt

import seaborn as sns

import logging

from typing import Tuple, Dict

from datetime import datetime

# Configure logging

[Link](level=[Link], format='%(levelname)s: %(message)s')

# --------------------------------------

# Module 1: Data Loading (Strict Validation)

# --------------------------------------

def load_data(file_path: str, target_column: str) -> [Link]:

"""Load Excel file with strict numerical validation"""

try:

df = pd.read_excel(file_path)

# Enforce numerical-only requirement

if not all([Link](lambda x: [Link](x, [Link]))):

raise ValueError("All columns must be numerical")

if target_column not in [Link]:

raise ValueError(f"Target column '{target_column}' not found")

[Link](f"Data loaded: {[Link][0]} rows, {[Link][1]} cols")

return df

except FileNotFoundError:
[Link]("Excel file not found")

raise

# --------------------------------------

# Module 2: Preprocessing (Explicit Row Removal)

# --------------------------------------

def preprocess_data(df: [Link]) -> [Link]:

"""Remove missing rows with logging"""

initial_rows = [Link][0]

df_clean = [Link]().drop_duplicates()

removed = initial_rows - df_clean.shape[0]

[Link](f"Removed {removed} rows ({removed/initial_rows:.1%})")

return df_clean

# --------------------------------------

# Module 3: Feature Scaling (Leakage-Proof)

# --------------------------------------

def create_scaler(X_train: [Link]) -> MinMaxScaler:

"""Create scaler fitted ONLY on training data"""

scaler = MinMaxScaler().fit(X_train)

[Link]("Scaler fitted on training data")

return scaler

# --------------------------------------

# Module 4: Data Splitting (Reproducible)

# --------------------------------------

def split_data(X: [Link], y: [Link], test_size: float=0.2,

random_state: int=42) -> Tuple:

"""Strict numerical split with seed"""

return train_test_split(

X, y,
test_size=test_size,

random_state=random_state

# --------------------------------------

# Module 5: Model Training (With Pipeline)

# --------------------------------------

def train_model(X_train: [Link], y_train: [Link]) -> LinearRegression:

"""Linear regression with integrated scaling"""

model = make_pipeline(

MinMaxScaler(),

LinearRegression()

).fit(X_train, y_train)

[Link]("Model trained")

return model

# --------------------------------------

# Module 6: Evaluation (With Baseline)

# --------------------------------------

def evaluate_model(model: LinearRegression, X_test: [Link],

y_test: [Link]) -> Dict[str, float]:

"""Metrics with naive baseline comparison"""

# Baseline (mean prediction)

baseline_pred = np.full_like(y_test, y_test.mean())

# Model predictions

y_pred = [Link](X_test)

return {

"R2": r2_score(y_test, y_pred),

"Baseline R2": r2_score(y_test, baseline_pred),


"RMSE": [Link](mean_squared_error(y_test, y_pred)),

"Baseline RMSE": [Link](mean_squared_error(y_test, baseline_pred)),

"MAE": mean_absolute_error(y_test, y_pred)

# --------------------------------------

# Module 7: Prediction (Input Validation)

# --------------------------------------

def predict_input(model: LinearRegression, feature_names: list,

X_train: [Link]) -> None:

"""Validate numerical inputs against training ranges"""

try:

inputs = []

for feat in feature_names:

val = float(input(f"Enter {feat}: "))

# Validate against known ranges

min_val = X_train[feat].min()

max_val = X_train[feat].max()

if not (min_val <= val <= max_val):

[Link](f"{feat} value outside training range")

[Link](val)

prediction = [Link]([Link]([inputs]))[0]

print(f"Prediction: {prediction:.4f}")

except ValueError:

print("Invalid numerical input")

# --------------------------------------

# Module 8: Visualization (Timestamped)

# --------------------------------------

def plot_results(y_true: [Link], y_pred: [Link]) -> None:


"""Actual vs Predicted plot with timestamp"""

timestamp = [Link]().strftime("%Y%m%d_%H%M%S")

[Link](figsize=(8,5))

[Link](x=y_true, y=y_pred)

[Link]([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')

[Link](f"results_{timestamp}.png"))

[Link]("Plot saved")

# --------------------------------------

# Main Pipeline (Leakage-Proof Order)

# --------------------------------------

def main(file_path: str, target_column: str) -> None:

try:

# 1. Load with validation

df = load_data(file_path, target_column)

# 2. Clean data

df_clean = preprocess_data(df)

# 3. Split first!

X = df_clean.drop(target_column, axis=1)

y = df_clean[target_column]

X_train, X_test, y_train, y_test = split_data(X, y)

# 4. Train model (with internal scaling)

model = train_model(X_train, y_train)

# 5. Evaluate

metrics = evaluate_model(model, X_test, y_test)

print("Metrics:", metrics)
# 6. Visualize

plot_results(y_test, [Link](X_test))

# 7. Prediction mode

predict_input(model, [Link](), X_train)

except Exception as e:

[Link](f"Fatal error: {str(e)}")

if __name__ == "__main__":

main("reactor_data.xlsx", "target_column")

You might also like