0% found this document useful (0 votes)
11 views25 pages

Current Code

The document outlines a process for building and training a Physics-Informed Neural Network (PINN) using Python and PyTorch to analyze hydrate curves and predict plug probabilities. It includes steps for data loading, model definition, training, and inference, with emphasis on ensuring the integrity of the data and model during the process. The final output includes saving the trained model and generating predictions for validation data.

Uploaded by

thedrillingpoint
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views25 pages

Current Code

The document outlines a process for building and training a Physics-Informed Neural Network (PINN) using Python and PyTorch to analyze hydrate curves and predict plug probabilities. It includes steps for data loading, model definition, training, and inference, with emphasis on ensuring the integrity of the data and model during the process. The final output includes saving the trained model and generating predictions for validation data.

Uploaded by

thedrillingpoint
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 25

Cell 1:

# --- Setup (local, no external module) ---

import os, math, numpy as np, pandas as pd, torch, torch.nn as nn, torch.nn.functional as F

# Put the CSV with the hydrate curve in the SAME FOLDER as this notebook:

CURVE_CSV = "hydrate_curve_clean.csv" # <-- upload this file next

DATA_CSV = "fair_pinn_synth_data.csv" # <-- your data file

# Inline Teq builder (replaces hydrate_teq.py)

def build_Teq(csv_path):

df = pd.read_csv(csv_path).sort_values("Pressure").drop_duplicates(subset="Pressure")

Px = df["Pressure"].values

Ty = df["Temperature"].values

def Teq(P):

P = np.asarray(P, dtype=float)

P = np.clip(P, Px.min(), Px.max())

return np.interp(P, Px, Ty)

return Teq

# Load T_eq(P)

Teq = build_Teq(CURVE_CSV)

print("Teq sample:", Teq([5.0, 10.0]))


Cell 2:

# --- Data Loading ---

# Expected columns in the CSV:

# time_s, P_bar, T_K, Sg, phase_frac, pipe_d_m, holdup_liq, y_hh, y_dpr, y_cls

df = pd.read_csv(DATA_CSV)

# Basic checks

required_base = ["time_s","P_bar","T_K","Sg","phase_frac","pipe_d_m","holdup_liq"]

for c in required_base:

if c not in df.columns:

raise ValueError(f"Missing required column: {c}")

# Optional targets

has_hh = "y_hh" in df.columns and df["y_hh"].notna().any()

has_dpr = "y_dpr" in df.columns and df["y_dpr"].notna().any()

has_cls = "y_cls" in df.columns and df["y_cls"].notna().any()

# Compute DeltaT = T - Teq(P)

Teq_vals = Teq(df["P_bar"].values)

df["DeltaT"] = df["T_K"].values - Teq_vals

# Features for base network (you can add/remove later)

feature_cols = ["time_s","P_bar","T_K","Sg","phase_frac","pipe_d_m","holdup_liq","DeltaT"]

X = df[feature_cols].astype(float).values
# Targets

Y_hh = df["y_hh"].astype(float).values if has_hh else None

Y_dpr = df["y_dpr"].astype(float).values if has_dpr else None

Y_cls = df["y_cls"].astype(float).values if has_cls else None

# Train/val split (simple)

idx = np.arange(len(df))

split = int(0.8*len(df))

train_idx, val_idx = idx[:split], idx[split:]

def to_tensor(x): return torch.tensor(x, dtype=torch.float32)

X_train, X_val = to_tensor(X[train_idx]), to_tensor(X[val_idx])

targets = {}

if has_hh: targets["hh"] = (to_tensor(Y_hh[train_idx]).unsqueeze(1),


to_tensor(Y_hh[val_idx]).unsqueeze(1))

if has_dpr: targets["dpr"] = (to_tensor(Y_dpr[train_idx]).unsqueeze(1),


to_tensor(Y_dpr[val_idx]).unsqueeze(1))

if has_cls: targets["cls"] = (to_tensor(Y_cls[train_idx]).unsqueeze(1),


to_tensor(Y_cls[val_idx]).unsqueeze(1))

# Physics context tensors (must align with feature order)

def get_context(X_tensor):

# IMPORTANT: keep the graph intact

X_tensor.requires_grad_(True) # now any slice (time) stays connected to the graph

cols = {name:i for i,name in enumerate(feature_cols)}

ctx = {
"time": X_tensor[:, cols["time_s"] : cols["time_s"]+1], # no clone/detach

"P": X_tensor[:, cols["P_bar"]],

"T": X_tensor[:, cols["T_K"]],

"Sg": X_tensor[:, cols["Sg"]],

"phase_frac": X_tensor[:, cols["phase_frac"]],

"pipe_d": X_tensor[:, cols["pipe_d_m"]],

"holdup_liq": X_tensor[:, cols["holdup_liq"]],

"time_idx": cols["time_s"], # optional, if you want to slice inside model

return ctx

print("Data loaded. has_hh:", has_hh, "has_dpr:", has_dpr, "has_cls:", has_cls)


Cell 3:

# ===== PINN Definition (replace whole cell with this) =====

import torch

import torch.nn as nn

import torch.nn.functional as F

def mlp(in_dim, out_dim, hidden=3, width=64, act=nn.ReLU):

layers = [nn.Linear(in_dim, width), act()]

for _ in range(hidden-1):

layers += [nn.Linear(width, width), act()]

layers += [nn.Linear(width, out_dim)]

return nn.Sequential(*layers)

class Kinetics(nn.Module):

def __init__(self, k1_init=1.0, k2_init=-1.0e4):

super().__init__()

self.k1 = nn.Parameter(torch.tensor(k1_init, dtype=torch.float32))

self.k2 = nn.Parameter(torch.tensor(k2_init, dtype=torch.float32))

def forward(self, T, P, Sg, phase_frac, pipe_d, holdup_liq, DeltaT=None):

# Simple illustrative kinetics; you can swap with your correlation later.

if DeltaT is None:

DeltaT = T*0.0

AS = torch.clamp(phase_frac * 10.0 / (pipe_d + 1e-6), min=1e-6)

k = AS * self.k1 * torch.exp(self.k2 / torch.clamp(T, min=1.0)) \

* torch.clamp(DeltaT, -100.0, 100.0)


return k

class PINN(nn.Module):

def __init__(self, base_feat_dim, feature_cols):

"""

feature_cols: list used to build x_base (must include 'time_s')

"""

super().__init__()

self.feature_cols = feature_cols

assert "time_s" in self.feature_cols, "feature_cols must contain 'time_s'"

self.time_idx = self.feature_cols.index("time_s")

self.hh_net = mlp(base_feat_dim, 1, hidden=4, width=128)

self.dpr_net = mlp(base_feat_dim + 1, 1, hidden=4, width=128)

self.cls_net = mlp(base_feat_dim + 2, 1, hidden=3, width=96)

self.kin = Kinetics()

def forward(self, x_base, context):

# Ensure autograd tracks inputs

x_base.requires_grad_(True)

# --- forward passes ---

hh = self.hh_net(x_base)

dpr = self.dpr_net(torch.cat([x_base, hh], dim=1))

logit = self.cls_net(torch.cat([x_base, hh, dpr], dim=1))

prob_plug = torch.sigmoid(logit)
# --- dhh/dt via gradient wrt all inputs, then select time column ---

dhh_dX = torch.autograd.grad(

outputs=hh,

inputs=x_base,

grad_outputs=torch.ones_like(hh),

create_graph=True,

retain_graph=True,

allow_unused=False

)[0] # (N, base_feat_dim)

dhh_dt = dhh_dX[:, self.time_idx:self.time_idx+1] # (N,1)

# --- kinetics residual ---

k = self.kin(

context["T"], context["P"], context["Sg"],

context["phase_frac"], context["pipe_d"], context["holdup_liq"]

pde_res = dhh_dt - k

return hh, dpr, prob_plug, pde_res

Cell 4:
# --- Training ---

device = "cpu"

model = PINN(base_feat_dim=len(feature_cols), feature_cols=feature_cols).to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-4)

mse = nn.MSELoss()

bce = nn.BCELoss()

# --- Sanitize y_cls targets ---

if "cls" in targets:

print("Sanitizing y_cls targets...")

print("Unique values before:", torch.unique(targets["cls"][0]))

# force binary 0/1

t_train, t_val = targets["cls"]

t_train = (t_train.float().clamp(0, 1) > 0.5).float()

t_val = (t_val.float().clamp(0, 1) > 0.5).float()

targets["cls"] = (t_train, t_val)

print("Unique values after:", torch.unique(targets["cls"][0]))

def batchify(X_tensor, batch_size=128):

n = X_tensor.shape[0]

for i in range(0, n, batch_size):

yield X_tensor[i:i+batch_size]

def compute_loss(model, Xb, targets_dict, ctx):

hh, dpr, pplug, pde = model(Xb, ctx)


loss = 0.0

logs = {}

if "hh" in targets_dict:

yh = targets_dict["hh"]

loss_hh = mse(hh, yh)

loss = loss + loss_hh

logs["hh"] = float(loss_hh.item())

if "dpr" in targets_dict:

yd = targets_dict["dpr"]

loss_dpr = mse(dpr, yd)

loss = loss + loss_dpr

logs["dpr"] = float(loss_dpr.item())

if "cls" in targets_dict:

yc = targets_dict["cls"]

# 🔑 Ensure yc is float in [0,1]

yc = yc.float().clamp(0,1)

loss_cls = bce(pplug, yc)

loss = loss + loss_cls

logs["cls"] = float(loss_cls.item())

# PDE residual

loss_pde = mse(pde, torch.zeros_like(pde))

loss = loss + 1.0 * loss_pde

logs["pde"] = float(loss_pde.item())

logs["total"] = float(loss.item())

return loss, logs


EPOCHS = 200

for ep in range(1, EPOCHS+1):

model.train()

tot = {"total":0.0,"hh":0.0,"dpr":0.0,"cls":0.0,"pde":0.0}

cnt = 0

for Xb in batchify(X_train, batch_size=256):

opt.zero_grad()

ctx = get_context(Xb)

tdict = {}

if "hh" in targets: tdict["hh"] = targets["hh"][0][:Xb.shape[0]]

if "dpr" in targets: tdict["dpr"] = targets["dpr"][0][:Xb.shape[0]]

if "cls" in targets: tdict["cls"] = targets["cls"][0][:Xb.shape[0]]

loss, logs = compute_loss(model, Xb, tdict, ctx)

loss.backward()

opt.step()

for k in tot:

tot[k] += logs.get(k, 0.0)

cnt += 1

if ep % 20 == 0 or ep == 1:

print(f"Epoch {ep:4d} | total={tot['total']/cnt:.4f} hh={tot['hh']/cnt:.4f}


dpr={tot['dpr']/cnt:.4f} cls={tot['cls']/cnt:.4f} pde={tot['pde']/cnt:.4f}")

print("Training done.")

torch.save(model.state_dict(), "fair_pinn_model.pt")

print("Model saved as fair_pinn_model.pt")


Cell 5:
# --- Inference on validation split (plug probability) ---

model.eval()

# Ensure autograd is enabled for PDE residual (no torch.no_grad() here)

X_val = X_val.clone().detach().requires_grad_(True) # keep shapes the same

ctx_val = get_context(X_val)

hh_val, dpr_val, pplug_val, pde_val = model(X_val, ctx_val)

print("Predicted plug probabilities (first 10):",

pplug_val.squeeze().detach().cpu().numpy()[:10])

print("Mean PDE residual (val):", float(pde_val.abs().mean().detach().cpu().numpy()))


Cell 6A:

# 1) Restart kernel manually now (Kernel → Restart) then run this cell

import os, torch, gc

os.environ["OMP_NUM_THREADS"] = "1"

os.environ["OPENBLAS_NUM_THREADS"] = "1"

torch.set_num_threads(1)

gc.collect()

print("Threads limited. torch threads:", torch.get_num_threads())


Cell 6B:

import torch, os, gc

MODEL_PT = "fair_pinn_model.pt"

print("Model file exists:", os.path.exists(MODEL_PT), "size(bytes):", os.path.getsize(MODEL_PT)


if os.path.exists(MODEL_PT) else "n/a")

state = torch.load(MODEL_PT, map_location="cpu")

print("Loaded state keys (sample):", list(state.keys())[:10], " total keys:", len(state.keys()))

del state

gc.collect()

print("Checkpoint loaded & freed.")


Cell 6C:

# 3) batched prediction and save probabilities (NO plotting)

import pandas as pd, numpy as np, torch, os, gc

from math import ceil

CURVE_CSV = "hydrate_curve_clean.csv"

DATA_CSV = "fair_pinn_dummy_data.csv"

MODEL_PT = "fair_pinn_model.pt"

# build Teq (safe, numpy)

curve = pd.read_csv(CURVE_CSV).sort_values("Pressure").drop_duplicates("Pressure")

Px, Ty = curve["Pressure"].values, curve["Temperature"].values

def Teq_np(P):

P = np.asarray(P, dtype=float)

P = np.clip(P, Px.min(), Px.max())

return np.interp(P, Px, Ty)

# load val slice

df = pd.read_csv(DATA_CSV)

split = int(0.8 * len(df))

val_df = df.iloc[split:].copy()

val_df["DeltaT"] = val_df["T_K"].values - Teq_np(val_df["P_bar"].values)

feature_cols = ["time_s","P_bar","T_K","Sg","phase_frac","pipe_d_m","holdup_liq","DeltaT"]

# create torch dataset

X = torch.tensor(val_df[feature_cols].astype(float).to_numpy(), dtype=torch.float32)
# make sure PINN class is defined (Cell 3 must have run)

try:

model # just to check

except NameError:

raise RuntimeError("PINN class or trained model not in memory. Please run Cell 3 (PINN
Definition) before this step.")

# load weights into model instance (fresh)

m = PINN(base_feat_dim=len(feature_cols), feature_cols=feature_cols)

m.load_state_dict(torch.load(MODEL_PT, map_location="cpu"))

m.eval()

# run inference in small batches to avoid memory spikes

batch_size = 16

n = X.shape[0]

probs = []

for i in range(0, n, batch_size):

xb = X[i:i+batch_size]

with torch.no_grad():

hh = m.hh_net(xb)

dpr = m.dpr_net(torch.cat([xb, hh], dim=1))

logit = m.cls_net(torch.cat([xb, hh, dpr], dim=1))

p = torch.sigmoid(logit).squeeze().cpu().numpy()

probs.append(p)

# optional: print progress minimal


if (i // batch_size) % 5 == 0:

print(f"Processed rows {i}..{min(i+batch_size,n)-1}")

probs = np.concatenate([np.atleast_1d(p) for p in probs])

print("Predictions shape:", probs.shape)

# Save probabilities next to CSV for inspection

out_df = val_df.reset_index(drop=True).copy()

out_df["pred_prob"] = probs

out_csv = "plots/val_predictions.csv"

os.makedirs("plots", exist_ok=True)

out_df.to_csv(out_csv, index=False)

print("Saved predictions to", out_csv)

gc.collect()
Cell 6E:

# Improved Pillow plotting with sensible y-limits, ticks, labels and markers

from PIL import Image, ImageDraw, ImageFont

import pandas as pd, numpy as np, os

csv = "plots/val_predictions.csv"

if not os.path.exists(csv):

raise FileNotFoundError(csv + " not found. Run inference (Cell B3) first.")

df = pd.read_csv(csv)

os.makedirs("plots", exist_ok=True)

# build Teq

curve =
pd.read_csv("hydrate_curve_clean.csv").sort_values("Pressure").drop_duplicates("Pressure")

Px, Ty = curve["Pressure"].values, curve["Temperature"].values

def Teq_np(P):

P = np.asarray(P, dtype=float)

P = np.clip(P, Px.min(), Px.max())

return np.interp(P, Px, Ty)

df["Teq"] = Teq_np(df["P_bar"].values)

# small helper to draw ticks and labels

def _draw_ticks_and_labels(draw, left, top, right, bottom, xmin, xmax, ymin, ymax, font):

# x ticks (5)

ticks = 5
for i in range(ticks+1):

t = i / ticks

px = left + int(t*(right-left))

xval = xmin + t*(xmax-xmin)

draw.line([(px, bottom), (px, bottom+6)], fill=(80,80,80))

s = f"{xval:.0f}" if abs(xmax-xmin) > 100 else f"{xval:.2f}"

draw.text((px-18, bottom+8), s, fill=(0,0,0), font=font)

# y ticks (5)

for i in range(ticks+1):

t = i / ticks

py = top + int(t*(bottom-top))

yval = ymax - t*(ymax-ymin)

draw.line([(left-6, py), (left, py)], fill=(80,80,80))

s = f"{yval:.1f}" if abs(ymax-ymin) > 1 else f"{yval:.2f}"

draw.text((6, py-8), s, fill=(0,0,0), font=font)

# generic canvas plotter (series: list of arrays; labels: list, mask support for markers)

def pillow_plot(x, series, labels, outpath, title="", figsize=(1200,380)):

W, H = figsize

left, top, right, bottom = 140, 50, W-60, H-70

im = Image.new("RGB", (W, H), "white")

draw = ImageDraw.Draw(im)

try:

font = ImageFont.truetype("arial.ttf", 12)

except Exception:

font = ImageFont.load_default()
# compute global y limits across all numeric series

nums = []

for s in series:

arr = np.asarray(s if s is not None else [])

if arr.size > 0:

nums.append(arr)

if len(nums) == 0:

nums = [np.array([0.0,1.0])]

all_y = np.hstack(nums)

ymin, ymax = float(all_y.min()), float(all_y.max())

if np.isclose(ymin, ymax):

ymin -= 0.5; ymax += 0.5

# add small margin

yrange = ymax - ymin

ymin -= 0.08 * yrange

ymax += 0.08 * yrange

xmin, xmax = float(x.min()), float(x.max())

if np.isclose(xmin, xmax):

xmin -= 0.5; xmax += 0.5

# map helper

def map_xy(xv, yv):

# xv, yv as numpy arrays

xs = (xv - xmin) / (xmax - xmin + 1e-12)


ys = (yv - ymin) / (ymax - ymin + 1e-12)

left_px = left; right_px = right; top_px = top; bottom_px = bottom

plotW = right_px - left_px; plotH = bottom_px - top_px

pts = [(left_px + int(xi*plotW), top_px + int((1.0-yi)*plotH)) for xi, yi in zip(xs, ys)]

return pts

colors = [(31,119,180),(255,127,14),(214,39,40),(44,160,44)]

# draw each series

for i, s in enumerate(series):

sarr = np.asarray(s if s is not None else [])

if sarr.size == 0:

continue

pts = map_xy(x, sarr)

draw.line(pts, fill=colors[i % len(colors)], width=2)

# if any labels contain mask dict, draw red markers at top area

for lab in labels:

if isinstance(lab, dict) and lab.get("mask") is not None:

mask = np.asarray(lab["mask"], dtype=bool)

for xi, m in zip(x, mask):

if m:

# compute px

t = (xi - xmin) / (xmax - xmin + 1e-12)

px = left + int(t*(right-left))

py = top + 6

draw.ellipse([px-5, py-5, px+5, py+5], fill=(214,39,40))


# draw axes box

draw.rectangle([left, top, right, bottom], outline=(60,60,60), width=1)

# draw ticks and numeric labels

_draw_ticks_and_labels(draw, left, top, right, bottom, xmin, xmax, ymin, ymax, font)

# title

draw.text((left, 12), title, fill=(0,0,0), font=font)

# legend

lx = right - 220; ly = 14

for i, lab in enumerate(labels):

text = lab if isinstance(lab, str) else "y_cls markers"

draw.rectangle([lx, ly + i*18, lx+12, ly+12 + i*18], fill=colors[i % len(colors)])

draw.text((lx+18, ly + i*18), text, fill=(0,0,0), font=font)

im.save(outpath)

# Plot A: T_eq vs Measured T

x = df["time_s"].to_numpy()

seriesA = [df["Teq"].to_numpy(), df["T_K"].to_numpy()]

labelsA = ["T_eq(P)", "Measured T"]

pillow_plot(x, seriesA, labelsA, "plots/plot_T_vs_Teq.png", title="Measured T vs Hydrate


Equilibrium T (validation)")

# Plot B: Predicted prob with y_cls markers

seriesB = [df["pred_prob"].to_numpy()]
labelsB = ["Predicted prob"]

if "y_cls" in df.columns and df["y_cls"].notna().any():

mask = df["y_cls"].astype(float).to_numpy() == 1

# include mask as a legend entry to draw red dots

labelsB = [{"mask": mask}]

pillow_plot(x, seriesB, labelsB, "plots/plot_probability.png", title="Predicted plug probability


(validation)")

print("Saved improved plots to plots/plot_T_vs_Teq.png and plots/plot_probability.png")


Cell 7:

import pandas as pd, numpy as np, os

# load predictions

fn = "plots/val_predictions.csv"

if not os.path.exists(fn):

raise FileNotFoundError(f"{fn} not found. Run inference (Cell B3) first.")

df = pd.read_csv(fn)

print("Columns in val_predictions.csv:", list(df.columns))

# If Teq is missing, rebuild it from hydrate_curve_clean.csv

if "Teq" not in df.columns:

curve_fn = "hydrate_curve_clean.csv"

if not os.path.exists(curve_fn):

raise FileNotFoundError(f"{curve_fn} not found. Upload it or change name.")

curve = pd.read_csv(curve_fn).sort_values("Pressure").drop_duplicates("Pressure")

Px, Ty = curve["Pressure"].values, curve["Temperature"].values

def Teq_np(P):

P = np.asarray(P, dtype=float)

P = np.clip(P, Px.min(), Px.max())

return np.interp(P, Px, Ty)

df["Teq"] = Teq_np(df["P_bar"].values)

# optional: save augmented CSV back

df.to_csv(fn, index=False)
print("Added 'Teq' column and saved back to", fn)

else:

print("'Teq' already present in CSV")

# Now print the summary stats (safe access)

print("\nRows:", len(df))

print("pred_prob min, mean, max:", df["pred_prob"].min(), df["pred_prob"].mean(),


df["pred_prob"].max())

if "y_cls" in df.columns:

print("y_cls value counts:\n", df["y_cls"].value_counts(dropna=False))

# Print first rows with safe column selection (use columns that exist)

cols_to_show = [c for c in ["time_s","P_bar","T_K","Teq","DeltaT","pred_prob","y_cls"] if c in


df.columns]

print("\nSample rows:")

print(df[cols_to_show].head())

# Quick classification metrics (threshold 0.5) if y_cls exists

if "y_cls" in df.columns:

y_true = df["y_cls"].astype(int).to_numpy()

y_hat = (df["pred_prob"].to_numpy() >= 0.5).astype(int)

tp = ((y_true==1)&(y_hat==1)).sum()

fp = ((y_true==0)&(y_hat==1)).sum()

tn = ((y_true==0)&(y_hat==0)).sum()

fn = ((y_true==1)&(y_hat==0)).sum()

acc = (tp+tn)/len(y_true)
prec = tp/(tp+fp) if (tp+fp)>0 else 0.0

rec = tp/(tp+fn) if (tp+fn)>0 else 0.0

print(f"\nClassification (threshold 0.5) acc={acc:.3f} prec={prec:.3f} rec={rec:.3f}")

print(f"TP={tp} FP={fp} TN={tn} FN={fn}")

else:

print("\nNo y_cls column present; skipping classification metrics.")

You might also like