Cell 1:
# --- Setup (local, no external module) ---
import os, math, numpy as np, pandas as pd, torch, torch.nn as nn, torch.nn.functional as F
# Put the CSV with the hydrate curve in the SAME FOLDER as this notebook:
CURVE_CSV = "hydrate_curve_clean.csv" # <-- upload this file next
DATA_CSV = "fair_pinn_synth_data.csv" # <-- your data file
# Inline Teq builder (replaces hydrate_teq.py)
def build_Teq(csv_path):
df = pd.read_csv(csv_path).sort_values("Pressure").drop_duplicates(subset="Pressure")
Px = df["Pressure"].values
Ty = df["Temperature"].values
def Teq(P):
P = np.asarray(P, dtype=float)
P = np.clip(P, Px.min(), Px.max())
return np.interp(P, Px, Ty)
return Teq
# Load T_eq(P)
Teq = build_Teq(CURVE_CSV)
print("Teq sample:", Teq([5.0, 10.0]))
Cell 2:
# --- Data Loading ---
# Expected columns in the CSV:
# time_s, P_bar, T_K, Sg, phase_frac, pipe_d_m, holdup_liq, y_hh, y_dpr, y_cls
df = pd.read_csv(DATA_CSV)
# Basic checks
required_base = ["time_s","P_bar","T_K","Sg","phase_frac","pipe_d_m","holdup_liq"]
for c in required_base:
if c not in df.columns:
raise ValueError(f"Missing required column: {c}")
# Optional targets
has_hh = "y_hh" in df.columns and df["y_hh"].notna().any()
has_dpr = "y_dpr" in df.columns and df["y_dpr"].notna().any()
has_cls = "y_cls" in df.columns and df["y_cls"].notna().any()
# Compute DeltaT = T - Teq(P)
Teq_vals = Teq(df["P_bar"].values)
df["DeltaT"] = df["T_K"].values - Teq_vals
# Features for base network (you can add/remove later)
feature_cols = ["time_s","P_bar","T_K","Sg","phase_frac","pipe_d_m","holdup_liq","DeltaT"]
X = df[feature_cols].astype(float).values
# Targets
Y_hh = df["y_hh"].astype(float).values if has_hh else None
Y_dpr = df["y_dpr"].astype(float).values if has_dpr else None
Y_cls = df["y_cls"].astype(float).values if has_cls else None
# Train/val split (simple)
idx = np.arange(len(df))
split = int(0.8*len(df))
train_idx, val_idx = idx[:split], idx[split:]
def to_tensor(x): return torch.tensor(x, dtype=torch.float32)
X_train, X_val = to_tensor(X[train_idx]), to_tensor(X[val_idx])
targets = {}
if has_hh: targets["hh"] = (to_tensor(Y_hh[train_idx]).unsqueeze(1),
to_tensor(Y_hh[val_idx]).unsqueeze(1))
if has_dpr: targets["dpr"] = (to_tensor(Y_dpr[train_idx]).unsqueeze(1),
to_tensor(Y_dpr[val_idx]).unsqueeze(1))
if has_cls: targets["cls"] = (to_tensor(Y_cls[train_idx]).unsqueeze(1),
to_tensor(Y_cls[val_idx]).unsqueeze(1))
# Physics context tensors (must align with feature order)
def get_context(X_tensor):
# IMPORTANT: keep the graph intact
X_tensor.requires_grad_(True) # now any slice (time) stays connected to the graph
cols = {name:i for i,name in enumerate(feature_cols)}
ctx = {
"time": X_tensor[:, cols["time_s"] : cols["time_s"]+1], # no clone/detach
"P": X_tensor[:, cols["P_bar"]],
"T": X_tensor[:, cols["T_K"]],
"Sg": X_tensor[:, cols["Sg"]],
"phase_frac": X_tensor[:, cols["phase_frac"]],
"pipe_d": X_tensor[:, cols["pipe_d_m"]],
"holdup_liq": X_tensor[:, cols["holdup_liq"]],
"time_idx": cols["time_s"], # optional, if you want to slice inside model
return ctx
print("Data loaded. has_hh:", has_hh, "has_dpr:", has_dpr, "has_cls:", has_cls)
Cell 3:
# ===== PINN Definition (replace whole cell with this) =====
import torch
import torch.nn as nn
import torch.nn.functional as F
def mlp(in_dim, out_dim, hidden=3, width=64, act=nn.ReLU):
layers = [nn.Linear(in_dim, width), act()]
for _ in range(hidden-1):
layers += [nn.Linear(width, width), act()]
layers += [nn.Linear(width, out_dim)]
return nn.Sequential(*layers)
class Kinetics(nn.Module):
def __init__(self, k1_init=1.0, k2_init=-1.0e4):
super().__init__()
self.k1 = nn.Parameter(torch.tensor(k1_init, dtype=torch.float32))
self.k2 = nn.Parameter(torch.tensor(k2_init, dtype=torch.float32))
def forward(self, T, P, Sg, phase_frac, pipe_d, holdup_liq, DeltaT=None):
# Simple illustrative kinetics; you can swap with your correlation later.
if DeltaT is None:
DeltaT = T*0.0
AS = torch.clamp(phase_frac * 10.0 / (pipe_d + 1e-6), min=1e-6)
k = AS * self.k1 * torch.exp(self.k2 / torch.clamp(T, min=1.0)) \
* torch.clamp(DeltaT, -100.0, 100.0)
return k
class PINN(nn.Module):
def __init__(self, base_feat_dim, feature_cols):
"""
feature_cols: list used to build x_base (must include 'time_s')
"""
super().__init__()
self.feature_cols = feature_cols
assert "time_s" in self.feature_cols, "feature_cols must contain 'time_s'"
self.time_idx = self.feature_cols.index("time_s")
self.hh_net = mlp(base_feat_dim, 1, hidden=4, width=128)
self.dpr_net = mlp(base_feat_dim + 1, 1, hidden=4, width=128)
self.cls_net = mlp(base_feat_dim + 2, 1, hidden=3, width=96)
self.kin = Kinetics()
def forward(self, x_base, context):
# Ensure autograd tracks inputs
x_base.requires_grad_(True)
# --- forward passes ---
hh = self.hh_net(x_base)
dpr = self.dpr_net(torch.cat([x_base, hh], dim=1))
logit = self.cls_net(torch.cat([x_base, hh, dpr], dim=1))
prob_plug = torch.sigmoid(logit)
# --- dhh/dt via gradient wrt all inputs, then select time column ---
dhh_dX = torch.autograd.grad(
outputs=hh,
inputs=x_base,
grad_outputs=torch.ones_like(hh),
create_graph=True,
retain_graph=True,
allow_unused=False
)[0] # (N, base_feat_dim)
dhh_dt = dhh_dX[:, self.time_idx:self.time_idx+1] # (N,1)
# --- kinetics residual ---
k = self.kin(
context["T"], context["P"], context["Sg"],
context["phase_frac"], context["pipe_d"], context["holdup_liq"]
pde_res = dhh_dt - k
return hh, dpr, prob_plug, pde_res
Cell 4:
# --- Training ---
device = "cpu"
model = PINN(base_feat_dim=len(feature_cols), feature_cols=feature_cols).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)
mse = nn.MSELoss()
bce = nn.BCELoss()
# --- Sanitize y_cls targets ---
if "cls" in targets:
print("Sanitizing y_cls targets...")
print("Unique values before:", torch.unique(targets["cls"][0]))
# force binary 0/1
t_train, t_val = targets["cls"]
t_train = (t_train.float().clamp(0, 1) > 0.5).float()
t_val = (t_val.float().clamp(0, 1) > 0.5).float()
targets["cls"] = (t_train, t_val)
print("Unique values after:", torch.unique(targets["cls"][0]))
def batchify(X_tensor, batch_size=128):
n = X_tensor.shape[0]
for i in range(0, n, batch_size):
yield X_tensor[i:i+batch_size]
def compute_loss(model, Xb, targets_dict, ctx):
hh, dpr, pplug, pde = model(Xb, ctx)
loss = 0.0
logs = {}
if "hh" in targets_dict:
yh = targets_dict["hh"]
loss_hh = mse(hh, yh)
loss = loss + loss_hh
logs["hh"] = float(loss_hh.item())
if "dpr" in targets_dict:
yd = targets_dict["dpr"]
loss_dpr = mse(dpr, yd)
loss = loss + loss_dpr
logs["dpr"] = float(loss_dpr.item())
if "cls" in targets_dict:
yc = targets_dict["cls"]
# 🔑 Ensure yc is float in [0,1]
yc = yc.float().clamp(0,1)
loss_cls = bce(pplug, yc)
loss = loss + loss_cls
logs["cls"] = float(loss_cls.item())
# PDE residual
loss_pde = mse(pde, torch.zeros_like(pde))
loss = loss + 1.0 * loss_pde
logs["pde"] = float(loss_pde.item())
logs["total"] = float(loss.item())
return loss, logs
EPOCHS = 200
for ep in range(1, EPOCHS+1):
model.train()
tot = {"total":0.0,"hh":0.0,"dpr":0.0,"cls":0.0,"pde":0.0}
cnt = 0
for Xb in batchify(X_train, batch_size=256):
opt.zero_grad()
ctx = get_context(Xb)
tdict = {}
if "hh" in targets: tdict["hh"] = targets["hh"][0][:Xb.shape[0]]
if "dpr" in targets: tdict["dpr"] = targets["dpr"][0][:Xb.shape[0]]
if "cls" in targets: tdict["cls"] = targets["cls"][0][:Xb.shape[0]]
loss, logs = compute_loss(model, Xb, tdict, ctx)
loss.backward()
opt.step()
for k in tot:
tot[k] += logs.get(k, 0.0)
cnt += 1
if ep % 20 == 0 or ep == 1:
print(f"Epoch {ep:4d} | total={tot['total']/cnt:.4f} hh={tot['hh']/cnt:.4f}
dpr={tot['dpr']/cnt:.4f} cls={tot['cls']/cnt:.4f} pde={tot['pde']/cnt:.4f}")
print("Training done.")
torch.save(model.state_dict(), "fair_pinn_model.pt")
print("Model saved as fair_pinn_model.pt")
Cell 5:
# --- Inference on validation split (plug probability) ---
model.eval()
# Ensure autograd is enabled for PDE residual (no torch.no_grad() here)
X_val = X_val.clone().detach().requires_grad_(True) # keep shapes the same
ctx_val = get_context(X_val)
hh_val, dpr_val, pplug_val, pde_val = model(X_val, ctx_val)
print("Predicted plug probabilities (first 10):",
pplug_val.squeeze().detach().cpu().numpy()[:10])
print("Mean PDE residual (val):", float(pde_val.abs().mean().detach().cpu().numpy()))
Cell 6A:
# 1) Restart kernel manually now (Kernel → Restart) then run this cell
import os, torch, gc
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
torch.set_num_threads(1)
gc.collect()
print("Threads limited. torch threads:", torch.get_num_threads())
Cell 6B:
import torch, os, gc
MODEL_PT = "fair_pinn_model.pt"
print("Model file exists:", os.path.exists(MODEL_PT), "size(bytes):", os.path.getsize(MODEL_PT)
if os.path.exists(MODEL_PT) else "n/a")
state = torch.load(MODEL_PT, map_location="cpu")
print("Loaded state keys (sample):", list(state.keys())[:10], " total keys:", len(state.keys()))
del state
gc.collect()
print("Checkpoint loaded & freed.")
Cell 6C:
# 3) batched prediction and save probabilities (NO plotting)
import pandas as pd, numpy as np, torch, os, gc
from math import ceil
CURVE_CSV = "hydrate_curve_clean.csv"
DATA_CSV = "fair_pinn_dummy_data.csv"
MODEL_PT = "fair_pinn_model.pt"
# build Teq (safe, numpy)
curve = pd.read_csv(CURVE_CSV).sort_values("Pressure").drop_duplicates("Pressure")
Px, Ty = curve["Pressure"].values, curve["Temperature"].values
def Teq_np(P):
P = np.asarray(P, dtype=float)
P = np.clip(P, Px.min(), Px.max())
return np.interp(P, Px, Ty)
# load val slice
df = pd.read_csv(DATA_CSV)
split = int(0.8 * len(df))
val_df = df.iloc[split:].copy()
val_df["DeltaT"] = val_df["T_K"].values - Teq_np(val_df["P_bar"].values)
feature_cols = ["time_s","P_bar","T_K","Sg","phase_frac","pipe_d_m","holdup_liq","DeltaT"]
# create torch dataset
X = torch.tensor(val_df[feature_cols].astype(float).to_numpy(), dtype=torch.float32)
# make sure PINN class is defined (Cell 3 must have run)
try:
model # just to check
except NameError:
raise RuntimeError("PINN class or trained model not in memory. Please run Cell 3 (PINN
Definition) before this step.")
# load weights into model instance (fresh)
m = PINN(base_feat_dim=len(feature_cols), feature_cols=feature_cols)
m.load_state_dict(torch.load(MODEL_PT, map_location="cpu"))
m.eval()
# run inference in small batches to avoid memory spikes
batch_size = 16
n = X.shape[0]
probs = []
for i in range(0, n, batch_size):
xb = X[i:i+batch_size]
with torch.no_grad():
hh = m.hh_net(xb)
dpr = m.dpr_net(torch.cat([xb, hh], dim=1))
logit = m.cls_net(torch.cat([xb, hh, dpr], dim=1))
p = torch.sigmoid(logit).squeeze().cpu().numpy()
probs.append(p)
# optional: print progress minimal
if (i // batch_size) % 5 == 0:
print(f"Processed rows {i}..{min(i+batch_size,n)-1}")
probs = np.concatenate([np.atleast_1d(p) for p in probs])
print("Predictions shape:", probs.shape)
# Save probabilities next to CSV for inspection
out_df = val_df.reset_index(drop=True).copy()
out_df["pred_prob"] = probs
out_csv = "plots/val_predictions.csv"
os.makedirs("plots", exist_ok=True)
out_df.to_csv(out_csv, index=False)
print("Saved predictions to", out_csv)
gc.collect()
Cell 6E:
# Improved Pillow plotting with sensible y-limits, ticks, labels and markers
from PIL import Image, ImageDraw, ImageFont
import pandas as pd, numpy as np, os
csv = "plots/val_predictions.csv"
if not os.path.exists(csv):
raise FileNotFoundError(csv + " not found. Run inference (Cell B3) first.")
df = pd.read_csv(csv)
os.makedirs("plots", exist_ok=True)
# build Teq
curve =
pd.read_csv("hydrate_curve_clean.csv").sort_values("Pressure").drop_duplicates("Pressure")
Px, Ty = curve["Pressure"].values, curve["Temperature"].values
def Teq_np(P):
P = np.asarray(P, dtype=float)
P = np.clip(P, Px.min(), Px.max())
return np.interp(P, Px, Ty)
df["Teq"] = Teq_np(df["P_bar"].values)
# small helper to draw ticks and labels
def _draw_ticks_and_labels(draw, left, top, right, bottom, xmin, xmax, ymin, ymax, font):
# x ticks (5)
ticks = 5
for i in range(ticks+1):
t = i / ticks
px = left + int(t*(right-left))
xval = xmin + t*(xmax-xmin)
draw.line([(px, bottom), (px, bottom+6)], fill=(80,80,80))
s = f"{xval:.0f}" if abs(xmax-xmin) > 100 else f"{xval:.2f}"
draw.text((px-18, bottom+8), s, fill=(0,0,0), font=font)
# y ticks (5)
for i in range(ticks+1):
t = i / ticks
py = top + int(t*(bottom-top))
yval = ymax - t*(ymax-ymin)
draw.line([(left-6, py), (left, py)], fill=(80,80,80))
s = f"{yval:.1f}" if abs(ymax-ymin) > 1 else f"{yval:.2f}"
draw.text((6, py-8), s, fill=(0,0,0), font=font)
# generic canvas plotter (series: list of arrays; labels: list, mask support for markers)
def pillow_plot(x, series, labels, outpath, title="", figsize=(1200,380)):
W, H = figsize
left, top, right, bottom = 140, 50, W-60, H-70
im = Image.new("RGB", (W, H), "white")
draw = ImageDraw.Draw(im)
try:
font = ImageFont.truetype("arial.ttf", 12)
except Exception:
font = ImageFont.load_default()
# compute global y limits across all numeric series
nums = []
for s in series:
arr = np.asarray(s if s is not None else [])
if arr.size > 0:
nums.append(arr)
if len(nums) == 0:
nums = [np.array([0.0,1.0])]
all_y = np.hstack(nums)
ymin, ymax = float(all_y.min()), float(all_y.max())
if np.isclose(ymin, ymax):
ymin -= 0.5; ymax += 0.5
# add small margin
yrange = ymax - ymin
ymin -= 0.08 * yrange
ymax += 0.08 * yrange
xmin, xmax = float(x.min()), float(x.max())
if np.isclose(xmin, xmax):
xmin -= 0.5; xmax += 0.5
# map helper
def map_xy(xv, yv):
# xv, yv as numpy arrays
xs = (xv - xmin) / (xmax - xmin + 1e-12)
ys = (yv - ymin) / (ymax - ymin + 1e-12)
left_px = left; right_px = right; top_px = top; bottom_px = bottom
plotW = right_px - left_px; plotH = bottom_px - top_px
pts = [(left_px + int(xi*plotW), top_px + int((1.0-yi)*plotH)) for xi, yi in zip(xs, ys)]
return pts
colors = [(31,119,180),(255,127,14),(214,39,40),(44,160,44)]
# draw each series
for i, s in enumerate(series):
sarr = np.asarray(s if s is not None else [])
if sarr.size == 0:
continue
pts = map_xy(x, sarr)
draw.line(pts, fill=colors[i % len(colors)], width=2)
# if any labels contain mask dict, draw red markers at top area
for lab in labels:
if isinstance(lab, dict) and lab.get("mask") is not None:
mask = np.asarray(lab["mask"], dtype=bool)
for xi, m in zip(x, mask):
if m:
# compute px
t = (xi - xmin) / (xmax - xmin + 1e-12)
px = left + int(t*(right-left))
py = top + 6
draw.ellipse([px-5, py-5, px+5, py+5], fill=(214,39,40))
# draw axes box
draw.rectangle([left, top, right, bottom], outline=(60,60,60), width=1)
# draw ticks and numeric labels
_draw_ticks_and_labels(draw, left, top, right, bottom, xmin, xmax, ymin, ymax, font)
# title
draw.text((left, 12), title, fill=(0,0,0), font=font)
# legend
lx = right - 220; ly = 14
for i, lab in enumerate(labels):
text = lab if isinstance(lab, str) else "y_cls markers"
draw.rectangle([lx, ly + i*18, lx+12, ly+12 + i*18], fill=colors[i % len(colors)])
draw.text((lx+18, ly + i*18), text, fill=(0,0,0), font=font)
im.save(outpath)
# Plot A: T_eq vs Measured T
x = df["time_s"].to_numpy()
seriesA = [df["Teq"].to_numpy(), df["T_K"].to_numpy()]
labelsA = ["T_eq(P)", "Measured T"]
pillow_plot(x, seriesA, labelsA, "plots/plot_T_vs_Teq.png", title="Measured T vs Hydrate
Equilibrium T (validation)")
# Plot B: Predicted prob with y_cls markers
seriesB = [df["pred_prob"].to_numpy()]
labelsB = ["Predicted prob"]
if "y_cls" in df.columns and df["y_cls"].notna().any():
mask = df["y_cls"].astype(float).to_numpy() == 1
# include mask as a legend entry to draw red dots
labelsB = [{"mask": mask}]
pillow_plot(x, seriesB, labelsB, "plots/plot_probability.png", title="Predicted plug probability
(validation)")
print("Saved improved plots to plots/plot_T_vs_Teq.png and plots/plot_probability.png")
Cell 7:
import pandas as pd, numpy as np, os
# load predictions
fn = "plots/val_predictions.csv"
if not os.path.exists(fn):
raise FileNotFoundError(f"{fn} not found. Run inference (Cell B3) first.")
df = pd.read_csv(fn)
print("Columns in val_predictions.csv:", list(df.columns))
# If Teq is missing, rebuild it from hydrate_curve_clean.csv
if "Teq" not in df.columns:
curve_fn = "hydrate_curve_clean.csv"
if not os.path.exists(curve_fn):
raise FileNotFoundError(f"{curve_fn} not found. Upload it or change name.")
curve = pd.read_csv(curve_fn).sort_values("Pressure").drop_duplicates("Pressure")
Px, Ty = curve["Pressure"].values, curve["Temperature"].values
def Teq_np(P):
P = np.asarray(P, dtype=float)
P = np.clip(P, Px.min(), Px.max())
return np.interp(P, Px, Ty)
df["Teq"] = Teq_np(df["P_bar"].values)
# optional: save augmented CSV back
df.to_csv(fn, index=False)
print("Added 'Teq' column and saved back to", fn)
else:
print("'Teq' already present in CSV")
# Now print the summary stats (safe access)
print("\nRows:", len(df))
print("pred_prob min, mean, max:", df["pred_prob"].min(), df["pred_prob"].mean(),
df["pred_prob"].max())
if "y_cls" in df.columns:
print("y_cls value counts:\n", df["y_cls"].value_counts(dropna=False))
# Print first rows with safe column selection (use columns that exist)
cols_to_show = [c for c in ["time_s","P_bar","T_K","Teq","DeltaT","pred_prob","y_cls"] if c in
df.columns]
print("\nSample rows:")
print(df[cols_to_show].head())
# Quick classification metrics (threshold 0.5) if y_cls exists
if "y_cls" in df.columns:
y_true = df["y_cls"].astype(int).to_numpy()
y_hat = (df["pred_prob"].to_numpy() >= 0.5).astype(int)
tp = ((y_true==1)&(y_hat==1)).sum()
fp = ((y_true==0)&(y_hat==1)).sum()
tn = ((y_true==0)&(y_hat==0)).sum()
fn = ((y_true==1)&(y_hat==0)).sum()
acc = (tp+tn)/len(y_true)
prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
rec = tp/(tp+fn) if (tp+fn)>0 else 0.0
print(f"\nClassification (threshold 0.5) acc={acc:.3f} prec={prec:.3f} rec={rec:.3f}")
print(f"TP={tp} FP={fp} TN={tn} FN={fn}")
else:
print("\nNo y_cls column present; skipping classification metrics.")