Import
import numpy as np
import pandas as pd
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from sklearn.linear_model import LinearRegression
from [Link] import mean_squared_error, r2_score
df = pd.read_csv('[Link]')
[Link]()
<class '[Link]'>
RangeIndex: 38291 entries, 0 to 38290
Columns: 428 entries, Time & Date to cpatToolfaceTar
dtypes: float64(34), int64(390), object(4)
memory usage: 125.0+ MB
cols = [Link]()
drop_col = []
for col in cols:
if df[col].isnull().sum() > 0.499 * len(df) or (df[col] ==
0).all():
drop_col.append(col)
else:
continue
len(drop_col)
376
df = [Link](drop_col, axis=1)
df = df.select_dtypes(include=['int64', 'float64'])
df = [Link](['Prop Spec Vol', 'Prp Spc Vol Ann', 'N2 Temp Ann', 'CO2
Temp Ann', 'Trt Num', 'Ann Pr @WH LOR'], axis=1)
# df = [Link]('Time & Date', axis=1)
[Link](figsize = (50, 50))
mask = [Link](np.ones_like([Link](), dtype=bool))
heatmap = [Link]([Link](), mask = mask, vmin = -1, vmax = 1,
annot = True)
heatmap.set_title('Correlation Heatmap', fontdict = {'fontsize' : 18},
pad = 12)
Text(0.5, 1.0, 'Correlation Heatmap')
df_2 = [Link]()
final_cols = [Link]()
while True:
corr_matrix = df[final_cols].corr().abs()
np.fill_diagonal(corr_matrix.values, 0)
max_corr = corr_matrix.[Link]()
if max_corr <= 0.5:
break
drop_corr = None
for col in corr_matrix.columns:
high_corrs = corr_matrix[col][corr_matrix[col] > 0.5]
if not high_corrs.empty:
drop_corr = high_corrs.index[0]
break
if drop_corr:
print(f"Dropping '{drop_corr}' due to correlation > 0.5")
final_cols.remove(drop_corr)
else:
break
Dropping 'Trt Time' due to correlation > 0.5
Dropping 'Stg Time' due to correlation > 0.5
Dropping 'Slurry Rate' due to correlation > 0.5
Dropping 'CT Running Len' due to correlation > 0.5
Dropping 'CT Run Speed' due to correlation > 0.5
Dropping 'SCC Cycle' due to correlation > 0.5
Dropping 'Max Allow CT Pr' due to correlation > 0.5
Dropping 'Clean Density' due to correlation > 0.5
Dropping 'Cln Density Ann' due to correlation > 0.5
Dropping 'Slurry Density' due to correlation > 0.5
Dropping 'Slurry Den Ann' due to correlation > 0.5
Dropping 'Stage At Bldr' due to correlation > 0.5
Dropping 'SF Calc Limit' due to correlation > 0.5
Dropping 'SF Allow Limit' due to correlation > 0.5
Dropping 'SFAllowInWell' due to correlation > 0.5
Dropping 'Annulus Pr @WH' due to correlation > 0.5
Dropping 'Max Allow WH Pr' due to correlation > 0.5
Dropping 'N2 Pressure' due to correlation > 0.5
Dropping 'N2 Pressure Ann' due to correlation > 0.5
Dropping 'CO2 Pressure' due to correlation > 0.5
Dropping 'CO2 Press Ann' due to correlation > 0.5
Dropping 'Tubing Guide Pr' due to correlation > 0.5
Dropping 'Pr @ Slry Den' due to correlation > 0.5
Dropping 'Pr@Slry Den Ann' due to correlation > 0.5
Dropping 'CO2 Temp' due to correlation > 0.5
Dropping 'SF Reel Inlet' due to correlation > 0.5
Dropping 'SF Tubing Guide' due to correlation > 0.5
Dropping 'SF Abv Stripper' due to correlation > 0.5
Dropping 'SF Blw Stripper' due to correlation > 0.5
Dropping 'Max OD Growth' due to correlation > 0.5
Dropping 'CT Depth' due to correlation > 0.5
Dropping 'CT Pipe Weight' due to correlation > 0.5
Dropping 'Warning Code' due to correlation > 0.5
len(final_cols)
df = df[final_cols]
[Link](figsize = (12, 8))
mask = [Link](np.ones_like([Link](), dtype=bool))
heatmap = [Link]([Link](), mask = mask, vmin = -1, vmax = 1,
annot = True)
heatmap.set_title('Correlation Heatmap', fontdict = {'fontsize' : 18},
pad = 12)
Text(0.5, 1.0, 'Correlation Heatmap')
X = [Link]('Treating Pr LOR', axis=1)
y = df['Treating Pr LOR']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = [Link](X_test)
X_2 = df_2.drop('Treating Pr LOR', axis=1)
y_2 = df_2['Treating Pr LOR']
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2,
test_size=0.2, random_state=42)
scaler2 = StandardScaler()
X_train_2 = scaler2.fit_transform(X_train_2)
X_test_2 = [Link](X_test_2)
lr = LinearRegression()
[Link](X_train, y_train)
y_pred = [Link](X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 31261.686890693905
R-squared: 0.13832227402498298
lr2 = LinearRegression()
[Link](X_train_2, y_train_2)
y_pred2 = [Link](X_test_2)
mse2 = mean_squared_error(y_test_2, y_pred2)
r2_2 = r2_score(y_test_2, y_pred2)
print(f"Mean Squared Error: {mse2}")
print(f"R-squared: {r2_2}")
Mean Squared Error: 29792.2029402794
R-squared: 0.17882621717997915