HomeWork
#1. Use Seaborn package to load 'tips' dataset
import seaborn as sns
df = sns.load_dataset('tips')
df.head()
{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\
n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}
LINEAR MODEL
predict_tip = f(total_bill)
Model 1 when m = 0.1 and c = -0.5
m = 0.1
c = -0.5
df['predicted_tip'] = df['total_bill'] * m + c
df
{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\
n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"predicted_tip\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0.8902411954856857,\n \"min\": -
0.193,\n \"max\": 4.581,\n \"num_unique_values\": 229,\n
\"samples\": [\n 1.7120000000000002,\n
1.5230000000000001\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}
# Evaluation metrics
import numpy as np
mse = ((df['tip'] - df['predicted_tip'])**2).mean()
rmse = np.sqrt(mse)
print(f'MSE : {mse:.2f}\nRMSE : {rmse:.2f}')
MSE : 3.35
RMSE : 1.83
Model 2 when m = 0.05 and c = 0.1
m = 0.05
c = 0.1
df['predicted_tip'] = df['tip'] * m + c
df
{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\
n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"predicted_tip\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0.06918190945005911,\n \"min\":
0.15000000000000002,\n \"max\": 0.6,\n
\"num_unique_values\": 123,\n \"samples\": [\n
0.2675,\n 0.17500000000000002\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"df"}
# Evaluation metrics
mse = ((df['tip'] - df['predicted_tip'])**2).mean()
rmse = np.sqrt(mse)
print(f'MSE : {mse:.2f}\nRMSE : {rmse:.2f}')
MSE : 9.27
RMSE : 3.05
LOGISTIC REGRESSION
time = f(tip)
df = sns.load_dataset('tips')
m = 0.07
c = -0.8
def sigmoid(z):
return 1 / (1 + np.exp(-z))
df['prob_dinner'] = sigmoid(df['tip'] * m + c)
df['predicted_time'] = (df['prob_dinner'] >= 5).astype(int)
df
{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\
n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"prob_dinner\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0.022580799768589357,\n \"min\":
0.3251947274176865,\n \"max\": 0.47502081252106,\n
\"num_unique_values\": 123,\n \"samples\": [\n
0.3622758233996694,\n 0.3329217229043748\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"predicted_time\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 0,\n
\"num_unique_values\": 1,\n \"samples\": [\n 0\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n }\n ]\n}","type":"dataframe","variable_name":"df"}
df['actual_time'] = (df['time'] == 'Dinner').astype(int)
TP = ((df['actual_time'] == 1) & (df['predicted_time'] == 1)).sum()
FP = ((df['actual_time'] == 0) & (df['predicted_time'] == 1)).sum()
FN = ((df['actual_time'] == 1) & (df['predicted_time'] == 0)).sum()
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
print(f'Precision : {precision:.4f}')
print(f'Recall : {recall:.4f}')
Precision : 0.0000
Recall : 0.0000
DECISION TREE
1. Basic (Dinner if tip > 1.2)
df = sns.load_dataset('tips')
df['actual_time'] = (df['time'] == 'Dinner').astype(int)
threshold = 1.2
df['predicted_time_DT'] = (df['tip'] > threshold).astype(int)
TP = ((df['actual_time'] == 1) * (df['predicted_time_DT'] == 1)).sum()
FP = ((df['actual_time'] == 0) * (df['predicted_time_DT'] == 1)).sum()
FN = ((df['actual_time'] == 1) * (df['predicted_time_DT'] == 0)).sum()
precision = TP / (TP + FP) if (TP + FP) else 0
recall = TP / (TP + FN) if (TP + FN) else 0
print(f'Precision : {precision:.4f}')
print(f'Recall : {recall:.4f}')
Precision : 0.7131
Recall : 0.9602
df
{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\
n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"actual_time\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 0,\n 1\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"predicted_time_DT\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 1,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}
df = sns.load_dataset('tips')
df['actual_time'] = (df['time'] == 'Dinner').astype(int)
mask1 = (df['tip'] < 1.5) & (df['total_bill'] < 10)
mask2 = (df['tip'] >= 1.5) & (df['tip'] < 4.5) & (df['total_bill'] <
12)
mask3 = (df['tip'] >= 1.5) & (df['tip'] >= 4.5) & (df['total_bill'] <
19)
df['predicted_time_DT2'] = (mask1 | mask2 | mask3).astype(int)
TP = ((df['actual_time'] == 1) * (df['predicted_time_DT2'] ==
1)).sum()
FP = ((df['actual_time'] == 0) * (df['predicted_time_DT2'] ==
1)).sum()
FN = ((df['actual_time'] == 1) * (df['predicted_time_DT2'] ==
0)).sum()
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
print(f'Precision : {precision:.4f}')
print(f'recall : {recall:.4f}')
Precision : 0.5676
recall : 0.1193
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Load dataset locally
df = sns.load_dataset("tips") # works if you're online
# Label: 1 = Dinner, 0 = Lunch
df["actual_time"] = (df["time"] == "Dinner").astype(int)
# Decision logic
mask1 = (df["tip"] < 1.5) & (df["total_bill"] < 10)
mask2 = (df["tip"] >= 1.5) & (df["tip"] < 4.5) & (df["total_bill"] <
12)
mask3 = (df["tip"] >= 4.5) & (df["total_bill"] < 19)
df["predicted_time_DT2"] = (mask1 | mask2 | mask3).astype(int)
# ────────────────────────────────
# Plot 1: Actual classification
# Plot 2: Predicted classification
# ────────────────────────────────
plt.figure(figsize=(12, 5))
# Actual
plt.subplot(1, 2, 1)
sns.scatterplot(
data=df,
x="total_bill",
y="tip",
hue="actual_time",
palette={0: "red", 1: "green"},
style="actual_time"
)
plt.title("Actual Time (Dinner=1, Lunch=0)")
plt.xlabel("Total Bill")
plt.ylabel("Tip")
# Predicted
plt.subplot(1, 2, 2)
sns.scatterplot(
data=df,
x="total_bill",
y="tip",
hue="predicted_time_DT2",
palette={0: "red", 1: "green"},
style="predicted_time_DT2"
)
plt.title("Predicted Time (DT2)")
plt.xlabel("Total Bill")
plt.ylabel("Tip")
plt.tight_layout()
plt.show()