0% found this document useful (0 votes)
25 views9 pages

Python 3

The document outlines the use of the Seaborn package to load and analyze the 'tips' dataset, including linear and logistic regression models to predict tips and dining times based on various features. It provides evaluation metrics such as Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for different models. The document also details the structure of the dataset, including columns like total_bill, tip, sex, smoker, day, time, size, and predicted values.

Uploaded by

prakashas404
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
25 views9 pages

Python 3

The document outlines the use of the Seaborn package to load and analyze the 'tips' dataset, including linear and logistic regression models to predict tips and dining times based on various features. It provides evaluation metrics such as Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for different models. The document also details the structure of the dataset, including columns like total_bill, tip, sex, smoker, day, time, size, and predicted values.

Uploaded by

prakashas404
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

HomeWork

#1. Use Seaborn package to load 'tips' dataset


import seaborn as sns

df = sns.load_dataset('tips')

df.head()

{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\


n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}

LINEAR MODEL
predict_tip = f(total_bill)

Model 1 when m = 0.1 and c = -0.5

m = 0.1
c = -0.5

df['predicted_tip'] = df['total_bill'] * m + c

df

{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\


n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"predicted_tip\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0.8902411954856857,\n \"min\": -
0.193,\n \"max\": 4.581,\n \"num_unique_values\": 229,\n
\"samples\": [\n 1.7120000000000002,\n
1.5230000000000001\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}

# Evaluation metrics

import numpy as np

mse = ((df['tip'] - df['predicted_tip'])**2).mean()


rmse = np.sqrt(mse)

print(f'MSE : {mse:.2f}\nRMSE : {rmse:.2f}')

MSE : 3.35
RMSE : 1.83

Model 2 when m = 0.05 and c = 0.1

m = 0.05
c = 0.1

df['predicted_tip'] = df['tip'] * m + c

df

{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\


n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"predicted_tip\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0.06918190945005911,\n \"min\":
0.15000000000000002,\n \"max\": 0.6,\n
\"num_unique_values\": 123,\n \"samples\": [\n
0.2675,\n 0.17500000000000002\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"df"}

# Evaluation metrics

mse = ((df['tip'] - df['predicted_tip'])**2).mean()


rmse = np.sqrt(mse)

print(f'MSE : {mse:.2f}\nRMSE : {rmse:.2f}')

MSE : 9.27
RMSE : 3.05

LOGISTIC REGRESSION

time = f(tip)

df = sns.load_dataset('tips')

m = 0.07
c = -0.8

def sigmoid(z):
return 1 / (1 + np.exp(-z))

df['prob_dinner'] = sigmoid(df['tip'] * m + c)

df['predicted_time'] = (df['prob_dinner'] >= 5).astype(int)

df

{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\


n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"prob_dinner\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0.022580799768589357,\n \"min\":
0.3251947274176865,\n \"max\": 0.47502081252106,\n
\"num_unique_values\": 123,\n \"samples\": [\n
0.3622758233996694,\n 0.3329217229043748\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"predicted_time\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 0,\n
\"num_unique_values\": 1,\n \"samples\": [\n 0\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n }\n ]\n}","type":"dataframe","variable_name":"df"}

df['actual_time'] = (df['time'] == 'Dinner').astype(int)

TP = ((df['actual_time'] == 1) & (df['predicted_time'] == 1)).sum()


FP = ((df['actual_time'] == 0) & (df['predicted_time'] == 1)).sum()
FN = ((df['actual_time'] == 1) & (df['predicted_time'] == 0)).sum()
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0

print(f'Precision : {precision:.4f}')
print(f'Recall : {recall:.4f}')

Precision : 0.0000
Recall : 0.0000

DECISION TREE

1. Basic (Dinner if tip > 1.2)


df = sns.load_dataset('tips')

df['actual_time'] = (df['time'] == 'Dinner').astype(int)

threshold = 1.2
df['predicted_time_DT'] = (df['tip'] > threshold).astype(int)

TP = ((df['actual_time'] == 1) * (df['predicted_time_DT'] == 1)).sum()


FP = ((df['actual_time'] == 0) * (df['predicted_time_DT'] == 1)).sum()
FN = ((df['actual_time'] == 1) * (df['predicted_time_DT'] == 0)).sum()

precision = TP / (TP + FP) if (TP + FP) else 0


recall = TP / (TP + FN) if (TP + FN) else 0

print(f'Precision : {precision:.4f}')
print(f'Recall : {recall:.4f}')

Precision : 0.7131
Recall : 0.9602

df

{"summary":"{\n \"name\": \"df\",\n \"rows\": 244,\n \"fields\": [\


n {\n \"column\": \"total_bill\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 8.902411954856856,\n
\"min\": 3.07,\n \"max\": 50.81,\n
\"num_unique_values\": 229,\n \"samples\": [\n 22.12,\
n 20.23,\n 14.78\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"tip\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1.3836381890011826,\n
\"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\":
123,\n \"samples\": [\n 3.35,\n 1.5,\n
6.73\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"sex\",\n \"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Male\",\n \"Female\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"smoker\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
2,\n \"samples\": [\n \"Yes\",\n \"No\"\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"day\",\n \"properties\": {\
n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Sat\",\n \"Fri\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"time\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 2,\n
\"samples\": [\n \"Lunch\",\n \"Dinner\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"size\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 6,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2,\n
3\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"actual_time\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 0,\n 1\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"predicted_time_DT\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 1,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}

df = sns.load_dataset('tips')

df['actual_time'] = (df['time'] == 'Dinner').astype(int)

mask1 = (df['tip'] < 1.5) & (df['total_bill'] < 10)

mask2 = (df['tip'] >= 1.5) & (df['tip'] < 4.5) & (df['total_bill'] <
12)

mask3 = (df['tip'] >= 1.5) & (df['tip'] >= 4.5) & (df['total_bill'] <
19)

df['predicted_time_DT2'] = (mask1 | mask2 | mask3).astype(int)

TP = ((df['actual_time'] == 1) * (df['predicted_time_DT2'] ==
1)).sum()
FP = ((df['actual_time'] == 0) * (df['predicted_time_DT2'] ==
1)).sum()
FN = ((df['actual_time'] == 1) * (df['predicted_time_DT2'] ==
0)).sum()

precision = TP / (TP + FP) if (TP + FP) != 0 else 0


recall = TP / (TP + FN) if (TP + FN) != 0 else 0

print(f'Precision : {precision:.4f}')
print(f'recall : {recall:.4f}')

Precision : 0.5676
recall : 0.1193

import seaborn as sns


import matplotlib.pyplot as plt
import pandas as pd

# Load dataset locally


df = sns.load_dataset("tips") # works if you're online

# Label: 1 = Dinner, 0 = Lunch


df["actual_time"] = (df["time"] == "Dinner").astype(int)

# Decision logic
mask1 = (df["tip"] < 1.5) & (df["total_bill"] < 10)
mask2 = (df["tip"] >= 1.5) & (df["tip"] < 4.5) & (df["total_bill"] <
12)
mask3 = (df["tip"] >= 4.5) & (df["total_bill"] < 19)

df["predicted_time_DT2"] = (mask1 | mask2 | mask3).astype(int)

# ────────────────────────────────
# Plot 1: Actual classification
# Plot 2: Predicted classification
# ────────────────────────────────
plt.figure(figsize=(12, 5))

# Actual
plt.subplot(1, 2, 1)
sns.scatterplot(
data=df,
x="total_bill",
y="tip",
hue="actual_time",
palette={0: "red", 1: "green"},
style="actual_time"
)
plt.title("Actual Time (Dinner=1, Lunch=0)")
plt.xlabel("Total Bill")
plt.ylabel("Tip")

# Predicted
plt.subplot(1, 2, 2)
sns.scatterplot(
data=df,
x="total_bill",
y="tip",
hue="predicted_time_DT2",
palette={0: "red", 1: "green"},
style="predicted_time_DT2"
)
plt.title("Predicted Time (DT2)")
plt.xlabel("Total Bill")
plt.ylabel("Tip")

plt.tight_layout()
plt.show()

You might also like