0% found this document useful (0 votes)
14 views12 pages

Apriori Algorithm

Uploaded by

asta9578
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views12 pages

Apriori Algorithm

Uploaded by

asta9578
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 12

APRIORI ALGORITHM

# Install necessary packages


!pip install mlxtend pandas

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Sample data: Each row is a transaction, columns are items (1 = bought, 0 = not bought)
data = {
'milk': [1,1,0,1,0],
'bread': [1,1,1,1,1],
'cheese': [0,1,0,1,1],
'butter': [1,0,1,1,0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Find frequent itemsets with minimum support of 0.5


frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)

print("Frequent Itemsets:")
print(frequent_itemsets)

# Generate association rules with minimum confidence of 1.0 (100%)


rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=1.0)

print("\nAssociation Rules:")
print(rules)

FP-GROWTH
# Correct imports
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import pandas as pd
# Sample dataset
data = {
'Bread': [True, True, True, False, False],
'Milk': [True, True, False, True, True],
'Butter': [False, True, True, True, True],
'Cheese': [True, False, True, True, False],
'Yogurt': [False, True, True, False, True]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Apply FP-Growth
frequent_itemsets = fpgrowth(df, min_support=0.5, use_colnames=True)

# Print the result


print("Frequent itemsets (using FP-Growth):")
print(frequent_itemsets)

DECISION TREE
# Correct imports
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the Iris dataset


iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train the Decision Tree model


model = DecisionTreeClassifier(max_depth=2)
model.fit(X_train, y_train)
# Plot the decision tree
plt.figure(figsize=(10, 6))
plot_tree(
model,
filled=True,
feature_names=iris.feature_names,
class_names=iris.target_names
)
plt.show()

NAIVE BAYES

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt

# Load the Iris dataset


iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the data (90% train, 10% test)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

# Train the Gaussian Naive Bayes model


model = GaussianNB()
model.fit(X_train, y_train)

# Predict on the test data


y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Generate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=iris.target_names)

# Plot the confusion matrix


disp.plot(cmap=plt.cm.Blues)
plt.title("Naive Bayes - Confusion Matrix")
plt.grid(False)
plt.show()

KNN ALGORITHM
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.inspection import DecisionBoundaryDisplay

# Load data (2 features for 2D plot)


iris = load_iris()
X, y = iris.data[:, :2], iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train KNN
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

# Decision boundary
disp = DecisionBoundaryDisplay.from_estimator(
model, X, response_method="predict", cmap=plt.cm.Set1,
xlabel=iris.feature_names[0], ylabel=iris.feature_names[1]
)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k")
plt.title("KNN Decision Boundary (k=3)")
plt.show()
# Predict and evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc * 100:.2f}%")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=iris.target_names).plot(cmap=plt.cm.Blues)
plt.title("KNN - Confusion Matrix")
plt.grid(False)
plt.show()

LINEAR REGRESSION
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset (Diabetes dataset is a good regression example)


data = load_diabetes()
X = data.data # Features
y = data.target # Target (disease progression)

# Split into train and test sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model


model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")


print(f"R² Score: {r2:.2f}")

# Plot predicted vs actual


plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Linear Regression: Actual vs Predicted")
plt.show()

EUCLIDEAN DISTANCE

import math
import matplotlib.pyplot as plt

# Function to calculate Euclidean distance


def euclidean_distance(x1, y1, x2, y2):
return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

# Points
x1, y1 = 2, 3
x2, y2 = 5, 7

# Calculate distance
distance = euclidean_distance(x1, y1, x2, y2)
print(f"The Euclidean distance between ({x1},{y1}) and ({x2},{y2}) is {distance:.2f}")

# Plotting
plt.figure(figsize=(6, 6))
plt.scatter([x1, x2], [y1, y2], color="blue", label="Points")
plt.plot([x1, x2], [y1, y2], color="green", linestyle="dashed")

# Annotate points
plt.text(x1, y1, f"({x1},{y1})", fontsize=12, verticalalignment='bottom',
horizontalalignment='right')
plt.text(x2, y2, f"({x2},{y2})", fontsize=12, verticalalignment='bottom', horizontalalignment='left')

# Labels and grid


plt.xlabel("x-axis")
plt.ylabel("y-axis")
plt.title(f"Euclidean Distance: {distance:.2f}")
plt.grid(True)
plt.legend()
plt.show()

MINKOWSKI DISTANCE
import math
import matplotlib.pyplot as plt

# Function to calculate Minkowski Distance


def minkowski_distance(x1, y1, x2, y2, p):
return ((abs(x2 - x1)**p + abs(y2 - y1)**p))**(1/p)

# Points
x1, y1 = 2, 3
x2, y2 = 5, 7
p = 3 # You can change p to 1 (Manhattan), 2 (Euclidean), etc.

# Calculate Minkowski Distance


distance = minkowski_distance(x1, y1, x2, y2, p)
print(f"The Minkowski distance (p={p}) between ({x1},{y1}) and ({x2},{y2}) is {distance:.2f}")

# Plotting
plt.figure(figsize=(6, 6))
plt.scatter([x1, x2], [y1, y2], color="purple", label="Points")
plt.plot([x1, x2], [y1, y2], color="orange", linestyle="dashed")

# Annotate points
plt.text(x1, y1, f"({x1},{y1})", fontsize=12, verticalalignment='bottom',
horizontalalignment='right')
plt.text(x2, y2, f"({x2},{y2})", fontsize=12, verticalalignment='bottom', horizontalalignment='left')
# Labels and grid
plt.xlabel("x-axis")
plt.ylabel("y-axis")
plt.title(f"Minkowski Distance (p={p}): {distance:.2f}")
plt.grid(True)
plt.legend()
plt.show()

MAHATTAN DISTANCE

import math
import matplotlib.pyplot as plt

# Function to calculate Manhattan Distance


def manhattan_distance(x1, y1, x2, y2):
return abs(x2 - x1) + abs(y2 - y1)

# Points
x1, y1 = 2, 3
x2, y2 = 5, 7

# Calculate Manhattan Distance


distance = manhattan_distance(x1, y1, x2, y2)
print(f"The Manhattan distance between ({x1},{y1}) and ({x2},{y2}) is {distance:.2f}")

# Plotting
plt.figure(figsize=(6, 6))
plt.scatter([x1, x2], [y1, y2], color="red", label="Points")

# Show horizontal and vertical path to illustrate Manhattan distance


plt.plot([x1, x2], [y1, y1], color='gray', linestyle='dotted')
plt.plot([x2, x2], [y1, y2], color='gray', linestyle='dotted')

# Annotate points
plt.text(x1, y1, f"({x1},{y1})", fontsize=12, verticalalignment='bottom',
horizontalalignment='right')
plt.text(x2, y2, f"({x2},{y2})", fontsize=12, verticalalignment='bottom', horizontalalignment='left')
# Labels and grid
plt.xlabel("x-axis")
plt.ylabel("y-axis")
plt.title(f"Manhattan Distance: {distance:.2f}")
plt.grid(True)
plt.legend()
plt.show()

K MEANS
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# Generate sample data


X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Apply KMeans
kmeans = KMeans(n_clusters=4, random_state=0)
y_kmeans = kmeans.fit_predict(X)

# Plot
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='red',
label='Centroids')
plt.title("K-Means Clustering")
plt.legend()
plt.show()

HIERARCHICAL CLUSTERING
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

# Generate sample data


X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Dendrogram
plt.figure(figsize=(8, 4))
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.title("Dendrogram (Hierarchical Clustering)")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()

# Agglomerative Clustering
hc = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(X)

# Plot clusters
plt.scatter(X[:, 0], X[:, 1], c=y_hc, cmap='rainbow')
plt.title("Hierarchical Clustering")
plt.show()

DBSCAN
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Generate non-spherical data


X, _ = make_moons(n_samples=300, noise=0.05, random_state=0)
X = StandardScaler().fit_transform(X)

# Apply DBSCAN
db = DBSCAN(eps=0.3, min_samples=5)
y_db = db.fit_predict(X)

# Plot DBSCAN results


plt.scatter(X[:, 0], X[:, 1], c=y_db, cmap='plasma')
plt.title("DBSCAN Clustering")
plt.show()
GREEN CREDIT DATA
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from mlxtend.frequent_patterns import apriori, association_rules

# Step 1: Load your CSV file


df = pd.read_csv("sas.csv")
print("Initial Shape:", df.shape)

# Step 2: Fill missing values if any


df.fillna(method="ffill", inplace=True)

# Step 3: Encode categorical columns


for col in df.select_dtypes(include='object').columns:
df[col] = pd.factorize(df[col])[0]

# Step 4: Normalize the dataset


scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Step 5: Correlation heatmap


plt.figure(figsize=(10, 6))
sns.heatmap(df_scaled.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# Step 6: K-Means Clustering


kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(df_scaled)
print("Silhouette Score (K-Means):", silhouette_score(df_scaled, labels))
# Step 7: Hierarchical Clustering
linked = linkage(df_scaled, method='ward')
plt.figure(figsize=(10, 6))
dendrogram(linked, no_labels=True)
plt.title("Dendrogram - Hierarchical Clustering")
plt.tight_layout()
plt.show()

# Step 8: Association Rule Mining (Apriori)


# Discretize numerical columns into bins
df_bin = df.copy()
for col in df_bin.columns:
try:
df_bin[col] = pd.qcut(df_bin[col].rank(method="first"), q=4, labels=False)
except ValueError:
df_bin[col] = df_bin[col]

df_bin = df_bin.astype(str) # Convert to string for one-hot encoding


df_encoded = pd.get_dummies(df_bin)

# Apply Apriori algorithm


frequent_items = apriori(df_encoded, min_support=0.3, use_colnames=True)

# Generate association rules


rules = association_rules(frequent_items, metric="lift", min_threshold=1.0)

# Show sample rules


print("\nSample Association Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

convert these into PDF

You might also like