keyboard_arrow_down ML- LAB3
KNN on text data
Name: Smit Ahire
Roll No: 05
PRN: 12311496
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from [Link] import KNeighborsClassifier
from [Link] import classification_report, confusion_matrix, accuracy_score
df = pd.read_csv('[Link]', encoding='latin1')
[Link]()
v1 v2 Unnamed: 2 Unnamed: 3 Unnamed: 4
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
1 ham Ok lar... Joking wif u oni... NaN NaN NaN
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN
3 ham U dun say so early hor... U c already then say... NaN NaN NaN
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN
Next steps: Generate code with df toggle_off View recommended plots New interactive sheet
df = [Link](columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df = [Link](columns={'v1': 'target', 'v2': 'text'})
df
target text
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
... ... ...
5567 spam This is the 2nd time we have tried 2 contact u...
5568 ham Will Ì_ b going to esplanade fr home?
5569 ham Pity, * was in mood for that. So...any other s...
5570 ham The guy did some bitching but I acted like i'd...
5571 ham Rofl. Its true to its name
5572 rows × 2 columns
Next steps: Generate code with df toggle_off View recommended plots New interactive sheet
df['target'] = df['target'].map({'ham': 0, 'spam': 1})
df
spark What can I help you build? add_circle send
target text
0 0 Go until jurong point, crazy.. Available only ...
1 0 Ok lar... Joking wif u oni...
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
3 0 U dun say so early hor... U c already then say...
4 0 Nah I don't think he goes to usf, he lives aro...
... ... ...
5567 1 This is the 2nd time we have tried 2 contact u...
5568 0 Will Ì_ b going to esplanade fr home?
5569 0 Pity, * was in mood for that. So...any other s...
5570 0 The guy did some bitching but I acted like i'd...
5571 0 Rofl. Its true to its name
5572 rows × 2 columns
Next steps: Generate code with df toggle_off View recommended plots New interactive sheet
# Separate features (text messages) and the target variable
X = df['text']
y = df['target']
# 80% for training, 20% for testing, with a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# `binary=True` ensures that the feature matrix contains only 0s and 1s,
# indicating the presence or absence of a word. This representation is suitable
# for Hamming distance calculations.
vectorizer = CountVectorizer(binary=True)
# Fit the vectorizer on the training text data and transform it
X_train_vectorized = vectorizer.fit_transform(X_train)
# Transform the test text data using the fitted vectorizer
X_test_vectorized = [Link](X_test)
# Convert the sparse matrices (output of CountVectorizer) to dense arrays.
# This is necessary because the 'hamming' metric in KNeighborsClassifier does
# not support sparse input directly.
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()
# Initialize KNeighborsClassifier with 5 neighbors and 'hamming' distance metric
knn = KNeighborsClassifier(n_neighbors=5, metric='hamming')
# Train the KNN model using the dense training data
[Link](X_train_dense, y_train)
▾ KNeighborsClassifier i ?
KNeighborsClassifier(metric='hamming')
# Make predictions on the dense test set
y_pred = [Link](X_test_dense)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(report)
Accuracy: 0.9192825112107623
Classification Report:
precision recall f1-score support
0 0.91 1.00 0.96 965
1 1.00 0.40 0.57 150
accuracy 0.92 1115
macro avg 0.96 0.70 0.76 1115
weighted avg 0.93 0.92 0.90 1115
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
# Plotting the Confusion Matrix
[Link](figsize=(8, 6))
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Predicted Ham', 'Predicted Spam'],
yticklabels=['Actual Ham', 'Actual Spam'])
[Link]('Confusion Matrix for KNN with Hamming Distance')
[Link]('Predicted Label')
[Link]('True Label')
[Link]('confusion_matrix.png')
[Link]()
Confusion Matrix:
[[965 0]
[ 90 60]]