0% found this document useful (0 votes)

14 views4 pages

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views4 pages

17 - Source Code - nlp-2-5

Uploaded by

nicolesaldanha96

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

# Visualize top categories

top_n = 5
top_categories = df['News
Categories'].value_counts().nlargest(top_n).index
df_top = df[df['News Categories'].isin(top_categories)]
sns.countplot(x='News Categories', data=df_top,
palette='viridis')
plt.title(f'Top {top_n} News Categories')
plt.xlabel('Categories')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()

# Preprocess the 'News Categories'

import string
punc = string.punctuation

def remove_punc(text):
return text.translate(str.maketrans('', '', punc))

df['News Categories'] = df['News

Categories'].apply(remove_punc)

# Convert Date to datetime

df['Date'] = pd.to_datetime(df['Date'], format='mixed',
dayfirst=True)
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Drop the original Date column

df = df.drop('Date', axis=1)

# Lowercase the Content

df['Content'] = df['Content'].str.lower()

# Check for HTML tags

def has_html_tags(text):
soup = BeautifulSoup(text, 'html.parser')
return bool(soup.find())

df['has_html_tags'] = df['Content'].apply(has_html_tags)
count_true = df['has_html_tags'].sum()
df = df.drop('has_html_tags', axis=1)

# Remove emojis
def has_emoji(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return bool(emoji_pattern.search(text))
def remove_emojis(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return emoji_pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_emojis)

# Remove URLs
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub('', text)

df['Content'] = df['Content'].apply(remove_url)

# Remove punctuation from Content

df['Content'] = df['Content'].apply(remove_punc)

# Remove stopwords
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in
stop_words]
return " ".join(filtered_words)

df['Content'] = df['Content'].apply(lambda x:
remove_stopwords(x))

# Replace abbreviations
abbreviation_dict = {
'LOL': 'laugh out loud',
'BRB': 'be right back',
# ... (other abbreviations)
'TTYL': 'talk to you later'
}

def replace_abbreviations(text, abbreviation_dict):

for abbreviation, full_form in abbreviation_dict.items():
text = text.replace(abbreviation, full_form)
return text

df['Content'] = df['Content'].apply(lambda x:
replace_abbreviations(x, abbreviation_dict))
# Tokenization
def tokenize_text(text):
words_list = [word_tokenize(sentence) for sentence in
sent_tokenize(text)]
return ' '.join(' '.join(words) for words in words_list)

df['Content'] = df['Content'].apply(tokenize_text)

# Prepare for model training

X = df['Content']
y = df['category_grouped']

# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_weights_train = compute_class_weight('balanced',
classes=np.unique(y_encoded), y=y_encoded)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
y_encoded, test_size=0.2, random_state=42)

# Multinomial Naive Bayes with Bag of Words

model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy:
{accuracy:.3f}")

# Print classification report

print("Classification Report:\n", classification_report(y_test,
y_pred))

# Cross-validation
cv_scores = cross_val_score(model, X, y_encoded,
cv=StratifiedKFold(n_splits=3, shuffle=True),
scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
'countvectorizer__max_features': [5000, 10000, None],
'countvectorizer__ngram_range': [(1, 1), (1, 2)],
'multinomialnb__alpha': uniform(0.1, 2.0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(model,
param_distributions=param_dist, n_iter=5, scoring='accuracy',
cv=cv, verbose=1, n_jobs=1)
random_search.fit(X, y_encoded)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy:.3f}")

# Inverse transform the predicted labels to get the original

class labels
predicted_labels_original = le.inverse_transform(y_pred_best)
correct_predictions = sum(y_test == y_pred_best)
wrong_predictions = len(y_test) - correct_predictions
print(f'Correct Predictions: {correct_predictions}, Wrong
Predictions: {wrong_predictions}')

# Visualization of predictions
labels = ['Correct Predictions', 'Wrong Predictions']
values = [correct_predictions, wrong_predictions]
plt.bar(labels, values, color=['green', 'red'])
plt.title('Correct vs Wrong Predictions')
plt.xlabel('Prediction Outcome')
plt.ylabel('Number of Samples')
plt.show()

# Final DataFrame with text and predicted labels

final_df = pd.DataFrame({'Content': X_test, 'Predicted_Labels':
predicted_labels_original, 'Actual_Labels':
le.inverse_transform(y_test)})
print(final_df.head())

Research on Text Topic Modeling
No ratings yet
Research on Text Topic Modeling
26 pages
Shreya Srivastava-27
No ratings yet
Shreya Srivastava-27
3 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
ML Lab Programs
No ratings yet
ML Lab Programs
8 pages
ML Week10.1
No ratings yet
ML Week10.1
5 pages
Cyberbullying Code
No ratings yet
Cyberbullying Code
6 pages
ML Lab Manual
No ratings yet
ML Lab Manual
12 pages
Topic Classifierby David Caleb
No ratings yet
Topic Classifierby David Caleb
7 pages
Lab5 Example Fall 23
No ratings yet
Lab5 Example Fall 23
4 pages
Atul MLT Exp 4-11
No ratings yet
Atul MLT Exp 4-11
17 pages
Email Spam Classifier
No ratings yet
Email Spam Classifier
22 pages
ML 1-10
No ratings yet
ML 1-10
53 pages
NLP Tushar
No ratings yet
NLP Tushar
21 pages
Naive Bayes Classification - Jupyter Notebook
No ratings yet
Naive Bayes Classification - Jupyter Notebook
4 pages
Foundations of Python For AI
No ratings yet
Foundations of Python For AI
67 pages
ML Program Output
No ratings yet
ML Program Output
22 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
Text Preprocessing and Sentiment Analysis
No ratings yet
Text Preprocessing and Sentiment Analysis
13 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
Manual
No ratings yet
Manual
48 pages
Detect Fake Social Media Profiles with SVM
No ratings yet
Detect Fake Social Media Profiles with SVM
8 pages
CCC
No ratings yet
CCC
25 pages
Python Text Classification Guide
No ratings yet
Python Text Classification Guide
34 pages
MLA TAB Lecture2
No ratings yet
MLA TAB Lecture2
84 pages
DM ML Practical
No ratings yet
DM ML Practical
13 pages
ML PDF
No ratings yet
ML PDF
30 pages
Task04 Emailspamdetectionwithmachinelearning 1752340927
No ratings yet
Task04 Emailspamdetectionwithmachinelearning 1752340927
2 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
'Dataset - CSV' '/dataset - CSV': Import As From Import From Import From Import From Import
No ratings yet
'Dataset - CSV' '/dataset - CSV': Import As From Import From Import From Import From Import
1 page
Extra Feature NLP
No ratings yet
Extra Feature NLP
5 pages
Miniproject 14
No ratings yet
Miniproject 14
4 pages
Untitled Document
No ratings yet
Untitled Document
2 pages
News Classification with TF-IDF and PCA
No ratings yet
News Classification with TF-IDF and PCA
2 pages
Mercedes-Benz Greener Manufacturing Ai
0% (1)
Mercedes-Benz Greener Manufacturing Ai
16 pages
Aiml 5-8
No ratings yet
Aiml 5-8
19 pages
ML Lab Programs
No ratings yet
ML Lab Programs
18 pages
Machine Learning Lab Assignment 2
No ratings yet
Machine Learning Lab Assignment 2
23 pages
ML Lab
No ratings yet
ML Lab
7 pages
ML Python Exercises UOM BDS Classification
No ratings yet
ML Python Exercises UOM BDS Classification
18 pages
Machine Learning Lab Manual 2021-22
No ratings yet
Machine Learning Lab Manual 2021-22
23 pages
Implemention of Sms Spam Filtering
No ratings yet
Implemention of Sms Spam Filtering
27 pages
Code Day 9 ML (Ordinal) - Jupyter Notebook
No ratings yet
Code Day 9 ML (Ordinal) - Jupyter Notebook
4 pages
5.2 Feature Engineering
No ratings yet
5.2 Feature Engineering
57 pages
10253.exp 5
No ratings yet
10253.exp 5
12 pages
9 Feature Engineering Text Data
No ratings yet
9 Feature Engineering Text Data
7 pages
Fall Semester 2020-21 AI With Python ECE-4031
No ratings yet
Fall Semester 2020-21 AI With Python ECE-4031
5 pages
1st PGM
No ratings yet
1st PGM
10 pages
NLP Lab Manual for B.E. Students
No ratings yet
NLP Lab Manual for B.E. Students
21 pages
Machine Learning Lab New
No ratings yet
Machine Learning Lab New
14 pages
Personalized Cancer Diagnosis
No ratings yet
Personalized Cancer Diagnosis
100 pages
7 Aiml
No ratings yet
7 Aiml
4 pages
Naive Bayes Classification
No ratings yet
Naive Bayes Classification
8 pages
Sentimental
No ratings yet
Sentimental
11 pages
Methodology
No ratings yet
Methodology
9 pages
HSU06 Session 5 Trần Thị Bích Hiền - Colab
No ratings yet
HSU06 Session 5 Trần Thị Bích Hiền - Colab
4 pages
Text Classification with ML Algorithms
No ratings yet
Text Classification with ML Algorithms
5 pages
Sample
No ratings yet
Sample
6 pages
AI&ML
No ratings yet
AI&ML
9 pages
Explicit Architecture: DDD, Hexagonal, Onion, Clean, CQRS, How I Put It All Together
No ratings yet
Explicit Architecture: DDD, Hexagonal, Onion, Clean, CQRS, How I Put It All Together
37 pages
Banking Exam Reasoning Course
No ratings yet
Banking Exam Reasoning Course
4 pages
Web Sam
No ratings yet
Web Sam
6 pages
C# Operators & Type Safety Guide
No ratings yet
C# Operators & Type Safety Guide
45 pages
04 Code Auditing
No ratings yet
04 Code Auditing
41 pages
C DPP-2
No ratings yet
C DPP-2
4 pages
Lab 6
No ratings yet
Lab 6
11 pages
CS301 FILE by Ali Raza
No ratings yet
CS301 FILE by Ali Raza
38 pages
Command Injection Essence
No ratings yet
Command Injection Essence
11 pages
CS501 Midterm Serhng Quiz Monkey
No ratings yet
CS501 Midterm Serhng Quiz Monkey
289 pages
Java Interview Questions From IBM
No ratings yet
Java Interview Questions From IBM
11 pages
NFA Non Deterministic Finite Automata Explained
No ratings yet
NFA Non Deterministic Finite Automata Explained
10 pages
CSIT Timetables for KIET Students
No ratings yet
CSIT Timetables for KIET Students
16 pages
17-BFS, DFS-22-02-2024
No ratings yet
17-BFS, DFS-22-02-2024
30 pages
JavaScript Basics: Variables & Functions
No ratings yet
JavaScript Basics: Variables & Functions
7 pages
Technical Assessment Submission - BBIT
No ratings yet
Technical Assessment Submission - BBIT
4 pages
Bit108 - Mvic June'23 - Fe QP - Revised
No ratings yet
Bit108 - Mvic June'23 - Fe QP - Revised
7 pages
Equifax PEC 2025
No ratings yet
Equifax PEC 2025
4 pages
Starting Out With Python 3rd Edition by Tony Gaddis ISBN 1292065508 9781292065502 2025 Easy Download
No ratings yet
Starting Out With Python 3rd Edition by Tony Gaddis ISBN 1292065508 9781292065502 2025 Easy Download
110 pages
Sbi (So) It Mock13 Sample
No ratings yet
Sbi (So) It Mock13 Sample
6 pages
Ee40034 2025 Dlvu
No ratings yet
Ee40034 2025 Dlvu
2 pages
Grade 5 Math Operations Practice
No ratings yet
Grade 5 Math Operations Practice
2 pages
Chapter 6 Mathematical Library Methods Solution+
67% (3)
Chapter 6 Mathematical Library Methods Solution+
14 pages
Aktu QP - 2023 24
No ratings yet
Aktu QP - 2023 24
2 pages
Dekomposisi Graf Matahari
No ratings yet
Dekomposisi Graf Matahari
12 pages
AS Computer Science Specimen Paper
No ratings yet
AS Computer Science Specimen Paper
6 pages
Modul Data Matrix
No ratings yet
Modul Data Matrix
5 pages
Imp Questions
No ratings yet
Imp Questions
7 pages
Python Programming Exercises for Class XII
No ratings yet
Python Programming Exercises for Class XII
45 pages
Coe201 Final Exam
No ratings yet
Coe201 Final Exam
5 pages

17 - Source Code - nlp-2-5

Uploaded by

17 - Source Code - nlp-2-5

Uploaded by

# Visualize top categories

# Preprocess the 'News Categories'

df['News Categories'] = df['News

# Convert Date to datetime

# Drop the original Date column

# Lowercase the Content

# Check for HTML tags

# Remove punctuation from Content

def replace_abbreviations(text, abbreviation_dict):

# Prepare for model training

# Split the data into training and testing sets

# Multinomial Naive Bayes with Bag of Words

# Print classification report

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Inverse transform the predicted labels to get the original

# Final DataFrame with text and predicted labels

You might also like