# Visualize top categories
top_n = 5
top_categories = df['News
Categories'].value_counts().nlargest(top_n).index
df_top = df[df['News Categories'].isin(top_categories)]
sns.countplot(x='News Categories', data=df_top,
palette='viridis')
plt.title(f'Top {top_n} News Categories')
plt.xlabel('Categories')
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.show()
# Preprocess the 'News Categories'
import string
punc = string.punctuation
def remove_punc(text):
return text.translate(str.maketrans('', '', punc))
df['News Categories'] = df['News
Categories'].apply(remove_punc)
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='mixed',
dayfirst=True)
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
# Drop the original Date column
df = df.drop('Date', axis=1)
# Lowercase the Content
df['Content'] = df['Content'].str.lower()
# Check for HTML tags
def has_html_tags(text):
soup = BeautifulSoup(text, 'html.parser')
return bool(soup.find())
df['has_html_tags'] = df['Content'].apply(has_html_tags)
count_true = df['has_html_tags'].sum()
df = df.drop('has_html_tags', axis=1)
# Remove emojis
def has_emoji(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return bool(emoji_pattern.search(text))
def remove_emojis(text):
emoji_pattern = regex.compile(r'\p{Emoji}',
flags=regex.UNICODE)
return emoji_pattern.sub('', text)
df['Content'] = df['Content'].apply(remove_emojis)
# Remove URLs
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub('', text)
df['Content'] = df['Content'].apply(remove_url)
# Remove punctuation from Content
df['Content'] = df['Content'].apply(remove_punc)
# Remove stopwords
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in
stop_words]
return " ".join(filtered_words)
df['Content'] = df['Content'].apply(lambda x:
remove_stopwords(x))
# Replace abbreviations
abbreviation_dict = {
'LOL': 'laugh out loud',
'BRB': 'be right back',
# ... (other abbreviations)
'TTYL': 'talk to you later'
}
def replace_abbreviations(text, abbreviation_dict):
for abbreviation, full_form in abbreviation_dict.items():
text = text.replace(abbreviation, full_form)
return text
df['Content'] = df['Content'].apply(lambda x:
replace_abbreviations(x, abbreviation_dict))
# Tokenization
def tokenize_text(text):
words_list = [word_tokenize(sentence) for sentence in
sent_tokenize(text)]
return ' '.join(' '.join(words) for words in words_list)
df['Content'] = df['Content'].apply(tokenize_text)
# Prepare for model training
X = df['Content']
y = df['category_grouped']
# Encoding labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_weights_train = compute_class_weight('balanced',
classes=np.unique(y_encoded), y=y_encoded)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
y_encoded, test_size=0.2, random_state=42)
# Multinomial Naive Bayes with Bag of Words
model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB with Bag of Words accuracy:
{accuracy:.3f}")
# Print classification report
print("Classification Report:\n", classification_report(y_test,
y_pred))
# Cross-validation
cv_scores = cross_val_score(model, X, y_encoded,
cv=StratifiedKFold(n_splits=3, shuffle=True),
scoring='accuracy')
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")
# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
param_dist = {
'countvectorizer__max_features': [5000, 10000, None],
'countvectorizer__ngram_range': [(1, 1), (1, 2)],
'multinomialnb__alpha': uniform(0.1, 2.0)
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(model,
param_distributions=param_dist, n_iter=5, scoring='accuracy',
cv=cv, verbose=1, n_jobs=1)
random_search.fit(X, y_encoded)
best_params = random_search.best_params_
print("Best Parameters:", best_params)
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy:.3f}")
# Inverse transform the predicted labels to get the original
class labels
predicted_labels_original = le.inverse_transform(y_pred_best)
correct_predictions = sum(y_test == y_pred_best)
wrong_predictions = len(y_test) - correct_predictions
print(f'Correct Predictions: {correct_predictions}, Wrong
Predictions: {wrong_predictions}')
# Visualization of predictions
labels = ['Correct Predictions', 'Wrong Predictions']
values = [correct_predictions, wrong_predictions]
plt.bar(labels, values, color=['green', 'red'])
plt.title('Correct vs Wrong Predictions')
plt.xlabel('Prediction Outcome')
plt.ylabel('Number of Samples')
plt.show()
# Final DataFrame with text and predicted labels
final_df = pd.DataFrame({'Content': X_test, 'Predicted_Labels':
predicted_labels_original, 'Actual_Labels':
le.inverse_transform(y_test)})
print(final_df.head())