library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
data <- read.csv('spam.csv', stringsAsFactors = FALSE)
str(data)
spam_text <- data$v2
spam_corpus <- Corpus(VectorSource(spam_text))
clean_corpus <- tm_map(spam_corpus, content_transformer(tolower))
clean_corpus <- tm_map(clean_corpus, removePunctuation)
clean_corpus <- tm_map(clean_corpus, removeNumbers)
clean_corpus <- tm_map(clean_corpus, removeWords,
stopwords("english"))
custom_stopwords <- c("u", "so")
clean_corpus <- tm_map(clean_corpus, removeWords, custom_stopwords)
clean_corpus <- tm_map(clean_corpus, stripWhitespace)
dtm <- TermDocumentMatrix(clean_corpus)
dtm_matrix <- as.matrix(dtm)
word_freq <- sort(rowSums(dtm_matrix), decreasing = TRUE)
word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
top_5_words <- head(word_freq_df, 5)
print(top_5_words)
x11()
ggplot(top_5_words, aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Top 5 Most Frequent Words in Spam Text", x = "Words",
y = "Frequency") +
theme_minimal()
set.seed(1234)
x11()
wordcloud(words = word_freq_df$word, freq = word_freq_df$freq,
min.freq = 1,
max.words = 100, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
sentiment_scores <- get_nrc_sentiment(spam_text)
sentiment_totals <- data.frame(colSums(sentiment_scores[, 1:8]))
names(sentiment_totals) <- c("sentiment", "score")
sentiment_totals <- sentiment_totals[order(sentiment_totals$score,
decreasing = TRUE), ]
print(sentiment_totals)
sentiment_freq <- data.frame(sentiment = colnames(sentiment_scores),
frequency = colSums(sentiment_scores))
sentiment_freq <- sentiment_freq[sentiment_freq$frequency > 0, ]
x11()
ggplot(sentiment_freq, aes(x = reorder(sentiment, -frequency), y =
frequency)) +
geom_bar(stat = "identity", fill = "coral") +
labs(title = "Frequency of Sentiments in Spam Text", x =
"Sentiment", y = "Frequency") +
theme_minimal()