0% found this document useful (0 votes)
20 views9 pages

Ment Analysis Text Classification

Uploaded by

Nipuni
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
20 views9 pages

Ment Analysis Text Classification

Uploaded by

Nipuni
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

ment-analysis-text-classification

March 24, 2024

[24]: import pandas as pd


import nltk
import re #regex

#Splitting the data into trainig and testing


from sklearn.model_selection import train_test_split

#model
from sklearn.naive_bayes import MultinomialNB

#evaluation metrics
from sklearn import metrics

#stemming
from nltk.stem import PorterStemmer

#stopwords
from nltk.corpus import stopwords

# pandas and numpy


import pandas as pd
import numpy as np

#import count vectorizer


from sklearn.feature_extraction.text import CountVectorizer

#tokernizers
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

#classification results
from sklearn.metrics import confusion_matrix,␣
↪accuracy_score,classification_report

#visualizations
import seaborn as sns

1
import matplotlib.pyplot as plt

[25]: #Loading the Dataset


data = pd.read_csv('Feedback.csv')

[26]: data.head()

[26]: Text Sentiment


0 I love spending time with my family. Positive
1 This movie is absolutely terrible. Negative
2 The food at that restaurant was amazing. Positive
3 I had a horrible experience at the dentist. Negative
4 The weather today is perfect. Positive

[27]: #row and coloumn count


data.shape

[27]: (20, 2)

[28]: # count of the negative and positive sentiments


data['Sentiment'].value_counts()

[28]: Positive 10
Negative 10
Name: Sentiment, dtype: int64

[29]: # assign the count vectorizer to a variable


countvectorizer=CountVectorizer()

# get the document term matrix


DTM=pd.DataFrame(countvectorizer.fit_transform(data["Text"]).toarray(),
columns=countvectorizer.get_feature_names_out(),index=None)

DTM

[29]: absolutely amazing and at awful bad being best book breathtaking \
0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0
2 0 1 0 1 0 0 0 0 0 0
3 0 0 0 1 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0
5 0 0 0 1 1 0 0 0 0 0
6 0 0 0 0 0 0 0 0 1 0
7 0 0 0 0 0 0 0 0 0 0
8 0 1 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 1 0 0 0
10 0 0 0 0 0 0 0 1 0 0

2
11 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 1
13 0 0 0 1 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 1 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0
17 0 0 1 0 0 0 0 0 0 0
18 0 0 0 1 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0

… too top traffic ve view was wasn waste weather with


0 … 0 0 0 0 0 0 0 0 0 1
1 … 0 0 0 0 0 0 0 0 0 0
2 … 0 0 0 0 0 1 0 0 0 0
3 … 0 0 0 0 0 0 0 0 0 0
4 … 0 0 0 0 0 0 0 0 1 0
5 … 0 0 0 0 0 1 0 0 0 0
6 … 0 0 0 0 0 0 0 0 0 0
7 … 0 0 0 0 0 0 0 0 0 1
8 … 0 0 0 0 0 1 0 0 0 0
9 … 0 0 1 0 0 0 0 0 0 0
10 … 0 0 0 1 0 0 0 0 0 0
11 … 0 0 0 0 0 0 0 0 0 1
12 … 0 1 0 0 1 1 0 0 0 0
13 … 0 0 0 0 0 1 0 0 0 0
14 … 0 0 0 0 0 0 0 0 0 0
15 … 1 0 1 0 0 0 1 0 0 0
16 … 0 0 0 0 0 0 0 0 0 1
17 … 0 0 0 0 0 1 0 1 0 0
18 … 0 0 0 0 0 0 0 0 0 0
19 … 0 0 0 0 0 0 0 0 0 0

[20 rows x 76 columns]

[32]: DTM['Sentiment']=data['Sentiment']

DTM

[32]: absolutely amazing and at awful bad being best book breathtaking \
0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0
2 0 1 0 1 0 0 0 0 0 0
3 0 0 0 1 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0
5 0 0 0 1 1 0 0 0 0 0
6 0 0 0 0 0 0 0 0 1 0
7 0 0 0 0 0 0 0 0 0 0

3
8 0 1 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 1 0 0 0
10 0 0 0 0 0 0 0 1 0 0
11 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 1
13 0 0 0 1 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 1 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0
17 0 0 1 0 0 0 0 0 0 0
18 0 0 0 1 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0

… top traffic ve view was wasn waste weather with Sentiment


0 … 0 0 0 0 0 0 0 0 1 Positive
1 … 0 0 0 0 0 0 0 0 0 Negative
2 … 0 0 0 0 1 0 0 0 0 Positive
3 … 0 0 0 0 0 0 0 0 0 Negative
4 … 0 0 0 0 0 0 0 1 0 Positive
5 … 0 0 0 0 1 0 0 0 0 Negative
6 … 0 0 0 0 0 0 0 0 0 Positive
7 … 0 0 0 0 0 0 0 0 1 Negative
8 … 0 0 0 0 1 0 0 0 0 Positive
9 … 0 1 0 0 0 0 0 0 0 Negative
10 … 0 0 1 0 0 0 0 0 0 Positive
11 … 0 0 0 0 0 0 0 0 1 Negative
12 … 1 0 0 1 1 0 0 0 0 Positive
13 … 0 0 0 0 1 0 0 0 0 Negative
14 … 0 0 0 0 0 0 0 0 0 Positive
15 … 0 1 0 0 0 1 0 0 0 Positive
16 … 0 0 0 0 0 0 0 0 1 Negative
17 … 0 0 0 0 1 0 1 0 0 Negative
18 … 0 0 0 0 0 0 0 0 0 Positive
19 … 0 0 0 0 0 0 0 0 0 Negative

[20 rows x 77 columns]

[8]: #preprocessing text function

def preprocess_text(text):
# Convert text to lowercase
text = text.apply(lambda x: x.lower())

# Tokenize text
text = text.apply(lambda x: nltk.word_tokenize(x))

# Remove stop words

4
stop_words = set(stopwords.words('english'))
text = text.apply(lambda x: [word for word in x if word not in stop_words])

# Stem text
stemmer = nltk.PorterStemmer()
text = text.apply(lambda x: [stemmer.stem(word) for word in x])

# Combine words back into a single string


text = text.apply(lambda x: ' '.join(x))

# Remove non-alphanumeric characters using regex


text = text.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Vectorize text using CountVectorizer


countvectorizer = CountVectorizer()
X = countvectorizer.fit_transform(text)

# Return the vectorized text and the vocabulary


return X, countvectorizer.vocabulary_

[9]: #Summarizing the Encoded Texts into a sparse matrix


text_counts=preprocess_text(data["Text"])[0]

[10]: #sparse matrix converting it to an array.


preprocess_text(data["Text"])[0].toarray()

preprocess_text(data["Text"])[0].toarray().shape

[10]: (20, 58)

[11]: #Printing the identified Unique words along with their indices
preprocess_text(data["Text"])[1]

[11]: {'love': 27,


'spend': 44,
'time': 50,
'famili': 18,
'movi': 30,
'absolut': 0,
'terribl': 49,
'food': 20,
'restaur': 41,
'amaz': 1,
'horribl': 22,
'experi': 17,
'dentist': 12,
'weather': 57,

5
'today': 51,
'perfect': 34,
'custom': 11,
'servic': 43,
'store': 46,
'aw': 2,
'realli': 40,
'enjoy': 15,
'book': 5,
'disappoint': 14,
'concert': 9,
'ca': 7,
'nt': 33,
'stand': 45,
'stuck': 47,
'traffic': 53,
'best': 4,
'pizza': 35,
've': 54,
'ever': 16,
'qualiti': 39,
'product': 36,
'view': 55,
'top': 52,
'mountain': 29,
'breathtak': 6,
'new': 31,
'design': 13,
'room': 42,
'bad': 3,
'surprisingli': 48,
'frustrat': 21,
'lack': 25,
'progress': 37,
'project': 38,
'complet': 8,
'wast': 56,
'money': 28,
'fantast': 19,
'last': 26,
'night': 32,
'internet': 24,
'connect': 10,
'hotel': 23}

[12]: #Splitting the data into trainig and testing


# x = text_counts

6
# y = data['Sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(text_counts,␣


↪data['Sentiment'], test_size=0.2, random_state=5)

Training the model


[13]: #Creating the Naïve Bayes Classifier Model
MNB = MultinomialNB()

# Train the model with training data


MNB.fit(X_train, Y_train)

[13]: MultinomialNB()

Predict the class of the unseen data


[14]: #get the model predictions for the test set
y_pred = MNB.predict(X_test)
y_pred

[14]: array(['Negative', 'Negative', 'Positive', 'Negative'], dtype='<U8')

[15]: # compare the outputs


data = {'Actual': Y_test,
'Predicted': y_pred}

outputs = pd.DataFrame(data)
outputs

[15]: Actual Predicted


2 Positive Negative
5 Negative Negative
17 Negative Positive
19 Negative Negative

Get Evaluation Metrics


[16]: #accuracy values
accuracy_score(Y_test,y_pred)

[16]: 0.5

[17]: #obtain the confusion matrix


confusion_matrix(Y_test,y_pred)

[17]: array([[2, 1],


[1, 0]], dtype=int64)

7
[18]: #confusion matrix visualization
sns.heatmap(confusion_matrix(Y_test,y_pred),annot=True,fmt="g")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

Classifcation Report
[19]: print(classification_report(Y_test,y_pred))

precision recall f1-score support

Negative 0.67 0.67 0.67 3


Positive 0.00 0.00 0.00 1

accuracy 0.50 4
macro avg 0.33 0.33 0.33 4
weighted avg 0.50 0.50 0.50 4

[ ]:

8
[ ]:

[ ]:

[ ]:

You might also like