0% found this document useful (0 votes)
17 views4 pages

NLP Projects

Uploaded by

Joshua David
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views4 pages

NLP Projects

Uploaded by

Joshua David
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

Text Concordance alpha_tokens = [token.

lower() for token in tokens if


[Link]()]
import nltk english_words = set([Link]())
from [Link] import gutenberg valid_tokens = [token for token in alpha_tokens if
from [Link] import Text token in english_words]
corpus=[Link]("[Link] filtered_tokens = [token for token in valid_tokens if
t") token not in stop_words]
text=Text(corpus) stemmer_tokens = [[Link](token) for token
[Link]("monstrous") in filtered_tokens] # Corrected variable name

Output print("Original text :", text)


print("Tokenized text :", tokens)
Displaying 1 of 1 matches: print("Filtered text :", filtered_tokens)
Who cannot want the thought , how monstrous It print("Validated text :", valid_tokens)
was for Malcolme , and for Dona. print("Alpha text :", alpha_tokens)
print("Stemmed text :", stemmer_tokens)

Vocabulary Count
Output
import nltk
text=("welcome to the world") Original text : This is a sample text that we used to
words = nltk.word_tokenize(text) demonstrate NLTK text processing 123
num_words=len(words) Tokenized text : ['This', 'is', 'a', 'sample', 'text', 'that',
num_the = [Link]('the') 'we', 'used', 'to', 'demonstrate', 'NLTK', 'text',
unique_words=set(words) 'processing', '123']
num_unique_words=len(unique_words) Filtered text : ['sample', 'text', 'used', 'demonstrate',
percen_unique=(num_unique_words/num_words)* 'text']
100 Validated text : ['this', 'is', 'a', 'sample', 'text', 'that',
print(words) 'we', 'used', 'to', 'demonstrate', 'text']
print("the number of words:",num_words) Alpha text : ['this', 'is', 'a', 'sample', 'text', 'that', 'we',
print('number of occurence of "the":',num_the) 'used', 'to', 'demonstrate', 'nltk', 'text', 'processing']
print("number of unique
words:",num_unique_words)
print("percentage of unique
words:",percen_unique)
Bag of Words

from sklearn.feature_extraction.text import


Output
CountVectorizer
['welcome', 'to', 'the', 'world'] corpus = ["This is the first document",
the number of words: 4 "This document is the second document",
number of occurence of "the": 1 "And this is the third one",
number of unique words: 4 "Is this the first document"]
percentage of unique words: 100.0
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

Text Preprocessing for i in range(len(corpus)):


print(f"BoW representation of Document {i+1}:
import nltk {X[i].toarray()[0]}")
[Link]('stopwords')
[Link]('words')
from [Link] import word_tokenize Output
from [Link] import stopwords
from [Link] import PorterStemmer BoW representation of
from [Link] import words Document 1: [0 1 1 1 0 0 1 0 1]
BoW representation of
text = 'This is a sample text that we used to Document 2: [0 2 0 1 0 1 1 0 1]
demonstrate NLTK text processing 123' BoW representation of
tokens = word_tokenize(text) Document 3: [1 0 0 1 1 0 1 1 1]
stop_words = set([Link]('english')) BoW representation of
Document 4: [0 1 1 1 0 0 1 0 1]
# Corrected variable name
stemmer = PorterStemmer()
TF-IDF filtered_tokens = [token for token in tokens if
[Link]() not in stop_words]
from [Link] import word_tokenize lemmatizer = WordNetLemmatizer()
from [Link] import stopwords lemmatized_tokens =
from [Link] import PorterStemmer [[Link](token) for token in
from collections import Counter filtered_tokens]
import math pos_tags = nltk.pos_tag(lemmatized_tokens)
pos_word_corpus = [(word, tag) for word, tag in
def calculate_tf(word, document): pos_tags]
word_frequency = [Link](word)
return word_frequency / len(document) for word, tag in pos_word_corpus:
print(word, ":", tag)
def calculate_idf(word, corpus):
num_documents_containing_word = len([True
for document in corpus if word in document]) Output
if num_documents_containing_word == 0:
return 0 quick : JJ
else: brown : NN
return math.log10(len(corpus) / fox : JJ
num_documents_containing_word) jump : NN
lazy : NN
def calculate_tfidf(document, corpus): dog : NN
PS = PorterStemmer()
stop_words = set([Link]('english'))
words = [[Link]([Link]()) for word in Named Entity Recognition
word_tokenize(document) if [Link]() not in
stop_words] import nltk
word_tfidf_values = {} [Link]('averaged_perceptron_tagger')
for word in words: [Link]('maxent_ne_chunker')
if word not in word_tfidf_values: [Link]('words')
tf = calculate_tf(word, words) text="Josh works for Twitter in California."
idf = calculate_idf(word, corpus) tokens=nltk.word_tokenize(text)
word_tfidf_values[word] = tf * idf tagged=nltk.pos_tag(tokens)
return word_tfidf_values entities=[Link].ne_chunk(tagged)
for entity in entities:
corpus = [ "This is the first document", "This if hasattr(entity,'label'):
document is the second document", "And this is the print([Link](),''.join(c[0] for c in
third one", "Is this the first document" ] [Link]()))
document = "This is the second document"
tfidf_vector = calculate_tfidf(document, corpus)
print(tfidf_vector) Output

PERSON Josh
Output GPE Twitter
GPE California
{'second': 0.3010299956639812,
'document': 0.06246936830414996}

Pos Tagging

import nltk
[Link]('averaged_perceptron_tagger')

from [Link] import stopwords


from [Link] import WordNetLemmatizer

[Link]('punkt')
[Link]('stopwords')
[Link]('wordnet')

line = "quick brown fox jumps over the lazy dog"


tokens = nltk.word_tokenize(line)
stop_words = set([Link]('english'))
Pos Tagging via HMM Chatbot

import nltk import nltk


[Link]('brown') from [Link] import Chat,reflections
from [Link] import brown pairs=[[r"Hello|hi|hey|hola",
["Hello,I am Aura,your AI assistant. How may I help
def train_hmm_tagger(): you?"]],
tagged_sentence = [r"How are you|How are you doing",
brown.tagged_sents(categories='news') ["I'm good, how about you?"]],
size = int(len(tagged_sentence) * 0.9) [r"What song always gets you in a good mood?",
trained_sents = tagged_sentence[:size] ['"Happy" by Pharrell Williams never fails to put a
test_sents = tagged_sentence[size:] smile on my face.']],
symbols = set([word for sentence in [r"Suggest a trending song",
tagged_sentence for word, _ in sentence]) ['Good 4 U by Olivia Rodrigo',
states = set([tag for sentence in 'Montero(Call Me By Your Name) by Lil Nas X',
tagged_sentence for _, tag in sentence]) 'Save Your Tears by The Weeknd',
trainer = 'Levitating by Dua Lipa']],
[Link](states=st [r"quit",["Good bye"]],
ates, symbols=symbols) [r"(.*)",["Could you try again?"]]]
hmm_tagger = bot=Chat(pairs,reflections)
trainer.train_supervised(trained_sents) [Link]()
return hmm_tagger
Output
def pos_tag_sentence(sentence, hmm_tagger):
tokens = nltk.word_tokenize(sentence) >hi
tagged_tokens = hmm_tagger.tag(tokens) Hello,I am Aura,your AI assistant. How may I help
return tagged_tokens you?
>how are you
hmm_tagger = train_hmm_tagger() I'm good, how about you?
sentence = input("Enter the sentence to be >What song always gets you in a good mood?
tagged?") "Happy" by Pharrell Williams never fails to put a
tagged = pos_tag_sentence(sentence, smile on my face.
hmm_tagger) >Suggest a trending song
print(tagged) Save Your Tears by The Weeknd
>bye
Good bye
Output

Enter the sentence to be tagged?


The sky is so beautiful.

[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),


('beautiful', 'JJ')]
TEXT CLASSIFICATION USING LOGISTIC TEXT CLASSIFICATION USING NAÏVE
REGRESSION BAYES

from [Link] import word_tokenize import nltk


from [Link] import stopwords from [Link] import movie_reviews
from [Link] import PorterStemmer from [Link] import stopwords
from sklearn.feature_extraction.text import from [Link] import NaiveBayesClassifier
CountVectorizer from [Link] import word_tokenize
from sklearn.model_selection import train_test_split from [Link] import WordNetLemmatizer
from sklearn.linear_model import from sklearn.model_selection import train_test_split
LogisticRegression [Link]('movie_reviews')
def preprocess(text): [Link]('stopwords')
ps = PorterStemmer() stop_words = set([Link]('english'))
stop_words = set([Link]('english')) lemmatizer = WordNetLemmatizer()
words = [word_tokenize(sentence) for sentence
in text] def preprocess(text):
filtered_words = [[[Link](word) for word in tokens = word_tokenize([Link]())
tokenized if word not in stop_words and tokens = [[Link](token) for token
[Link]()] for in tokens if token not in stop_words and
tokenized in words] [Link]()]
filtered_sentences = [' '.join(sentence) for return dict([Link](tokens))
sentence in filtered_words] pos_reviews = [(movie_reviews.raw(fileid),
return filtered_sentences 'positive') for fileid in movie_reviews.fileids('pos')]
neg_reviews = [(movie_reviews.raw(fileid),
sentences = ["The food is tasty", "the quality of 'negative') for fileid in movie_reviews.fileids('neg')]
food is low", "i will never recommend their food", tot_rev = pos_reviews + neg_reviews
"I got sick after having their food", "I was processed_data = [(preprocess(text), category) for
in cloudnine after tasting their food", (text, category) in tot_rev]
"My favourite is their desserts", "the food train_data, val_data =
was not cooked properly"] train_test_split(processed_data, test_size=0.2,
classes = [1, 0, 0, 0, 1, 1, 0] random_state=42)
test_sentences = ["food is not cooked properly", "I
feel sick after having food", "I love their desserts", classifier = [Link](train_data)
"was in cloudnine after tasting their food"] new_text = ["The movie was amazing", "the movie
was terrible", "The movie was awful"]
vectorizer = CountVectorizer() for text in new_text:
sentences = preprocess(sentences) new_features = preprocess(text)
vect1 = vectorizer.fit_transform(sentences) predicted_category =
# Splitting data for testing [Link](new_features)
# train_data, test_data, train_labels, test_labels = print(f"The predicted category for '{text}' is
train_test_split(vect1, classes, test_size=0.2, '{predicted_category}'")
random_state=42
nb = LogisticRegression() Output
[Link](vect1, classes)
test_sentences = preprocess(test_sentences) The predicted category for 'The movie was
vect2 = [Link](test_sentences) amazing' is 'positive'
pred_classes = [Link](vect2) The predicted category for 'the movie was terrible'
print(pred_classes) is 'negative'
The predicted category for 'The movie was awful' is
Output 'negative’

[0 0 1 1]

You might also like