1.
Convert the text into tokens and find the word frequency
from collections import Counter
import re
def tokenize(text):
tokens = [Link](r'\b\w+\b', [Link]())
return tokens
def word_frequency(tokens):
frequency = Counter(tokens)
return frequency
text = "This is a simple text. This text is for testing the word frequency program. This is simple."
tokens = tokenize(text)
frequency = word_frequency(tokens)
for word, count in [Link]():
print(f"{word}: {count}")
2. Perform Lemmatization and Stemming
import nltk
from [Link] import word_tokenize
from [Link] import PorterStemmer, WordNetLemmatizer
[Link]('punkt')
[Link]('wordnet')
[Link]('omw-1.4')
def tokenize(text):
tokens = word_tokenize([Link]())
return tokens
def lemmatize(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [[Link](token) for token in tokens]
return lemmatized_tokens
def stem(tokens):
stemmer = PorterStemmer()
stemmed_tokens = [[Link](token) for token in tokens]
return stemmed_tokens
text = "The striped bats are hanging on their feet for best."
tokens = tokenize(text)
lemmatized_tokens = lemmatize(tokens)
print("Lemmatized Tokens:")
print(lemmatized_tokens)
stemmed_tokens = stem(tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)
3. Implement Bi-gram
import nltk
from [Link] import word_tokenize
from [Link] import bigrams
[Link]('punkt')
def generate_bigrams(text):
tokens = word_tokenize([Link]())
bigram_list = list(bigrams(tokens))
return bigram_list
text = "The striped bats are hanging on their feet for best."
bigrams_result = generate_bigrams(text)
print("Bigrams:")
for bigram in bigrams_result:
print(bigram)
4. Identify parts-of Speech using Penn Treebank tag set
pip install nltk
import nltk
from [Link] import treebank
from [Link] import word_tokenize
from nltk import pos_tag
[Link]('averaged_perceptron_tagger')
[Link]('punkt')
sentence = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(sentence)
tagged_words = pos_tag(words)
print("Sentence:", sentence)
print("Tagged Words:", tagged_words)
POS Tags defined in treebank
1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
Preposition or subordinating
6. IN
conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10
LS List item marker
.
11
MD Modal
.
12
NN Noun, singular or mass
.
13
NNS Noun, plural
.
14
NNP Proper noun, singular
.
15 NNP
Proper noun, plural
. S
16
PDT Predeterminer
.
17
POS Possessive ending
.
18
PRP Personal pronoun
.
19 PRP
Possessive pronoun
. $
20
RB Adverb
.
21
RBR Adverb, comparative
.
22
RBS Adverb, superlative
.
23
RP Particle
.
24
SYM Symbol
.
25
TO to
.
26
UH Interjection
.
27
VB Verb, base form
.
28
VBD Verb, past tense
.
29
VBG Verb, gerund or present participle
.
30
VBN Verb, past participle
.
31
VBP Verb, non-3rd person singular present
.
32
VBZ Verb, 3rd person singular present
.
33 WD
Wh-determiner
. T
34
WP Wh-pronoun
.
35
WP$ Possessive wh-pronoun
.
36 WR
Wh-adverb
. B
5. Implement HMM for POS tagging and Build a Chunker
import nltk
from [Link] import treebank
from collections import defaultdict
[Link]('treebank')
tagged_sentences = treebank.tagged_sents()
states = set()
observations = set()
for sentence in tagged_sentences:
for word, tag in sentence:
[Link](tag)
[Link]([Link]())
states = list(states)
observations = list(observations)
transition_counts = defaultdict(lambda: defaultdict(int))
for sentence in tagged_sentences:
prev_tag = "<s>"
for word, tag in sentence:
transition_counts[prev_tag][tag] += 1
prev_tag = tag
transition_counts[prev_tag]["</s>"] += 1
transition_probabilities = defaultdict(lambda: defaultdict(float))
for prev_tag, next_tags in transition_counts.items():
total_count = sum(next_tags.values())
for next_tag, count in next_tags.items():
transition_probabilities[prev_tag][next_tag] = count / total_count
emission_counts = defaultdict(lambda: defaultdict(int))
for sentence in tagged_sentences:
for word, tag in sentence:
emission_counts[tag][[Link]()] += 1
emission_probabilities = defaultdict(lambda: defaultdict(float))
for tag, words in emission_counts.items():
total_count = sum([Link]())
for word, count in [Link]():
emission_probabilities[tag][word] = count / total_count
initial_counts = defaultdict(int)
for sentence in tagged_sentences:
initial_counts[sentence[0][1]] += 1
initial_probabilities = defaultdict(float)
total_count = sum(initial_counts.values())
for tag, count in initial_counts.items():
initial_probabilities[tag] = count / total_count
def viterbi(observations, states, start_p, trans_p, emit_p):
V = [{}]
path = {}
for state in states:
V[0][state] = start_p[state] * emit_p[state].get(observations[0], 0)
path[state] = [state]
for t in range(1, len(observations)):
[Link]({})
newpath = {}
for y in states:
(prob, state) = max((V[t-1][y0] * trans_p[y0].get(y, 0) * emit_p[y].get(observations[t], 0),
y0) for y0 in states)
V[t][y] = prob
newpath[y] = path[state] + [y]
path = newpath
n = len(observations) - 1
(prob, state) = max((V[n][y], y) for y in states)
return (prob, path[state])
sentence = "the quick brown fox jumps over the lazy dog"
observations = [Link]().split()
prob, tags = viterbi(observations, states, initial_probabilities, transition_probabilities,
emission_probabilities)
print("Sentence:", sentence)
print("Tags:", tags)
from nltk import pos_tag, word_tokenize
from [Link] import RegexpParser
def chunk_sentence(sentence, grammar):
words = word_tokenize(sentence)
tagged = pos_tag(words)
parser = RegexpParser(grammar)
chunks = [Link](tagged)
return chunks
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
PP: {<IN><NP>} # Chunk prepositions followed by NP
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
sentence = "The quick brown fox jumps over the lazy dog"
chunks = chunk_sentence(sentence, grammar)
print(chunks)
6. Find the synonym of a word and antonym of a word using WordNet
import nltk
from [Link] import wordnet as wn
[Link]('wordnet')
[Link]('omw-1.4')
def get_synonyms(word):
synonyms = set()
for syn in [Link](word):
for lemma in [Link]():
[Link]([Link]())
return synonyms
def get_antonyms(word):
antonyms = set()
for syn in [Link](word):
for lemma in [Link]():
if [Link]():
[Link]([Link]()[0].name())
return antonyms
word = "happy"
synonyms = get_synonyms(word)
antonyms = get_antonyms(word)
print(f"Synonyms of '{word}':", synonyms)
print(f"Antonyms of '{word}':", antonyms)
7. Implement semantic role labeling to identify named entities (Same as Program-9)
Goto Command Prompt
Install Python 3.12.4
Set the path where your programs folder is runned
C:>>pip install spacy
C:>> python -m spacy download en_core_web_sm
C:>> python –version
C:>> pip --version
C:>> pip show spacy
C:>> import spacy
C:>> pip install jupyter
C:>> jupyter notebook
• This will open the Jupyter Environment
• Create a file “test_spacy.ipnyb” file
Paste the following code in single row
import spacy
# Load the spaCy model
nlp = [Link]("en_core_web_sm")
# Example text
text = "Barack Obama was born in Hawaii. He was elected president in 2008."
# Process the text
doc = nlp(text)
# Extract named entities
named_entities = [([Link], ent.label_) for ent in [Link]]
# Print named entities
print("Named Entities:", named_entities)
C:>> python test_spacy.py
8. Implement POS tagging using LSTM
!pip install tensorflow
import tensorflow as tf
print(tf.__version__)
!pip install numpy tensorflow keras scikit-learn nltk
import numpy as np
import tensorflow as tf
from [Link] import pad_sequences
from [Link] import to_categorical
from [Link] import Sequential
from [Link] import LSTM, Dense, Embedding, TimeDistributed
from sklearn.model_selection import train_test_split
import nltk
from [Link] import treebank
from [Link] import word_tokenize
[Link]('treebank')
[Link]('punkt')
def load_data():
sentences = [Link]()
tags = treebank.tagged_sents()
words = [[Link]() for sentence in sentences for word in sentence]
tags_flattened = [tag for sentence in tags for _, tag in sentence]
unique_words = sorted(set(words))
unique_tags = sorted(set(tags_flattened))
word_to_index = {word: i + 2 for i, word in enumerate(unique_words)}
tag_to_index = {tag: i + 1 for i, tag in enumerate(unique_tags)}
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
tag_to_index['<PAD>'] = 0
index_to_word = {i: word for word, i in word_to_index.items()}
index_to_tag = {i: tag for tag, i in tag_to_index.items()}
X = [[word_to_index.get(word, 1) for word in sentence] for sentence in sentences]
y = [[tag_to_index[tag] for word, tag in sentence] for sentence in tags]
max_len = max(len(sentence) for sentence in X)
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')
y = to_categorical(y, num_classes=len(tag_to_index))
return X, y, word_to_index, tag_to_index, index_to_word, index_to_tag
# Load data and mappings
X, y, word_to_index, tag_to_index, index_to_word, index_to_tag = load_data()
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Sequential()
[Link](Embedding(input_dim=len(word_to_index), output_dim=50,
input_length=X_train.shape[1]))
[Link](LSTM(units=100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
[Link](TimeDistributed(Dense(len(tag_to_index), activation='softmax')))
[Link](optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
history = [Link](X_train, y_train, batch_size=32, epochs=5, validation_split=0.1, verbose=1)
# Evaluate the model
loss, accuracy = [Link](X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
# Predict POS tags for a new sentence
def predict_pos(sentence):
tokens = word_tokenize([Link]())
indices = [word_to_index.get(token, 1) for token in tokens]
indices = pad_sequences([indices], maxlen=X_train.shape[1], padding='post')
predictions = [Link](indices)
predicted_tags = [index_to_tag[[Link](tag)] for tag in predictions[0]]
return list(zip(tokens, predicted_tags))
# Example usage
sentence = "Barack Obama was born in Hawaii."
print(predict_pos(sentence))
10. Develop a movie review system (sentiment analysis on movie data)
!pip install numpy pandas scikit-learn nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from [Link] import Pipeline
from [Link] import classification_report, accuracy_score
import nltk
[Link]('movie_reviews')
[Link]('punkt')
from [Link] import movie_reviews
from [Link] import load_files
# Load the movie reviews dataset from NLTK
def load_movie_reviews():
reviews = []
labels = []
for fileid in movie_reviews.fileids():
category = [Link]('/')[0] # 'pos' or 'neg'
with movie_reviews.open(fileid) as f:
review = [Link]()
[Link](review)
[Link](category)
return [Link]({'review': reviews, 'sentiment': labels})
data = load_movie_reviews()
# Map sentiments to binary labels
data['sentiment'] = data['sentiment'].map({'pos': 1, 'neg': 0})
# Split the data into training and testing sets
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build a pipeline with CountVectorizer and MultinomialNB
pipeline = Pipeline([
('vectorizer', CountVectorizer()), # Converts text to feature vectors
('classifier', MultinomialNB()) # Naive Bayes classifier
])
# Train the model
[Link](X_train, y_train)
# Make predictions on the test set
y_pred = [Link](X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
def predict_sentiment(review):
prediction = [Link]([review])
sentiment = 'Positive' if prediction[0] == 1 else 'Negative'
return sentiment
# Example usage
new_review = "The movie was fantastic! I really enjoyed the performances."
print(f"Review: {new_review}")
print(f"Predicted Sentiment: {predict_sentiment(new_review)}")