0% found this document useful (0 votes)

55 views14 pages

NLP Lab Assignment 8

The document outlines a series of lab assignments focused on Natural Language Processing (NLP) tasks, including relation extraction, event extraction, named entity recognition (NER) using rule-based, dictionary-based, and machine learning methods, and custom NER implementations. It provides code examples using Python and libraries such as NLTK and pandas to extract relationships, events, and named entities from text corpora. Additionally, it includes methods for handling sensitive words and building dependency graphs.

Uploaded by

ragebhanukiran

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

55 views14 pages

NLP Lab Assignment 8

Uploaded by

ragebhanukiran

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 14

Natural Language Processing

Lab Assignment

R.BhanuKiran
22BCE9560
L45+L46
1. Implement Relation Extraction from a corpus
2. import nltk
3. import pandas as pd
4. from nltk.tokenize import word_tokenize, sent_tokenize
5.
6. # Download necessary resources
7. nltk.download('punkt', quiet=True)
8. nltk.download('averaged_perceptron_tagger', quiet=True)
9.
10. def extract_relations(text):
11. """Extract subject-relation-object (SRO) triples from a given
text."""
12. relations = []
13.
14. for sentence in sent_tokenize(text):
15. tokens = word_tokenize(sentence)
16. pos_tags = nltk.pos_tag(tokens)
17.
18. subject, relation, obj = None, None, None
19.
20. for i, (token, tag) in enumerate(pos_tags):
21. if tag.startswith('NN') and not subject:
22. subject = token # First noun is assumed as the
subject
23. elif tag.startswith('VB') and subject and not
relation:
24. relation = token # First verb is the relation
25. elif tag.startswith('NN') and subject and relation:
26. obj = token # Next noun after the verb is the
object
27. relations.append({'sentence': sentence,
'subject': subject, 'relation': relation, 'object': obj})
28. subject, relation, obj = None, None, None #
Reset for potential next triple
29.
30. return pd.DataFrame(relations, columns=['sentence',
'subject', 'relation', 'object'])
31.
32. corpus = [
33. "Elon Musk founded SpaceX.",
34. "NASA launched the Artemis mission.",
35. "OpenAI developed ChatGPT.",
36. "Apple designs innovative products."
37. ]
38.
39. results = pd.concat([extract_relations(text) for text in corpus],
ignore_index=True)
40.
41. if not results.empty:
42. print(results.to_string(index=False))
43. else:
44. print("No valid subject-relation-object triples found.")
45.

Output:

2. Implement Event Extraction from the same corpus

import nltk
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
nltk.download('maxent_ne_chunker_tab')

def extract_events(text):
"""Extract events with participants, time, and location."""
events = []

for sentence in sent_tokenize(text):

tokens = word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)
named_entities = nltk.ne_chunk(pos_tags)

time = extract_time(sentence)

entities = {
label: ' '.join(w for w, _ in chunk.leaves())
for chunk in named_entities if hasattr(chunk, 'label')
for label in ['PERSON', 'ORGANIZATION', 'GPE']
}

return pd.DataFrame(events, columns=['sentence', 'event',

'category', 'participants', 'time', 'location'])

def extract_time(text):
"""Extract time-related expressions."""
patterns = [
r'\b\d{4}\b',
r'\b(yesterday|today|tomorrow)\b',
r'\b(last|next|this)\s+\w+\b'
]

for pattern in patterns:

match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group()

return None

def categorize_event(verb):
"""Classify events based on verb meaning."""
event_categories = {
"Movement": {"go", "travel", "arrive", "leave"},
"Communication": {"say", "announce", "report", "declare"},
"Transaction": {"buy", "sell", "purchase", "acquire"},
"Creation": {"make", "build", "develop", "invent"}
}

for category, verbs in event_categories.items():

if verb.lower() in verbs:
return category

return "Other"

corpus = [
"OpenAI developed a new language model last year.",
"NASA announced a Mars mission yesterday.",
"Tesla will launch a self-driving update next month in
California.",
"Microsoft acquired a gaming company in 2020.",
]

results = pd.concat([extract_events(text) for text in corpus],

ignore_index=True)

if not results.empty:
print(results.to_string(index=False))
else:
print("No significant events extracted.")

Output:

3. Design Rule based, Dictionary-based, and Machine Learning based

NER tagger
import nltk
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

corpus = [
"Apple Inc. was founded by Steve Jobs in California in 1976.",
"Microsoft developed Windows operating system in the United
States.",
"Amazon CEO Jeff Bezos visited their new headquarters in Seattle
last week.",
"Tesla's Elon Musk announced a new factory in Berlin, Germany.",
"The European Union and the United Kingdom signed a trade deal in
December 2020.",
"Dr. Smith prescribed medication for John's condition at Mayo
Clinic."
]

def rule_based_ner(text):
"""Extract named entities using regex-based rules."""
entities = []

patterns = {
r'\b([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*) (Inc\.|Corp\.|Ltd\.|
LLC|Company)\b': "ORG",
r'\b(Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.) ([A-Z][a-zA-Z]+(?: [A-Z][a-
zA-Z]+)*)\b': "PERSON",
r'\b(January|February|March|April|May|June|July|August|
September|October|November|December) \d{1,2}(st|nd|rd|th)?, \d{4}\b':
"DATE"
}

for pattern, label in patterns.items():

matches = re.finditer(pattern, text)
for match in matches:
entities.append({"text": match.group(), "label": label,
"method": "rule-based"})

return entities

class DictionaryNER:
"""Named Entity Recognition using a pre-defined dictionary."""
def __init__(self):
self.dictionary = {
"apple": "ORG", "microsoft": "ORG", "amazon": "ORG",
"tesla": "ORG",
"steve jobs": "PERSON", "jeff bezos": "PERSON", "elon
musk": "PERSON",
"california": "LOC", "seattle": "LOC", "berlin": "LOC",
"germany": "LOC"
}

def find_entities(self, text):

"""Find named entities using dictionary-based matching."""
entities = []
text_lower = text.lower()
for entity, label in self.dictionary.items():
if entity in text_lower:
start = text_lower.find(entity)
end = start + len(entity)
entities.append({"text": text[start:end], "label":
label, "method": "dictionary-based"})

return entities

class MLBasedNER:
"""NER using Machine Learning with Logistic Regression."""
def __init__(self):
self.vectorizer = CountVectorizer(analyzer='char_wb',
ngram_range=(2, 5))
self.model = LogisticRegression(max_iter=1000)
self.is_trained = False

def prepare_training_data(self, texts):

"""Prepare tokenized data with POS tagging and Named Entity
Labels."""
X, y = [], []

for text in texts:

tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
named_entities = nltk.ne_chunk(pos_tags)

for token, pos in pos_tags:

X.append(token)
label = "O"

for ne in named_entities:
if hasattr(ne, 'label') and isinstance(ne,
nltk.tree.Tree):
for word, tag in ne.leaves():
if word == token:
label = ne.label()
break
y.append(label)

return X, y

def train(self, texts):

"""Train the ML-based NER model."""
X, y = self.prepare_training_data(texts)

if len(set(y)) < 2:
print("Not enough data to train ML model.")
return

X_features = self.vectorizer.fit_transform(X)
self.model.fit(X_features, y)
self.is_trained = True

def predict(self, text):

"""Predict named entities in new text using the trained
model."""
if not self.is_trained:
return []

tokens = word_tokenize(text)
X_test = self.vectorizer.transform(tokens)
predicted_labels = self.model.predict(X_test)

entities = [
{"text": token, "label": label, "method": "ml-based"}
for token, label in zip(tokens, predicted_labels) if
label != 'O'
]

return entities

dictionary_ner = DictionaryNER()
ml_ner = MLBasedNER()
ml_ner.train(corpus)

for sentence in corpus:

print("Sentence:", sentence)
print("Rule-based NER:", rule_based_ner(sentence))
print("Dictionary-based NER:",
dictionary_ner.find_entities(sentence))
print("ML-based NER:", ml_ner.predict(sentence))
print("-" * 80)

Output:
4. Implement Custom-NER tagger for Week-8 related programs and
display the output
Code-I
import nltk
import networkx as nx
import matplotlib.pyplot as plt
from nltk.corpus import brown
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from nltk.chunk import ne_chunk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def build_dependency_graph():
edges = [
("root", "Book", 12), ("root", "that", 4), ("root", "flight",
4),
("Book", "that", 5), ("that", "Book", 6), ("that", "flight",
8),
("flight", "that", 7), ("flight", "Book", 5), ("Book",
"flight", 7),
("root", "John", 9), ("root", "saw", 10), ("root", "Mary", 9),
("John", "saw", 20), ("saw", "John", 30), ("saw", "Mary", 30),
("Mary", "saw", 0), ("John", "Mary", 3), ("Mary", "John", 11)
]

graph = nx.DiGraph()
graph.add_weighted_edges_from(edges)
plt.figure(figsize=(10, 6))
pos = nx.spring_layout(graph)
nx.draw(graph, pos, with_labels=True, node_color='lightblue',
edge_color='gray', node_size=2000, font_size=10)
edge_labels = {(u, v): d for u, v, d in edges}
nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels,
font_size=8)

plt.title("Dependency Graph using CLE Algorithm")

plt.show()

return graph

def extract_sensitive_words():
sensitive_categories = {
"personal": ["name", "address", "phone", "email", "gender",
"age"],
"financial": ["salary", "income", "credit", "loan", "bank"],
"social": ["friends", "family", "community", "relationship"]
}

brown_sample = brown.words()[:5000]
detected_words = defaultdict(list)

for word in brown_sample:

for category, keywords in sensitive_categories.items():
if word.lower() in keywords:
detected_words[category].append(word)

return detected_words

def map_sensitivity_scores(detected_words):
score_mapping = {"personal": 5, "financial": 4, "social": 3}
scored_words = []

for category, words in detected_words.items():

for word in words:
scored_words.append((word, category,
score_mapping[category]))

return scored_words

def retrieve_similar_words(detected_words):
synonyms = defaultdict(set)

for category, words in detected_words.items():

for word in words:
synsets = wordnet.synsets(word)
for syn in synsets:
for lemma in syn.lemmas():
synonyms[category].add(lemma.name().replace('_', '
'))

return synonyms

def named_entity_recognition(text):
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
named_entities = ne_chunk(pos_tags)

persons, organizations, locations = set(), set(), set()

for chunk in named_entities:

if hasattr(chunk, "label"):
entity_name = " ".join(c[0] for c in chunk)
if chunk.label() == "PERSON":
persons.add(entity_name)
elif chunk.label() == "ORGANIZATION":
organizations.add(entity_name)
elif chunk.label() == "GPE":
locations.add(entity_name)

return {"PERSON": persons, "ORGANIZATION": organizations,

"LOCATION": locations}

dependency_graph = build_dependency_graph()
sensitive_words = extract_sensitive_words()
scored_words = map_sensitivity_scores(sensitive_words)
similar_sensitive_words = retrieve_similar_words(sensitive_words)

sample_text = "John works at Google and lives in New York."

ner_results = named_entity_recognition(sample_text)

print("Detected Sensitive Words:")

for word, category, score in scored_words:
print(f"{word} - {category} (Score: {score})")

print("\nWords Similar to Sensitive Words:")

for category, words in similar_sensitive_words.items():
print(f"{category}: {', '.join(words)}")

print("\nNamed Entities Identified:")

for entity_type, entities in ner_results.items():
print(f"{entity_type}: {', '.join(entities)}")
Code-II
import nltk
import pandas as pd
from nltk.corpus import gutenberg
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec

# Download necessary NLTK data

nltk.download("gutenberg")
nltk.download("punkt")

def extract_sensitive_words(text, file_name):

sensitive_categories = {
"personal": ["name", "age", "address", "gender", "identity"],
"financial": ["bank", "credit", "debt", "loan", "salary"],
"social": ["friend", "family", "community", "social",
"relationship"]
}
words = set(nltk.word_tokenize(text.lower()))
extracted_words = [(word, category, file_name) for word in words
for category, word_list in sensitive_categories.items() if word in
word_list]
return extracted_words

def process_corpus():
corpus_files = ['bible-kjv.txt', 'shakespeare-hamlet.txt']
extracted_data = []
for file in corpus_files:
corpus_text = gutenberg.raw(file)
extracted_data.extend(extract_sensitive_words(corpus_text,
file))
return extracted_data

def assign_sensitivity_score(extracted_data):
scores = {"personal": 5, "financial": 4, "social": 3}
return [(word, category, file_name, scores[category]) for word,
category, file_name in extracted_data]

def find_similar_words(corpus, target_words):

sentences = [nltk.word_tokenize(sent.lower()) for sent in
nltk.sent_tokenize(corpus)]

if len(sentences) < 2:
print("Not enough sentences for Word2Vec training.")
return {}

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1,

workers=4)

similar_words = {}
for word in target_words:
if word in model.wv.index_to_key:
similar_words[word] = model.wv.most_similar(word, topn=5)
return similar_words

if __name__ == "__main__":
extracted_sensitive_words = process_corpus()
sensitivity_scores =
assign_sensitivity_score(extracted_sensitive_words)

target_words = list(set(word for word, _, _, _ in

sensitivity_scores))

bible_corpus = gutenberg.raw('bible-kjv.txt')
similar_words = find_similar_words(bible_corpus, target_words)
df_sensitive_words = pd.DataFrame(sensitivity_scores,
columns=["Word", "Category", "File Name", "Sensitivity Score"])
df_sensitive_words.drop_duplicates(inplace=True)

print("Extracted Sensitive Words:")

print(df_sensitive_words.head(20).to_markdown(index=False))

print("\nSimilar Words:")
for word, similar_list in similar_words.items():
print(f"{word}: {[sim_word for sim_word, _ in similar_list]}")

Output:

Week 11-14 With Week 1-14
No ratings yet
Week 11-14 With Week 1-14
36 pages
C24064 - NLP - Lab Manual
No ratings yet
C24064 - NLP - Lab Manual
28 pages
Dokumen - Pub - Natural Language Processing Practical Using Transformers With Python
No ratings yet
Dokumen - Pub - Natural Language Processing Practical Using Transformers With Python
275 pages
2403RES29 - Hemant Choudhary - CS582 - Assignment - 1
No ratings yet
2403RES29 - Hemant Choudhary - CS582 - Assignment - 1
5 pages
DS 7
No ratings yet
DS 7
3 pages
Parts of Speech Tagger
No ratings yet
Parts of Speech Tagger
12 pages
Rajeev Mishra 20 SCSE1180087
No ratings yet
Rajeev Mishra 20 SCSE1180087
29 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
NER Presentation
No ratings yet
NER Presentation
16 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
18 pages
NLP Record
No ratings yet
NLP Record
15 pages
NLP Notebook
No ratings yet
NLP Notebook
20 pages
Bling
No ratings yet
Bling
7 pages
NLP Week 6-9
No ratings yet
NLP Week 6-9
30 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
19 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
21 pages
A5 - Jupyter Notebook PDF
No ratings yet
A5 - Jupyter Notebook PDF
4 pages
Keras NER with Transformers Guide
No ratings yet
Keras NER with Transformers Guide
7 pages
Shubham Jade MSC It 31031420010 NLP Practical Journal
No ratings yet
Shubham Jade MSC It 31031420010 NLP Practical Journal
17 pages
UNIT 5 - Information Extraction
No ratings yet
UNIT 5 - Information Extraction
14 pages
NLP Practicals
No ratings yet
NLP Practicals
6 pages
NLP Lab
No ratings yet
NLP Lab
7 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
NLP - Cheatsheet
No ratings yet
NLP - Cheatsheet
10 pages
NLP Assignment 4 (22bce9560)
No ratings yet
NLP Assignment 4 (22bce9560)
12 pages
Clean Data
No ratings yet
Clean Data
4 pages
NLP Projects
No ratings yet
NLP Projects
4 pages
NeuralInformationExtractionFromNaturalLanguageText GuptaPankaj PDF
No ratings yet
NeuralInformationExtractionFromNaturalLanguageText GuptaPankaj PDF
241 pages
NeuralInformationExtractionFromNaturalLanguageText GuptaPankaj PDF
No ratings yet
NeuralInformationExtractionFromNaturalLanguageText GuptaPankaj PDF
241 pages
NLP Lab
No ratings yet
NLP Lab
63 pages
Natural Language Processing Journal
No ratings yet
Natural Language Processing Journal
73 pages
Lecture 18. NER With: Conditional Random Fields (CRF)
No ratings yet
Lecture 18. NER With: Conditional Random Fields (CRF)
21 pages
NLP
No ratings yet
NLP
12 pages
Spark NLP Training-Public-Oct 2020
No ratings yet
Spark NLP Training-Public-Oct 2020
50 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
Unit 4 TB
No ratings yet
Unit 4 TB
23 pages
Sumati
No ratings yet
Sumati
10 pages
Natural Language Processing
No ratings yet
Natural Language Processing
17 pages
1 s2.0 S0957417424011187 Main
No ratings yet
1 s2.0 S0957417424011187 Main
14 pages
NLP Core Using NLTK: Dr. Muhammad Nouman Durrani
No ratings yet
NLP Core Using NLTK: Dr. Muhammad Nouman Durrani
42 pages
NLP Exam: Named Entity Recognition
No ratings yet
NLP Exam: Named Entity Recognition
14 pages
Dsbda 7
No ratings yet
Dsbda 7
1 page
Tokenization (Breaking Text Into Words) : Import From Import From Import From Import
No ratings yet
Tokenization (Breaking Text Into Words) : Import From Import From Import From Import
7 pages
NLP Assignment (917722H031)
No ratings yet
NLP Assignment (917722H031)
18 pages
DSBD 7 Ass
No ratings yet
DSBD 7 Ass
9 pages
Ass5 DL Inp OUT
No ratings yet
Ass5 DL Inp OUT
5 pages
Data Augmentation for Low-Resource NLP
No ratings yet
Data Augmentation for Low-Resource NLP
13 pages
Natural Language Processing in Data Science
No ratings yet
Natural Language Processing in Data Science
7 pages
Gging and Named Entity Recognition
No ratings yet
Gging and Named Entity Recognition
31 pages
Understanding POS Tagging Techniques
No ratings yet
Understanding POS Tagging Techniques
5 pages
TSA Lab Manual New
No ratings yet
TSA Lab Manual New
14 pages
NLP Session 4
No ratings yet
NLP Session 4
13 pages
NLP & Machine Learning Techniques Guide
No ratings yet
NLP & Machine Learning Techniques Guide
8 pages
Full Text 01
No ratings yet
Full Text 01
51 pages
Unit 4 TB
No ratings yet
Unit 4 TB
24 pages
Methodology
No ratings yet
Methodology
9 pages
Python NLP Techniques Guide
No ratings yet
Python NLP Techniques Guide
18 pages
DKhurana NERTask
No ratings yet
DKhurana NERTask
14 pages
NLP Practical Journal 2023-24
No ratings yet
NLP Practical Journal 2023-24
22 pages
R Practice Questions
No ratings yet
R Practice Questions
84 pages
8 Habits That Beat Talent
No ratings yet
8 Habits That Beat Talent
13 pages
Natural Language Processing
No ratings yet
Natural Language Processing
11 pages
Natural Language Processing Lab 9
No ratings yet
Natural Language Processing Lab 9
13 pages
Module-2 NLP
No ratings yet
Module-2 NLP
50 pages
Module 6
No ratings yet
Module 6
11 pages
New Alamein City Development Guide
No ratings yet
New Alamein City Development Guide
51 pages
Car Insurance Policy Details for Davis Bansal
No ratings yet
Car Insurance Policy Details for Davis Bansal
7 pages
Design Calculation Sheet: Date: Sheet No.: Project No.: 1203 Computed By: Alaa Ramadan Approved By: Checked by
No ratings yet
Design Calculation Sheet: Date: Sheet No.: Project No.: 1203 Computed By: Alaa Ramadan Approved By: Checked by
1 page
ALKON®SOL CAST HT A Complete Refractory Solution For Melting Furnaces
No ratings yet
ALKON®SOL CAST HT A Complete Refractory Solution For Melting Furnaces
6 pages
Engine Control eTPU Library-AN4907
No ratings yet
Engine Control eTPU Library-AN4907
29 pages
IDBI Current Account Statement
No ratings yet
IDBI Current Account Statement
26 pages
Mobile Elevating Work Platform MEWP
No ratings yet
Mobile Elevating Work Platform MEWP
3 pages
PD Cen TR 15367-1-2014
No ratings yet
PD Cen TR 15367-1-2014
18 pages
BCSC 1002
No ratings yet
BCSC 1002
2 pages
Chapter 7 Exercises
No ratings yet
Chapter 7 Exercises
7 pages
BP BLG 22
No ratings yet
BP BLG 22
11 pages
Dina Sri Hastuti 2203003 MRP
No ratings yet
Dina Sri Hastuti 2203003 MRP
5 pages
Arduino Compilation
No ratings yet
Arduino Compilation
233 pages
Production & Operations Management Guide
No ratings yet
Production & Operations Management Guide
15 pages
Socialism-1964-1985-11953500: Ebook or Textbook
100% (12)
Socialism-1964-1985-11953500: Ebook or Textbook
48 pages
E1 Solar Panel Lay-Out
No ratings yet
E1 Solar Panel Lay-Out
1 page
Evaluating The Effectiveness of Bayesian Knowledge
No ratings yet
Evaluating The Effectiveness of Bayesian Knowledge
23 pages
Mechanical Trenching Job Safety Analysis
No ratings yet
Mechanical Trenching Job Safety Analysis
1 page
5 SMW Module 3B Sheet Metal Working
No ratings yet
5 SMW Module 3B Sheet Metal Working
88 pages
Master Thesis Progress Report Sample
100% (3)
Master Thesis Progress Report Sample
4 pages
Common Business Idioms and Phrases: 1. Yes Man
No ratings yet
Common Business Idioms and Phrases: 1. Yes Man
44 pages
Contractual Terms in Insurance Law
No ratings yet
Contractual Terms in Insurance Law
5 pages
The YouTube Algorithm Determines How Videos Are Recommended and Ranked On The Platform
No ratings yet
The YouTube Algorithm Determines How Videos Are Recommended and Ranked On The Platform
6 pages
Macro Volatility Digest Sep222025 U1
No ratings yet
Macro Volatility Digest Sep222025 U1
6 pages
Tourism Advantages and Disadvantages
No ratings yet
Tourism Advantages and Disadvantages
1 page
Syllabus SP2024ACCT116ACRN54477 MW DISKIN 16WKS
No ratings yet
Syllabus SP2024ACCT116ACRN54477 MW DISKIN 16WKS
11 pages
Dyed Fabric Costing Sheet
No ratings yet
Dyed Fabric Costing Sheet
1 page
Sec. 16: Appointing Power: Types of Appointment
No ratings yet
Sec. 16: Appointing Power: Types of Appointment
7 pages
Fuel Oil Combustion Handbook PDF
No ratings yet
Fuel Oil Combustion Handbook PDF
31 pages

NLP Lab Assignment 8

Uploaded by

NLP Lab Assignment 8

Uploaded by

Natural Language Processing

2. Implement Event Extraction from the same corpus

for sentence in sent_tokenize(text):

for word, tag in pos_tags:

return pd.DataFrame(events, columns=['sentence', 'event',

for pattern in patterns:

for category, verbs in event_categories.items():

results = pd.concat([extract_events(text) for text in corpus],

3. Design Rule based, Dictionary-based, and Machine Learning based

for pattern, label in patterns.items():

def find_entities(self, text):

def prepare_training_data(self, texts):

for text in texts:

for token, pos in pos_tags:

def train(self, texts):

def predict(self, text):

for sentence in corpus:

plt.title("Dependency Graph using CLE Algorithm")

for word in brown_sample:

for category, words in detected_words.items():

for category, words in detected_words.items():

persons, organizations, locations = set(), set(), set()

for chunk in named_entities:

return {"PERSON": persons, "ORGANIZATION": organizations,

sample_text = "John works at Google and lives in New York."

print("Detected Sensitive Words:")

print("\nWords Similar to Sensitive Words:")

print("\nNamed Entities Identified:")

# Download necessary NLTK data

def extract_sensitive_words(text, file_name):

def find_similar_words(corpus, target_words):

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1,

target_words = list(set(word for word, _, _, _ in

print("Extracted Sensitive Words:")

You might also like