1. Read the paragraph and obtain the frequency of words.
code:
import nltk
from [Link] import word_tokenize
from [Link] import FreqDist
paragraph = "Sukumar is good at coding and pratcing lot of problems in
leetcode .sukumar is very nice guy"
words = word_tokenize(paragraph)
fdist = FreqDist(words)
for word, frequency in [Link]():
print(f"{word}: {frequency}")
2. Write a program to slit sentences in a document?
code:
import nltk
from [Link] import sent_tokenize
# Sample document
document = "sukumar is good boy. Sukumar in vitap"
# Tokenize the document into sentences
sentences = sent_tokenize(document)
# Print each sentence
for sentence in sentences:
print(sentence)
[Link] tokenizing and stemming by reading the input string?
code:
import nltk
from [Link] import word_tokenize
from [Link] import PorterStemmer
# Sample input string
input_string = "i am running"
# Tokenize the input string into words
words = word_tokenize(input_string)
# Initialize the PorterStemmer
stemmer = PorterStemmer()
# Perform stemming on each word
stemmed_words = [[Link](word) for word in words]
# Print the original words and their stemmed forms
for original, stemmed in zip(words, stemmed_words):
print(f"{original} -> {stemmed}")
4. Remove the stopwords and rareword in the document?
code:
import nltk
[Link]('stopwords')
from [Link] import word_tokenize
from [Link] import stopwords
from [Link] import FreqDist
# Sample document
document = "running in the forest is most dangerous than any ting in world of
human. sukumar sukumar hero model model run"
# Tokenize the document into words
words = word_tokenize(document)
# Remove stopwords
stop_words = set([Link]('english'))
filtered_words = [word for word in words if [Link]() not in stop_words]
# Calculate the frequency distribution of words
fdist = FreqDist(filtered_words)
# Define a threshold for rare words (e.g., words that occur less than 2 times)
rare_words = [word for word, frequency in [Link]() if frequency < 2]
# Remove rare words from the filtered words
filtered_words = [word for word in filtered_words if word not in rare_words]
# Join the filtered words back into a document
filtered_document = ' '.join(filtered_words)
print(filtered_words)
[Link] the parts of speech in the document?
code:
import nltk
from [Link] import word_tokenize
from nltk import pos_tag
# Sample document
document = "NLTK is a leading platform for building Python programs. It provides
easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet."
# Tokenize the document into words
words = word_tokenize(document)
# Perform part-of-speech tagging
pos_tags = pos_tag(words)
# Print the part-of-speech tags
for word, pos_tag in pos_tags:
print(f"{word}: {pos_tag}")
6. Write a program to read the words form a string variable/Text and perform
tokenizing
and Lancaster stemming by reading the input string?
code:
import nltk
from [Link] import word_tokenize
from [Link] import LancasterStemmer
# Sample input string
input_string = "NLTK is a leading platform for building Python programs."
# Tokenize the input string into words
words = word_tokenize(input_string)
# Initialize the LancasterStemmer
stemmer = LancasterStemmer()
# Perform stemming on each word
stemmed_words = [[Link](word) for word in words]
# Print the original words and their stemmed forms
for original, stemmed in zip(words, stemmed_words):
print(f"{original} -> {stemmed}")
[Link]:
CODE:
import nltk
from [Link] import word_tokenize,sent_tokenize
from [Link] import ngrams
import re
s= """Natural language processing is the ability of a computer program to
understand
human language as it is spoken and written referred to as natural language. It is
a
component of Artificial intelligence."""
s = [Link]()
s = [Link](r'[^a-zA-Z0-9\s]',' ',s)
tokens = [token for token in [Link](" ") if token!=""]
ouput = list(ngrams(tokens,5))
print(ouput)
[Link] BIGRAM TRIGRAM
CODE:
import nltk
from [Link] import treebank
from [Link] import UnigramTagger, BigramTagger, TrigramTagger
# Download the Treebank corpus if not already downloaded
[Link]('treebank')
# Get tagged sentences from the Treebank corpus
tagged_sentences = treebank.tagged_sents()
# Split the tagged sentences into train and test sets
train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]
# Train Unigram, Bigram, and Trigram taggers
unigram_tagger = UnigramTagger(train_sents)
bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)
# Evaluate the taggers on the test set
print(f"Unigram tagger accuracy: {unigram_tagger.evaluate(test_sents)}")
print(f"Bigram tagger accuracy: {bigram_tagger.evaluate(test_sents)}")
print(f"Trigram tagger accuracy: {trigram_tagger.evaluate(test_sents)}")
# Tag a sample sentence
sentence = "Barack Obama was born in Hawaii."
words = nltk.word_tokenize(sentence)
tags = trigram_tagger.tag(words)
print(tags)
[Link] Tagger
code:
import nltk
from [Link] import treebank
from [Link] import AffixTagger
# Download the Treebank corpus if not already downloaded
[Link]('treebank')
# Get tagged sentences from the Treebank corpus
tagged_sentences = treebank.tagged_sents()
# Split the tagged sentences into train and test sets
train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]
# Specify the affix tagger parameters
prefix_length = 3
suffix_length = 3
min_stem_length = 2
# Train an affix tagger
affix_tagger = AffixTagger(train_sents, affix_length=prefix_length,
min_stem_length=2)
# Tag a sample sentence
sentence = "Barack Obama was born in Hawaii."
words = nltk.word_tokenize(sentence)
tags = affix_tagger.tag(words)
print(tags)
12. Dependency parser
code:
import nltk
# Define a simple context-free grammar for parsing
grammar = [Link]("""
S -> NP VP
NP -> Det N | N
VP -> V NP | V
Det -> 'the'
N -> 'dog' | 'cat' | 'man' | 'ball'
V -> 'chased' | 'saw' | 'caught'
""")
# Input sentences
input_sentences = ['the dog chased the cat', 'the man saw the ball']
# Create a chart parser
parser = [Link](grammar)
# Iterate over input sentences
for sent in input_sentences:
# Tokenize the sentence
tokens = nltk.word_tokenize(sent)
# Parse the sentence
for tree in [Link](tokens):
# Convert constituency parse tree to dependency parse tree
dep_tree = [Link](tree)
# Print the original sentence
print("Input Sentence:", sent)
# Print the dependency parse tree
print("Dependency Parse Tree:")
print(dep_tree)
print()
[Link] parsing
import nltk
[Link]('averaged_perceptron_tagger')
text = "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
chunk_grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP followed by VP
"""
chunk_parser = [Link](chunk_grammar)
chunks = chunk_parser.parse(pos_tags)
print(chunks)
#14 NER:'
code:
import nltk
from [Link] import word_tokenize
from [Link] import pos_tag
from [Link] import ne_chunk
[Link]("punkt")
[Link]('averaged_perceptron_tagger')
[Link]('maxent_ne_chunker')
[Link]('words')
doc = "Harry Potter, the young wizard with a lightning-shaped scar, attended
Hogwarts School, faced challenges, and triumphed over the dark wizard Voldemort,
bringing an end to the magical conflict."
words = word_tokenize(doc)
pos_tags = pos_tag(words)
ne_tags = ne_chunk(pos_tags)
for chunk in ne_tags:
if hasattr(chunk, 'label'):
print([Link](),':',' '.join(c[0] for c in chunk))