NLP Lab Programs
1. Tokenize a text
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt') # Download tokenizer data
# Example text
text = "NLP makes machines understand language. Tokenization is the first step."
# Sentence Tokenization
print("Sentences:", sent_tokenize(text))
# Word Tokenization
print("Words:", word_tokenize(text))
output:
2. sentences of a text document
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt') # Download tokenizer data
# Read the text from a file
file_path = "example.txt" # Replace with your file path
with open(file_path, 'r') as file:
text = file.read()
# Sentence Tokenization
sentences = sent_tokenize(text)
# Display the sentences
print("Sentences in the document:")
for i, sentence in enumerate(sentences, 1):
print(f"{i}: {sentence}")
save a text file as example.txt in jupyter notebook
output:
3. tokenize text with stop words as delimiters
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
# Download necessary data
nltk.download('punkt')
nltk.download('stopwords')
# Example text
text = "I enjoy learning Python and coding."
# Define stop words
stop_words = set(stopwords.words('english'))
# Tokenize the text
words = word_tokenize(text)
# Tokenize using stop words as delimiters
tokens_without_stopwords = [word for word in words if word.lower() not in stop_words]
# Output the result
print("Original Tokens:", words)
print("Tokens without Stop Words:", tokens_without_stopwords)
output:
4. remove stop words and punctuations in a text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
# Download necessary data
nltk.download('punkt')
nltk.download('stopwords')
# Example text
text = "Python is great! It's simple and powerful."
# Define stop words
stop_words = set(stopwords.words('english'))
# Tokenize the text
words = word_tokenize(text)
# Remove stop words and punctuation
tokens_cleaned = [word for word in words if word.lower() not in stop_words and word not in
string.punctuation]
# Output the result
print("Tokens without Stop Words and Punctuation:", tokens_cleaned)
output:
5. perform stemming
# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
# choose some words to be stemmed
words = ["pythonprogramming", "programs", "programmer", "event", "thankyou"]
for w in words:
print(w, " : ", ps.stem(w))
output: