text-processing
March 24, 2024
[1]: import nltk
#tokenizing
from [Link] import WordPunctTokenizer
from [Link] import word_tokenize
from [Link] import sent_tokenize
from [Link] import RegexpTokenizer
#stopwords
from [Link] import stopwords
#regexp
import re
# pandas dataframe
import pandas as pd
#import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
[2]: [Link]()
showing info [Link]
[2]: True
[3]: #load the data used in the book examples into the Python environment:
from [Link] import *
*** Introductory Examples for the NLTK Book ***
Loading text1, …, text9 and sent1, …, sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
1
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
This command loaded 9 of the text examples available from the corpora package (only
a small number of them!). It has used the variable names text1 through text9 for
theseexamples, and already assigned them values. If you type the variable name, you
get a description of the text
[4]: text1
[4]: <Text: Moby Dick by Herman Melville 1851>
Note that the first sentence of the book Moby Dick is “Call me Ishmael.” and that
this sentence has been already separated into tokens in the variable sent1
[5]: #The variables sent1 through sent9 have been set to be a list of tokens of the␣
↪first sentence of each text.
sent1
[5]: ['Call', 'me', 'Ishmael', '.']
[ ]:
0.1 Counting
[8]: #gives the total number of words in the text
len(text1)
[8]: 260819
[7]: #to find out how many unique words there are, not counting repetitions (gives␣
↪all tokens)
sorted(set(text1))
#Or we can just find the length of that list.
len(sorted(set(text3)))
[7]: 2789
[12]: #Or we can specify just to print the first 30 words in the list of sorted words:
sorted(set(text3))[:30]
[12]: ['!',
"'",
2
'(',
')',
',',
',)',
'.',
'.)',
':',
';',
';)',
'?',
'?)',
'A',
'Abel',
'Abelmizraim',
'Abidah',
'Abide',
'Abimael',
'Abimelech',
'Abr',
'Abrah',
'Abraham',
'Abram',
'Accad',
'Achbor',
'Adah',
'Adam',
'Adbeel',
'Admah']
[13]: #to count how many times the word 'Moby' has appeared in the text1
[Link]("Moby")
[13]: 84
[ ]:
0.2 Processing Text
lets use gutenberg corpus
NLTK includes a small selection of texts from the Project Gutenberg electronic text
archive, which contains some 25,000 free electronic books
[19]: # You can then view some books obtained from the Gutenberg on-line book project:
[Link]()
3
[19]: ['[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'melville-moby_dick.txt',
'[Link]',
'[Link]',
'[Link]',
'[Link]',
'[Link]']
[22]: #view the first file
file1 = [Link]( ) [0]
file1
[22]: '[Link]'
[33]: #We can get the original text, using the raw function:
emmatext = [Link](file1)
emmatext[:120] #Since this is quite long, we can view part of it, e.g. the␣
↪first 120 characters
#len(emmatext) #count of total characters
[33]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,
handsome, clever, and rich, with a comfortable home\nan'
0.3 1. Tokenization
NLTK has several tokenizers available to break the raw text into tokens; we will use one that
separates by white space and also by special characters (punctuation)
0.3.1 Word Tokenization
[32]: emmatokens = nltk.wordpunct_tokenize(emmatext)
len(emmatokens) #total token count
4
#view the tokenized text
emmatokens[:15]
[32]: ['[',
'Emma',
'by',
'Jane',
'Austen',
'1816',
']',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
',',
'handsome']
[34]: #Example
sentence="I have no money at the moment."
nltk.wordpunct_tokenize(sentence)
[34]: ['I', 'have', 'no', 'money', 'at', 'the', 'moment', '.']
[36]: #using word_tokenize
text = "God is Great! I won a lottery."
print(word_tokenize(text))
['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']
[39]: #usigng Regexp tokenizer
text="God is Great! I won a lottery."
tokenizer = RegexpTokenizer("[\w']+")
[Link](text)
[39]: ['God', 'is', 'Great', 'I', 'won', 'a', 'lottery']
0.3.2 Sentence Tokenization
[44]: #by using nltk library
text1 = "God is Great! I won a lottery."
print(sent_tokenize(text1))
5
['God is Great!', 'I won a lottery.']
[45]: text2="Let us understand the difference between sentence & word tokenizer. It␣
↪is going to be a simple example."
[Link](". ")
[45]: ['Let us understand the difference between sentence & word tokenizer',
'It is going to be a simple example.']
[ ]:
0.4 2. Stopwords
[19]: #lookat the stopwords listed
print([Link]('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
"you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
"couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
"hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
"shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn',
"wouldn't"]
[49]: sent1="""He determined to drop his litigation with the monastry, and relinguish␣
↪his claims to the wood-cuting and
fishery rihgts at once. He was the more ready to do this becuase the rights had␣
↪become much less valuable, and he had
indeed the vaguest idea where the wood and river in question were."""
# set of stop words
stop_words = set([Link]('english'))
# tokens of words
6
word_tokens = word_tokenize(sent1)
word_tokens[:10]
[49]: ['He',
'determined',
'to',
'drop',
'his',
'litigation',
'with',
'the',
'monastry',
',']
[50]: #empty list to get the final stop word removed text
filtered_sentence = []
# filter out the stop words
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print("\nOriginal Sentence \n")
print(" ".join(word_tokens))
print("\nFiltered Sentence \n")
print(" ".join(filtered_sentence))
Original Sentence
He determined to drop his litigation with the monastry , and relinguish his
claims to the wood-cuting and fishery rihgts at once . He was the more ready to
do this becuase the rights had become much less valuable , and he had indeed the
vaguest idea where the wood and river in question were .
Filtered Sentence
He determined drop litigation monastry , relinguish claims wood-cuting fishery
rihgts . He ready becuase rights become much less valuable , indeed vaguest idea
wood river question .
7
0.5 3. Normalizing word Formats
0.6 3.1 Lowercase
[51]: #Example
sentence="I have NO moNey at tHE moMent."
[Link]()
[51]: 'i have no money at the moment.'
[53]: #for already tokenized text
emmawords = [[Link]( ) for w in emmatokens]
emmawords[:15]
[53]: ['[',
'emma',
'by',
'jane',
'austen',
'1816',
']',
'volume',
'i',
'chapter',
'i',
'emma',
'woodhouse',
',',
'handsome']
[55]: # We can further view the words by getting the unique words and sorting them:
emmavocab = sorted(set(emmawords))
emmavocab[:10]
[55]: ['!', '!"', '!"--', "!'", "!'--", '!)--', '!--', '!--"', '!--(', '!--`']
[25]: #uppercased
[Link]()
#check Table 3.2 for more operations on strings (Chapter 3, Section 3.2 of NLTK␣
↪book)
[25]: 'I HAVE NO MONEY AT THE MOMENT.'
[26]: #select a set of words from the tokenized text
shortwords=emmawords[11:111]
shortwords[:10]
8
[26]: ['emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',']
[27]: #get the frequency count for each word
shortdist = FreqDist(shortwords)
[Link]( )
for word in [Link]():
print (word, shortdist[word])
emma 1
woodhouse 1
, 8
handsome 1
clever 1
and 4
rich 1
with 2
a 3
comfortable 1
home 1
happy 1
disposition 1
seemed 1
to 3
unite 1
some 1
of 6
the 4
best 1
blessings 1
existence 1
; 2
had 3
lived 1
nearly 1
twenty 1
- 1
one 1
years 1
in 2
world 1
very 2
little 1
distress 1
or 1
vex 1
her 4
9
. 2
she 1
was 1
youngest 1
two 1
daughters 1
most 1
affectionate 1
indulgent 1
father 1
consequence 1
sister 1
' 1
s 1
marriage 1
been 1
mistress 1
his 1
house 1
from 1
early 1
period 1
mother 1
died 1
too 1
long 1
ago 1
for 1
have 1
more 1
0.7 3.2 Stemming
NLTK has two stemmers, Porter and Lancaster, described in section 3.6 of the NLTK
book. To use these stemmers, you first create them
[58]: porter = [Link]()
lancaster = [Link]()
[61]: #regular-cased text- porter stemmer
emmaregstem = [[Link](t) for t in emmatokens]
emmaregstem[1:10]
[61]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']
[30]: #lowercased text
emmalowerstem = [[Link](t) for t in emmawords]
emmalowerstem[1:10]
10
[30]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']
[31]: #regular-cased text - lancaster stemmer
emmaregstem1 = [[Link](t) for t in emmatokens]
emmaregstem1[1:10]
[31]: ['emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt']
[70]: #building our own simple stemmer by making a list of suffixes to take off.
def stem(word):
for suffix in ['ing','ly','ed','ious','ies','ive','es','s']:
if [Link](suffix):
return word[:-len(suffix)]
return word
#try the above stemmer with 'friends'
stem('friends')
[70]: 'friend'
[71]: stem('relatives')
[71]: 'relativ'
0.8 3.3 Lemmatizing
NLTK has a lemmatizer that uses the WordNet on-line thesaurus as a dictionary to look up roots
and find the word.
[74]: wnl = [Link]()
emmalemma=[[Link](t) for t in emmawords]
emmalemma[1:10]
[74]: ['emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']
[82]: [Link]('friends')
[Link]('relatives')
[82]: 'relative'
0.9 4. Regex:Regular Expressions for Detecting Word Patterns
[83]: emmatext[:100]
[83]: '[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse,
handsome, clever, and rich, with a'
11
[85]: #the function replace to replace all the new characters ‘\n’ with a space ‘ ‘.
newemmatext = [Link]('\n', ' ')
shorttext = newemmatext[:150]
#redefined the variable shorttext to be the first 150 characters
#without newlines
shorttext
[85]: '[Emma by Jane Austen 1816] VOLUME I CHAPTER I Emma Woodhouse, handsome,
clever, and rich, with a comfortable home and happy disposition, seemed to'
[38]: pword = [Link]('\w+')
#[Link] will find the substrings that matched anywhere in the string.
[Link](pword, shorttext)
[38]: ['Emma',
'by',
'Jane',
'Austen',
'1816',
'VOLUME',
'I',
'CHAPTER',
'I',
'Emma',
'Woodhouse',
'handsome',
'clever',
'and',
'rich',
'with',
'a',
'comfortable',
'home',
'and',
'happy',
'disposition',
'seemed',
'to']
[39]: #[Link] will find the substrings that matched anywhere in the specialtext.
specialtext = 'U.S.A. poster-print costs $12.40, with 10% off.'
[Link](pword, specialtext)
[39]: ['U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'with', '10', 'off']
12
[40]: #to match tokens by matching words can have an internal hyphen.
ptoken = [Link]('(\w+(-\w+)*)')
[Link](ptoken, specialtext)
[40]: [('U', ''),
('S', ''),
('A', ''),
('poster-print', '-print'),
('costs', ''),
('12', ''),
('40', ''),
('with', ''),
('10', ''),
('off', '')]
[41]: #to match abbreviations that might have a “.” inside, like U.S.A.
#We only allow capitalized letters
pabbrev = [Link]('(([A-Z]\.)+)')
[Link](pabbrev, specialtext)
[41]: [('U.S.A.', 'A.')]
[42]: #combine it with the words pattern to match either words or abbreviations
ptoken = [Link]('(\w+(-\w+)*|([A-Z]\.)+)')
[Link](ptoken, specialtext)
[42]: [('U', '', ''),
('S', '', ''),
('A', '', ''),
('poster-print', '-print', ''),
('costs', '', ''),
('12', '', ''),
('40', '', ''),
('with', '', ''),
('10', '', ''),
('off', '', '')]
[43]: #order of the matching patterns really matters if
#an earlier pattern matches part of what you want to match.
ptoken = [Link]('(([A-Z]\.)+|\w+(-\w+)*)')
[Link](ptoken, specialtext)
[43]: [('U.S.A.', 'A.', ''),
('poster-print', '', '-print'),
('costs', '', ''),
('12', '', ''),
('40', '', ''),
13
('with', '', ''),
('10', '', ''),
('off', '', '')]
[44]: #add an expression to match the currency
ptoken = [Link](r'(([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?)')
[Link](ptoken, specialtext)
[44]: [('U.S.A.', 'A.', '', ''),
('poster-print', '', '-print', ''),
('costs', '', '', ''),
('$12.40', '', '', '.40'),
('with', '', '', ''),
('10', '', '', ''),
('off', '', '', '')]
Regular Expression Tokenizer using NLTK Tokenizer
[45]: #We can make a prettier regular expression that is equivalent to this one by
#using Python’s triple quotes that allows a string to go across multiple
#lines without adding a newline character
# abbreviations, e.g. U.S.A.
# words with internal hyphens
# currency, like $12.40
ptoken = [Link](r'''([A-Z]\.)+
| \w+(-\w+)*
| \$?\d+(\.\d+)?
''', re.X)
[46]: # abbreviations, e.g. U.S.A.
# words with optional internal hyphens
# currency and percentages, e.g. $12.40, 82%
# ellipsis ex: hmm..., well...
# these are separate tokens; includes ], [
pattern = r''' (?x) [A-Z][a-z]+\.| (?:[A-Z]\.)+|
| \w+(?:-\w+)*
| \$?\d+(?:\.\d+)?%?
| \.\.\.
| [][.,;"'?():-_']'''
[47]: nltk.regexp_tokenize(shorttext[:30], pattern)
[47]: ['',
'[',
14
'',
'Emma',
'',
'',
'by',
'',
'',
'Jane',
'',
'',
'Austen',
'',
'',
'1816',
'',
']',
'',
'',
'',
'VO',
'']
[48]: nltk.regexp_tokenize(specialtext, pattern)
[48]: ['U.S.A.',
'',
'',
'poster-print',
'',
'',
'costs',
'',
'',
'$12.40',
'',
',',
'',
'',
'with',
'',
'',
'10',
'',
'',
'',
'off',
'',
15
'.',
'']
[Link]
0.10 Document Term Matrix- DTM
[87]: # Let's start with a 'toy' corpus
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]
[90]: #assign the count vectorizer to a variable
countvectorizer=CountVectorizer()
DTM=[Link](countvectorizer.fit_transform(CORPUS).toarray(),
columns=countvectorizer.get_feature_names_out(),index=None)
DTM
[90]: and beautiful blue cheese is love sky so the
0 0 0 1 0 1 0 1 0 1
1 1 1 1 0 2 0 2 0 0
2 0 1 1 0 1 0 1 1 1
3 0 0 1 1 0 1 0 0 0
[ ]:
16