Clint-Roy Muvirimi-Mukarakate
H1802386
AI Practical Assignment
import nltk
from nltk import *
sent = 'Zim is a republic nation. We are proud Zimbabeans'
print(len(sent)) #Print the number of characters
print(sent[0:5]) #Prints Zim
print(sent[11:19])
print('**************************************************************************
****')
# Tokens
print(nltk.word_tokenize(sent))
print('**************************************************************************
****')
tokens = nltk.word_tokenize(sent)
vocab = sorted(set(tokens))
print(vocab)
print('**************************************************************************
****')
from string import punctuation
vocab_wo_punct=[]
for i in vocab:
if i not in punctuation:
vocab_wo_punct.append(i)
print(vocab_wo_punct)
print('**************************************************************************
****')
pos_list = pos_tag(vocab_wo_punct)
print(pos_list)
print('**************************************************************************
****')
#Root Stemming
stemObj = SnowballStemmer("english")
stemObj.stem("Studying")
stemmed_vocab = []
stemObj=SnowballStemmer("english")
for i in vocab_wo_punct:
stemmed_vocab.append(stemObj.stem(i))
print(stemmed_vocab)
print('**************************************************************************
****')
# Base of a ord Lemmatization
lemmaObj = WordNetLemmatizer()
lemmaObj.lemmatize("went",pos='v')
# Stop Words
from nltk.corpus import stopwords
wo_stop_words=[]
stop_words_set=set(stopwords.words("english"))
for i in vocab_wo_punct:
if i not in stop_words_set:
wo_stop_words.append(i)
print(wo_stop_words)
print('**************************************************************************
****')
# Frequence Distrubution
texts ="I saw John comming. He was with Mary. I talked to John and Mary. John
said he met Marry on the way. John and Marry were going to school"
print(nltk.FreqDist(nltk.word_tokenize(texts)))
print('**************************************************************************
****')
# N Gramms
bigrams =ngrams(vocab_wo_punct,2) #Use 2 for bigrams
print(list(bigrams))
print('**************************************************************************
****')
#use 3 for trigrams
trigrams=ngrams(vocab_wo_punct,3)
print(list(trigrams))
print('**************************************************************************
****')
file = open(r'C:\Users\Clint\Documents\AI\wikipedia.txt')
text = ''
for i in file.readlines():
text += i
# Step 1: Trimming the text of unanted spaces
trimmed_text = text.strip()
print(trimmed_text)
print('**************************************************************************
****')
#Step 2: Convert the text into upper or lower
converted_text = trimmed_text.lower()
print(converted_text)
print('**************************************************************************
****')
# Step 3: Tokenize the text and determine vocubulary
tokenize_list = word_tokenize(converted_text)
print(tokenize_list)
print('**************************************************************************
****')
# Tokenization using word punch tokenizer
punct_tokenized_list=wordpunct_tokenize(converted_text)
print(punct_tokenized_list)
print('**************************************************************************
****')
# Get vocubulary
vocab_list=set(tokenize_list)
print(vocab_list)
print('**************************************************************************
****')
# Step 4 Remove stop words
set_wo_stopwords=vocab_list-set(stopwords.words("english"))
print(set_wo_stopwords)
print('**************************************************************************
****')
# Step 5 Remove Punctuation
set_wo_punctuation=set_wo_stopwords-set(punctuation)
print(set_wo_punctuation)
print('**************************************************************************
****')
# Step 6 Normalising the text and / or lemmatization
print("Step 6 Normalising the text and / or lemmatization")
stemmed_list = []
stemObjs = SnowballStemmer("english")
for i in set_wo_punctuation:
stemmed_list.append(stemObjs.stem(i))
print(stemmed_list)
print('**************************************************************************
****')
Outputs
Windows PowerShell
Copyright (C) Microsoft Corporation. All rights reserved.
Try the new cross-platform PowerShell https://aka.ms/pscore6
PS C:\Users\Clint> & C:/Users/Clint/AppData/Local/Programs/Python/Python310/python.exe
"c:/Users/Clint/Documents/AI/Prac Assignment.py"
49
Zim i
public n
******************************************************************************
['Zim', 'is', 'a', 'republic', 'nation', '.', 'We', 'are', 'proud', 'Zimbabeans']
******************************************************************************
['.', 'We', 'Zim', 'Zimbabeans', 'a', 'are', 'is', 'nation', 'proud', 'republic']
******************************************************************************
['We', 'Zim', 'Zimbabeans', 'a', 'are', 'is', 'nation', 'proud', 'republic']
******************************************************************************
[('We', 'PRP'), ('Zim', 'VBP'), ('Zimbabeans', 'VBZ'), ('a', 'DT'), ('are', 'VBP'), ('is', 'VBZ'), ('nation', 'NN'),
('proud', 'JJ'), ('republic', 'NN')]
******************************************************************************
['we', 'zim', 'zimbabean', 'a', 'are', 'is', 'nation', 'proud', 'republ']
******************************************************************************
['We', 'Zim', 'Zimbabeans', 'nation', 'proud', 'republic']
******************************************************************************
<FreqDist with 22 samples and 33 outcomes>
******************************************************************************
[('We', 'Zim'), ('Zim', 'Zimbabeans'), ('Zimbabeans', 'a'), ('a', 'are'), ('are', 'is'), ('is', 'nation'), ('nation',
'proud'), ('proud', 'republic')]
******************************************************************************
[('We', 'Zim', 'Zimbabeans'), ('Zim', 'Zimbabeans', 'a'), ('Zimbabeans', 'a', 'are'), ('a', 'are', 'is'), ('are', 'is',
'nation'), ('is', 'nation', 'proud'), ('nation', 'proud', 'republic')]
******************************************************************************
This article is about natural language processing done by computers. For the natural language
processing done by the human brain, see Language processing in the brain.
An automated online assistant providing customer service on a web page, an example of an application
where natural language processing is a major component.[1]
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
concerned with the interactions between computers and human language, in particular how to program
computers to process and analyze large amounts of natural language data. The goal is a computer
capable of "understanding" the contents of documents, including the contextual nuances of the
language within them. The technology can then accurately extract information and insights contained in
the documents as well as categorize and organize the documents themselves.
Challenges in natural language processing frequently involve speech recognition, natural language
understanding, and natural language generation.
******************************************************************************
this article is about natural language processing done by computers. for the natural language processing
done by the human brain, see language processing in the brain.
an automated online assistant providing customer service on a web page, an example of an application
where natural language processing is a major component.[1]
natural language processing (nlp) is a subfield of linguistics, computer science, and artificial intelligence
concerned with the interactions between computers and human language, in particular how to program
computers to process and analyze large amounts of natural language data. the goal is a computer
capable of "understanding" the contents of documents, including the contextual nuances of the
language within them. the technology can then accurately extract information and insights contained in
the documents as well as categorize and organize the documents themselves.
challenges in natural language processing frequently involve speech recognition, natural language
understanding, and natural language generation.
******************************************************************************
['this', 'article', 'is', 'about', 'natural', 'language', 'processing', 'done', 'by', 'computers', '.', 'for', 'the',
'natural', 'language', 'processing', 'done', 'by', 'the', 'human', 'brain', ',', 'see', 'language', 'processing', 'in',
'the', 'brain', '.', 'an', 'automated', 'online', 'assistant', 'providing', 'customer', 'service', 'on', 'a', 'web',
'page', ',', 'an', 'example', 'of', 'an', 'application', 'where', 'natural', 'language', 'processing', 'is', 'a',
'major', 'component', '.', '[', '1', ']', 'natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'a', 'subfield', 'of',
'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the',
'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to',
'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data',
'.', 'the', 'goal', 'is', 'a', 'computer', 'capable', 'of', '``', 'understanding', "''", 'the', 'contents', 'of',
'documents', ',', 'including', 'the', 'contextual', 'nuances', 'of', 'the', 'language', 'within', 'them', '.', 'the',
'technology', 'can', 'then', 'accurately', 'extract', 'information', 'and', 'insights', 'contained', 'in', 'the',
'documents', 'as', 'well', 'as', 'categorize', 'and', 'organize', 'the', 'documents', 'themselves', '.',
'challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'speech', 'recognition', ',',
'natural', 'language', 'understanding', ',', 'and', 'natural', 'language', 'generation', '.']
******************************************************************************
['this', 'article', 'is', 'about', 'natural', 'language', 'processing', 'done', 'by', 'computers', '.', 'for', 'the',
'natural', 'language', 'processing', 'done', 'by', 'the', 'human', 'brain', ',', 'see', 'language', 'processing', 'in',
'the', 'brain', '.', 'an', 'automated', 'online', 'assistant', 'providing', 'customer', 'service', 'on', 'a', 'web',
'page', ',', 'an', 'example', 'of', 'an', 'application', 'where', 'natural', 'language', 'processing', 'is', 'a',
'major', 'component', '.[', '1', ']', 'natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'a', 'subfield', 'of',
'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the',
'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to',
'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data',
'.', 'the', 'goal', 'is', 'a', 'computer', 'capable', 'of', '"', 'understanding', '"', 'the', 'contents', 'of',
'documents', ',', 'including', 'the', 'contextual', 'nuances', 'of', 'the', 'language', 'within', 'them', '.', 'the',
'technology', 'can', 'then', 'accurately', 'extract', 'information', 'and', 'insights', 'contained', 'in', 'the',
'documents', 'as', 'well', 'as', 'categorize', 'and', 'organize', 'the', 'documents', 'themselves', '.',
'challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve',
'speech', 'recognition', ',', 'natural', 'language', 'understanding', ',', 'and', 'natural', 'language',
'generation', '.']
******************************************************************************
{')', '[', 'challenges', 'major', 'including', 'automated', 'can', 'insights', 'computers', 'online', 'web', ',',
'involve', 'then', 'as', 'analyze', 'natural', 'within', '.', 'for', 'component', 'process', 'themselves', 'with',
'about', 'example', 'done', 'where', 'brain', 'understanding', 'language', 'intelligence', "''", 'documents',
'this', 'contained', 'and', 'frequently', 'well', 'assistant', 'amounts', 'article', 'customer', 'see', '``',
'organize', 'nuances', 'in', 'speech', 'application', 'is', 'between', 'subfield', 'contextual',
'page', 'service', 'generation', 'linguistics', 'information', '1', 'nlp', 'processing', 'on', 'data', 'computer',
'by', ']', 'program', 'science', 'artificial', 'concerned', 'contents', 'interactions', 'technology', 'extract', 'of',
'large', 'recognition', 'an', 'categorize', 'capable', 'them', 'how', 'accurately', 'goal', 'a', 'to', 'the',
'particular', 'providing', '(', 'human'}
******************************************************************************
{')', 'artificial', 'concerned', "''", 'speech', 'application', 'documents', 'challenges', '[', 'contents',
'interactions', 'major', 'including', 'automated', 'technology', 'extract', 'insights', 'computers', 'online',
'subfield', 'large', 'contained', 'recognition', 'web', ',', 'categorize', 'capable', 'involve', 'contextual',
'frequently', 'page', 'service', 'analyze', 'natural', 'within', '.', 'well', 'assistant', 'generation', 'linguistics',
'information', 'component', 'amounts', 'process', '1', 'language', 'nlp', 'processing', 'accurately', 'article',
'goal', 'example', 'data', 'customer', 'organize', 'computer', ']', 'program', 'see', 'science', 'done',
'particular', 'providing', 'brain', '(', 'understanding', '``', 'nuances', 'intelligence', 'human'}
******************************************************************************
{'concerned', 'artificial', "''", 'speech', 'application', 'documents', 'challenges', 'contents', 'interactions',
'major', 'including', 'automated', 'technology', 'extract', 'insights', 'computers', 'subfield', 'online', 'large',
'contained', 'recognition', 'web', 'categorize', 'capable', 'involve', 'contextual', 'frequently', 'page',
'service', 'analyze', 'natural', 'within', 'well', 'assistant', 'generation', 'linguistics', 'information',
'understanding', 'component', 'amounts', 'process', '1', 'nlp', '``', 'processing', 'accurately', 'article', 'goal',
'example', 'data', 'customer', 'computer', 'program', 'see', 'science', 'done', 'particular', 'providing',
'brain', 'language', 'organize', 'nuances', 'intelligence', 'human'}
******************************************************************************
Step 6 Normalising the text and / or lemmatization
['concern', 'artifici', "''", 'speech', 'applic', 'document', 'challeng', 'content', 'interact', 'major', 'includ',
'autom', 'technolog', 'extract', 'insight', 'comput', 'subfield',
'onlin', 'larg', 'contain', 'recognit', 'web', 'categor', 'capabl', 'involv', 'contextu', 'frequent', 'page', 'servic',
'analyz', 'natur', 'within', 'well', 'assist', 'generat', 'linguist', 'inform', 'understand', 'compon', 'amount',
'process', '1', 'nlp', '``', 'process', 'accur', 'articl', 'goal', 'exampl', 'data', 'custom', 'comput', 'program',
'see', 'scienc', 'done', 'particular', 'provid', 'brain', 'languag', 'organ', 'nuanc', 'intellig', 'human']
******************************************************************************