24/11/2024, 23:24 mapreduce.
ipynb - Colab
https://colab.research.google.com/drive/1YscQHdGXIwpRFjZXAdHQvwSoecTEFxr-#scrollTo=cHKPYhQ5PPFW&printMode=true 1/6
24/11/2024, 23:24 mapreduce.ipynb - Colab
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab') # This line is added to fix the error
# Sample large text (replace this with your own text)
large_text = """
Natural Language Processing (NLP) is a subfield of artificial intelligence (AI) focused on enabling machines to understand and process human languages.
It encompasses a range of tasks such as text analysis, machine translation, and sentiment analysis. NLP applications are widely used in chatbots, virtual ass
language translation services, and more. Despite significant advancements, NLP still faces challenges such as ambiguity, context understanding, and linguisti
"""
### Preprocessing Techniques ###
# 1. Remove special characters, numbers, and extra spaces
def clean_text(text):
text = re.sub(r'\d+', '', text) # Remove numbers
text = re.sub(r'[^\w\s]', '', text) # Remove special characters
text = text.strip() # Remove leading/trailing spaces
text = re.sub(r'\s+', ' ', text) # Remove extra spaces
return text
# 2. Tokenize the text into words
def tokenize_text(text):
return word_tokenize(text)
# 3. Convert tokens to lowercase
def to_lowercase(tokens):
return [token.lower() for token in tokens]
# 4. Remove stopwords
def remove_stopwords(tokens):
stop_words = set(stopwords.words('english'))
return [token for token in tokens if token not in stop_words]
https://colab.research.google.com/drive/1YscQHdGXIwpRFjZXAdHQvwSoecTEFxr-#scrollTo=cHKPYhQ5PPFW&printMode=true 2/6
24/11/2024, 23:24 mapreduce.ipynb - Colab
# 5. Lemmatize tokens
def lemmatize_tokens(tokens):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(token) for token in tokens]
# 6. Stem tokens (Optional, for comparison)
def stem_tokens(tokens):
stemmer = PorterStemmer()
return [stemmer.stem(token) for token in tokens]
### Applying the Preprocessing Steps ###
# Step 1: Clean text
cleaned_text = clean_text(large_text)
print("\n--- Cleaned Text ---\n", cleaned_text)
# Step 2: Tokenize
tokens = tokenize_text(cleaned_text)
print("\n--- Tokens ---\n", tokens)
# Step 3: Convert to lowercase
lower_tokens = to_lowercase(tokens)
print("\n--- Lowercase Tokens ---\n", lower_tokens)
# Step 4: Remove Stopwords
filtered_tokens = remove_stopwords(lower_tokens)
print("\n--- Tokens without Stopwords ---\n", filtered_tokens)
# Step 5: Lemmatize
lemmatized_tokens = lemmatize_tokens(filtered_tokens)
print("\n--- Lemmatized Tokens ---\n", lemmatized_tokens)
# Step 6: Stem (Optional)
stemmed_tokens = stem_tokens(filtered_tokens)
print("\n--- Stemmed Tokens ---\n", stemmed_tokens)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt_tab.zip.
https://colab.research.google.com/drive/1YscQHdGXIwpRFjZXAdHQvwSoecTEFxr-#scrollTo=cHKPYhQ5PPFW&printMode=true 3/6
24/11/2024, 23:24 mapreduce.ipynb - Colab
--- Cleaned Text ---
Natural Language Processing NLP is a subfield of artificial intelligence AI focused on enabling machines to understand and process human languages It en
--- Tokens ---
['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'AI', 'focused', 'on', 'enabling', 'machines', '
--- Lowercase Tokens ---
['natural', 'language', 'processing', 'nlp', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'ai', 'focused', 'on', 'enabling', 'machines', '
--- Tokens without Stopwords ---
['natural', 'language', 'processing', 'nlp', 'subfield', 'artificial', 'intelligence', 'ai', 'focused', 'enabling', 'machines', 'understand', 'process',
--- Lemmatized Tokens ---
['natural', 'language', 'processing', 'nlp', 'subfield', 'artificial', 'intelligence', 'ai', 'focused', 'enabling', 'machine', 'understand', 'process',
--- Stemmed Tokens ---
['natur', 'languag', 'process', 'nlp', 'subfield', 'artifici', 'intellig', 'ai', 'focus', 'enabl', 'machin', 'understand', 'process', 'human', 'languag'
from collections import defaultdict
# Sample large article
article = """
MapReduce is a programming model that is widely used for processing and generating large datasets.
This model was introduced by Google and is now a core concept in big data. MapReduce involves two key functions: Map and Reduce.
The Map function processes key-value pairs and produces intermediate key-value pairs.
The Reduce function merges all intermediate values associated with the same key.
Together, MapReduce enables distributed processing of large datasets across a cluster of computers.
"""
# ----------------------------------------
# Step 1: Mapper
# ----------------------------------------
def mapper(text):
"""
The mapper splits the text into words and emits key-value pairs (word, 1).
"""
print("### Step 1: Mapper ###")
words = text.lower().replace('.', '').replace(',', '').split() # Normalize text
mapped_data = [(word, 1) for word in words] # Emit (word, 1) for each word
print("Mapper Output:", mapped_data[:10], "...") # Print first 10 pairs for illustration
return mapped_data
# ----------------------------------------
# Step 2: Shuffle and Sort
https://colab.research.google.com/drive/1YscQHdGXIwpRFjZXAdHQvwSoecTEFxr-#scrollTo=cHKPYhQ5PPFW&printMode=true 4/6
24/11/2024, 23:24 mapreduce.ipynb - Colab
# ----------------------------------------
def shuffle_and_sort(mapped_data):
"""
The shuffle and sort phase groups the key-value pairs by keys.
"""
print("\n### Step 2: Shuffle and Sort ###")
grouped_data = defaultdict(list)
for word, count in mapped_data:
grouped_data[word].append(count)
print("Shuffled and Sorted Output (Sample):", dict(list(grouped_data.items())[:5])) # Print first 5 groups
return grouped_data
# ----------------------------------------
# Step 3: Reducer
# ----------------------------------------
def reducer(shuffled_data):
"""
The reducer aggregates the values for each key by summing them up.
"""
print("\n### Step 3: Reducer ###")
reduced_data = {word: sum(counts) for word, counts in shuffled_data.items()}
print("Reducer Output (Sample):", dict(list(reduced_data.items())[:5])) # Print first 5 reduced results
return reduced_data
# ----------------------------------------
# Combine the Steps in a MapReduce Pipeline
# ----------------------------------------
def mapreduce_pipeline(text):
"""
Executes the full MapReduce process: Map, Shuffle/Sort, Reduce.
"""
print("### MapReduce Pipeline ###\n")
# Step 1: Map
mapped_data = mapper(text)
# Step 2: Shuffle and Sort
shuffled_data = shuffle_and_sort(mapped_data)
# Step 3: Reduce
reduced_data = reducer(shuffled_data)
return reduced_data
# ----------------------------------------
# Run the MapReduce Pipeline
https://colab.research.google.com/drive/1YscQHdGXIwpRFjZXAdHQvwSoecTEFxr-#scrollTo=cHKPYhQ5PPFW&printMode=true 5/6
24/11/2024, 23:24 mapreduce.ipynb - Colab
# ----------------------------------------
result = mapreduce_pipeline(article)
# ----------------------------------------
# Display Final Results
# ----------------------------------------
print("\n--- Final Word Count ---")
for word, count in sorted(result.items(), key=lambda x: x[1], reverse=True): # Sort by frequency
print(f"{word}: {count}")
### MapReduce Pipeline ###
### Step 1: Mapper ###
Mapper Output: [('mapreduce', 1), ('is', 1), ('a', 1), ('programming', 1), ('model', 1), ('that', 1), ('is', 1), ('widely', 1), ('used', 1), ('for', 1)
### Step 2: Shuffle and Sort ###
Shuffled and Sorted Output (Sample): {'mapreduce': [1, 1, 1], 'is': [1, 1, 1], 'a': [1, 1, 1], 'programming': [1], 'model': [1, 1]}
### Step 3: Reducer ###
Reducer Output (Sample): {'mapreduce': 3, 'is': 3, 'a': 3, 'programming': 1, 'model': 2}
--- Final Word Count ---
and: 4
mapreduce: 3
is: 3
a: 3
the: 3
model: 2
processing: 2
large: 2
datasets: 2
key: 2
map: 2
reduce: 2
function: 2
key-value: 2
pairs: 2
intermediate: 2
of: 2
programming: 1
that: 1
widely: 1
used: 1
https://colab.research.google.com/drive/1YscQHdGXIwpRFjZXAdHQvwSoecTEFxr-#scrollTo=cHKPYhQ5PPFW&printMode=true 6/6