GEN AI LAB PROGRAMS
1. Computing the TF-IDF Matrix using NumPy
Task: Write a Python function to compute the TF-IDF matrix for the given set of
documents using only NumPy.
Code:
import numpy as np
def compute_tf_idf(documents, vocabulary):
N = len(documents)
V = len(vocabulary)
# Initialize TF matrix (N x V)
tf = np.zeros((N, V))
# Build term frequency matrix
for i, doc in enumerate(documents):
words = doc.lower().split()
for word in words:
if word in vocabulary:
j = vocabulary.index(word)
tf[i, j] += 1
tf[i] = tf[i] / len(words) # Normalize TF by document length
# Compute Document Frequency (DF)
df = np.zeros(V)
for j, term in enumerate(vocabulary):
df[j] = sum(1 for doc in documents if term in doc.lower().split())
# Compute Inverse Document Frequency (IDF)
idf = np.log(N / (df + 1)) # Add 1 to avoid division by zero
# Compute TF-IDF matrix
tf_idf = tf * idf # Element-wise multiplication
return tf_idf
# Example usage:
documents = [
"cat sat on the mat",
"dog sat on the log",
"cat and dog played together"
]
vocabulary = list(set(" ".join(documents).lower().split()))
tf_idf_matrix = compute_tf_idf(documents, vocabulary)
print("Vocabulary:", vocabulary)
print("TF-IDF Matrix:\n", tf_idf_matrix)
Output:
Vocabulary: ['mat', 'together', 'sat', 'dog', 'cat', 'the', 'played', 'on', 'log', 'and']
TF-IDF Matrix:
[[0.08109302 0. 0. 0. 0. 0.
0. 0. 0. 0. ]
[0. 0. 0. 0. 0. 0.
0. 0. 0.08109302 0. ]
[0. 0.08109302 0. 0. 0. 0.
0.08109302 0. 0. 0.08109302]]
2. Generating n-grams for a Sentence
Task: Write a Python function to generate n-grams for a given sentence.
Code:
def generate_ngrams(sentence, n):
words = sentence.lower().split()
ngrams = []
for i in range(len(words) - n + 1):
ngram = tuple(words[i:i + n])
ngrams.append(ngram)
return ngrams
# Example usage:
sentence = "The quick brown fox jumps over the lazy dog."
n=3
ngrams = generate_ngrams(sentence, n)
print(f"{n}-grams:")
for gram in ngrams:
print(gram)
Output:
3-grams:
('the', 'quick', 'brown')
('quick', 'brown', 'fox')
('brown', 'fox', 'jumps')
('fox', 'jumps', 'over')
('jumps', 'over', 'the')
('over', 'the', 'lazy')
('the', 'lazy', 'dog.')
3: Computing a 3-gram Language Model
Task: Write a Python function to compute a 3-gram language model.
Code:
def compute_trigram_language_model(documents):
from collections import defaultdict
trigram_counts = defaultdict(int)
total_trigrams = 0
for doc in documents:
words = doc.lower().split()
for i in range(len(words) - 2):
trigram = tuple(words[i:i + 3])
trigram_counts[trigram] += 1
total_trigrams += 1
# Compute probabilities
trigram_probabilities = {}
for trigram, count in trigram_counts.items():
trigram_probabilities[trigram] = count / total_trigrams
return trigram_probabilities
# Example usage:
documents = [
"The quick brown fox jumps over the lazy dog",
"The quick blue fox jumps over the lazy cat",
"The lazy dog sleeps under the blue sky"
]
trigram_model = compute_trigram_language_model(documents)
print("Trigram Probabilities:")
for trigram, prob in trigram_model.items():
print(f"{trigram}: {prob}")
Output:
Trigram Probabilities:
('the', 'quick', 'brown'): 0.05
('quick', 'brown', 'fox'): 0.05
('brown', 'fox', 'jumps'): 0.05
('fox', 'jumps', 'over'): 0.1
('jumps', 'over', 'the'): 0.1
('over', 'the', 'lazy'): 0.1
('the', 'lazy', 'dog'): 0.1
('the', 'quick', 'blue'): 0.05
('quick', 'blue', 'fox'): 0.05
('blue', 'fox', 'jumps'): 0.05
4: Creating a Word Embedding Matrix
Task:
1. Implement the function create_embedding_matrix(corpus, embedding_dim).
2. Test the function and get_word_vector with the given corpus and embedding_dim=3.
Code:
import numpy as np
def create_embedding_matrix(corpus, embedding_dim):
# Preprocessing
vocabulary = {}
index = 0
for sentence in corpus:
words = sentence.lower().split()
for word in words:
if word not in vocabulary:
vocabulary[word] = index
index += 1
V = len(vocabulary)
# Initialize embedding matrix with random values between 0 and 1
E = np.random.rand(V, embedding_dim)
# Create word to index mapping (already done in vocabulary)
word_to_index = vocabulary
# Define get_word_vector function
def get_word_vector(word):
word = word.lower()
if word in word_to_index:
idx = word_to_index[word]
return E[idx]
else:
return np.zeros(embedding_dim)
return E, vocabulary, get_word_vector
# Example usage:
corpus = [
"I love machine learning",
"Machine learning is amazing",
"I love learning new things"
]
embedding_dim = 3
E, vocabulary, get_word_vector = create_embedding_matrix(corpus, embedding_dim)
print("Vocabulary:", vocabulary)
print("Embedding Matrix E:\n", E)
# Test get_word_vector
word = "learning"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)
# Test with a word not in the vocabulary
word = "unknown"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)
Output:
Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
Embedding Matrix E:
[[0.70366694 0.37323165 0.8339942 ]
[0.30824863 0.25459773 0.29978671]
[0.17141767 0.55727104 0.19208332]
[0.36011277 0.62322428 0.86099527]
[0.1327652 0.03365305 0.35291037]
[0.80062233 0.84881622 0.73158583]
[0.70957902 0.75419446 0.53513209]
[0.78353907 0.28600711 0.20810742]]
Embedding for 'learning': [0.36011277 0.62322428 0.86099527]
Embedding for 'unknown': [0. 0. 0.]
5: Creating a Word Embedding Matrix with Pre-trained Embeddings
Task:
1. Implement the function create_embedding_matrix_with_pretrained(corpus,
pretrained_embeddings, embedding_dim).
2. Test the function with the given corpus and pre-trained embeddings.
Code:
import numpy as np
def create_embedding_matrix_with_pretrained(corpus, pretrained_embeddings,
embedding_dim):
# Preprocessing
vocabulary = {}
index = 0
for sentence in corpus:
words = sentence.lower().split()
for word in words:
if word not in vocabulary:
vocabulary[word] = index
index += 1
V = len(vocabulary)
# Initialize embedding matrix
E = np.zeros((V, embedding_dim))
# Assign embeddings
for word, idx in vocabulary.items():
if word in pretrained_embeddings:
E[idx] = np.array(pretrained_embeddings[word])
else:
E[idx] = np.random.rand(embedding_dim) # Random initialization
# Define get_word_vector function
def get_word_vector(word):
word = word.lower()
if word in vocabulary:
idx = vocabulary[word]
return E[idx]
else:
return np.zeros(embedding_dim)
return E, vocabulary, get_word_vector
# Example usage:
corpus = [
"I love machine learning",
"Machine learning is amazing",
"I love learning new things"
]
pretrained_embeddings = {
"machine": [0.1, 0.2, 0.3],
"learning": [0.2, 0.3, 0.4],
"amazing": [0.3, 0.4, 0.5],
"love": [0.4, 0.5, 0.6]
}
embedding_dim = 3
E, vocabulary, get_word_vector = create_embedding_matrix_with_pretrained(
corpus, pretrained_embeddings, embedding_dim)
print("Vocabulary:", vocabulary)
print("Embedding Matrix E:\n", E)
# Test get_word_vector
word = "machine"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)
word = "i"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector) # Randomly initialized
word = "unknown"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector) # Returns zeros
Output:
Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
Embedding Matrix E:
[[0.20433889 0.45932819 0.62836074]
[0.4 0.5 0.6 ]
[0.1 0.2 0.3 ]
[0.2 0.3 0.4 ]
[0.84748087 0.37440758 0.93981111]
[0.3 0.4 0.5 ]
[0.40474447 0.82834371 0.13308173]
[0.44601989 0.85308688 0.05198728]]
Embedding for 'machine': [0.1 0.2 0.3]
Embedding for 'i': [0.20433889 0.45932819 0.62836074]
Embedding for 'unknown': [0. 0. 0.]
6: Generating One-Hot Encodings
Task:
1. Implement the function create_one_hot_encodings(corpus).
2. Test the function with the given corpus.
Code:
import numpy as np
def create_one_hot_encodings(corpus):
# Preprocessing
vocabulary = {}
index = 0
for sentence in corpus:
words = sentence.lower().split()
for word in words:
if word not in vocabulary:
vocabulary[word] = index
index += 1
V = len(vocabulary)
# Initialize one-hot encoding matrix
one_hot_encodings = {}
for word, idx in vocabulary.items():
one_hot_vector = np.zeros(V)
one_hot_vector[idx] = 1
one_hot_encodings[word] = one_hot_vector
return vocabulary, one_hot_encodings
# Example usage:
corpus = [
"I love machine learning",
"Machine learning is amazing",
"I love learning new things"
]
vocabulary, one_hot_encodings = create_one_hot_encodings(corpus)
print("Vocabulary:", vocabulary)
print("\nOne-Hot Encodings:")
for word, one_hot_vector in one_hot_encodings.items():
print(f"Word: '{word}' - One-Hot Vector: {one_hot_vector}")
Output:
Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
One-Hot Encodings:
Word: 'i' - One-Hot Vector: [1. 0. 0. 0. 0. 0. 0. 0.]
Word: 'love' - One-Hot Vector: [0. 1. 0. 0. 0. 0. 0. 0.]
Word: 'machine' - One-Hot Vector: [0. 0. 1. 0. 0. 0. 0. 0.]
Word: 'learning' - One-Hot Vector: [0. 0. 0. 1. 0. 0. 0. 0.]
Word: 'is' - One-Hot Vector: [0. 0. 0. 0. 1. 0. 0. 0.]
Word: 'amazing' - One-Hot Vector: [0. 0. 0. 0. 0. 1. 0. 0.]
Word: 'new' - One-Hot Vector: [0. 0. 0. 0. 0. 0. 1. 0.]
Word: 'things' - One-Hot Vector: [0. 0. 0. 0. 0. 0. 0. 1.]
7: Implementing the Skip-Gram Model
Task:
1. Implement the function generate_skip_gram_pairs(sentences, window_size).
2. Test it with the given sentences and window_size = 2.
Code:
def generate_skip_gram_pairs(sentences, window_size):
# Preprocessing: Build the vocabulary and word indices
vocabulary = {}
index = 0
for sentence in sentences:
words = sentence.lower().split()
for word in words:
if word not in vocabulary:
vocabulary[word] = index
index += 1
# Generate skip-gram training pairs
training_pairs = []
for sentence in sentences:
words = sentence.lower().split()
for i, target_word in enumerate(words):
# Define the context window
start = max(0, i - window_size)
end = min(len(words), i + window_size + 1)
for j in range(start, end):
if i != j:
context_word = words[j]
training_pairs.append((target_word, context_word))
return vocabulary, training_pairs
# Example usage:
sentences = [
"I love machine learning",
"Machine learning is amazing",
"I love learning new things"
]
window_size = 2
vocabulary, training_pairs = generate_skip_gram_pairs(sentences, window_size)
print("Vocabulary:", vocabulary)
print("\nSkip-Gram Training Pairs:")
for pair in training_pairs:
print(pair)
Output:
Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
Skip-Gram Training Pairs:
('i', 'love')
('i', 'machine')
('love', 'i')
('love', 'machine')
('love', 'learning')
('machine', 'i')
('machine', 'love')
('machine', 'learning')
('learning', 'love')
('learning', 'machine')
('machine', 'learning')
('machine', 'is')
('learning', 'machine')
('learning', 'is')
('learning', 'amazing')
('is', 'machine')
('is', 'learning')
('is', 'amazing')
('amazing', 'learning')
('amazing', 'is')
('i', 'love')
('i', 'learning')
('love', 'i')
('love', 'learning')
('love', 'new')
('learning', 'i')
('learning', 'love')
('learning', 'new')
('learning', 'things')
8: Generating CBOW Training Pairs
Task:
1. Implement the function generate_cbow_pairs(sentences, window_size).
2. Test it with the given sentences and window_size = 2.
Code:
def generate_cbow_pairs(sentences, window_size):
# Preprocessing: Build the vocabulary and word indices
vocabulary = {}
index = 0
for sentence in sentences:
words = sentence.lower().split()
for word in words:
if word not in vocabulary:
vocabulary[word] = index
index += 1
# Generate CBOW training pairs
training_pairs = []
for sentence in sentences:
words = sentence.lower().split()
for i, target_word in enumerate(words):
# Define the context window
start = max(0, i - window_size)
end = min(len(words), i + window_size + 1)
context_words = []
for j in range(start, end):
if i != j:
context_words.append(words[j])
if context_words:
training_pairs.append((tuple(context_words), target_word))
return vocabulary, training_pairs
# Example usage:
sentences = [
"I love machine learning",
"Machine learning is amazing",
"I love learning new things"
]
window_size = 2
vocabulary, training_pairs = generate_cbow_pairs(sentences, window_size)
print("Vocabulary:", vocabulary)
print("\nCBOW Training Pairs:")
for pair in training_pairs:
print(f"Context: {pair[0]}, Target: {pair[1]}")
Output:
Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
CBOW Training Pairs:
Context: ('love', 'machine'), Target: i
Context: ('i', 'machine', 'learning'), Target: love
Context: ('i', 'love', 'learning'), Target: machine
Context: ('love', 'machine'), Target: learning
Context: ('learning', 'is'), Target: machine
Context: ('machine', 'is', 'amazing'), Target: learning
Context: ('machine', 'learning', 'amazing'), Target: is
Context: ('learning', 'is'), Target: amazing
Context: ('love', 'learning'), Target: i
Context: ('i', 'learning', 'new'), Target: love
Context: ('i', 'love', 'new', 'things'), Target: learning
Context: ('love', 'learning', 'things'), Target: new
Context: ('learning', 'new'), Target: things
9. Implementing a Simple Vanilla RNN
Task:
1. Implement the function rnn_forward(x, Wxh, Whh, Why, bh, by, h0).
2. Test the function with random weights, biases, and an initial hidden state.
Code:
import numpy as np
def rnn_forward(x, Wxh, Whh, Why, bh, by, h0):
h = h0
hs = []
ys = []
for t in range(len(x)):
xt = np.array([[x[t]]]) # Input at time t (make it a column vector)
h = np.tanh(np.dot(Whh, h) + np.dot(Wxh, xt) + bh) # Hidden state
y = np.dot(Why, h) + by # Output
hs.append(h)
ys.append(y)
return ys, hs
# Example usage:
# Input sequence
x = [1, 2, 3]
# Hyperparameters
input_size = 1 # Since x is a sequence of numbers
hidden_size = 4 # You can choose any size for hidden state
output_size = 1 # Output is a single number at each time step
# Random initialization of weights and biases
np.random.seed(0) # For reproducibility
Wxh = np.random.randn(hidden_size, input_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(output_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((output_size, 1))
h0 = np.zeros((hidden_size, 1))
# Run the RNN forward function
ys, hs = rnn_forward(x, Wxh, Whh, Why, bh, by, h0)
print("Outputs at each time step:")
for t, y in enumerate(ys):
print(f"Time step {t+1}: y = {y.flatten()}")
Output:
Outputs at each time step:
Time step 1: y = [-0.00050584]
Time step 2: y = [-0.00101643]
Time step 3: y = [-0.00152624]
10 : Implementation of the self-attention mechanism using only NumPy
Code:
import numpy as np
def softmax(x, axis=-1):
"""Compute the softmax of each element along the specified axis of x."""
exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) # For numerical stability
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
def self_attention(X, Wq, Wk, Wv):
"""
Implement the self-attention mechanism.
Args:
X: Input matrix of shape (n, d), where n is the number of input vectors, and d is the
dimension of each vector.
Wq: Query weight matrix of shape (d, dout).
Wk: Key weight matrix of shape (d, dout).
Wv: Value weight matrix of shape (d, dout).
Returns:
Output matrix of shape (n, dout).
"""
# Compute Queries (Q), Keys (K), and Values (V)
Q = np.dot(X, Wq) # Shape: (n, dout)
K = np.dot(X, Wk) # Shape: (n, dout)
V = np.dot(X, Wv) # Shape: (n, dout)
# Compute attention scores: Q * K.T, then scale by sqrt(dout)
d_k = Q.shape[1] # dout
attention_scores = np.dot(Q, K.T) / np.sqrt(d_k) # Shape: (n, n)
# Apply softmax to attention scores
attention_weights = softmax(attention_scores, axis=-1) # Shape: (n, n)
# Compute final output: Attention weights * V
output = np.dot(attention_weights, V) # Shape: (n, dout)
return output
# Example usage:
np.random.seed(0) # For reproducibility
# Input matrix X (n=4 vectors, d=3 features per vector)
X = np.random.rand(4, 3) # Shape: (4, 3)
# Learnable weight matrices Wq, Wk, Wv
d=3 # Input dimension
dout = 2 # Output dimension
Wq = np.random.rand(d, dout) # Shape: (3, 2)
Wk = np.random.rand(d, dout) # Shape: (3, 2)
Wv = np.random.rand(d, dout) # Shape: (3, 2)
# Call the self_attention function
output = self_attention(X, Wq, Wk, Wv)
print("Input Matrix X:")
print(X)
print("\nWeight Matrix Wq:")
print(Wq)
print("\nWeight Matrix Wk:")
print(Wk)
print("\nWeight Matrix Wv:")
print(Wv)
print("\nSelf-Attention Output:")
print(output)
Output:
Input Matrix X:
[[0.5488135 0.71518937 0.60276338]
[0.54488318 0.4236548 0.64589411]
[0.43758721 0.891773 0.96366276]
[0.38344152 0.79172504 0.52889492]]
Weight Matrix Wq:
[[0.56804456 0.92559664]
[0.07103606 0.0871293 ]
[0.0202184 0.83261985]]
Weight Matrix Wk:
[[0.77815675 0.87001215]
[0.97861834 0.79915856]
[0.46147936 0.78052918]]
Weight Matrix Wv:
[[0.11827443 0.63992102]
[0.14335329 0.94466892]
[0.52184832 0.41466194]]
Self-Attention Output:
[[0.53569849 1.29450415]
[0.53551973 1.29413435]
[0.53849796 1.29925955]
[0.53131543 1.28657939]]