# Install gensim for Word2Vec
!pip install gensim nltk
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
# Download tokenizer resources
nltk.download("punkt")
# ----------------------------
# 1. Sample Document Corpus
# ----------------------------
corpus = [
"Deep learning is a subset of machine learning",
"Neural networks are the backbone of deep learning",
"Word embeddings capture semantic meaning of words",
"Reinforcement learning is used for decision making",
"Python is widely used for machine learning and AI"
]
# ----------------------------
# 2. Preprocessing - Tokenize
# ----------------------------
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in
corpus]
print("Tokenized Corpus:", tokenized_corpus)
# ----------------------------
# 3. Train Word2Vec Model
# ----------------------------
model = Word2Vec(
sentences=tokenized_corpus, # input text
vector_size=50, # embedding dimensions
window=3, # context window size
sg=1, # 1 = Skip-gram, 0 = CBOW
min_count=1, # ignore rare words
workers=4, # parallel training
epochs=100 # training iterations
)
# ----------------------------
# 4. Save and Load Model
# ----------------------------
model.save("word2vec_model.model")
loaded_model = Word2Vec.load("word2vec_model.model")
# ----------------------------
# 5. Generate Embeddings
# ----------------------------
word = "learning"
vector = loaded_model.wv[word]
print(f"\nVector Representation of '{word}':\n", vector)
# ----------------------------
# 6. Find Similar Words
# ----------------------------
similar_words = loaded_model.wv.most_similar("learning")
print("\nWords similar to 'learning':")
for word, score in similar_words:
print(f"{word} → {score:.4f}")
Requirement already satisfied: gensim in
/usr/local/lib/python3.12/dist-packages (4.3.3)
Requirement already satisfied: nltk in /usr/local/lib/python3.12/dist-
packages (3.9.1)
Requirement already satisfied: numpy<2.0,>=1.18.5 in
/usr/local/lib/python3.12/dist-packages (from gensim) (1.26.4)
Requirement already satisfied: scipy<1.14.0,>=1.7.0 in
/usr/local/lib/python3.12/dist-packages (from gensim) (1.13.1)
Requirement already satisfied: smart-open>=1.8.1 in
/usr/local/lib/python3.12/dist-packages (from gensim) (7.3.0.post1)
Requirement already satisfied: click in
/usr/local/lib/python3.12/dist-packages (from nltk) (8.2.1)
Requirement already satisfied: joblib in
/usr/local/lib/python3.12/dist-packages (from nltk) (1.5.1)
Requirement already satisfied: regex>=2021.8.3 in
/usr/local/lib/python3.12/dist-packages (from nltk) (2024.11.6)
Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-
packages (from nltk) (4.67.1)
Requirement already satisfied: wrapt in
/usr/local/lib/python3.12/dist-packages (from smart-open>=1.8.1-
>gensim) (1.17.3)
Tokenized Corpus: [['deep', 'learning', 'is', 'a', 'subset', 'of',
'machine', 'learning'], ['neural', 'networks', 'are', 'the',
'backbone', 'of', 'deep', 'learning'], ['word', 'embeddings',
'capture', 'semantic', 'meaning', 'of', 'words'], ['reinforcement',
'learning', 'is', 'used', 'for', 'decision', 'making'], ['python',
'is', 'widely', 'used', 'for', 'machine', 'learning', 'and', 'ai']]
Vector Representation of 'learning':
[-2.59147864e-03 7.30598229e-04 9.51948576e-03 1.91300642e-02
-1.95838269e-02 -1.64280050e-02 1.37428828e-02 1.97431687e-02
-1.25339665e-02 -9.08659119e-03 1.46197099e-02 -4.21433197e-03
-7.07245385e-03 1.23048238e-02 -9.53463651e-03 -2.26634275e-03
8.85236356e-03 3.58688971e-03 -2.10996512e-02 -2.25846618e-02
1.52685875e-02 1.20135937e-02 1.80702936e-02 1.89628452e-03
1.38820671e-02 -6.68328675e-03 -2.52554868e-03 1.27551164e-02
-1.76950172e-02 -7.37456046e-03 -1.38710439e-02 -2.95944279e-03
2.02024095e-02 -1.53445862e-02 -5.59908012e-03 -4.49628336e-03
1.78563129e-02 -1.22257946e-02 1.05311898e-04 -8.80396646e-03
-1.91957895e-02 1.06142303e-02 -1.88530274e-02 -8.87032412e-03
3.55112040e-03 4.20613615e-05 -1.50183570e-02 1.92238390e-02
1.15314703e-02 1.92625672e-02]
Words similar to 'learning':
semantic → 0.3309
ai → 0.2913
networks → 0.2487
making → 0.2378
of → 0.2287
the → 0.2098
backbone → 0.1948
decision → 0.1834
and → 0.1460
subset → 0.1449
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0
y_train, y_test = to_categorical(y_train), to_categorical(y_test)
# Build DNN model
model = Sequential([
Flatten(input_shape=(28,28)),
Dense(128, activation='relu'),
Dense(64, activation='relu'),
Dense(10, activation='softmax')
])
# Compile and train
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size=32, verbose=1)
# Evaluate
print("Test Accuracy:", model.evaluate(x_test, y_test)[1]*100, "%")
/usr/local/lib/python3.12/dist-packages/keras/src/layers/reshaping/
flatten.py:37: UserWarning: Do not pass an `input_shape`/`input_dim`
argument to a layer. When using Sequential models, prefer using an
`Input(shape)` object as the first layer in the model instead.
super().__init__(**kwargs)
Epoch 1/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 7s 3ms/step - accuracy: 0.8779 - loss:
0.4226
Epoch 2/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 7s 4ms/step - accuracy: 0.9685 - loss:
0.1057
Epoch 3/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 8s 4ms/step - accuracy: 0.9786 - loss:
0.0689
Epoch 4/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 7s 4ms/step - accuracy: 0.9841 - loss:
0.0505
Epoch 5/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - accuracy: 0.9873 - loss:
0.0394
313/313 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.9738 - loss:
0.0891
Test Accuracy: 97.69999980926514 %
# Import libraries
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten,
Dense
from tensorflow.keras.utils import to_categorical
# Load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Reshape and normalize
x_train = x_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
x_test = x_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0
# One-hot encoding of labels
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)
# Build CNN model
model = Sequential([
Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
MaxPooling2D(pool_size=(2,2)),
Conv2D(64, (3,3), activation='relu'),
MaxPooling2D(pool_size=(2,2)),
Flatten(),
Dense(128, activation='relu'),
Dense(10, activation='softmax')
])
# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=['accuracy'])
# Train model
model.fit(x_train, y_train, epochs=5, batch_size=32, verbose=1)
# Evaluate model
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test Accuracy:", test_acc * 100, "%")
/usr/local/lib/python3.12/dist-packages/keras/src/layers/
convolutional/base_conv.py:113: UserWarning: Do not pass an
`input_shape`/`input_dim` argument to a layer. When using Sequential
models, prefer using an `Input(shape)` object as the first layer in
the model instead.
super().__init__(activity_regularizer=activity_regularizer,
**kwargs)
Epoch 1/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 50s 26ms/step - accuracy: 0.9123 -
loss: 0.2924
Epoch 2/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 79s 25ms/step - accuracy: 0.9874 -
loss: 0.0430
Epoch 3/5
1875/1875 ━━━━━━━━━━━━━━━━━━━━ 47s 25ms/step - accuracy: 0.9908 -
loss: 0.0272
Epoch 4/5
964/1875 ━━━━━━━━━━━━━━━━━━━━ 23s 25ms/step - accuracy: 0.9933 -
loss: 0.0185
# Install required packages (if not available)
!pip install tensorflow matplotlib
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
# ----------------------------
# 1. Load and preprocess dataset
# ----------------------------
(x_train, _), (x_test, _) = tf.keras.datasets.mnist.load_data()
# Normalize and flatten
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0
x_train = x_train.reshape(len(x_train), 28*28)
x_test = x_test.reshape(len(x_test), 28*28)
# ----------------------------
# 2. Build Encoder
# ----------------------------
encoder = models.Sequential([
layers.Input(shape=(784,)),
layers.Dense(128, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(32, activation='relu') # compressed representation
])
# ----------------------------
# 3. Build Decoder
# ----------------------------
decoder = models.Sequential([
layers.Input(shape=(32,)),
layers.Dense(64, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(784, activation='sigmoid') # reconstruct image
])
# ----------------------------
# 4. Build Autoencoder (Encoder + Decoder)
# ----------------------------
autoencoder = models.Sequential([encoder, decoder])
autoencoder.compile(optimizer='adam', loss='mse')
# ----------------------------
# 5. Train Autoencoder
# ----------------------------
history = autoencoder.fit(
x_train, x_train,
epochs=5,
batch_size=256,
shuffle=True,
validation_data=(x_test, x_test)
)
# ----------------------------
# 6. Test Compression & Reconstruction
# ----------------------------
encoded_imgs = encoder.predict(x_test[:10])
decoded_imgs = autoencoder.predict(x_test[:10])
# ----------------------------
# 7. Visualize Original vs Reconstructed Images
# ----------------------------
n = 10 # number of images to display
plt.figure(figsize=(20, 4))
for i in range(n):
# Original
ax = plt.subplot(2, n, i+1)
plt.imshow(x_test[i].reshape(28, 28), cmap="gray")
plt.title("Original")
plt.axis("off")
# Reconstructed
ax = plt.subplot(2, n, i + 1 + n)
plt.imshow(decoded_imgs[i].reshape(28, 28), cmap="gray")
plt.title("Reconstructed")
plt.axis("off")
plt.show()