image-captions
January 6, 2023
[1]: import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
[2]: BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'
[3]: #load vgg16 model
model = VGG16()
# Restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# summarize
print(model.summary())
2023-01-06 15:09:59.937887: I
tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool
with default inter op setting: 2. Tune using inter_op_parallelism_threads for
best performance.
Downloading data from https://storage.googleapis.com/tensorflow/keras-
applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
553467904/553467096 [==============================] - 3s 0us/step
553476096/553467096 [==============================] - 3s 0us/step
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 224, 224, 3)] 0
_________________________________________________________________
1
block1_conv1 (Conv2D) (None, 224, 224, 64) 1792
_________________________________________________________________
block1_conv2 (Conv2D) (None, 224, 224, 64) 36928
_________________________________________________________________
block1_pool (MaxPooling2D) (None, 112, 112, 64) 0
_________________________________________________________________
block2_conv1 (Conv2D) (None, 112, 112, 128) 73856
_________________________________________________________________
block2_conv2 (Conv2D) (None, 112, 112, 128) 147584
_________________________________________________________________
block2_pool (MaxPooling2D) (None, 56, 56, 128) 0
_________________________________________________________________
block3_conv1 (Conv2D) (None, 56, 56, 256) 295168
_________________________________________________________________
block3_conv2 (Conv2D) (None, 56, 56, 256) 590080
_________________________________________________________________
block3_conv3 (Conv2D) (None, 56, 56, 256) 590080
_________________________________________________________________
block3_pool (MaxPooling2D) (None, 28, 28, 256) 0
_________________________________________________________________
block4_conv1 (Conv2D) (None, 28, 28, 512) 1180160
_________________________________________________________________
block4_conv2 (Conv2D) (None, 28, 28, 512) 2359808
_________________________________________________________________
block4_conv3 (Conv2D) (None, 28, 28, 512) 2359808
_________________________________________________________________
block4_pool (MaxPooling2D) (None, 14, 14, 512) 0
_________________________________________________________________
block5_conv1 (Conv2D) (None, 14, 14, 512) 2359808
_________________________________________________________________
block5_conv2 (Conv2D) (None, 14, 14, 512) 2359808
_________________________________________________________________
block5_conv3 (Conv2D) (None, 14, 14, 512) 2359808
_________________________________________________________________
block5_pool (MaxPooling2D) (None, 7, 7, 512) 0
_________________________________________________________________
flatten (Flatten) (None, 25088) 0
_________________________________________________________________
fc1 (Dense) (None, 4096) 102764544
_________________________________________________________________
fc2 (Dense) (None, 4096) 16781312
=================================================================
Total params: 134,260,544
Trainable params: 134,260,544
Non-trainable params: 0
_________________________________________________________________
None
2
[4]: # extarct the feature from the image
features = {}
directory = os.path.join(BASE_DIR, 'Images')
for img_name in tqdm(os.listdir(directory)):
# load the image file
img_path = directory + '/' + img_name
image = load_img(img_path, target_size=(224,224))
# converts image pixels to numpy array
image = img_to_array(image)
#image reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
#extract features
feature = model.predict(image, verbose=0)
#get image ID
image_id = img_name.split('.')[0]
# store feature
features[image_id]=feature
0%| | 0/8091 [00:00<?, ?it/s]
2023-01-06 15:10:04.952115: I
tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR
Optimization Passes are enabled (registered 2)
[5]: # store features in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))
[6]: # load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
features = pickle.load(f)
[7]: #load caption
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
next(f)
captions_doc = f.read()
[8]: # create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
#split the line by comma(,)
tokens = line.split(',')
if len(line) < 2:
continue
image_id, caption = tokens[0], tokens[1:]
3
#remove extension from image id
image_id = image_id.split('.')[0]
# convert caption list to string
caption = " ".join(caption)
# create list if needed
if image_id not in mapping:
mapping[image_id] = []
mapping[image_id].append(caption)
0%| | 0/40456 [00:00<?, ?it/s]
[9]: len(mapping)
[9]: 8091
[10]: # preprocess caption function
def clean(mapping):
for key, captions in mapping.items():
for i in range(len(captions)):
# tkae one captin at a time
caption = captions[i]
# preprocessing text
# convert to lower case
caption = caption.lower()
# delete digits and special characters etc.,
caption = caption.replace('[A=Za-z]','')
# delete addtional spaces
caption = caption.replace('\s+', ' ')
# add start and end tags to the caption
caption = 'startseq ' + " ".join([word for word in caption.split()␣
↪if len(word)>1]) + ' endseq'
captions[i]= caption
[11]: # Before preprocess of text
mapping['1001773457_577c3a7d70']
[11]: ['A black dog and a spotted dog are fighting',
'A black dog and a tri-colored dog playing with each other on the road .',
'A black dog and a white dog with brown spots are staring at each other in the
street .',
'Two dogs of different breeds looking at each other on the road .',
'Two dogs on pavement moving toward each other .']
[12]: # preprocess the text
clean(mapping)
4
[13]: # After preprocess of text
mapping['1001773457_577c3a7d70']
[13]: ['startseq black dog and spotted dog are fighting endseq',
'startseq black dog and tri-colored dog playing with each other on the road
endseq',
'startseq black dog and white dog with brown spots are staring at each other in
the street endseq',
'startseq two dogs of different breeds looking at each other on the road
endseq',
'startseq two dogs on pavement moving toward each other endseq']
[14]: all_captions = []
for key in mapping:
for caption in mapping[key]:
all_captions.append(caption)
[15]: len(all_captions)
[15]: 40455
[16]: all_captions[:10]
[16]: ['startseq child in pink dress is climbing up set of stairs in an entry way
endseq',
'startseq girl going into wooden building endseq',
'startseq little girl climbing into wooden playhouse endseq',
'startseq little girl climbing the stairs to her playhouse endseq',
'startseq little girl in pink dress going into wooden cabin endseq',
'startseq black dog and spotted dog are fighting endseq',
'startseq black dog and tri-colored dog playing with each other on the road
endseq',
'startseq black dog and white dog with brown spots are staring at each other in
the street endseq',
'startseq two dogs of different breeds looking at each other on the road
endseq',
'startseq two dogs on pavement moving toward each other endseq']
[17]: # tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1
[18]: vocab_size
[18]: 8485
5
[19]: # get maximum length of the captions avilable
max_length = max(len(caption.split()) for caption in all_captions)
max_length
[19]: 35
#Train Test Split
[20]: image_ids = list(mapping.keys())
split=int(len(image_ids)*0.90)
train= image_ids[:split]
test=image_ids[split:]
[21]: # create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length,␣
↪vocab_size, batch_size):
# loop over images
X1, X2, y = list(), list(), list()
n = 0
while 1:
for key in data_keys:
n += 1
captions = mapping[key]
# process each caption
for caption in captions:
# encode the sequence
seq = tokenizer.texts_to_sequences([caption])[0]
# split the sequence into X, y pairs
for i in range(1, len(seq)):
# split into input and output pairs
in_seq, out_seq = seq[:i], seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# encode output sequence
out_seq = to_categorical([out_seq],
num_classes=vocab_size)[0]
# store the sequences
X1.append(features[key][0])
X2.append(in_seq)
y.append(out_seq)
if n == batch_size:
X1, X2, y = np.array(X1), np.array(X2), np.array(y)
yield [X1, X2], y
X1, X2, y = list(), list(), list()
n = 0
6
[22]: # encoder model
# image feature layer
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layer
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)
#decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# plot the model
plot_model(model, show_shapes=True)
[22]:
7
[23]: # train the model
epochs = 20
batch_size = 32
steps = len(train) // batch_size
for i in range(epochs):
# create data generator
generator = data_generator(train, mapping, features, tokenizer, max_length,␣
↪vocab_size, batch_size)
# fit for one epoch
model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
227/227 [==============================] - 549s 2s/step - loss: 5.2256
227/227 [==============================] - 552s 2s/step - loss: 4.0186
227/227 [==============================] - 536s 2s/step - loss: 3.5946
227/227 [==============================] - 544s 2s/step - loss: 3.3291
227/227 [==============================] - 543s 2s/step - loss: 3.1336
227/227 [==============================] - 544s 2s/step - loss: 2.9820
227/227 [==============================] - 542s 2s/step - loss: 2.8589
227/227 [==============================] - 549s 2s/step - loss: 2.7625
227/227 [==============================] - 540s 2s/step - loss: 2.6836
227/227 [==============================] - 533s 2s/step - loss: 2.6126
227/227 [==============================] - 539s 2s/step - loss: 2.5536
227/227 [==============================] - 545s 2s/step - loss: 2.4991
227/227 [==============================] - 538s 2s/step - loss: 2.4554
227/227 [==============================] - 540s 2s/step - loss: 2.4116
227/227 [==============================] - 534s 2s/step - loss: 2.3653
227/227 [==============================] - 529s 2s/step - loss: 2.3223
227/227 [==============================] - 533s 2s/step - loss: 2.2887
227/227 [==============================] - 531s 2s/step - loss: 2.2525
227/227 [==============================] - 621s 3s/step - loss: 2.2214
227/227 [==============================] - 569s 3s/step - loss: 2.1934
[24]: # save the model
model.save(WORKING_DIR+'/best_model.h5')
/opt/conda/lib/python3.7/site-packages/keras/utils/generic_utils.py:497:
CustomMaskWarning: Custom mask layers require a config and must override
get_config. When loading, the custom mask layer must be passed to the
custom_objects argument.
category=CustomMaskWarning)
[25]: def idx_to_word(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
8
return word
return None
[26]: # generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
# add start tag for generaation process
in_text = 'startseq'
# iterate over the max length of sequence
for i in range(max_length):
# encode input sequence
sequence = tokenizer.texts_to_sequences([in_text])[0]
# pad sequences
sequence = pad_sequences([sequence], max_length)
# predict next word
yhat = model.predict([image, sequence], verbose=0)
#get index with high probability
yhat = np.argmax(yhat)
# convert index to word
word = idx_to_word(yhat, tokenizer)
#stop if word not found
if word is None:
break
# append word as input for generating next word
in_text+=" "+word
# stop if we reach end tag
if word == 'endseq':
break
return in_text
[27]: from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()
for key in tqdm(test):
#get actual caption
captions= mapping[key]
# predict the caption for image
y_pred = predict_caption(model, features[key], tokenizer, max_length)
#split ino words
actual_captions = [caption.split() for caption in captions]
y_pred = y_pred.split()
#append to the list
actual.append(actual_captions)
predicted.append(y_pred)
# calculate BLEU score
print("BLEU-1:%f"% corpus_bleu(actual, predicted, weights=(1.0,0,0,0)))
9
print("BLEU-2:%f"% corpus_bleu(actual, predicted, weights=(0.5,0.5,0,0)))
0%| | 0/810 [00:00<?, ?it/s]
BLEU-1:0.535393
BLEU-2:0.308639
[28]: from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
# load the image
#image_name = ""
image_id = image_name.split('.')[0]
img_path = os.path.join(BASE_DIR, "Images", image_name)
image = Image.open(img_path)
captions = mapping[image_id]
print('-------------Actual------------')
for caption in captions:
print(caption)
y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
print('--------------Predicted---------------')
print(y_pred)
plt.imshow(image)
[29]: generate_caption("1019604187_d087bf9a5f.jpg")
-------------Actual------------
startseq dog prepares to catch thrown object in field with nearby cars endseq
startseq white dog is about to catch yellow ball in its mouth endseq
startseq white dog is about to catch yellow dog toy endseq
startseq white dog is ready to catch yellow ball flying through the air endseq
startseq white dog running after yellow ball endseq
--------------Predicted---------------
startseq white dog with white vest is running through the grass endseq
10
[30]: generate_caption("103106960_e8a41d64f8.jpg")
-------------Actual------------
startseq boy with stick kneeling in front of goalie net endseq
startseq child in red jacket playing street hockey guarding goal endseq
startseq young kid playing the goalie in hockey rink endseq
startseq young male kneeling in front of hockey goal with hockey stick in his
right hand endseq
startseq "hockey goalie boy in red jacket crouches by goal with stick ." endseq
--------------Predicted---------------
startseq little boy in red shirt is walking on playground endseq
11
[31]: generate_caption("106490881_5a2dd9b7bd.jpg")
-------------Actual------------
startseq boy in his blue swim shorts at the beach endseq
startseq boy smiles for the camera at beach endseq
startseq young boy in swimming trunks is walking with his arms outstretched on
the beach endseq
startseq children playing on the beach endseq
startseq the boy is playing on the shore of an ocean endseq
--------------Predicted---------------
startseq boy in green shirt is walking on the beach endseq
12
13