Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pickle | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow.keras.applications import EfficientNetB7 | |
from tensorflow.keras.applications.efficientnet import preprocess_input | |
from tensorflow.keras.preprocessing.image import load_img, img_to_array | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.models import model_from_json | |
from keras.optimizers import Adam | |
from PIL import Image | |
# load vgg16 model | |
pre_trained = EfficientNetB7(weights='imagenet', include_top=False, input_shape=(224, 224, 3)) | |
# Freeze the base model | |
pre_trained.trainable = False | |
x = tf.keras.layers.GlobalAveragePooling2D()(pre_trained.output) | |
# restructure the model | |
pre_trained_model = Model(inputs=pre_trained.input, outputs=x) | |
########################################################################################################### | |
# model = tf.keras.models.load_model("image_captioning_30k_model.h5") | |
# Load model architecture | |
with open("30k_model_architecture.json", "r") as json_file: | |
loaded_model_json = json_file.read() | |
# Create the optimizer without specifying the learning rate | |
optimizer = Adam() | |
# Set the learning rate separately | |
optimizer.learning_rate.assign(0.001) | |
# Load weights | |
model = model_from_json(loaded_model_json) | |
model.load_weights("30k_model_weights.h5") | |
# Load optimizer state | |
model.compile(optimizer=optimizer, loss='categorical_crossentropy') | |
########################################################################################################### | |
tokenizer = Tokenizer() | |
with open("Image_Captioner_tokenizer_30k.pkl", "rb") as f: | |
tokenizer = pickle.load(f) | |
def idx_to_word(integer, tokenizer): | |
for word, index in tokenizer.word_index.items(): | |
if index == integer: | |
return word | |
return None | |
# generate caption for an image | |
def predict_caption(model, image, tokenizer, max_length): | |
# add start tag for generation process | |
in_text = 'startseq' | |
# iterate over the max length of sequence | |
for i in range(max_length): | |
# encode input sequence | |
sequence = tokenizer.texts_to_sequences([in_text])[0] | |
# pad the sequence | |
sequence = pad_sequences([sequence], max_length) | |
# predict next word | |
yhat = model.predict([image, sequence], verbose=0) | |
# get index with high probability | |
yhat = np.argmax(yhat) | |
# convert index to word | |
word = idx_to_word(yhat, tokenizer) | |
# stop if word not found | |
if word is None: | |
break | |
# append word as input for generating next word | |
in_text += " " + word | |
# stop if we reach end tag | |
if word == 'endseq': | |
break | |
cut_text = ' '.join(in_text.split()[1:-1]) | |
return cut_text | |
def google_image_testing(inp): | |
# Convert input into jpg file | |
input_image = Image.fromarray(inp) | |
input_image.save("input_image.jpg") | |
# Load input Image | |
image_path = 'input_image.jpg' | |
image = load_img(image_path, target_size=(224, 224)) | |
# convert image pixels to numpy array | |
image = img_to_array(image) | |
# reshape data for model | |
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) | |
# preprocess image for vgg | |
image = preprocess_input(image) | |
# extract features | |
img_feature = pre_trained_model.predict(image, verbose=0) | |
# predict the caption | |
predicted = predict_caption(model, img_feature, tokenizer, max_length=74) | |
return predicted | |
demo = gr.Interface(fn=google_image_testing, inputs='image',outputs='text',title='Image Captioner') | |
demo.launch(debug=True,share=True) |