!pip install transformers #(Colab)
from transformers import Blip2Processor, BlipForConditionalGeneration
processor = Blip2Processor.from_pretrained('Salesforce/blip-flan-t5-xl')
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-flan-t5-xl')
!pip install open_clip_torch
!pip install clip-interrogator==0.6.0
model, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained = 'laion2b-s32B-b79k)
tokenizer = open_clip.get_tokenizer('ViT-H-14')
images = os.listdir('/kaggle/input/stable-diffusion-image-to-prompts/images/')
imgIds = [i.split('.')[0] for i in images]
EMBEDDING_LENGTH = 384
eIds = list(range(EMBEDDING_LENGTH))
imgId_eId = [
'_'.join(map(str, i)) for i in zip(
np.repeat(imgIds, EMBEDDING_LENGTH),
np.tile(range(EMBEDDING_LENGTH), len(imgIds)))]
assert sorted(imgId_eId) == sorted(submission.imgId_eId)
ground_truth = pd.read_csv('/kaggle/input/stable-diffusion-image-to-prompts/prompts.csv')
ground_truth = pd.merge(pd.DataFrame(imgIds, columns = ['imgId']), ground_truth,
on = 'imgId', how = 'left')
ground_truth_embeddings = st_model.encode(ground_truth.prompt).flatten()
gte = pd.DataFrame(
index = imgId_eId,
data = ground_truth_embeddings,
columns = ['val']
).rename_axis('imgId_eId')
from scipy import spatial
vec1 = gte['val']
vec2 = submission['val']
cos_sim = 1 - spatial.distance.cosine(vec1, vec2)
print(cos_sim)