The outputs are of shape [32,490,16121] (where 16121 is the len of my vocab) What is the 490 dimensions
Also the outputs are probabilities right?
(outputs)
tensor([[[-9.7001, -9.6490, -9.6463, ..., -9.6936, -9.6430, -9.7431],
[-9.6997, -9.6487, -9.6470, ..., -9.6903, -9.6450, -9.7416],
[-9.6999, -9.6477, -9.6479, ..., -9.6898, -9.6453, -9.7417],
...,
[-9.7006, -9.6449, -9.6513, ..., -9.6889, -9.6477, -9.7405],
[-9.7003, -9.6448, -9.6512, ..., -9.6893, -9.6477, -9.7410],
[-9.7007, -9.6453, -9.6513, ..., -9.6892, -9.6466, -9.7403]],
[[-9.6844, -9.6316, -9.6387, ..., -9.6880, -9.6269, -9.7657],
[-9.6834, -9.6299, -9.6404, ..., -9.6872, -9.6283, -9.7642],
[-9.6834, -9.6334, -9.6387, ..., -9.6864, -9.6290, -9.7616],
...,
[-9.6840, -9.6299, -9.6431, ..., -9.6830, -9.6304, -9.7608],
[-9.6838, -9.6297, -9.6428, ..., -9.6834, -9.6303, -9.7609],
[-9.6842, -9.6300, -9.6428, ..., -9.6837, -9.6292, -9.7599]],
[[-9.6966, -9.6386, -9.6458, ..., -9.6896, -9.6375, -9.7521],
[-9.6974, -9.6374, -9.6462, ..., -9.6890, -9.6369, -9.7516],
[-9.6974, -9.6405, -9.6456, ..., -9.6876, -9.6378, -9.7491],
...,
[-9.6978, -9.6336, -9.6493, ..., -9.6851, -9.6419, -9.7490],
[-9.6971, -9.6334, -9.6487, ..., -9.6863, -9.6411, -9.7501],
[-9.6972, -9.6338, -9.6489, ..., -9.6867, -9.6396, -9.7497]],
...,
[[-9.7005, -9.6249, -9.6588, ..., -9.6762, -9.6557, -9.7555],
[-9.7028, -9.6266, -9.6597, ..., -9.6765, -9.6574, -9.7542],
[-9.7016, -9.6240, -9.6605, ..., -9.6761, -9.6576, -9.7553],
...,
[-9.7036, -9.6237, -9.6624, ..., -9.6728, -9.6590, -9.7524],
[-9.7034, -9.6235, -9.6620, ..., -9.6735, -9.6589, -9.7530],
[-9.7038, -9.6240, -9.6622, ..., -9.6738, -9.6582, -9.7524]],
[[-9.7058, -9.6305, -9.6566, ..., -9.6739, -9.6557, -9.7466],
[-9.7061, -9.6273, -9.6569, ..., -9.6774, -9.6564, -9.7499],
[-9.7046, -9.6280, -9.6576, ..., -9.6772, -9.6575, -9.7498],
...,
[-9.7060, -9.6263, -9.6609, ..., -9.6714, -9.6561, -9.7461],
[-9.7055, -9.6262, -9.6605, ..., -9.6723, -9.6558, -9.7469],
[-9.7058, -9.6270, -9.6606, ..., -9.6725, -9.6552, -9.7460]],
[[-9.7101, -9.6312, -9.6570, ..., -9.6736, -9.6551, -9.7420],
[-9.7102, -9.6307, -9.6579, ..., -9.6733, -9.6576, -9.7418],
[-9.7078, -9.6281, -9.6598, ..., -9.6704, -9.6596, -9.7418],
...,
[-9.7084, -9.6288, -9.6605, ..., -9.6706, -9.6588, -9.7399],
[-9.7081, -9.6286, -9.6600, ..., -9.6714, -9.6584, -9.7406],
[-9.7085, -9.6291, -9.6601, ..., -9.6717, -9.6577, -9.7398]]],
device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
(output_lengths)
tensor([312, 260, 315, 320, 317, 275, 308, 291, 272, 300, 262, 227, 303, 252,
298, 256, 303, 251, 284, 259, 263, 286, 209, 262, 166, 194, 149, 212,
121, 114, 110, 57], device='cuda:0', dtype=torch.int32)
(target_lengths)
tensor([57, 55, 54, 50, 49, 49, 49, 48, 48, 47, 43, 42, 41, 40, 40, 39, 37, 37,
36, 36, 36, 35, 34, 33, 29, 27, 26, 24, 20, 19, 17, 9])
import torch
import time
import sys
from google.colab import output
import torch.nn as nn
from conformer import Conformer
import torchmetrics
import random
cuda = torch.cuda.is_available()
device = torch.device('cuda' if cuda else 'cpu')
print('Device:', device)
################################################################################
def train_model(model, optimizer, criterion, loader, metric):
running_loss = 0.0
for i, (audio,audio_len, translations, translation_len) in enumerate(loader):
# with output.use_tags('some_outputs'):
# sys.stdout.write('Batch: '+ str(i+1)+'/290')
# sys.stdout.flush();
#sorting inputs and targets to have targets in descending order based on len
sorted_list,sorted_indices=torch.sort(translation_len,descending=True)
sorted_audio=torch.zeros((32,201,1963),dtype=torch.float)
sorted_audio_len=torch.zeros(32,dtype=torch.int)
sorted_translations=torch.zeros((32,78),dtype=torch.int)
sorted_translation_len=sorted_list
for index, contentof in enumerate(translation_len):
sorted_audio[index]=audio[sorted_indices[index]]
sorted_audio_len[index]=audio_len[sorted_indices[index]]
sorted_translations[index]=translations[sorted_indices[index]]
#transpose inputs from (batch, dim, seq_len) to (batch, seq_len, dim)
inputs=sorted_audio.to(device)
inputs=torch.transpose(inputs, 1, 2)
input_lengths=sorted_audio_len
targets=sorted_translations.to(device)
target_lengths=sorted_translation_len
optimizer.zero_grad()
# Forward propagate
outputs, output_lengths = model(inputs, input_lengths)
# print(outputs)
# Calculate CTC Loss
loss = criterion(outputs.transpose(0, 1), targets, output_lengths, target_lengths)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
output.clear(output_tags='some_outputs')
loss_per_epoch=running_loss/(i+1)
# print(f'Loss: {loss_per_epoch:.3f}')
return loss_per_epoch
################################################################################
def eval_model(model, optimizer, criterion, loader, metric):
running_loss = 0.0
wer_calc=0.0
random_index_per_epoch= random.randint(0, 178)
for i, (audio,audio_len, translations, translation_len) in enumerate(loader):
# with output.use_tags('some_outputs'):
# sys.stdout.write('Batch: '+ str(i+1)+'/72')
# sys.stdout.flush();
#sorting inputs and targets to have targets in descending order based on len
sorted_list,sorted_indices=torch.sort(translation_len,descending=True)
sorted_audio=torch.zeros((32,201,1963),dtype=torch.float)
sorted_audio_len=torch.zeros(32,dtype=torch.int)
sorted_translations=torch.zeros((32,78),dtype=torch.int)
sorted_translation_len=sorted_list
for index, contentof in enumerate(translation_len):
sorted_audio[index]=audio[sorted_indices[index]]
sorted_audio_len[index]=audio_len[sorted_indices[index]]
sorted_translations[index]=translations[sorted_indices[index]]
#transpose inputs from (batch, dim, seq_len) to (batch, seq_len, dim)
inputs=sorted_audio.to(device)
inputs=torch.transpose(inputs, 1, 2)
input_lengths=sorted_audio_len
targets=sorted_translations.to(device)
target_lengths=sorted_translation_len
# Forward propagate
outputs, output_lengths = model(inputs, input_lengths)
# print(outputs)
# Calculate CTC Loss
loss = criterion(outputs.transpose(0, 1), targets, output_lengths, target_lengths)
print(output_lengths)
print(target_lengths)
# outputs_in_words=words_vocab.convert_pred_to_words(outputs.transpose(0, 1))
# targets_in_words=words_vocab.convert_pred_to_words(targets)
# wer=metrics_calculation(metric, outputs_in_words,targets_in_words)
break
if (i==random_index_per_epoch):
print(outputs_in_words,targets_in_words)
running_loss += loss.item()
# wer_calc += wer
output.clear(output_tags='some_outputs')
loss_per_epoch=running_loss/(i+1)
wer_per_epoch=wer_calc/(i+1)
return loss_per_epoch, wer_per_epoch
################################################################################
def train_eval_model(epochs):
#conformer model init
model = nn.DataParallel(Conformer(num_classes=16121, input_dim=201, encoder_dim=32, num_encoder_layers=1)).to(device)
# Optimizers specified in the torch.optim package
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
#loss function
criterion = nn.CTCLoss().to(device)
#metrics init
metric=torchmetrics.WordErrorRate()
for epoch in range(epochs):
print("Epoch", epoch+1)
############################################################################
#TRAINING
model.train()
print("Training")
# epoch_loss=train_model(model=model,optimizer=optimizer, criterion=criterion, loader=train_loader, metric=metric)
# print(f'Loss: {epoch_loss:.3f}')
# print(f'WER: {epoch_wer:.3f}')
############################################################################
#EVALUATION
model.train(False)
print("Validation")
epoch_val_loss, epoch_val_wer=eval_model(model=model,optimizer=optimizer, criterion=criterion, loader=test_loader, metric=metric)
print(f'Loss: {epoch_val_loss:.3f}')
print(f'WER: {epoch_val_wer:.3f}')
################################################################################
def metrics_calculation(metric, predictions, targets):
print(predictions)
print(targets)
wer=metric(predictions, targets)
return wer
train_eval_model(1)