Hi,
While I run the demo.py file, I found a data error in coco val dataset.
In coco_val_localized_narratives.jsonl , line 4414, the case lacks traces data.
`{
"dataset_id":"mscoco_val2017",
"image_id":"381639",
"annotator_id":89,
"caption":"In the image we can see girl standing and holding a doll in her hand. These are the road cone. There are even other people who are getting into airplane, there is a building. This is a tree and sky.",
"timed_caption":[
{
"utterance":"In the",
"start_time":0,
"end_time":0
},
{
"utterance":"image",
"start_time":0,
"end_time":4.4
},
{
"utterance":"we",
"start_time":4.4,
"end_time":5
},
{
"utterance":"can",
"start_time":5,
"end_time":5.3
},
{
"utterance":"see",
"start_time":5.3,
"end_time":5.5
},
{
"utterance":"girl",
"start_time":5.5,
"end_time":6.3
},
{
"utterance":"standing",
"start_time":6.3,
"end_time":6.9
},
{
"utterance":"and",
"start_time":6.9,
"end_time":7.6
},
{
"utterance":"holding",
"start_time":7.6,
"end_time":8.1
},
{
"utterance":"a",
"start_time":8.1,
"end_time":8.2
},
{
"utterance":"doll",
"start_time":8.2,
"end_time":8.5
},
{
"utterance":"in",
"start_time":8.5,
"end_time":9.1
},
{
"utterance":"her",
"start_time":9.1,
"end_time":9.3
},
{
"utterance":"hand.",
"start_time":9.3,
"end_time":9.5
},
{
"utterance":"These",
"start_time":9.5,
"end_time":10.5
},
{
"utterance":"are",
"start_time":10.5,
"end_time":10.7
},
{
"utterance":"the",
"start_time":10.7,
"end_time":10.9
},
{
"utterance":"road",
"start_time":10.9,
"end_time":11.2
},
{
"utterance":"cone.",
"start_time":11.2,
"end_time":11.6
},
{
"utterance":"There",
"start_time":11.6,
"end_time":12.2
},
{
"utterance":"are",
"start_time":12.2,
"end_time":12.3
},
{
"utterance":"even",
"start_time":12.3,
"end_time":12.8
},
{
"utterance":"other",
"start_time":12.8,
"end_time":13
},
{
"utterance":"people",
"start_time":13,
"end_time":13.5
},
{
"utterance":"who",
"start_time":13.5,
"end_time":13.5
},
{
"utterance":"are",
"start_time":13.5,
"end_time":14
},
{
"utterance":"getting",
"start_time":14,
"end_time":14.5
},
{
"utterance":"into",
"start_time":14.5,
"end_time":15.2
},
{
"utterance":"airplane,",
"start_time":15.2,
"end_time":15.8
},
{
"utterance":"there",
"start_time":15.8,
"end_time":16.7
},
{
"utterance":"is",
"start_time":16.7,
"end_time":16.9
},
{
"utterance":"a",
"start_time":16.9,
"end_time":17.1
},
{
"utterance":"building.",
"start_time":17.1,
"end_time":17.6
},
{
"utterance":"This",
"start_time":17.6,
"end_time":18.5
},
{
"utterance":"is",
"start_time":18.5,
"end_time":18.8
},
{
"utterance":"a",
"start_time":18.8,
"end_time":18.9
},
{
"utterance":"tree",
"start_time":18.9,
"end_time":19.4
},
{
"utterance":"and",
"start_time":19.4,
"end_time":19.9
},
{
"utterance":"sky.",
"start_time":19.9,
"end_time":20.2
}
],
"traces":[
],
"voice_recording":"coco_val/coco_val_381639_89.ogg"
}`