Documenting the types of inputs and data structures used for each metric
accuracy
predictions = Sequence(Value("int32"))
references = Sequence(Value("int32"))
or if "multilabel"
mode:
predictions= Value("int32")
references = Value("int32")
bertscore
predictions = Value("string", id="sequence")
references = Sequence(Value("string", id="sequence"), id="references")
bleu
predictions = Sequence(Value("string", id="token"), id="sequence")
references = Sequence(Value("string", id="token"), id="sequence"), id="references"
bleurt
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
cer
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
chrf
predictions = Value("string", id="sequence")
references = Sequence(Value("string", id="sequence"), id="references")
code_eval
predictions = Sequence(Value("string"))
references = Value("string")
comet
sources = Value("string", id="sequence")
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
competition_math
predictions = Value("string")
references = Value("string")
coval
predictions = Sequence(Value("string"))
references = Sequence(Value("string"))
N.B. The sentences have to be in CoNLL format, which may be tricky to handle in some cases
cuad
"predictions": {
"id": Value("string"),
"prediction_text": Sequence(Value("string")),
}
"references": {
"id": Value("string"),
"answers": Sequence(
{
"text": Value("string"),
"answer_start": Value("int32"),
}
),
},
}
exact_match
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
f1
predictions = Sequence(Value("int32")
references = Sequence(Value("int32"))
frugalscore
references = Value("string")
predictions = Value("string")
gleu
predictions = Sequence(Value("string", id="token"), id="sequence")
references = Sequence(Sequence(Value("string", id="token"), id="sequence"), id="references")
glue
predictions = Value("int64" if self.config_name != "stsb" else "float32")
references = Value("int64" if self.config_name != "stsb" else "float32")
The type of input depends on the GLUE subset used.
google_bleu
predictions = Sequence(Value("string", id="token"), id="sequence")
references = Sequence(Sequence(Value("string", id="token"), id="sequence"), id="references")
indic_glue
predictions = Value("int64") if self.config_name != "cvit-mkb-clsr" else Sequence(Value("float32"))
references = Value("int64") if self.config_name != "cvit-mkb-clsr" else Sequence(Value("float32"))
mae
predictions = Value("float")
references = Value("float")
or if multilist
:
predictions = Sequence(Value("float"))
references = Sequence(Value("float"))
mahalanobis
"X": Sequence(Value("float", id="sequence"), id="X")
reference_distribution = np.array(reference_distribution)
N.B. the names for references
and predictions
are different here -- maybe we should standardize? wdyt @lhoestq
matthews_correlation
predictions = Value("int32")
references = Value("int32")
mauve
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
mean_iou
predictions = Sequence(Sequence(Value("uint16")))
references = Sequence(Sequence(Value("uint16")))
What's a unit16
? unicode? this is the only metric with a unicode restriction (so far).
meteor
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
mse
predictions = Value("float")
references = Value("float")
or if multilist
:
predictions = Sequence(Value("float"))
references = Sequence(Value("float")),
pearsonr
references = Value("float")
predictions = Value("float")
perplexity
input_texts = Value("string")
precision
predictions = Value("int32")
references = Value("int32")
or if multilist
:
predictions = Sequence(Value("int32"))
references = Sequence(Value("int32"))
recall
predictions = Value("int32")
references = Value("int32")
or if multilist
:
predictions = Sequence(Value("int32"))
references = Sequence(Value("int32"))
rouge
predictions = Value("string", id="sequence")
references = Value("string", id="sequence")
sacrebleu
predictions = Value("string", id="sequence")
references = Sequence(Value("string", id="sequence"), id="references")
sari
sources = Value("string", id="sequence")
predictions = Value("string", id="sequence")
references = Sequence(Value("string", id="sequence"), id="references")
seqeval
predictions = Sequence(Value("string", id="label"), id="sequence")
references = Sequence(Value("string", id="label"), id="sequence")
N.B. both predictions
and references
are in IOB format
spearmanr
predictions = Value("float")
references = Value("float")
squad
predictions = {"id": Value("string"), "prediction_text": Value("string")}
"references": {
"id": Value("string"),
"answers": features.Sequence(
{
"text": Value("string"),
"answer_start": Value("int32"),
}
)
squad_v2
"predictions": {
"id": Value("string"),
"prediction_text": Value("string"),
"no_answer_probability": Value("float32"),
}
"references": {
"id": Value("string"),
"answers": features.Sequence(
{"text": Value("string"), "answer_start": Value("int32")}
),
}
N.B. SQuAD and SQuAD v2. formats differ in the fact that v2 has the 'no_answer_probability'
tag in predictions
.
super_glue
if self.config_name == "record":
return {
"predictions": {
"idx": {
"passage": Value("int64"),
"query": Value("int64"),
},
"prediction_text": Value("string"),
},
"references": {
"idx": {
"passage": Value("int64"),
"query": Value("int64"),
},
"answers": Sequence(datasets.Value("string")),
},
}
elif self.config_name == "multirc":
return {
"predictions": {
"idx": {
"answer": Value("int64"),
"paragraph": Value("int64"),
"question": Value("int64"),
},
"prediction": Value("int64"),
},
"references": Value("int64"),
}
else:
return {
"predictions": Value("int64"),
"references": Value("int64"),
}
ter
predictions = Value("string", id="sequence")
references = Sequence(Value("string", id="sequence"), id="references")
wer
predictions = Value("string", id="sequence"),
references = Value("string", id="sequence")
wiki_split
predictions = Value("string", id="sequence")
references = Sequence(Value("string", id="sequence"), id="references")
xnli
predictions = Value("int64" if self.config_name != "sts-b" else "float32")
references = Value("int64" if self.config_name != "sts-b" else "float32")
xtreme_s
pred_type = "int64" if self.config_name in ["fleurs-lang_id", "minds14"] else "string"
predictions = Value(pred_type)
references = Value(pred_type)
N.B. the input depends on the XTREME-S dataset selected