In the current implementation for mmlu task. The output is given as mmlu acc <

This is what I get on the most recent commit: <div class="snippet-clipboard-conten

mmlu final score and per subject score is calculated wrong about lm-evaluation-harness HOT 6 CLOSED

alonbenhaim commented on June 10, 2024 1

mmlu final score and per subject score is calculated wrong

from lm-evaluation-harness.

Comments (6)

codedecde commented on June 10, 2024 1

I think this might be a regression of sorts.
Consider the output from Mistral-7B-v0.1

{
  "results": {
    "mmlu": {
      "acc,none": 0.6534780774610744,
      "acc_stderr,none": 0.02878254466378838,
      "alias": "mmlu"
    },
    "mmlu_humanities": {
      "alias": " - humanities",
      "acc,none": 0.680133883189164,
      "acc_stderr,none": 0.022931458812042033
    },
    "mmlu_formal_logic": {
      "alias": "  - formal_logic",
      "acc,none": 0.3888888888888889,
      "acc_stderr,none": 0.0436031486007746
    },
    "mmlu_high_school_european_history": {
      "alias": "  - high_school_european_history",
      "acc,none": 0.7757575757575758,
      "acc_stderr,none": 0.032568666616811015
    },
    "mmlu_high_school_us_history": {
      "alias": "  - high_school_us_history",
      "acc,none": 0.7892156862745098,
      "acc_stderr,none": 0.0286265479124374
    },
    "mmlu_high_school_world_history": {
      "alias": "  - high_school_world_history",
      "acc,none": 0.7763713080168776,
      "acc_stderr,none": 0.027123298205229966
    },
    "mmlu_international_law": {
      "alias": "  - international_law",
      "acc,none": 0.7933884297520661,
      "acc_stderr,none": 0.03695980128098825
    },
    "mmlu_jurisprudence": {
      "alias": "  - jurisprudence",
      "acc,none": 0.7685185185185185,
      "acc_stderr,none": 0.04077494709252627
    },
    "mmlu_logical_fallacies": {
      "alias": "  - logical_fallacies",
      "acc,none": 0.7914110429447853,
      "acc_stderr,none": 0.031921934489347256
    },
    "mmlu_moral_disputes": {
      "alias": "  - moral_disputes",
      "acc,none": 0.7138728323699421,
      "acc_stderr,none": 0.024332146779134128
    },
    "mmlu_moral_scenarios": {
      "alias": "  - moral_scenarios",
      "acc,none": 0.3139664804469274,
      "acc_stderr,none": 0.015521923933523635
    },
    "mmlu_philosophy": {
      "alias": "  - philosophy",
      "acc,none": 0.7041800643086816,
      "acc_stderr,none": 0.025922371788818788
    },
    "mmlu_prehistory": {
      "alias": "  - prehistory",
      "acc,none": 0.7407407407407407,
      "acc_stderr,none": 0.02438366553103545
    },
    "mmlu_professional_law": {
      "alias": "  - professional_law",
      "acc,none": 0.455019556714472,
      "acc_stderr,none": 0.012718456618701773
    },
    "mmlu_world_religions": {
      "alias": "  - world_religions",
      "acc,none": 0.8304093567251462,
      "acc_stderr,none": 0.02878210810540171
    },
    "mmlu_other": {
      "alias": " - other",
      "acc,none": 0.6690477897312819,
      "acc_stderr,none": 0.029087291991729783
    },
    "mmlu_business_ethics": {
      "alias": "  - business_ethics",
      "acc,none": 0.58,
      "acc_stderr,none": 0.049604496374885836
    },
    "mmlu_clinical_knowledge": {
      "alias": "  - clinical_knowledge",
      "acc,none": 0.6830188679245283,
      "acc_stderr,none": 0.028637235639800893
    },
    "mmlu_college_medicine": {
      "alias": "  - college_medicine",
      "acc,none": 0.6647398843930635,
      "acc_stderr,none": 0.03599586301247077
    },
    "mmlu_global_facts": {
      "alias": "  - global_facts",
      "acc,none": 0.35,
      "acc_stderr,none": 0.0479372485441102
    },
    "mmlu_human_aging": {
      "alias": "  - human_aging",
      "acc,none": 0.695067264573991,
      "acc_stderr,none": 0.030898610882477518
    },
    "mmlu_management": {
      "alias": "  - management",
      "acc,none": 0.8252427184466019,
      "acc_stderr,none": 0.0376017800602662
    },
    "mmlu_marketing": {
      "alias": "  - marketing",
      "acc,none": 0.8717948717948718,
      "acc_stderr,none": 0.021901905115073318
    },
    "mmlu_medical_genetics": {
      "alias": "  - medical_genetics",
      "acc,none": 0.74,
      "acc_stderr,none": 0.04408440022768078
    },
    "mmlu_miscellaneous": {
      "alias": "  - miscellaneous",
      "acc,none": 0.8186462324393359,
      "acc_stderr,none": 0.013778693778464085
    },
    "mmlu_nutrition": {
      "alias": "  - nutrition",
      "acc,none": 0.7647058823529411,
      "acc_stderr,none": 0.024288619466046123
    },
    "mmlu_professional_accounting": {
      "alias": "  - professional_accounting",
      "acc,none": 0.4787234042553192,
      "acc_stderr,none": 0.029800481645628693
    },
    "mmlu_professional_medicine": {
      "alias": "  - professional_medicine",
      "acc,none": 0.6654411764705882,
      "acc_stderr,none": 0.028661996202335307
    },
    "mmlu_virology": {
      "alias": "  - virology",
      "acc,none": 0.5602409638554217,
      "acc_stderr,none": 0.03864139923699121
    },
    "mmlu_social_sciences": {
      "alias": " - social_sciences",
      "acc,none": 0.7365027553174844,
      "acc_stderr,none": 0.026785760218568794
    },
    "mmlu_econometrics": {
      "alias": "  - econometrics",
      "acc,none": 0.5,
      "acc_stderr,none": 0.047036043419179864
    },
    "mmlu_high_school_geography": {
      "alias": "  - high_school_geography",
      "acc,none": 0.7626262626262627,
      "acc_stderr,none": 0.0303137105381989
    },
    "mmlu_high_school_government_and_politics": {
      "alias": "  - high_school_government_and_politics",
      "acc,none": 0.8652849740932642,
      "acc_stderr,none": 0.024639789097709443
    },
    "mmlu_high_school_macroeconomics": {
      "alias": "  - high_school_macroeconomics",
      "acc,none": 0.6615384615384615,
      "acc_stderr,none": 0.023991500500313036
    },
    "mmlu_high_school_microeconomics": {
      "alias": "  - high_school_microeconomics",
      "acc,none": 0.6596638655462185,
      "acc_stderr,none": 0.030778057422931673
    },
    "mmlu_high_school_psychology": {
      "alias": "  - high_school_psychology",
      "acc,none": 0.818348623853211,
      "acc_stderr,none": 0.016530617409266885
    },
    "mmlu_human_sexuality": {
      "alias": "  - human_sexuality",
      "acc,none": 0.8015267175572519,
      "acc_stderr,none": 0.03498149385462472
    },
    "mmlu_professional_psychology": {
      "alias": "  - professional_psychology",
      "acc,none": 0.6830065359477124,
      "acc_stderr,none": 0.01882421951270621
    },
    "mmlu_public_relations": {
      "alias": "  - public_relations",
      "acc,none": 0.6636363636363637,
      "acc_stderr,none": 0.04525393596302505
    },
    "mmlu_security_studies": {
      "alias": "  - security_studies",
      "acc,none": 0.726530612244898,
      "acc_stderr,none": 0.02853556033712844
    },
    "mmlu_sociology": {
      "alias": "  - sociology",
      "acc,none": 0.8258706467661692,
      "acc_stderr,none": 0.026814951200421603
    },
    "mmlu_us_foreign_policy": {
      "alias": "  - us_foreign_policy",
      "acc,none": 0.87,
      "acc_stderr,none": 0.033799766898963086
    },
    "mmlu_stem": {
      "alias": " - stem",
      "acc,none": 0.5282278816063678,
      "acc_stderr,none": 0.037025799074914184
    },
    "mmlu_abstract_algebra": {
      "alias": "  - abstract_algebra",
      "acc,none": 0.27,
      "acc_stderr,none": 0.0446196043338474
    },
    "mmlu_anatomy": {
      "alias": "  - anatomy",
      "acc,none": 0.6148148148148148,
      "acc_stderr,none": 0.042039210401562783
    },
    "mmlu_astronomy": {
      "alias": "  - astronomy",
      "acc,none": 0.6578947368421053,
      "acc_stderr,none": 0.03860731599316091
    },
    "mmlu_college_biology": {
      "alias": "  - college_biology",
      "acc,none": 0.7361111111111112,
      "acc_stderr,none": 0.03685651095897532
    },
    "mmlu_college_chemistry": {
      "alias": "  - college_chemistry",
      "acc,none": 0.5,
      "acc_stderr,none": 0.050251890762960605
    },
    "mmlu_college_computer_science": {
      "alias": "  - college_computer_science",
      "acc,none": 0.52,
      "acc_stderr,none": 0.050211673156867795
    },
    "mmlu_college_mathematics": {
      "alias": "  - college_mathematics",
      "acc,none": 0.36,
      "acc_stderr,none": 0.048241815132442176
    },
    "mmlu_college_physics": {
      "alias": "  - college_physics",
      "acc,none": 0.3431372549019608,
      "acc_stderr,none": 0.04724007352383888
    },
    "mmlu_computer_security": {
      "alias": "  - computer_security",
      "acc,none": 0.78,
      "acc_stderr,none": 0.041633319989322626
    },
    "mmlu_conceptual_physics": {
      "alias": "  - conceptual_physics",
      "acc,none": 0.5829787234042553,
      "acc_stderr,none": 0.03223276266711712
    },
    "mmlu_electrical_engineering": {
      "alias": "  - electrical_engineering",
      "acc,none": 0.5517241379310345,
      "acc_stderr,none": 0.041443118108781526
    },
    "mmlu_elementary_mathematics": {
      "alias": "  - elementary_mathematics",
      "acc,none": 0.3862433862433862,
      "acc_stderr,none": 0.025075981767601688
    },
    "mmlu_high_school_biology": {
      "alias": "  - high_school_biology",
      "acc,none": 0.7774193548387097,
      "acc_stderr,none": 0.023664216671642518
    },
    "mmlu_high_school_chemistry": {
      "alias": "  - high_school_chemistry",
      "acc,none": 0.5073891625615764,
      "acc_stderr,none": 0.0351760354036101
    },
    "mmlu_high_school_computer_science": {
      "alias": "  - high_school_computer_science",
      "acc,none": 0.68,
      "acc_stderr,none": 0.04688261722621505
    },
    "mmlu_high_school_mathematics": {
      "alias": "  - high_school_mathematics",
      "acc,none": 0.37037037037037035,
      "acc_stderr,none": 0.02944316932303154
    },
    "mmlu_high_school_physics": {
      "alias": "  - high_school_physics",
      "acc,none": 0.32450331125827814,
      "acc_stderr,none": 0.038227469376587525
    },
    "mmlu_high_school_statistics": {
      "alias": "  - high_school_statistics",
      "acc,none": 0.5648148148148148,
      "acc_stderr,none": 0.033812000056435254
    },
    "mmlu_machine_learning": {
      "alias": "  - machine_learning",
      "acc,none": 0.5089285714285714,
      "acc_stderr,none": 0.04745033255489123
    }
  },
  "groups": {
    "mmlu": {
      "acc,none": 0.6534780774610744,
      "acc_stderr,none": 0.02878254466378838,
      "alias": "mmlu"
    },
    "mmlu_humanities": {
      "alias": " - humanities",
      "acc,none": 0.680133883189164,
      "acc_stderr,none": 0.022931458812042033
    },
    "mmlu_other": {
      "alias": " - other",
      "acc,none": 0.6690477897312819,
      "acc_stderr,none": 0.029087291991729783
    },
    "mmlu_social_sciences": {
      "alias": " - social_sciences",
      "acc,none": 0.7365027553174844,
      "acc_stderr,none": 0.026785760218568794
    },
    "mmlu_stem": {
      "alias": " - stem",
      "acc,none": 0.5282278816063678,
      "acc_stderr,none": 0.037025799074914184
    }
  }
}

Here, the mmlu score seems to be computed as an average of all groups:

assert data["groups"]["mmlu"]["acc,none"] == np.mean(
    [data["groups"][f"mmlu_{subject}"]["acc,none"] 
     for subject in ["humanities", "other", "social_sciences", "stem"]]
)

But based on this comment here, this should have been an average by number of documents.

Imho, might be good to report both accuracy and weighted accuracy, to avoid any confusion.

from lm-evaluation-harness.

alonbenhaim commented on June 10, 2024 1

This looks to be corrected now! For reference here are the numbers I am getting with Mistral using the script I wrote to come up with this:

The weighted average is 0.6259

The regular average is 0.6405
The regular average of the weighted 4 subject average is 0.6346
The regular average of the regular 4 subject average is 0.6548
per subject regular average: dict_items([('stem', 0.5323367530451689), ('other', 0.6712050680651024), ('humanities', 0.678644611627678), ('social_sciences', 0.7371609780969487)])
per subject weighted average: dict_items([('stem', 0.5302886140183952), ('other', 0.7077566784679755), ('humanities', 0.5632051009564294), ('social_sciences', 0.737081572960676)])

The weighted average and per subject weighted average seem to fit well with your eval. Thanks!

from lm-evaluation-harness.

haileyschoelkopf commented on June 10, 2024 1

Glad it's working--Would like to add a regression test specifically for this in future though!

The suggestion to report both weighted + unweighted (or at least clearly indicate which is being reported) is a very good one as well.

from lm-evaluation-harness.

LSinev commented on June 10, 2024

What will be the solution, when some next researcher or model author will calculate with regular average and report in their blog post?

from lm-evaluation-harness.

haileyschoelkopf commented on June 10, 2024

Hi, thank you for reporting this--looking into it now! As far as I was aware we are aggregating using the total docs per subject category, but there may be a bug introduced somewhere for nested groups.

For GPT-2:

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|------|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.2292|±  |0.0035|
| - humanities     |N/A    |none  |None  |acc   |0.2421|±  |0.0062|
| - other          |N/A    |none  |None  |acc   |0.2382|±  |0.0076|
| - social_sciences|N/A    |none  |None  |acc   |0.2171|±  |0.0074|
| - stem           |N/A    |none  |None  |acc   |0.2131|±  |0.0073|

doc counts are:

[4705, 3077, 3107, 3153] mmlu ['mmlu_humanities', 'mmlu_social_sciences', 'mmlu_other', 'mmlu_stem']

(4705 * 24.21 +23.82 * 3077 + 3107 * 21.71 + 21.31 * 3153) / (4705 + 3077 + 3107 + 3153)

gives 22.92 which is what we're reporting, versus (24.21 +23.82 + 21.71 + 21.31 ) / (4) giving 22.7625.

Let me also run on Mistral though!

from lm-evaluation-harness.

haileyschoelkopf commented on June 10, 2024

This is what I get on the most recent commit:

hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto (8)

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.6246|±  |0.0038|
| - humanities     |N/A    |none  |     5|acc   |0.5683|±  |0.0068|
| - other          |N/A    |none  |     5|acc   |0.7016|±  |0.0079|
| - social_sciences|N/A    |none  |     5|acc   |0.7345|±  |0.0078|
| - stem           |N/A    |none  |     5|acc   |0.5252|±  |0.0085|

@codedecde what commit of the codebase did you run on?

from lm-evaluation-harness.

mmlu final score and per subject score is calculated wrong about lm-evaluation-harness HOT 6 CLOSED

Comments (6)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs