GithubHelp home page GithubHelp logo

Comments (6)

codedecde avatar codedecde commented on June 10, 2024 1

I think this might be a regression of sorts.
Consider the output from Mistral-7B-v0.1

{
  "results": {
    "mmlu": {
      "acc,none": 0.6534780774610744,
      "acc_stderr,none": 0.02878254466378838,
      "alias": "mmlu"
    },
    "mmlu_humanities": {
      "alias": " - humanities",
      "acc,none": 0.680133883189164,
      "acc_stderr,none": 0.022931458812042033
    },
    "mmlu_formal_logic": {
      "alias": "  - formal_logic",
      "acc,none": 0.3888888888888889,
      "acc_stderr,none": 0.0436031486007746
    },
    "mmlu_high_school_european_history": {
      "alias": "  - high_school_european_history",
      "acc,none": 0.7757575757575758,
      "acc_stderr,none": 0.032568666616811015
    },
    "mmlu_high_school_us_history": {
      "alias": "  - high_school_us_history",
      "acc,none": 0.7892156862745098,
      "acc_stderr,none": 0.0286265479124374
    },
    "mmlu_high_school_world_history": {
      "alias": "  - high_school_world_history",
      "acc,none": 0.7763713080168776,
      "acc_stderr,none": 0.027123298205229966
    },
    "mmlu_international_law": {
      "alias": "  - international_law",
      "acc,none": 0.7933884297520661,
      "acc_stderr,none": 0.03695980128098825
    },
    "mmlu_jurisprudence": {
      "alias": "  - jurisprudence",
      "acc,none": 0.7685185185185185,
      "acc_stderr,none": 0.04077494709252627
    },
    "mmlu_logical_fallacies": {
      "alias": "  - logical_fallacies",
      "acc,none": 0.7914110429447853,
      "acc_stderr,none": 0.031921934489347256
    },
    "mmlu_moral_disputes": {
      "alias": "  - moral_disputes",
      "acc,none": 0.7138728323699421,
      "acc_stderr,none": 0.024332146779134128
    },
    "mmlu_moral_scenarios": {
      "alias": "  - moral_scenarios",
      "acc,none": 0.3139664804469274,
      "acc_stderr,none": 0.015521923933523635
    },
    "mmlu_philosophy": {
      "alias": "  - philosophy",
      "acc,none": 0.7041800643086816,
      "acc_stderr,none": 0.025922371788818788
    },
    "mmlu_prehistory": {
      "alias": "  - prehistory",
      "acc,none": 0.7407407407407407,
      "acc_stderr,none": 0.02438366553103545
    },
    "mmlu_professional_law": {
      "alias": "  - professional_law",
      "acc,none": 0.455019556714472,
      "acc_stderr,none": 0.012718456618701773
    },
    "mmlu_world_religions": {
      "alias": "  - world_religions",
      "acc,none": 0.8304093567251462,
      "acc_stderr,none": 0.02878210810540171
    },
    "mmlu_other": {
      "alias": " - other",
      "acc,none": 0.6690477897312819,
      "acc_stderr,none": 0.029087291991729783
    },
    "mmlu_business_ethics": {
      "alias": "  - business_ethics",
      "acc,none": 0.58,
      "acc_stderr,none": 0.049604496374885836
    },
    "mmlu_clinical_knowledge": {
      "alias": "  - clinical_knowledge",
      "acc,none": 0.6830188679245283,
      "acc_stderr,none": 0.028637235639800893
    },
    "mmlu_college_medicine": {
      "alias": "  - college_medicine",
      "acc,none": 0.6647398843930635,
      "acc_stderr,none": 0.03599586301247077
    },
    "mmlu_global_facts": {
      "alias": "  - global_facts",
      "acc,none": 0.35,
      "acc_stderr,none": 0.0479372485441102
    },
    "mmlu_human_aging": {
      "alias": "  - human_aging",
      "acc,none": 0.695067264573991,
      "acc_stderr,none": 0.030898610882477518
    },
    "mmlu_management": {
      "alias": "  - management",
      "acc,none": 0.8252427184466019,
      "acc_stderr,none": 0.0376017800602662
    },
    "mmlu_marketing": {
      "alias": "  - marketing",
      "acc,none": 0.8717948717948718,
      "acc_stderr,none": 0.021901905115073318
    },
    "mmlu_medical_genetics": {
      "alias": "  - medical_genetics",
      "acc,none": 0.74,
      "acc_stderr,none": 0.04408440022768078
    },
    "mmlu_miscellaneous": {
      "alias": "  - miscellaneous",
      "acc,none": 0.8186462324393359,
      "acc_stderr,none": 0.013778693778464085
    },
    "mmlu_nutrition": {
      "alias": "  - nutrition",
      "acc,none": 0.7647058823529411,
      "acc_stderr,none": 0.024288619466046123
    },
    "mmlu_professional_accounting": {
      "alias": "  - professional_accounting",
      "acc,none": 0.4787234042553192,
      "acc_stderr,none": 0.029800481645628693
    },
    "mmlu_professional_medicine": {
      "alias": "  - professional_medicine",
      "acc,none": 0.6654411764705882,
      "acc_stderr,none": 0.028661996202335307
    },
    "mmlu_virology": {
      "alias": "  - virology",
      "acc,none": 0.5602409638554217,
      "acc_stderr,none": 0.03864139923699121
    },
    "mmlu_social_sciences": {
      "alias": " - social_sciences",
      "acc,none": 0.7365027553174844,
      "acc_stderr,none": 0.026785760218568794
    },
    "mmlu_econometrics": {
      "alias": "  - econometrics",
      "acc,none": 0.5,
      "acc_stderr,none": 0.047036043419179864
    },
    "mmlu_high_school_geography": {
      "alias": "  - high_school_geography",
      "acc,none": 0.7626262626262627,
      "acc_stderr,none": 0.0303137105381989
    },
    "mmlu_high_school_government_and_politics": {
      "alias": "  - high_school_government_and_politics",
      "acc,none": 0.8652849740932642,
      "acc_stderr,none": 0.024639789097709443
    },
    "mmlu_high_school_macroeconomics": {
      "alias": "  - high_school_macroeconomics",
      "acc,none": 0.6615384615384615,
      "acc_stderr,none": 0.023991500500313036
    },
    "mmlu_high_school_microeconomics": {
      "alias": "  - high_school_microeconomics",
      "acc,none": 0.6596638655462185,
      "acc_stderr,none": 0.030778057422931673
    },
    "mmlu_high_school_psychology": {
      "alias": "  - high_school_psychology",
      "acc,none": 0.818348623853211,
      "acc_stderr,none": 0.016530617409266885
    },
    "mmlu_human_sexuality": {
      "alias": "  - human_sexuality",
      "acc,none": 0.8015267175572519,
      "acc_stderr,none": 0.03498149385462472
    },
    "mmlu_professional_psychology": {
      "alias": "  - professional_psychology",
      "acc,none": 0.6830065359477124,
      "acc_stderr,none": 0.01882421951270621
    },
    "mmlu_public_relations": {
      "alias": "  - public_relations",
      "acc,none": 0.6636363636363637,
      "acc_stderr,none": 0.04525393596302505
    },
    "mmlu_security_studies": {
      "alias": "  - security_studies",
      "acc,none": 0.726530612244898,
      "acc_stderr,none": 0.02853556033712844
    },
    "mmlu_sociology": {
      "alias": "  - sociology",
      "acc,none": 0.8258706467661692,
      "acc_stderr,none": 0.026814951200421603
    },
    "mmlu_us_foreign_policy": {
      "alias": "  - us_foreign_policy",
      "acc,none": 0.87,
      "acc_stderr,none": 0.033799766898963086
    },
    "mmlu_stem": {
      "alias": " - stem",
      "acc,none": 0.5282278816063678,
      "acc_stderr,none": 0.037025799074914184
    },
    "mmlu_abstract_algebra": {
      "alias": "  - abstract_algebra",
      "acc,none": 0.27,
      "acc_stderr,none": 0.0446196043338474
    },
    "mmlu_anatomy": {
      "alias": "  - anatomy",
      "acc,none": 0.6148148148148148,
      "acc_stderr,none": 0.042039210401562783
    },
    "mmlu_astronomy": {
      "alias": "  - astronomy",
      "acc,none": 0.6578947368421053,
      "acc_stderr,none": 0.03860731599316091
    },
    "mmlu_college_biology": {
      "alias": "  - college_biology",
      "acc,none": 0.7361111111111112,
      "acc_stderr,none": 0.03685651095897532
    },
    "mmlu_college_chemistry": {
      "alias": "  - college_chemistry",
      "acc,none": 0.5,
      "acc_stderr,none": 0.050251890762960605
    },
    "mmlu_college_computer_science": {
      "alias": "  - college_computer_science",
      "acc,none": 0.52,
      "acc_stderr,none": 0.050211673156867795
    },
    "mmlu_college_mathematics": {
      "alias": "  - college_mathematics",
      "acc,none": 0.36,
      "acc_stderr,none": 0.048241815132442176
    },
    "mmlu_college_physics": {
      "alias": "  - college_physics",
      "acc,none": 0.3431372549019608,
      "acc_stderr,none": 0.04724007352383888
    },
    "mmlu_computer_security": {
      "alias": "  - computer_security",
      "acc,none": 0.78,
      "acc_stderr,none": 0.041633319989322626
    },
    "mmlu_conceptual_physics": {
      "alias": "  - conceptual_physics",
      "acc,none": 0.5829787234042553,
      "acc_stderr,none": 0.03223276266711712
    },
    "mmlu_electrical_engineering": {
      "alias": "  - electrical_engineering",
      "acc,none": 0.5517241379310345,
      "acc_stderr,none": 0.041443118108781526
    },
    "mmlu_elementary_mathematics": {
      "alias": "  - elementary_mathematics",
      "acc,none": 0.3862433862433862,
      "acc_stderr,none": 0.025075981767601688
    },
    "mmlu_high_school_biology": {
      "alias": "  - high_school_biology",
      "acc,none": 0.7774193548387097,
      "acc_stderr,none": 0.023664216671642518
    },
    "mmlu_high_school_chemistry": {
      "alias": "  - high_school_chemistry",
      "acc,none": 0.5073891625615764,
      "acc_stderr,none": 0.0351760354036101
    },
    "mmlu_high_school_computer_science": {
      "alias": "  - high_school_computer_science",
      "acc,none": 0.68,
      "acc_stderr,none": 0.04688261722621505
    },
    "mmlu_high_school_mathematics": {
      "alias": "  - high_school_mathematics",
      "acc,none": 0.37037037037037035,
      "acc_stderr,none": 0.02944316932303154
    },
    "mmlu_high_school_physics": {
      "alias": "  - high_school_physics",
      "acc,none": 0.32450331125827814,
      "acc_stderr,none": 0.038227469376587525
    },
    "mmlu_high_school_statistics": {
      "alias": "  - high_school_statistics",
      "acc,none": 0.5648148148148148,
      "acc_stderr,none": 0.033812000056435254
    },
    "mmlu_machine_learning": {
      "alias": "  - machine_learning",
      "acc,none": 0.5089285714285714,
      "acc_stderr,none": 0.04745033255489123
    }
  },
  "groups": {
    "mmlu": {
      "acc,none": 0.6534780774610744,
      "acc_stderr,none": 0.02878254466378838,
      "alias": "mmlu"
    },
    "mmlu_humanities": {
      "alias": " - humanities",
      "acc,none": 0.680133883189164,
      "acc_stderr,none": 0.022931458812042033
    },
    "mmlu_other": {
      "alias": " - other",
      "acc,none": 0.6690477897312819,
      "acc_stderr,none": 0.029087291991729783
    },
    "mmlu_social_sciences": {
      "alias": " - social_sciences",
      "acc,none": 0.7365027553174844,
      "acc_stderr,none": 0.026785760218568794
    },
    "mmlu_stem": {
      "alias": " - stem",
      "acc,none": 0.5282278816063678,
      "acc_stderr,none": 0.037025799074914184
    }
  }
}

Here, the mmlu score seems to be computed as an average of all groups:

assert data["groups"]["mmlu"]["acc,none"] == np.mean(
    [data["groups"][f"mmlu_{subject}"]["acc,none"] 
     for subject in ["humanities", "other", "social_sciences", "stem"]]
) 

But based on this comment here, this should have been an average by number of documents.

Imho, might be good to report both accuracy and weighted accuracy, to avoid any confusion.

from lm-evaluation-harness.

alonbenhaim avatar alonbenhaim commented on June 10, 2024 1

This looks to be corrected now! For reference here are the numbers I am getting with Mistral using the script I wrote to come up with this:

The weighted average is 0.6259

The regular average is 0.6405
The regular average of the weighted 4 subject average is 0.6346
The regular average of the regular 4 subject average is 0.6548
per subject regular average: dict_items([('stem', 0.5323367530451689), ('other', 0.6712050680651024), ('humanities', 0.678644611627678), ('social_sciences', 0.7371609780969487)])
per subject weighted average: dict_items([('stem', 0.5302886140183952), ('other', 0.7077566784679755), ('humanities', 0.5632051009564294), ('social_sciences', 0.737081572960676)])

The weighted average and per subject weighted average seem to fit well with your eval. Thanks!

from lm-evaluation-harness.

haileyschoelkopf avatar haileyschoelkopf commented on June 10, 2024 1

Glad it's working--Would like to add a regression test specifically for this in future though!

The suggestion to report both weighted + unweighted (or at least clearly indicate which is being reported) is a very good one as well.

from lm-evaluation-harness.

LSinev avatar LSinev commented on June 10, 2024

What will be the solution, when some next researcher or model author will calculate with regular average and report in their blog post?

from lm-evaluation-harness.

haileyschoelkopf avatar haileyschoelkopf commented on June 10, 2024

Hi, thank you for reporting this--looking into it now! As far as I was aware we are aggregating using the total docs per subject category, but there may be a bug introduced somewhere for nested groups.

For GPT-2:

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|------|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.2292|±  |0.0035|
| - humanities     |N/A    |none  |None  |acc   |0.2421|±  |0.0062|
| - other          |N/A    |none  |None  |acc   |0.2382|±  |0.0076|
| - social_sciences|N/A    |none  |None  |acc   |0.2171|±  |0.0074|
| - stem           |N/A    |none  |None  |acc   |0.2131|±  |0.0073|

doc counts are:

[4705, 3077, 3107, 3153] mmlu ['mmlu_humanities', 'mmlu_social_sciences', 'mmlu_other', 'mmlu_stem']
(4705 * 24.21 +23.82 * 3077 + 3107 * 21.71 + 21.31 * 3153) / (4705 + 3077 + 3107 + 3153)

gives 22.92 which is what we're reporting, versus (24.21 +23.82 + 21.71 + 21.31 ) / (4) giving 22.7625.

Let me also run on Mistral though!

from lm-evaluation-harness.

haileyschoelkopf avatar haileyschoelkopf commented on June 10, 2024

This is what I get on the most recent commit:

hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto (8)

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.6246|±  |0.0038|
| - humanities     |N/A    |none  |     5|acc   |0.5683|±  |0.0068|
| - other          |N/A    |none  |     5|acc   |0.7016|±  |0.0079|
| - social_sciences|N/A    |none  |     5|acc   |0.7345|±  |0.0078|
| - stem           |N/A    |none  |     5|acc   |0.5252|±  |0.0085|

@codedecde what commit of the codebase did you run on?

from lm-evaluation-harness.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.