Comments (6)
I think this might be a regression of sorts.
Consider the output from Mistral-7B-v0.1
{
"results": {
"mmlu": {
"acc,none": 0.6534780774610744,
"acc_stderr,none": 0.02878254466378838,
"alias": "mmlu"
},
"mmlu_humanities": {
"alias": " - humanities",
"acc,none": 0.680133883189164,
"acc_stderr,none": 0.022931458812042033
},
"mmlu_formal_logic": {
"alias": " - formal_logic",
"acc,none": 0.3888888888888889,
"acc_stderr,none": 0.0436031486007746
},
"mmlu_high_school_european_history": {
"alias": " - high_school_european_history",
"acc,none": 0.7757575757575758,
"acc_stderr,none": 0.032568666616811015
},
"mmlu_high_school_us_history": {
"alias": " - high_school_us_history",
"acc,none": 0.7892156862745098,
"acc_stderr,none": 0.0286265479124374
},
"mmlu_high_school_world_history": {
"alias": " - high_school_world_history",
"acc,none": 0.7763713080168776,
"acc_stderr,none": 0.027123298205229966
},
"mmlu_international_law": {
"alias": " - international_law",
"acc,none": 0.7933884297520661,
"acc_stderr,none": 0.03695980128098825
},
"mmlu_jurisprudence": {
"alias": " - jurisprudence",
"acc,none": 0.7685185185185185,
"acc_stderr,none": 0.04077494709252627
},
"mmlu_logical_fallacies": {
"alias": " - logical_fallacies",
"acc,none": 0.7914110429447853,
"acc_stderr,none": 0.031921934489347256
},
"mmlu_moral_disputes": {
"alias": " - moral_disputes",
"acc,none": 0.7138728323699421,
"acc_stderr,none": 0.024332146779134128
},
"mmlu_moral_scenarios": {
"alias": " - moral_scenarios",
"acc,none": 0.3139664804469274,
"acc_stderr,none": 0.015521923933523635
},
"mmlu_philosophy": {
"alias": " - philosophy",
"acc,none": 0.7041800643086816,
"acc_stderr,none": 0.025922371788818788
},
"mmlu_prehistory": {
"alias": " - prehistory",
"acc,none": 0.7407407407407407,
"acc_stderr,none": 0.02438366553103545
},
"mmlu_professional_law": {
"alias": " - professional_law",
"acc,none": 0.455019556714472,
"acc_stderr,none": 0.012718456618701773
},
"mmlu_world_religions": {
"alias": " - world_religions",
"acc,none": 0.8304093567251462,
"acc_stderr,none": 0.02878210810540171
},
"mmlu_other": {
"alias": " - other",
"acc,none": 0.6690477897312819,
"acc_stderr,none": 0.029087291991729783
},
"mmlu_business_ethics": {
"alias": " - business_ethics",
"acc,none": 0.58,
"acc_stderr,none": 0.049604496374885836
},
"mmlu_clinical_knowledge": {
"alias": " - clinical_knowledge",
"acc,none": 0.6830188679245283,
"acc_stderr,none": 0.028637235639800893
},
"mmlu_college_medicine": {
"alias": " - college_medicine",
"acc,none": 0.6647398843930635,
"acc_stderr,none": 0.03599586301247077
},
"mmlu_global_facts": {
"alias": " - global_facts",
"acc,none": 0.35,
"acc_stderr,none": 0.0479372485441102
},
"mmlu_human_aging": {
"alias": " - human_aging",
"acc,none": 0.695067264573991,
"acc_stderr,none": 0.030898610882477518
},
"mmlu_management": {
"alias": " - management",
"acc,none": 0.8252427184466019,
"acc_stderr,none": 0.0376017800602662
},
"mmlu_marketing": {
"alias": " - marketing",
"acc,none": 0.8717948717948718,
"acc_stderr,none": 0.021901905115073318
},
"mmlu_medical_genetics": {
"alias": " - medical_genetics",
"acc,none": 0.74,
"acc_stderr,none": 0.04408440022768078
},
"mmlu_miscellaneous": {
"alias": " - miscellaneous",
"acc,none": 0.8186462324393359,
"acc_stderr,none": 0.013778693778464085
},
"mmlu_nutrition": {
"alias": " - nutrition",
"acc,none": 0.7647058823529411,
"acc_stderr,none": 0.024288619466046123
},
"mmlu_professional_accounting": {
"alias": " - professional_accounting",
"acc,none": 0.4787234042553192,
"acc_stderr,none": 0.029800481645628693
},
"mmlu_professional_medicine": {
"alias": " - professional_medicine",
"acc,none": 0.6654411764705882,
"acc_stderr,none": 0.028661996202335307
},
"mmlu_virology": {
"alias": " - virology",
"acc,none": 0.5602409638554217,
"acc_stderr,none": 0.03864139923699121
},
"mmlu_social_sciences": {
"alias": " - social_sciences",
"acc,none": 0.7365027553174844,
"acc_stderr,none": 0.026785760218568794
},
"mmlu_econometrics": {
"alias": " - econometrics",
"acc,none": 0.5,
"acc_stderr,none": 0.047036043419179864
},
"mmlu_high_school_geography": {
"alias": " - high_school_geography",
"acc,none": 0.7626262626262627,
"acc_stderr,none": 0.0303137105381989
},
"mmlu_high_school_government_and_politics": {
"alias": " - high_school_government_and_politics",
"acc,none": 0.8652849740932642,
"acc_stderr,none": 0.024639789097709443
},
"mmlu_high_school_macroeconomics": {
"alias": " - high_school_macroeconomics",
"acc,none": 0.6615384615384615,
"acc_stderr,none": 0.023991500500313036
},
"mmlu_high_school_microeconomics": {
"alias": " - high_school_microeconomics",
"acc,none": 0.6596638655462185,
"acc_stderr,none": 0.030778057422931673
},
"mmlu_high_school_psychology": {
"alias": " - high_school_psychology",
"acc,none": 0.818348623853211,
"acc_stderr,none": 0.016530617409266885
},
"mmlu_human_sexuality": {
"alias": " - human_sexuality",
"acc,none": 0.8015267175572519,
"acc_stderr,none": 0.03498149385462472
},
"mmlu_professional_psychology": {
"alias": " - professional_psychology",
"acc,none": 0.6830065359477124,
"acc_stderr,none": 0.01882421951270621
},
"mmlu_public_relations": {
"alias": " - public_relations",
"acc,none": 0.6636363636363637,
"acc_stderr,none": 0.04525393596302505
},
"mmlu_security_studies": {
"alias": " - security_studies",
"acc,none": 0.726530612244898,
"acc_stderr,none": 0.02853556033712844
},
"mmlu_sociology": {
"alias": " - sociology",
"acc,none": 0.8258706467661692,
"acc_stderr,none": 0.026814951200421603
},
"mmlu_us_foreign_policy": {
"alias": " - us_foreign_policy",
"acc,none": 0.87,
"acc_stderr,none": 0.033799766898963086
},
"mmlu_stem": {
"alias": " - stem",
"acc,none": 0.5282278816063678,
"acc_stderr,none": 0.037025799074914184
},
"mmlu_abstract_algebra": {
"alias": " - abstract_algebra",
"acc,none": 0.27,
"acc_stderr,none": 0.0446196043338474
},
"mmlu_anatomy": {
"alias": " - anatomy",
"acc,none": 0.6148148148148148,
"acc_stderr,none": 0.042039210401562783
},
"mmlu_astronomy": {
"alias": " - astronomy",
"acc,none": 0.6578947368421053,
"acc_stderr,none": 0.03860731599316091
},
"mmlu_college_biology": {
"alias": " - college_biology",
"acc,none": 0.7361111111111112,
"acc_stderr,none": 0.03685651095897532
},
"mmlu_college_chemistry": {
"alias": " - college_chemistry",
"acc,none": 0.5,
"acc_stderr,none": 0.050251890762960605
},
"mmlu_college_computer_science": {
"alias": " - college_computer_science",
"acc,none": 0.52,
"acc_stderr,none": 0.050211673156867795
},
"mmlu_college_mathematics": {
"alias": " - college_mathematics",
"acc,none": 0.36,
"acc_stderr,none": 0.048241815132442176
},
"mmlu_college_physics": {
"alias": " - college_physics",
"acc,none": 0.3431372549019608,
"acc_stderr,none": 0.04724007352383888
},
"mmlu_computer_security": {
"alias": " - computer_security",
"acc,none": 0.78,
"acc_stderr,none": 0.041633319989322626
},
"mmlu_conceptual_physics": {
"alias": " - conceptual_physics",
"acc,none": 0.5829787234042553,
"acc_stderr,none": 0.03223276266711712
},
"mmlu_electrical_engineering": {
"alias": " - electrical_engineering",
"acc,none": 0.5517241379310345,
"acc_stderr,none": 0.041443118108781526
},
"mmlu_elementary_mathematics": {
"alias": " - elementary_mathematics",
"acc,none": 0.3862433862433862,
"acc_stderr,none": 0.025075981767601688
},
"mmlu_high_school_biology": {
"alias": " - high_school_biology",
"acc,none": 0.7774193548387097,
"acc_stderr,none": 0.023664216671642518
},
"mmlu_high_school_chemistry": {
"alias": " - high_school_chemistry",
"acc,none": 0.5073891625615764,
"acc_stderr,none": 0.0351760354036101
},
"mmlu_high_school_computer_science": {
"alias": " - high_school_computer_science",
"acc,none": 0.68,
"acc_stderr,none": 0.04688261722621505
},
"mmlu_high_school_mathematics": {
"alias": " - high_school_mathematics",
"acc,none": 0.37037037037037035,
"acc_stderr,none": 0.02944316932303154
},
"mmlu_high_school_physics": {
"alias": " - high_school_physics",
"acc,none": 0.32450331125827814,
"acc_stderr,none": 0.038227469376587525
},
"mmlu_high_school_statistics": {
"alias": " - high_school_statistics",
"acc,none": 0.5648148148148148,
"acc_stderr,none": 0.033812000056435254
},
"mmlu_machine_learning": {
"alias": " - machine_learning",
"acc,none": 0.5089285714285714,
"acc_stderr,none": 0.04745033255489123
}
},
"groups": {
"mmlu": {
"acc,none": 0.6534780774610744,
"acc_stderr,none": 0.02878254466378838,
"alias": "mmlu"
},
"mmlu_humanities": {
"alias": " - humanities",
"acc,none": 0.680133883189164,
"acc_stderr,none": 0.022931458812042033
},
"mmlu_other": {
"alias": " - other",
"acc,none": 0.6690477897312819,
"acc_stderr,none": 0.029087291991729783
},
"mmlu_social_sciences": {
"alias": " - social_sciences",
"acc,none": 0.7365027553174844,
"acc_stderr,none": 0.026785760218568794
},
"mmlu_stem": {
"alias": " - stem",
"acc,none": 0.5282278816063678,
"acc_stderr,none": 0.037025799074914184
}
}
}
Here, the mmlu score seems to be computed as an average of all groups:
assert data["groups"]["mmlu"]["acc,none"] == np.mean(
[data["groups"][f"mmlu_{subject}"]["acc,none"]
for subject in ["humanities", "other", "social_sciences", "stem"]]
)
But based on this comment here, this should have been an average by number of documents.
Imho, might be good to report both accuracy and weighted accuracy, to avoid any confusion.
from lm-evaluation-harness.
This looks to be corrected now! For reference here are the numbers I am getting with Mistral using the script I wrote to come up with this:
The weighted average is 0.6259
The regular average is 0.6405
The regular average of the weighted 4 subject average is 0.6346
The regular average of the regular 4 subject average is 0.6548
per subject regular average: dict_items([('stem', 0.5323367530451689), ('other', 0.6712050680651024), ('humanities', 0.678644611627678), ('social_sciences', 0.7371609780969487)])
per subject weighted average: dict_items([('stem', 0.5302886140183952), ('other', 0.7077566784679755), ('humanities', 0.5632051009564294), ('social_sciences', 0.737081572960676)])
The weighted average and per subject weighted average seem to fit well with your eval. Thanks!
from lm-evaluation-harness.
Glad it's working--Would like to add a regression test specifically for this in future though!
The suggestion to report both weighted + unweighted (or at least clearly indicate which is being reported) is a very good one as well.
from lm-evaluation-harness.
What will be the solution, when some next researcher or model author will calculate with regular average and report in their blog post?
from lm-evaluation-harness.
Hi, thank you for reporting this--looking into it now! As far as I was aware we are aggregating using the total docs per subject category, but there may be a bug introduced somewhere for nested groups.
For GPT-2:
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|------|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.2292|± |0.0035|
| - humanities |N/A |none |None |acc |0.2421|± |0.0062|
| - other |N/A |none |None |acc |0.2382|± |0.0076|
| - social_sciences|N/A |none |None |acc |0.2171|± |0.0074|
| - stem |N/A |none |None |acc |0.2131|± |0.0073|
doc counts are:
[4705, 3077, 3107, 3153] mmlu ['mmlu_humanities', 'mmlu_social_sciences', 'mmlu_other', 'mmlu_stem']
(4705 * 24.21 +23.82 * 3077 + 3107 * 21.71 + 21.31 * 3153) / (4705 + 3077 + 3107 + 3153)
gives 22.92 which is what we're reporting, versus (24.21 +23.82 + 21.71 + 21.31 ) / (4)
giving 22.7625.
Let me also run on Mistral though!
from lm-evaluation-harness.
This is what I get on the most recent commit:
hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto (8)
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.6246|± |0.0038|
| - humanities |N/A |none | 5|acc |0.5683|± |0.0068|
| - other |N/A |none | 5|acc |0.7016|± |0.0079|
| - social_sciences|N/A |none | 5|acc |0.7345|± |0.0078|
| - stem |N/A |none | 5|acc |0.5252|± |0.0085|
@codedecde what commit of the codebase did you run on?
from lm-evaluation-harness.
Related Issues (20)
- [Question] command to run open_llm_leaderboard ? HOT 5
- Multiple Choice Question Standard Deviation HOT 1
- the score of Qwen/Qwen1.5-7B on mmlu is 1.5 point lower than the score reported on leaderboard. HOT 10
- coqa not working HOT 1
- Iinconsistent API and docs for loglikelihood_rolling HOT 1
- overwrite evaluator for particular task
- Problems when testing Mistral-7B HOT 3
- Run pawsx task got "TypeError: 'NoneType' object cannot be interpreted as an integer" error. HOT 2
- Question about prompt formatting issue HOT 4
- Questions about evaluation metrics HOT 2
- lm_eval: error: unrecognized arguments: --wandb_args HOT 3
- Edit prompt for evaluation HOT 1
- Local fewshot support HOT 2
- New Task Request: InflectionAI's Physics GRE
- Clarifying Sample Count and Evaluation Process in MultiMedQA HOT 2
- Whitespace before label in MultipleChoiceTask causes wrong label probability prediction
- Allow Task objects to defer dataset download
- How to use generate_until function for the multiple_choice problem. HOT 1
- Create new release and ship to PyPI HOT 8
- Issue in gsm8k and inability to access Huggingface HOT 3
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from lm-evaluation-harness.