NeurIPS 2025 E2LM Competition: Early Training Evaluation of Language Models Leaderboard
{
- "headers": [
- "Submission ID",
- "Team",
- "Submitter",
- "Signal Quality",
- "Ranking Consistency",
- "Scientific Compliance",
- "Global Score",
- "Submission Time",
- "Description"
- "data": [
- [
- "dc6b27cb29",
- "morai",
- "giovanivaldrighi",
- 0.935,
- 0.928,
- 0.554,
- 0.782,
- "2025-10-01T22:35:50",
- "MMLU small with ll diff (v1)"
- [
- "8d5e3c3934",
- "morai",
- "giovanivaldrighi",
- 0.902,
- 0.899,
- 0.575,
- 0.771,
- "2025-10-03T01:42:42",
- "MMLU small with ll diff (v2)"
- [
- "984384485e",
- "morai",
- "giovanivaldrighi",
- 0.919,
- 0.957,
- 0.534,
- 0.769,
- "2025-09-30T23:20:17",
- "MMLU with ll diff (v1)"
- [
- "ed491308f5",
- "morai",
- "giovanivaldrighi",
- 0.893,
- 0.91,
- 0.574,
- 0.767,
- "2025-10-06T17:51:57",
- "MMLU with ll diff norm"
- [
- "aff36100ca",
- "ShAIkespear",
- "DaGrapix",
- 0.928,
- 0.73,
- 0.542,
- 0.754,
- "2025-09-25T21:16:33",
- "MMLU_var new"
- [
- "78710f0a1c",
- "Nobelist",
- "pkuHaowei",
- 0.95,
- 0.858,
- 0.476,
- 0.751,
- "2025-09-25T16:44:33",
- ""
- [
- "3e96b7b636",
- "ShAIkespear",
- "DaGrapix",
- 0.861,
- 0.887,
- 0.574,
- 0.749,
- "2025-09-28T22:07:45",
- "0001"
- [
- "95de27dbcb",
- "morai",
- "giovanivaldrighi",
- 0.959,
- 0.837,
- 0.419,
- 0.731,
- "2025-09-12T21:00:14",
- "MMLU with fixed perplexity (v3)"
- [
- "c1baf2ca4e",
- "morai",
- "giovanivaldrighi",
- 0.959,
- 0.837,
- 0.419,
- 0.731,
- "2025-09-11T21:36:13",
- "MMLU with fixed perplexity (v2)"
- [
- "c7654b1abc",
- "ShAIkespear",
- "ESmike",
- 0.959,
- 0.837,
- 0.419,
- 0.731,
- "2025-09-16T20:39:51",
- "MMLU_var"
- [
- "fb1bc9bdad",
- "ShAIkespear",
- "ESmike",
- 0.958,
- 0.823,
- 0.42,
- 0.729,
- "2025-09-17T21:02:04",
- "MMLU_var 2.0"
- [
- "18ce18709a",
- "Nobelist",
- "pkuHaowei",
- 0.956,
- 0.841,
- 0.419,
- 0.729,
- "2025-08-19T06:05:53",
- "Another MMLU-Var test with updated hf_token"
- [
- "56ee9534e6",
- "morai",
- "giovanivaldrighi",
- 0.888,
- 0.919,
- 0.474,
- 0.725,
- "2025-10-07T00:57:47",
- "MMLU full with ll diff norm"
- [
- "7c4156c0b4",
- "Nobelist",
- "pkuHaowei",
- 0.955,
- 0.872,
- 0.4,
- 0.724,
- "2025-09-23T16:20:43",
- "mmlu-var new v2.0"
- [
- "6fafe06861",
- "Nobelist",
- "pkuHaowei",
- 0.937,
- 0.83,
- 0.428,
- 0.722,
- "2025-08-27T13:51:43",
- "Reduce dataset size + focusing on STEM"
- [
- "9adf239175",
- "Nobelist",
- "pkuHaowei",
- 0.87,
- 0.801,
- 0.494,
- 0.713,
- "2025-09-19T17:04:06",
- "MMLU-var new"
- [
- "cd86d28256",
- "ShAIkespear",
- "ESmike",
- 0.956,
- 0.737,
- 0.396,
- 0.71,
- "2025-09-25T13:37:29",
- "MMLU_var 3.0"
- [
- "d5b347e711",
- "ShAIkespear",
- "DaGrapix",
- 0.846,
- 0.737,
- 0.447,
- 0.676,
- "2025-10-02T18:18:52",
- "0002"
- [
- "b695ee644e",
- "morai",
- "giovanivaldrighi",
- 0.741,
- 0.929,
- 0.376,
- 0.614,
- "2025-09-26T14:06:20",
- "MMLU Var with constrained prompt"
- [
- "4d37e4aeeb",
- "Cruise",
- "wmere",
- 0.614,
- 0.704,
- 0.454,
- 0.559,
- "2025-09-29T08:06:59",
- ""
- [
- "5845c5c8be",
- "morai",
- "giovanivaldrighi",
- 0.265,
- 0.722,
- 0.236,
- 0.299,
- "2025-08-22T11:53:41",
- "Task focused in STEM topics for MMLU using prompt engineering."
- [
- "22fb4b9aa6",
- "ShAIkespear",
- "ESmike",
- 0.162,
- 0.488,
- 0.071,
- 0.158,
- "2025-09-10T19:25:19",
- "Second Test"
- [
- "11619a9444",
- "ShAIkespear",
- "DaGrapix",
- null,
- 1,
- 0,
- null,
- "2025-09-19T15:12:13",
- "MMLU_var new"
- [
- "metadata": null