NeurIPS 2025 E2LM Competition: Early Training Evaluation of Language Models Leaderboard
{
- "headers": [
- "Submission ID",
- "Team",
- "Submitter",
- "Signal Quality",
- "Ranking Consistency",
- "Scientific Compliance",
- "Global Score",
- "Submission Time",
- "Description"
- "data": [
- [
- "e1490cde32",
- "morai",
- "giovanivaldrighi",
- 0.915,
- 0.885,
- 0.638,
- 0.801,
- "2025-10-21T21:13:30",
- "mmlu_v15"
- [
- "dc6b27cb29",
- "morai",
- "giovanivaldrighi",
- 0.935,
- 0.928,
- 0.554,
- 0.782,
- "2025-10-01T22:35:50",
- "MMLU small with ll diff (v1)"
- [
- "cd9b3b9524",
- "morai",
- "giovanivaldrighi",
- 0.921,
- 0.834,
- 0.587,
- 0.779,
- "2025-10-22T11:57:41",
- "mmlu_v16"
- [
- "8d5e3c3934",
- "morai",
- "giovanivaldrighi",
- 0.902,
- 0.899,
- 0.575,
- 0.771,
- "2025-10-03T01:42:42",
- "MMLU small with ll diff (v2)"
- [
- "984384485e",
- "morai",
- "giovanivaldrighi",
- 0.919,
- 0.957,
- 0.534,
- 0.769,
- "2025-09-30T23:20:17",
- "MMLU with ll diff (v1)"
- [
- "ed491308f5",
- "morai",
- "giovanivaldrighi",
- 0.893,
- 0.91,
- 0.574,
- 0.767,
- "2025-10-06T17:51:57",
- "MMLU with ll diff norm"
- [
- "e01740670c",
- "morai",
- "giovanivaldrighi",
- 0.934,
- 0.935,
- 0.511,
- 0.765,
- "2025-10-21T20:57:24",
- "mmlu_v14"
- [
- "6e373496a0",
- "morai",
- "giovanivaldrighi",
- 0.889,
- 0.773,
- 0.605,
- 0.764,
- "2025-10-10T22:08:17",
- "mmlu_v9"
- [
- "76e438a046",
- "ShAIkespear",
- "DaGrapix",
- 0.926,
- 0.752,
- 0.548,
- 0.758,
- "2025-10-13T20:53:23",
- "0007"
- [
- "a4395863a4",
- "morai",
- "giovanivaldrighi",
- 0.902,
- 0.776,
- 0.572,
- 0.758,
- "2025-10-17T21:35:04",
- "mmlu_v12"
- [
- "2bbe5c8486",
- "Scitix",
- "Lie24",
- 0.947,
- 0.936,
- 0.471,
- 0.756,
- "2025-10-20T09:30:22",
- "1"
- [
- "aff36100ca",
- "ShAIkespear",
- "DaGrapix",
- 0.928,
- 0.73,
- 0.542,
- 0.754,
- "2025-09-25T21:16:33",
- "MMLU_var new"
- [
- "440612e173",
- "Nobelist",
- "pkuHaowei",
- 0.95,
- 0.858,
- 0.476,
- 0.751,
- "2025-10-21T08:41:30",
- ""
- [
- "78710f0a1c",
- "Nobelist",
- "pkuHaowei",
- 0.95,
- 0.858,
- 0.476,
- 0.751,
- "2025-09-25T16:44:33",
- ""
- [
- "3e96b7b636",
- "ShAIkespear",
- "DaGrapix",
- 0.861,
- 0.887,
- 0.574,
- 0.749,
- "2025-09-28T22:07:45",
- "0001"
- [
- "a3deb9d739",
- "ShAIkespear",
- "DaGrapix",
- 0.916,
- 0.83,
- 0.509,
- 0.744,
- "2025-10-11T08:56:58",
- "0005"
- [
- "57a1db9bf2",
- "Scitix",
- "Lie24",
- 0.954,
- 0.972,
- 0.423,
- 0.743,
- "2025-10-23T13:38:14",
- "6"
- [
- "5d9e22bec2",
- "Nobelist",
- "pkuHaowei",
- 0.925,
- 0.865,
- 0.481,
- 0.741,
- "2025-10-27T10:06:19",
- "revert-42-correct"
- [
- "eeab1413e7",
- "morai",
- "giovanivaldrighi",
- 0.849,
- 0.748,
- 0.582,
- 0.732,
- "2025-10-15T12:24:27",
- "mmlu_v10"
- [
- "60d985bc28",
- "Scitix",
- "Lie24",
- 0.979,
- 0.986,
- 0.357,
- 0.731,
- "2025-10-20T11:14:45",
- "5"
- [
- "c1baf2ca4e",
- "morai",
- "giovanivaldrighi",
- 0.959,
- 0.837,
- 0.419,
- 0.731,
- "2025-09-11T21:36:13",
- "MMLU with fixed perplexity (v2)"
- [
- "95de27dbcb",
- "morai",
- "giovanivaldrighi",
- 0.959,
- 0.837,
- 0.419,
- 0.731,
- "2025-09-12T21:00:14",
- "MMLU with fixed perplexity (v3)"
- [
- "c7654b1abc",
- "ShAIkespear",
- "ESmike",
- 0.959,
- 0.837,
- 0.419,
- 0.731,
- "2025-09-16T20:39:51",
- "MMLU_var"
- [
- "18ce18709a",
- "Nobelist",
- "pkuHaowei",
- 0.956,
- 0.841,
- 0.419,
- 0.729,
- "2025-08-19T06:05:53",
- "Another MMLU-Var test with updated hf_token"
- [
- "fb1bc9bdad",
- "ShAIkespear",
- "ESmike",
- 0.958,
- 0.823,
- 0.42,
- 0.729,
- "2025-09-17T21:02:04",
- "MMLU_var 2.0"
- [
- "56ee9534e6",
- "morai",
- "giovanivaldrighi",
- 0.888,
- 0.919,
- 0.474,
- 0.725,
- "2025-10-07T00:57:47",
- "MMLU full with ll diff norm"
- [
- "7c4156c0b4",
- "Nobelist",
- "pkuHaowei",
- 0.955,
- 0.872,
- 0.4,
- 0.724,
- "2025-09-23T16:20:43",
- "mmlu-var new v2.0"
- [
- "8fe150c749",
- "Scitix",
- "Lie24",
- 0.971,
- 0.979,
- 0.347,
- 0.722,
- "2025-10-20T11:14:20",
- "4"
- [
- "6fafe06861",
- "Nobelist",
- "pkuHaowei",
- 0.937,
- 0.83,
- 0.428,
- 0.722,
- "2025-08-27T13:51:43",
- "Reduce dataset size + focusing on STEM"
- [
- "71be160599",
- "ShAIkespear",
- "DaGrapix",
- 0.964,
- 0.8,
- 0.401,
- 0.722,
- "2025-10-08T20:39:59",
- "0004"
- [
- "ccbf3bf7b4",
- "morai",
- "giovanivaldrighi",
- 0.947,
- 0.886,
- 0.394,
- 0.72,
- "2025-10-24T19:57:04",
- "mmlu_v17"
- [
- "1bc07adbed",
- "ShAIkespear",
- "DaGrapix",
- 0.905,
- 0.851,
- 0.452,
- 0.718,
- "2025-10-08T20:33:25",
- "0003"
- [
- "25fea29fad",
- "ShAIkespear",
- "DaGrapix",
- 0.924,
- 0.928,
- 0.403,
- 0.716,
- "2025-10-23T20:15:05",
- "0013"
- [
- "e3c8ba2e0b",
- "ShAIkespear",
- "DaGrapix",
- 0.933,
- 0.735,
- 0.44,
- 0.716,
- "2025-10-18T20:47:17",
- "0010"
- [
- "d4bb266f7e",
- "Scitix",
- "Lie24",
- 0.971,
- 0.986,
- 0.33,
- 0.716,
- "2025-10-15T12:04:38",
- "test"
- [
- "c31400f3ef",
- "morai",
- "giovanivaldrighi",
- 0.912,
- 0.744,
- 0.459,
- 0.714,
- "2025-10-24T20:25:15",
- "mmlu_v18"
- [
- "9adf239175",
- "Nobelist",
- "pkuHaowei",
- 0.87,
- 0.801,
- 0.494,
- 0.713,
- "2025-09-19T17:04:06",
- "MMLU-var new"
- [
- "cd86d28256",
- "ShAIkespear",
- "ESmike",
- 0.956,
- 0.737,
- 0.396,
- 0.71,
- "2025-09-25T13:37:29",
- "MMLU_var 3.0"
- [
- "29c501c195",
- "Episteme",
- "1OOl",
- 0.938,
- 0.842,
- 0.365,
- 0.699,
- "2025-10-20T21:25:06",
- "exp004"
- [
- "815cca37c4",
- "Scitix",
- "Lie24",
- 0.922,
- 0.906,
- 0.364,
- 0.697,
- "2025-10-27T09:03:08",
- "V2.1"
- [
- "1c3036c0bb",
- "morai",
- "giovanivaldrighi",
- 0.974,
- 0.815,
- 0.319,
- 0.696,
- "2025-10-16T18:12:54",
- "mmlu_v11"
- [
- "aae73d095e",
- "DARG",
- "imberator",
- 0.939,
- 0.85,
- 0.342,
- 0.691,
- "2025-10-13T16:40:52",
- "MMLU var subset"
- [
- "d5b347e711",
- "ShAIkespear",
- "DaGrapix",
- 0.846,
- 0.737,
- 0.447,
- 0.676,
- "2025-10-02T18:18:52",
- "0002"
- [
- "79ff9c16ed",
- "ShAIkespear",
- "DaGrapix",
- 0.792,
- 0.761,
- 0.482,
- 0.665,
- "2025-10-27T19:24:41",
- "0015"
- [
- "2eb78a7263",
- "ShAIkespear",
- "DaGrapix",
- 0.874,
- 0.847,
- 0.348,
- 0.661,
- "2025-10-19T09:37:08",
- "0011"
- [
- "8b2821e989",
- "ShAIkespear",
- "ESmike",
- 0.817,
- 0.742,
- 0.369,
- 0.63,
- "2025-10-20T22:57:25",
- "0009"
- [
- "b695ee644e",
- "morai",
- "giovanivaldrighi",
- 0.741,
- 0.929,
- 0.376,
- 0.614,
- "2025-09-26T14:06:20",
- "MMLU Var with constrained prompt"
- [
- "6ee7646bce",
- "morai",
- "giovanivaldrighi",
- 0.845,
- 0.843,
- 0.264,
- 0.612,
- "2025-10-09T17:39:51",
- "mmlu_v8"
- [
- "4616bc30c4",
- "Nobelist",
- "pkuHaowei",
- 0.63,
- 0.63,
- 0.534,
- 0.592,
- "2025-10-23T18:34:54",
- ""
- [
- "4d37e4aeeb",
- "Cruise",
- "wmere",
- 0.614,
- 0.704,
- 0.454,
- 0.559,
- "2025-09-29T08:06:59",
- ""
- [
- "376e97c0a0",
- "Nobelist",
- "pkuHaowei",
- 0.592,
- 0.32,
- 0.511,
- 0.533,
- "2025-10-23T18:57:01",
- "strategy balance"
- [
- "858c964f59",
- "Nobelist",
- "pkuHaowei",
- 0.592,
- 0.32,
- 0.511,
- 0.533,
- "2025-10-27T10:03:55",
- "revert-birth"
- [
- "9f9b33f7c8",
- "Episteme",
- "1OOl",
- 0.567,
- 0.781,
- 0.342,
- 0.498,
- "2025-10-20T22:20:17",
- "exp008"
- [
- "63bf359ae7",
- "Episteme",
- "1OOl",
- 0.528,
- 0.76,
- 0.048,
- 0.359,
- "2025-10-27T00:01:21",
- "exp005_2"
- [
- "60c6942c7b",
- "Scitix",
- "Lie24",
- 0.471,
- 0.471,
- 0.165,
- 0.349,
- "2025-10-20T11:13:55",
- "3"
- [
- "b74f0486fd",
- "Scitix",
- "Lie24",
- 0.471,
- 0.471,
- 0.165,
- 0.349,
- "2025-10-20T11:13:25",
- "2"
- [
- "acce8d8af8",
- "ShAIkespear",
- "ESmike",
- 0.399,
- 0.715,
- 0.194,
- 0.348,
- "2025-10-16T17:24:28",
- "0006"
- [
- "bff4cf1ff9",
- "Scitix",
- "Lie24",
- 0.29,
- 0.622,
- 0.271,
- 0.316,
- "2025-10-16T12:08:59",
- "3"
- [
- "5845c5c8be",
- "morai",
- "giovanivaldrighi",
- 0.265,
- 0.722,
- 0.236,
- 0.299,
- "2025-08-22T11:53:41",
- "Task focused in STEM topics for MMLU using prompt engineering."
- [
- "319b1a3a3a",
- "thu-let_it_be",
- "jiangyi233",
- 0.182,
- 0.555,
- 0.286,
- 0.261,
- "2025-10-24T03:26:30",
- "100(2)"
- [
- "c3017d1ddf",
- "thu-let_it_be",
- "jiangyi233",
- 0.182,
- 0.555,
- 0.286,
- 0.261,
- "2025-10-24T03:26:23",
- "100(2)"
- [
- "3c4c5eac04",
- "thu-let_it_be",
- "jiangyi233",
- 0.173,
- 0.546,
- 0.262,
- 0.246,
- "2025-10-26T15:53:34",
- "1026-1400"
- [
- "6ea21350e8",
- "Episteme",
- "1OOl",
- 0.305,
- 0.667,
- 0.057,
- 0.242,
- "2025-10-19T20:15:04",
- "exp002"
- [
- "11619a9444",
- "ShAIkespear",
- "DaGrapix",
- 0.386,
- 0.477,
- 0,
- 0.241,
- "2025-09-19T15:12:13",
- "MMLU_var new"
- [
- "c619560da1",
- "Episteme",
- "1OOl",
- 0.321,
- 0.552,
- 0,
- 0.215,
- "2025-10-20T21:55:19",
- "exp006"
- [
- "22fb4b9aa6",
- "ShAIkespear",
- "ESmike",
- 0.162,
- 0.488,
- 0.071,
- 0.158,
- "2025-09-10T19:25:19",
- "Second Test"
- [
- "metadata": null