Persist Bench Leaderboard
{
- "headers": [
- "Rank",
- "Type",
- "Model",
- "Cross-Domain FR",
- "Sycophancy FR",
- "Beneficial FR",
- "Normalized Failure Score"
- "data": [
- [
- 1,
- "Proprietary",
- "Claude-Opus-4.5",
- 18,
- 87.5,
- 2,
- 0.0259
- [
- 2,
- "Proprietary",
- "GPT-5.2 (High)",
- 4,
- 59,
- 23,
- 0.0333
- [
- 3,
- "Proprietary",
- "Claude-Sonnet-4.5",
- 40,
- 83.5,
- 11,
- 0.0412
- [
- 4,
- "Proprietary",
- "Gemini-3-Pro",
- 59.5,
- 100,
- 4,
- 0.0457
- [
- 5,
- "Open Weights",
- "DeepSeek-V3.2-Speciale",
- 38,
- 98.5,
- 14,
- 0.0461
- [
- 6,
- "Proprietary",
- "Grok-4.1-Fast",
- 56.5,
- 99.5,
- 6,
- 0.0462
- [
- 7,
- "Open Weights",
- "Kimi-K2-Thinking",
- 49.5,
- 97,
- 10,
- 0.0466
- [
- 8,
- "Proprietary",
- "Gemini-3-Flash",
- 79.5,
- 99,
- 4,
- 0.053
- [
- 9,
- "Open Weights",
- "GLM-4.7",
- 47,
- 99,
- 21,
- 0.0557
- [
- 10,
- "Proprietary",
- "Grok-4",
- 86,
- 100,
- 5,
- 0.0565
- [
- 11,
- "Open Weights",
- "MiniMax-M2.1",
- 33.5,
- 94,
- 30,
- 0.0575
- [
- 12,
- "Open Weights",
- "GPT-OSS-120B",
- 59.5,
- 96.5,
- 20,
- 0.059
- [
- 13,
- "Open Weights",
- "Qwen3-235B-A22B-Think",
- 91,
- 100,
- 12,
- 0.0645
- [
- 14,
- "Open Weights",
- "Qwen3-235B-A22B",
- 76,
- 99.5,
- 19,
- 0.0649
- [
- 15,
- "Proprietary",
- "GPT-4o",
- 13,
- 88,
- 53,
- 0.0687
- [
- 16,
- "Open Weights",
- "Llama-3.3-70B-Instruct",
- 17.5,
- 82,
- 55,
- 0.071
- [
- 17,
- "Open Weights",
- "Kimi-K2-0905",
- 61.5,
- 99.5,
- 33,
- 0.0717
- [
- 18,
- "Open Weights",
- "Llama-4-Maverick",
- 59,
- 96,
- 59,
- 0.0928
- [
- "metadata": null