{
  "archive": [
    {
      "kind": "JSON",
      "name": "2026-05-03-granite-shakedown.json"
    },
    {
      "kind": "JSON",
      "name": "2026-05-04-phi4-reasoning-shakedown.json"
    },
    {
      "kind": "JSON",
      "name": "casepack-revisit-2026-04-21.json"
    },
    {
      "kind": "JSON",
      "name": "full-2026-04-21.json"
    },
    {
      "kind": "JSON",
      "name": "gitref-smoke-2026-04-21.json"
    },
    {
      "kind": "MD",
      "name": "phanteks-rig-bench-plan-2026-05-07.md"
    },
    {
      "kind": "RUN",
      "name": "seller-pc-3090-alone-large-2026-04-27/"
    },
    {
      "kind": "RUN",
      "name": "seller-pc-3090-fe-fresh-pads-2026-04-28/"
    },
    {
      "kind": "JSON",
      "name": "smoke-2026-04-21-v2.json"
    },
    {
      "kind": "JSON",
      "name": "smoke-2026-04-21.json"
    }
  ],
  "latest": {
    "_filename": "2026-05-03-granite-shakedown.json",
    "_raw_url": "https://github.com/kevinhsykes/kevinsykes-ai/blob/master/reports/benchmarks/2026-05-03-granite-shakedown.json",
    "case_slugs": [
      "strict_json_idea_tagger",
      "copy_drafting_project_card",
      "multi_file_summary_closeout"
    ],
    "cases_dir": "benchmarks\\cases",
    "ollama_host": "http://127.0.0.1:11434",
    "ollama_ps_after": "NAME                                        ID              SIZE      PROCESSOR    CONTEXT    UNTIL               \nhf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M    b93498a4389e    6.2 GB    100% GPU     4096       29 minutes from now    \ngranite3.3:8b                               fd429f23b909    6.1 GB    100% GPU     4096       29 minutes from now    \ngemma4:latest                               c6eb396dbd59    10 GB     100% GPU     4096       15 minutes from now",
    "ollama_ps_before": "NAME             ID              SIZE     PROCESSOR    CONTEXT    UNTIL               \ngemma4:latest    c6eb396dbd59    10 GB    100% GPU     4096       15 minutes from now",
    "registry_path": "benchmarks\\models.json",
    "repeats": 1,
    "requested_models": [
      "granite3.3:8b",
      "hf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M"
    ],
    "roles": {
      "coder": {
        "case_slugs": [
          "code_edit_password_compat",
          "regex_oneliner_jq_pin"
        ],
        "challengers": [],
        "incumbent": "qwen2.5-coder:14b"
      },
      "draft": {
        "case_slugs": [
          "strict_json_idea_tagger",
          "copy_drafting_project_card",
          "multi_file_summary_closeout"
        ],
        "challengers": [
          "llama3.1:8b",
          "granite3.3:8b"
        ],
        "incumbent": "gemma4:latest"
      },
      "reason": {
        "case_slugs": [
          "bug_diag_autoseed_lock",
          "inconsistency_reasoning_handoffs"
        ],
        "challengers": [],
        "incumbent": "deepseek-r1:14b"
      },
      "review": {
        "case_slugs": [
          "security_review_audit_headers"
        ],
        "challengers": [],
        "incumbent": "gemma4:26b"
      }
    },
    "runs": {
      "granite3.3:8b": {
        "cases": {
          "copy_drafting_project_card": {
            "runs": [
              {
                "check": {
                  "forbidden_hits": [],
                  "pass": true,
                  "type": "contains_none_of"
                },
                "cost_tier": "free",
                "elapsed_ms": 541.9,
                "expected_format": "plain_text",
                "family": "ibm-granite",
                "incumbent_role": "draft",
                "rc": 0,
                "severity_bucket": "pass",
                "severity_weight": 2,
                "stderr": "",
                "stdout": "\"Our AI-driven system automates inventory management, reducing manual tasks by 80%. It integrates directly into your procurement process, boosting daily transaction speed by 75%.\"",
                "turbo_escalation_case": false
              }
            ],
            "severity_weight": 2,
            "source_commit": "2d3209a",
            "summary": {
              "loud_wrong_count": 0,
              "max_ms": 541.9,
              "mean_ms": 541.9,
              "min_ms": 541.9,
              "parse_fail_count": 0,
              "pass_count": 1,
              "rc_zero_count": 1,
              "severity_score": 0.0,
              "silent_wrong_count": 0
            },
            "task_shape": "copy-drafting",
            "turbo_escalation": false
          },
          "multi_file_summary_closeout": {
            "runs": [
              {
                "check": {
                  "missing_section": "Why",
                  "pass": false,
                  "type": "ordered_sections"
                },
                "cost_tier": "free",
                "elapsed_ms": 6354.4,
                "expected_format": "markdown",
                "family": "ibm-granite",
                "incumbent_role": "draft",
                "rc": 0,
                "severity_bucket": "loud-wrong",
                "severity_weight": 2,
                "stderr": "",
                "stdout": "This document appears to be a detailed technical report or log from a software development or maintenance process, specifically focusing on troubleshooting and resolving issues related to a Cloud Run service named \"kevinsykes-ai.\" The reports cover various aspects such as diagnosis of exceptions, implementation of solutions, verification through testing, rollback procedures, and future considerations.\n\nKey points from the document include:\n\n1. **Exception Diagnosis**: An issue was identified where a 500 Internal Server Error was occurring on certain POST requests to `/studio/style-quiz` endpoints in the \"kevinsykes-ai\" Cloud Run service due to a missing Firestore permission for the Compute Service Account (CSA).\n\n2. **Solution Implementation**:\n   - Introduced defensive code layers (in `services/preference_game.py`) to wrap around Firestore calls, causing these requests to return a 503 Service Unavailable response instead of a 500 Internal Server Error when permissions were insufficient.\n   - Documented the necessary IAM policy binding command for granting `roles/datastore.user` to the CSA in a waiver memo for explicit user instruction.\n\n3. **Verification**: Tested the changes through a full test suite and verified that the staging and production environments now return 503 responses as expected when encountering permission issues, ensuring no new failures were introduced. The style quiz functionality was degraded but safely, with appropriate flagging.\n\n4. **Rollback Paths**: Provided clear instructions for both narrow rollback (removing the environment variable to disable the new behavior) and full rollback (reverting the code change and redeploying).\n\n5. **Next Steps**: Suggested further actions such as updating the public HTML to inform users about the degraded service, setting up monitoring alerts for future IAM-related issues, and cataloging the implemented pattern of defensive exception handling for reuse in other modules.\n\n6. **Zero-New-Failure Gate**: Confirmed that no new failures were introduced by the changes, meeting the gate criteria for deployment.\n\nThis document seems to be a comprehensive log detailing a successful troubleshooting and resolution process for a specific technical issue within a cloud-based application, ensuring that changes adhered to predefined rules and maintained system stability.",
                "turbo_escalation_case": false
              }
            ],
            "severity_weight": 2,
            "source_commit": "31e7af0",
            "summary": {
              "loud_wrong_count": 1,
              "max_ms": 6354.4,
              "mean_ms": 6354.4,
              "min_ms": 6354.4,
              "parse_fail_count": 0,
              "pass_count": 0,
              "rc_zero_count": 1,
              "severity_score": 0.5,
              "silent_wrong_count": 0
            },
            "task_shape": "multi-file-summary",
            "turbo_escalation": false
          },
          "strict_json_idea_tagger": {
            "runs": [
              {
                "check": {
                  "pass": true,
                  "type": "strict_json",
                  "value": {
                    "category": "admin",
                    "pipeline": "client",
                    "priority": "high",
                    "tags": [
                      "infrastructure",
                      "monitoring",
                      "background worker"
                    ]
                  }
                },
                "cost_tier": "free",
                "elapsed_ms": 555.0,
                "expected_format": "json",
                "family": "ibm-granite",
                "incumbent_role": "draft",
                "rc": 0,
                "severity_bucket": "pass",
                "severity_weight": 1,
                "stderr": "",
                "stdout": "{\n  \"category\": \"admin\",\n  \"priority\": \"high\",\n  \"pipeline\": \"client\",\n  \"tags\": [\"infrastructure\", \"monitoring\", \"background worker\"]\n}",
                "turbo_escalation_case": false
              }
            ],
            "severity_weight": 1,
            "source_commit": "b4eb6f4",
            "summary": {
              "loud_wrong_count": 0,
              "max_ms": 555.0,
              "mean_ms": 555.0,
              "min_ms": 555.0,
              "parse_fail_count": 0,
              "pass_count": 1,
              "rc_zero_count": 1,
              "severity_score": 0.0,
              "silent_wrong_count": 0
            },
            "task_shape": "strict-json-classification",
            "turbo_escalation": false
          }
        },
        "cost_tier": "free",
        "family": "ibm-granite",
        "ollama_ps_after_warmup": "NAME             ID              SIZE      PROCESSOR    CONTEXT    UNTIL               \ngranite3.3:8b    fd429f23b909    6.1 GB    100% GPU     4096       29 minutes from now    \ngemma4:latest    c6eb396dbd59    10 GB     100% GPU     4096       15 minutes from now",
        "roles": [
          "draft"
        ],
        "warmup": {
          "elapsed_ms": 5036.2,
          "rc": 0,
          "stderr": "",
          "stdout": "OK"
        }
      },
      "hf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M": {
        "cases": {
          "copy_drafting_project_card": {
            "runs": [
              {
                "check": {
                  "forbidden_hits": [],
                  "pass": true,
                  "type": "contains_none_of"
                },
                "cost_tier": "free",
                "elapsed_ms": 515.6,
                "expected_format": "plain_text",
                "family": "ibm-granite",
                "incumbent_role": "draft",
                "rc": 0,
                "severity_bucket": "pass",
                "severity_weight": 2,
                "stderr": "",
                "stdout": "Built an AI-driven automation platform that integrates directly with procurement systems, reducing manual inventory tasks by 70% and increasing order fulfillment accuracy from 85% to 98%.",
                "turbo_escalation_case": false
              }
            ],
            "severity_weight": 2,
            "source_commit": "2d3209a",
            "summary": {
              "loud_wrong_count": 0,
              "max_ms": 515.6,
              "mean_ms": 515.6,
              "min_ms": 515.6,
              "parse_fail_count": 0,
              "pass_count": 1,
              "rc_zero_count": 1,
              "severity_score": 0.0,
              "silent_wrong_count": 0
            },
            "task_shape": "copy-drafting",
            "turbo_escalation": false
          },
          "multi_file_summary_closeout": {
            "runs": [
              {
                "check": {
                  "pass": false,
                  "type": "ordered_sections",
                  "word_cap_exceeded": 500,
                  "word_count": 686
                },
                "cost_tier": "free",
                "elapsed_ms": 16333.5,
                "expected_format": "markdown",
                "family": "ibm-granite",
                "incumbent_role": "draft",
                "rc": 0,
                "severity_bucket": "loud-wrong",
                "severity_weight": 2,
                "stderr": "",
                "stdout": "**Combined Summary of Sprint\u201111 Reports**\n\n---\n\n### **T1 Sprint\u201111 Report \u2013 \u201c/admin/style\u2011quiz\u201d Diagnostics & Fix**\n- **Core Issue:** Production `POST /studio/style\u2011quiz/*` returned persistent 500s due to missing Firestore IAM role on the Cloud Run compute service account.\n- **Diagnosis Steps:**\n  1. Collected Cloud Run logs (2026\u201104\u201117) showing a `PermissionDenied (403)` from Google APIs when calling `list_sessions()`.\n  2. Verified IAM bindings for the service account (`504633026097-compute@developer.gserviceaccount.com`) \u2013 Firestore role missing.\n- **Implementation:**\n  - Added a typed exception wrapper in `services/preference_game.py` to translate `google.api_core.exceptions.GoogleAPICallError` into a domain\u2011specific `PreferenceGameUnavailable`.\n  - Updated admin (`/admin/style_quiz.py`) and studio (`/blueprints/studio.py`) view layers to catch this exception and return **503 Service Unavailable** with JSON `{ \"error\": \"preference game temporarily unavailable\" }`.\n  - Provided fallback handling for legacy code paths via `_unavailable_exc(svc)`.\n- **Testing:**\n  - Added **12 new tests** in `tests/test_style_quiz_unavailable.py` covering service\u2011layer, admin list view, and studio detail views.\n  - Full test suite now reports **1168 passed**, a net +12 relative to the baseline (1156).\n- **Verification & Smoke Tests:**\n  - **Staging:** All three studio POST endpoints respond with 503 JSON; GET /admin/style\u2011quiz returns 404 (admin auth gated off staging).  \n  - **Production (rev `kevinsykes-ai-00141-w4n`):** Prior 500s replaced by graceful 503 responses; other routes unchanged.\n- **Rollback Options:**\n  1. **Partial:** Remove the `USE_NEW_ADMIN_STYLE_QUIZ_BP` env var to disable the new defensive layer \u2013 leaves studio endpoints still degrading but prevents re\u2011introduction of the original 500 error.\n  2. **Full:** Revert commit `3eaebd4` and redeploy; however, this would expose the latent defect again unless the IAM grant is applied later.\n- **Next Action (Kevin):**\n  - Execute the Firestore IAM grant (see waiver memo) to allow persistent session writes:\n    ```bash\n    gcloud projects add-iam-policy-binding project-03955f9d-ca86-45e5-9fc \\\n      --member=serviceAccount:504633026097-compute@developer.gserviceaccount.com \\\n      --role=roles/datastore.user\n    ```\n  - After the grant, verify that `POST /studio/style\u2011quiz/start` returns a successful JSON payload with a `session_id`.  \n  - Optionally surface a degraded\u2011state banner on the public quiz UI for user visibility.\n\n---\n\n### **T5 Sprint\u201111 Report \u2013 \u201c/api/ideas/<id>/*\u201d and \u201c/api/timeline\u201d Refactor**\n- **Objective:** Consolidate idea\u2011management endpoints under a new `USE_NEW_ADMIN_STYLE_QUIZ_BP` style flag to simplify wiring, remove duplication, and improve failure handling.\n- **Key Changes:**\n  - **Endpoint Unification:** Merged duplicate logic in `admin/ideas.py` \u2192 single helper `idea_service()` that dispatches CRUD calls based on the request path.\n  - **Feature Flag Integration:** Introduced `USE_NEW_ADMIN_STYLE_QUIZ_BP` (or a dedicated flag `USE_NEW_IIDEAS_BP`) to toggle new vs legacy endpoints. Default set to **ON** in production after deployment.\n  - **Error Handling Upgrade:** All CRUD calls now catch generic `GoogleAPICallError` and map them to `PreferenceGameUnavailable`, returning consistent JSON error responses (`503 Service Unavailable`).\n  - **Testing Expansion:** Added **30 new tests** covering the unified service, edge\u2011case validation (e.g., missing project IDs), and integration with the timeline endpoint.\n- **Verification & Smoke Tests:**\n  - **Local Dev:** All tests pass (`passing 1450/1500`).  \n  - **Staging Deploy (rev `kevinsykes-ai-staging-00031-m5g`):** Endpoints respond correctly; no regressions observed on `/api/feed/public`, `/api/tracker`, or T3/T4 endpoints.  \n  - **Production Deploy (rev `kevinsykes-ai-00141-w4n`):** Feature flag active, new endpoints operational; legacy paths remain disabled per contract.\n- **Rollback Path:**\n  - Disable the feature flag (`--remove-env-vars USE_NEW_IIDEAS_BP`) to fall back to the older, now\u2011deprecated endpoint set. No code revert needed as both sets coexist behind a conditional guard.\n- **Next Action (Kevin):**\n  - Confirm deployment of the unified service via `./deploy.sh prod`.  \n  - Validate timeline queries (`GET /api/timeline?project=kevinsykes-ai`) return expected results with the new pagination logic.  \n  - Document the deprecation schedule for legacy `/api/ideas/<id>/legacy` routes in the project\u2019s migration plan.\n\n---\n\n### **Combined Takeaways**\n1. **Both sprint reports address latent production bugs** (T1 fixes a permission\u2011denied crash; T5 consolidates and refactors duplicated logic) that were previously hidden from end\u2011users.\n2. **Testing growth:** +12 tests for T1, +30 tests for T5 \u2192 total suite now at **\u22481450\u20131500 passing**, well above the baseline.\n3. **Feature\u2011flag driven rollout** ensures a safe transition\u2014flags can be toggled without redeploying code, aligning with contract rules about \u201cno redeploy required when only flag changes\u201d.\n4. **Rollback mechanisms are explicit:** environment variable removal for T5; partial/complete revert for T1 via IAM grant or full commit revert.\n\n**Overall Status:** All sprint\u201111 objectives are met, tests pass zero new failures, and the next step is Kevin\u2019s authorization of the Firestore IAM grant (T1) followed by a final verification of the unified ideas service (T5).",
                "turbo_escalation_case": false
              }
            ],
            "severity_weight": 2,
            "source_commit": "31e7af0",
            "summary": {
              "loud_wrong_count": 1,
              "max_ms": 16333.5,
              "mean_ms": 16333.5,
              "min_ms": 16333.5,
              "parse_fail_count": 0,
              "pass_count": 0,
              "rc_zero_count": 1,
              "severity_score": 0.5,
              "silent_wrong_count": 0
            },
            "task_shape": "multi-file-summary",
            "turbo_escalation": false
          },
          "strict_json_idea_tagger": {
            "runs": [
              {
                "check": {
                  "pass": true,
                  "type": "strict_json",
                  "value": {
                    "category": "admin",
                    "pipeline": "site",
                    "priority": "medium",
                    "tags": [
                      "per-model",
                      "token-usage",
                      "ingest-worker",
                      "background-job",
                      "ollama"
                    ]
                  }
                },
                "cost_tier": "free",
                "elapsed_ms": 800.6,
                "expected_format": "json",
                "family": "ibm-granite",
                "incumbent_role": "draft",
                "rc": 0,
                "severity_bucket": "pass",
                "severity_weight": 1,
                "stderr": "",
                "stdout": "{\n  \"category\": \"admin\",\n  \"priority\": \"medium\",\n  \"pipeline\": \"site\",\n  \"tags\": [\n    \"per-model\",\n    \"token-usage\",\n    \"ingest-worker\",\n    \"background-job\",\n    \"ollama\"\n  ]\n}",
                "turbo_escalation_case": false
              }
            ],
            "severity_weight": 1,
            "source_commit": "b4eb6f4",
            "summary": {
              "loud_wrong_count": 0,
              "max_ms": 800.6,
              "mean_ms": 800.6,
              "min_ms": 800.6,
              "parse_fail_count": 0,
              "pass_count": 1,
              "rc_zero_count": 1,
              "severity_score": 0.0,
              "silent_wrong_count": 0
            },
            "task_shape": "strict-json-classification",
            "turbo_escalation": false
          }
        },
        "cost_tier": "free",
        "family": "ibm-granite",
        "ollama_ps_after_warmup": "NAME                                        ID              SIZE      PROCESSOR    CONTEXT    UNTIL               \nhf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M    b93498a4389e    6.2 GB    100% GPU     4096       29 minutes from now    \ngranite3.3:8b                               fd429f23b909    6.1 GB    100% GPU     4096       29 minutes from now    \ngemma4:latest                               c6eb396dbd59    10 GB     100% GPU     4096       15 minutes from now",
        "roles": [
          "draft"
        ],
        "warmup": {
          "elapsed_ms": 5070.2,
          "rc": 0,
          "stderr": "",
          "stdout": "OK"
        }
      }
    },
    "timeout": 180.0,
    "ts": "2026-05-04T00:16:53Z",
    "turbo_escalation": {
      "budget_note": "$20/mo flat; activated 2026-04-13. Keep decision at re-eval depends on acceptance rate on cases with turbo_escalation: true (currently: security_review_audit_headers, inconsistency_reasoning_handoffs).",
      "from_role": "review",
      "reevaluation_date": "2026-04-28",
      "retry_model": "gpt-oss:120b-cloud",
      "telemetry_tag": "turbo-escalation",
      "trigger_condition": "26B verify-reject on code-review or security-review task"
    }
  },
  "registry": {
    "child": "#1075",
    "epic": "#1073",
    "explicitly_excluded": [
      {
        "model": "phi3:14b",
        "reason": "No defined job distinct from qwen2.5-coder:14b or deepseek-r1:14b \u2014 Codex cleanup advice."
      },
      {
        "model": "yi:9b",
        "reason": "No defined job, overlaps with other 8-14B general models."
      },
      {
        "model": "mistral:7b",
        "reason": "Old and overlaps with newer models."
      },
      {
        "model": "command-r7b",
        "reason": "Cohere lineage; keep only if a RAG plan materializes. Parked, not evaluated here."
      },
      {
        "model": "embeddinggemma",
        "reason": "Embeddings \u2014 separate concern from text-gen routing; may feature in a later RAG-specific registry."
      }
    ],
    "families": {
      "alibaba-qwen": {
        "cost_tier": "free",
        "count": 5,
        "reasoning_about": "Alibaba's Qwen family; distinct from gemma and deepseek training. Includes Qwen2.5-Coder line (14B + 32B) + Qwen3-Coder + Qwen3.6 dense (27B) and MoE (35B-A3B)."
      },
      "deepseek": {
        "cost_tier": "free",
        "count": 3,
        "reasoning_about": "DeepSeek reasoning distills; distinct lineage, reasoning-focused. Includes R1-Distill-Qwen-32B and R1-Distill-Llama-70B."
      },
      "google-gemma": {
        "cost_tier": "free",
        "count": 2,
        "reasoning_about": "Google DeepMind's Gemma family; Google lineage."
      },
      "ibm-granite": {
        "cost_tier": "free",
        "count": 2,
        "reasoning_about": "IBM's Granite family; enterprise-tuned. 3.3 8B + 4.1 8B (hybrid Mamba/Transformer)."
      },
      "meta-llama": {
        "cost_tier": "free",
        "count": 3,
        "reasoning_about": "Meta's Llama family; foundation of many finetunes. 3.1 8B + 3.1 70B + 3.3 70B."
      },
      "microsoft-phi": {
        "cost_tier": "free",
        "count": 1,
        "reasoning_about": "Microsoft Phi family \u2014 dense, reasoning-tuned. AIME-optimized small models."
      },
      "mistral-ai": {
        "cost_tier": "free",
        "count": 1,
        "reasoning_about": "Mistral AI European lineage; Apache-licensed dense models."
      },
      "nvidia-nemotron": {
        "cost_tier": "free",
        "count": 1,
        "reasoning_about": "NVIDIA Nemotron family \u2014 math-reasoning specialist; reasoning-on/off toggle."
      },
      "openai-gpt-oss": {
        "cost_tier": "paid",
        "count": 1,
        "reasoning_about": "OpenAI GPT-OSS lineage; runs via Ollama Turbo cloud."
      },
      "zhipu-glm": {
        "cost_tier": "free",
        "count": 1,
        "reasoning_about": "Zhipu AI / Z.AI GLM family \u2014 Chinese-lab agentic-MoE focus, MIT-licensed."
      }
    },
    "models": [
      {
        "cost_tier": "free",
        "family": "google-gemma",
        "model": "gemma4:latest",
        "notes": "Production default for first-pass drafts. 83% accept rate over 16 benchmark rounds; best-in-class summarization; 99% on JSON extraction. Keystone of the current routing stack \u2014 do not remove without a replacement.",
        "roles": [
          "draft"
        ],
        "status": {
          "draft": "incumbent"
        }
      },
      {
        "cost_tier": "free",
        "family": "google-gemma",
        "model": "gemma4:26b",
        "notes": "Security code-review specialist. Round 10 telemetry showed it catches classes 8B misses (e.g., path traversal). 26B verify-reject on sec-review or code-review tasks triggers Ollama Turbo retry per CLAUDE.md turbo-escalation rule.",
        "roles": [
          "review"
        ],
        "status": {
          "review": "incumbent"
        }
      },
      {
        "cost_tier": "free",
        "family": "alibaba-qwen",
        "model": "qwen2.5-coder:14b",
        "notes": "Coder-slot incumbent. Strong multi-file repo edit + test generation. Different training lineage from gemma family \u2014 required for /ask panel diversity (memory #895).",
        "roles": [
          "coder"
        ],
        "status": {
          "coder": "incumbent"
        }
      },
      {
        "cost_tier": "free",
        "family": "deepseek",
        "model": "deepseek-r1:14b",
        "notes": "Reasoning-distilled model for second-opinion logic tasks. Different family from gemma/qwen \u2014 adds panel diversity for /ask. Distilled from R1 reasoning traces; closest local analog to frontier reasoning.",
        "roles": [
          "reason"
        ],
        "status": {
          "reason": "incumbent"
        }
      },
      {
        "cost_tier": "free",
        "family": "meta-llama",
        "model": "llama3.1:8b",
        "notes": "Draft-slot challenger. Meta lineage \u2014 would add panel-diversity if promoted over gemma4:latest. Currently no defined job distinct from the incumbent; benchmark data will decide whether it earns a slot.",
        "roles": [
          "draft"
        ],
        "status": {
          "draft": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "ibm-granite",
        "model": "granite3.3:8b",
        "notes": "Draft-slot challenger from IBM lineage. Currently overlaps with llama3.1:8b \u2014 if one wins, the other is removable per Codex's model-stack cleanup.",
        "roles": [
          "draft"
        ],
        "status": {
          "draft": "challenger"
        }
      },
      {
        "cost_tier": "paid",
        "family": "openai-gpt-oss",
        "model": "gpt-oss:120b-cloud",
        "notes": "Ollama Turbo paid tier ($20/mo flat, activated 2026-04-13). Retry path when gemma4:26b rejects on code-review or security-review tasks per CLAUDE.md turbo-escalation rule. 2026-04-28 re-evaluation date \u2014 telemetry from cases #5 and #7 (turbo_escalation: true) informs the keep/cut decision.",
        "roles": [
          "turbo-escalation"
        ],
        "status": {
          "turbo-escalation": "incumbent"
        }
      },
      {
        "cost_tier": "free",
        "family": "alibaba-qwen",
        "model": "qwen2.5-coder:32b",
        "notes": "Larger sibling of qwen2.5-coder:14b. Q4_K_M ~20 GB, fits in 24 GB VRAM on 3090. Added 2026-04-27 for 3090-alone vs Main-AI-Rig dual-GPU baseline comparison. Not in any role yet \u2014 pure benchmark target.",
        "roles": [],
        "status": {}
      },
      {
        "cost_tier": "free",
        "family": "meta-llama",
        "model": "llama3.1:70b",
        "notes": "70B model at Q4_K_M ~40 GB; cannot fit in 24 GB single-GPU VRAM, will partial-CPU-offload on the 3090-alone seller PC. Added 2026-04-27 to establish a slow 3090-alone baseline before Main AI Rig dual-GPU build (~2026-05-07) where 40 GB combined VRAM can run it fully on-GPU. Phanteks Scenario A test subject \u2014 Tier-3-protected through Scenario A FINAL_COMPARISON.md ship + llama3.3:70b head-to-head.",
        "roles": [],
        "status": {}
      },
      {
        "cost_tier": "free",
        "family": "alibaba-qwen",
        "model": "qwen3-coder:30b",
        "notes": "Phase B candidate added 2026-05-03 (epic: Local Model Stack Refresh). MoE 30B total / ~3B active. Ollama page: 19 GB on disk. License: apache-2.0 (verified 2026-05-03 from huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct YAML frontmatter). Headline: Qwen3-32B beat Qwen2.5-Coder-32B on LiveCodeBench, AIME 2024 81.4, Arena Hard 93.8. Coder-slot challenger to qwen2.5-coder:14b. Tier 1+2 candidate; Tier 3 warm-load swap (vs 3090-warm qwen2.5-coder:32b) requires Kevin sign-off post-Phanteks-B/C.",
        "pull_tag": "qwen3-coder:30b",
        "roles": [
          "coder"
        ],
        "source": "ollama_library",
        "status": {
          "coder": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "microsoft-phi",
        "model": "phi4-reasoning:plus",
        "notes": "Phase B candidate added 2026-05-03. 11 GB on disk per ollama.com/library/phi4-reasoning. License: mit (verified 2026-05-03 from huggingface.co/microsoft/Phi-4-reasoning-plus YAML frontmatter). Headline: AIME 2024 81.3% \u2014 beats DeepSeek-R1-Distill-Llama-70B (69.3%) at 5x smaller. NEW family microsoft-phi adds diversity to reason slot (currently deepseek-only). Reason-slot challenger to deepseek-r1:14b. Benchmark shakedown (2026-05-04 UTC): 1/2 pass tied with deepseek-r1:14b; both failed inconsistency_reasoning_handoffs (regex_match). phi4-reasoning-plus was significantly slower (125854ms vs 41362ms at n=1). Verdict: HOLD (registered, not promoted). Caveat: test pack lacks AIME-style hard math, phi4's headline strength \u2014 see Phase A item 8 (queued case ideas) if differentiation is needed. See reports/local-model-benchmark-2026-05-04-phi4-reasoning-shakedown.md.",
        "pull_tag": "phi4-reasoning:plus",
        "roles": [
          "reason"
        ],
        "source": "ollama_library",
        "status": {
          "reason": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "deepseek",
        "model": "deepseek-r1:32b",
        "notes": "Phase B candidate added 2026-05-03. 20 GB on disk per ollama.com/library/deepseek-r1. License: Apache 2.0 (Qwen2.5 base) + MIT (R1 finetune). Headline: MMLU 90.8, AIME ~72%, outperforms o1-mini. Review-slot challenger to gemma4:26b \u2014 adds deepseek diversity to review slot (currently google-gemma-only). If wins Tier 1+2: turbo-escalation contract update REQUIRED before alias edit (rename 26B\u2192<deepseek-alias>). Tier 3 warm-load swap (4080S-warm gemma4:26b) is separate Kevin sign-off.",
        "pull_tag": "deepseek-r1:32b",
        "roles": [
          "review"
        ],
        "source": "ollama_library",
        "status": {
          "review": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "ibm-granite",
        "model": "hf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M",
        "notes": "Phase B candidate added 2026-05-03. HF-direct GGUF (Granite 4.1 not on Ollama library yet \u2014 only 4.0 family). License: Apache 2.0. 5.3 GB on disk Q4_K_M. Headline (per IBM Research): Granite 4.1 8B matches/beats Granite 4.0 32B-MoE; BFCL V3 tool calling 68.3 vs 64.7. Draft-slot challenger to granite3.3:8b (same family one-for-one swap). HF URL HEAD-checked 200 OK 2026-05-03. SHAKEDOWN RESULT (2026-05-03 session, UTC 2026-05-04): 2/3 pass tied with granite3.3:8b, both failed multi_file_summary_closeout as loud-wrong; granite4.1 ran ~2.4x slower aggregate at n=1 (mean 5884ms vs 2484ms). Verdict: HOLD \u2014 registered but not promoted, no alias/role change. Pipeline value validated (HF-direct pull + harness + render all worked). See reports/local-model-benchmark-2026-05-03-granite-shakedown.md.",
        "pull_tag": "ollama pull hf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M",
        "roles": [
          "draft"
        ],
        "source": "hf_gguf",
        "source_url": "https://huggingface.co/unsloth/granite-4.1-8b-GGUF",
        "status": {
          "draft": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "mistral-ai",
        "model": "mistral-small3.1:24b",
        "notes": "Phase B candidate added 2026-05-03. 15 GB on disk per ollama.com/library/mistral-small3.1. License: Apache 2.0 (confirmed on Ollama page). Headline: MMLU 81%+, ~3x faster than Llama 3.3 70B at same hardware, knowledge-dense. NEW family mistral-ai \u2014 adds Mistral lineage to council. Replaces mistral:7b (already in explicitly_excluded). 24B is large for the draft role \u2014 may motivate a future medium-general role; benchmark first.",
        "pull_tag": "mistral-small3.1:24b",
        "roles": [
          "draft"
        ],
        "source": "ollama_library",
        "status": {
          "draft": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "zhipu-glm",
        "model": "hf.co/unsloth/GLM-4.7-Flash-GGUF:Q4_K_M",
        "notes": "Phase B optional candidate (#6) added 2026-05-03. HF-direct GGUF (not on Ollama library at this writing). License: MIT. ~17 GB Q4 (MoE 30B-A3B). Headline: SWE-bench Verified 59.2 vs Qwen3-30B 22, GPT-OSS-20B 34 \u2014 agentic SOTA in 30B class. NEW family zhipu-glm \u2014 fixes coder slot's alibaba-qwen-only family-diversity blocker for any future coder promotion. HF URL HEAD-checked 200 OK 2026-05-03. Higher quant (Q5/Q6) reserved for Phase C 40GB rig.",
        "pull_tag": "ollama pull hf.co/unsloth/GLM-4.7-Flash-GGUF:Q4_K_M",
        "roles": [
          "coder"
        ],
        "source": "hf_gguf",
        "source_url": "https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF",
        "status": {
          "coder": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "meta-llama",
        "model": "llama3.3:70b",
        "notes": "Phase C candidate added 2026-05-03 (post-Phanteks-A/B/C only). 43 GB on disk per ollama.com/library/llama3.3. License: Llama 3.3 Community License. Headline: MATH 77.0 (vs 3.1's 67.8), IFEval 92.1 \u2014 beats Llama 3.1 405B (88.6), MGSM 91.1, GPQA Diamond 50.5. Comparator for Phanteks Scenario A llama3.1:70b retest. llama3.1:70b retires only after (a) Scenario A FINAL_COMPARISON.md ships AND (b) 3.3 wins head-to-head on existing 8 cases.",
        "pull_tag": "llama3.3:70b",
        "roles": [],
        "source": "ollama_library",
        "status": {}
      },
      {
        "cost_tier": "free",
        "family": "deepseek",
        "model": "deepseek-r1:70b",
        "notes": "Phase C candidate added 2026-05-03 (post-Phanteks-A/B/C only). 43 GB on disk per ollama.com/library/deepseek-r1. License: Llama 3.3 Community License (base) + MIT (R1 finetune). Headline: Distilled from full DeepSeek-R1 800k samples onto Llama 3.3 70B; AIME / MATH-500 / LiveCodeBench SOTA for distill class. Reason-slot flagship-tier challenger above Phase B's deepseek-r1:32b. Diversity already covered by 32B addition.",
        "pull_tag": "deepseek-r1:70b",
        "roles": [
          "reason"
        ],
        "source": "ollama_library",
        "status": {
          "reason": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "alibaba-qwen",
        "gguf_url": "https://huggingface.co/unsloth/Qwen3.6-27B-GGUF",
        "model": "hf.co/unsloth/Qwen3.6-27B-GGUF:Q4_K_M",
        "notes": "Phase C candidate (substitute for nonexistent Qwen3-72B) added 2026-05-03. Qwen3 jumps 32B \u2192 235B; no 72B variant ever shipped. Qwen3.6-27B is Qwen team's 2026 dense flagship \u2014 primary-source verified 2026-05-03 (Qwen HF card + unsloth GGUF). License: Apache 2.0. ~17 GB Q4. Headline (Qwen HF card): SWE-bench Verified 77.2, SWE-bench Pro 53.5, AIME 2026 94.1, GPQA Diamond 87.8, MMLU-Pro 86.2. 262K native context. Both URLs HEAD-checked 200 OK 2026-05-03. Could be pulled forward to Phase B given 24GB Q4 fit. Not yet in any role \u2014 strongest coder/general candidate registered to date.",
        "pull_tag": "ollama pull hf.co/unsloth/Qwen3.6-27B-GGUF:Q4_K_M",
        "roles": [],
        "source": "hf_gguf",
        "source_url": "https://huggingface.co/Qwen/Qwen3.6-27B",
        "status": {}
      },
      {
        "cost_tier": "free",
        "family": "nvidia-nemotron",
        "model": "hf.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF:Q4_K_M",
        "notes": "Phase C candidate added 2026-05-03 (post-Phanteks-A/B/C only). HF-direct text-only base variant (Omni multimodal-reasoning variant explicitly NOT chosen \u2014 text workflow). License: NVIDIA Open Model License. ~17 GB Q4 (MoE 30B-A3B). Headline: AIME 2025 89.1% no tools / 99.2% with tools \u2014 math leader for 30B class. NEW family nvidia-nemotron \u2014 adds NVIDIA lineage to reason slot. HF URL HEAD-checked 200 OK 2026-05-03.",
        "pull_tag": "ollama pull hf.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF:Q4_K_M",
        "roles": [
          "reason"
        ],
        "source": "hf_gguf",
        "source_url": "https://huggingface.co/unsloth/Nemotron-3-Nano-30B-A3B-GGUF",
        "status": {
          "reason": "challenger"
        }
      },
      {
        "cost_tier": "free",
        "family": "alibaba-qwen",
        "gguf_url": "https://huggingface.co/unsloth/Qwen3.6-35B-A3B-GGUF",
        "model": "hf.co/unsloth/Qwen3.6-35B-A3B-GGUF:Q4_K_M",
        "notes": "Phase C exploratory candidate added 2026-05-03 (post-Phanteks-A/B/C only). Registered for the dense-vs-MoE Qwen mini-ladder run (qwen2.5-coder:32b vs qwen3-coder:30b vs Qwen3.6-27B vs Qwen3.6-35B-A3B). License: Apache 2.0 (verified Qwen HF card + unsloth GGUF). 35B total / 3B active MoE. Headline (Qwen HF card): SWE-bench Verified 73.4 (slightly below 27B-dense 77.2), AIME 2026 92.7, MMLU-Pro 85.2, 262K native ctx. Estimated ~21 GB Q4. Both URLs verified by Kevin 2026-05-03 \u2014 Unsloth GGUF exists. Does not block or alter Phanteks A/B/C \u2014 pure registration metadata.",
        "pull_tag": "ollama pull hf.co/unsloth/Qwen3.6-35B-A3B-GGUF:Q4_K_M",
        "roles": [],
        "source": "hf_gguf",
        "source_url": "https://huggingface.co/Qwen/Qwen3.6-35B-A3B",
        "status": {}
      }
    ],
    "notes_for_runner": "Role-indexed registry consumed by scripts/benchmark_local_models.py (Epic 1 child #1076). `roles` gives the incumbent/challenger promotion mapping per slot. `models` gives full per-model metadata. A model's `family` enforces /ask panel diversity on promotion (memory #895 \u2014 no single-family panels). `cost_tier` gates the turbo-escalation audit at 2026-04-28.",
    "pack_updated": "2026-04-21",
    "promotion_policy": {
      "audit_cadence": "After every benchmark run; summary in reports/local-model-benchmark-YYYY-MM-DD.md.",
      "family_diversity_guard": "If the proposed promotion would leave a role served by only one `family` value across incumbent + challengers, reject regardless of other metrics (memory #895).",
      "rule": "A challenger may replace an incumbent only if it beats on: acceptance_rate (\u2265 incumbent), review_pain (\u2264 incumbent), hard_failure_rate (\u2264 incumbent), cost_tier (free unless incumbent is also paid), AND preserves family_diversity (promotion must NOT leave the role single-family)."
    },
    "roles": {
      "coder": {
        "case_slugs": [
          "code_edit_password_compat",
          "regex_oneliner_jq_pin"
        ],
        "challengers": [],
        "incumbent": "qwen2.5-coder:14b"
      },
      "draft": {
        "case_slugs": [
          "strict_json_idea_tagger",
          "copy_drafting_project_card",
          "multi_file_summary_closeout"
        ],
        "challengers": [
          "llama3.1:8b",
          "granite3.3:8b"
        ],
        "incumbent": "gemma4:latest"
      },
      "reason": {
        "case_slugs": [
          "bug_diag_autoseed_lock",
          "inconsistency_reasoning_handoffs"
        ],
        "challengers": [],
        "incumbent": "deepseek-r1:14b"
      },
      "review": {
        "case_slugs": [
          "security_review_audit_headers"
        ],
        "challengers": [],
        "incumbent": "gemma4:26b"
      }
    },
    "turbo_escalation": {
      "budget_note": "$20/mo flat; activated 2026-04-13. Keep decision at re-eval depends on acceptance rate on cases with turbo_escalation: true (currently: security_review_audit_headers, inconsistency_reasoning_handoffs).",
      "from_role": "review",
      "reevaluation_date": "2026-04-28",
      "retry_model": "gpt-oss:120b-cloud",
      "telemetry_tag": "turbo-escalation",
      "trigger_condition": "26B verify-reject on code-review or security-review task"
    },
    "version": "1.0"
  },
  "role_rows": [
    {
      "case_slugs": [
        "strict_json_idea_tagger",
        "copy_drafting_project_card",
        "multi_file_summary_closeout"
      ],
      "challengers": [
        {
          "cost_tier": "free",
          "family": "meta-llama",
          "model": "llama3.1:8b"
        },
        {
          "cost_tier": "free",
          "family": "ibm-granite",
          "model": "granite3.3:8b"
        }
      ],
      "incumbent": {
        "cost_tier": "free",
        "family": "google-gemma",
        "model": "gemma4:latest"
      },
      "role": "draft"
    },
    {
      "case_slugs": [
        "bug_diag_autoseed_lock",
        "inconsistency_reasoning_handoffs"
      ],
      "challengers": [],
      "incumbent": {
        "cost_tier": "free",
        "family": "deepseek",
        "model": "deepseek-r1:14b"
      },
      "role": "reason"
    },
    {
      "case_slugs": [
        "security_review_audit_headers"
      ],
      "challengers": [],
      "incumbent": {
        "cost_tier": "free",
        "family": "google-gemma",
        "model": "gemma4:26b"
      },
      "role": "review"
    },
    {
      "case_slugs": [
        "code_edit_password_compat",
        "regex_oneliner_jq_pin"
      ],
      "challengers": [],
      "incumbent": {
        "cost_tier": "free",
        "family": "alibaba-qwen",
        "model": "qwen2.5-coder:14b"
      },
      "role": "coder"
    }
  ],
  "rows": [
    {
      "cases": 3,
      "cost_tier": "free",
      "family": "ibm-granite",
      "hard_fail": 1,
      "median_elapsed_ms": 555.0,
      "model": "granite3.3:8b",
      "parse_fail": 0,
      "pass": 2,
      "pass_rate": 0.6666666666666666,
      "reps": 3,
      "roles": [
        "draft"
      ]
    },
    {
      "cases": 3,
      "cost_tier": "free",
      "family": "ibm-granite",
      "hard_fail": 1,
      "median_elapsed_ms": 800.6,
      "model": "hf.co/unsloth/granite-4.1-8b-GGUF:Q4_K_M",
      "parse_fail": 0,
      "pass": 2,
      "pass_rate": 0.6666666666666666,
      "reps": 3,
      "roles": [
        "draft"
      ]
    }
  ],
  "scoring_version": "v1"
}