Add human-readable descriptions to per-question pass rates - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit 4d2226787818ffd5455c35bf72eeef923ae3a7ce
parent 5618e59897d0dae1dd10f47a1d8e147054069a9d
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon, 23 Mar 2026 10:09:30 +0100

Add human-readable descriptions to per-question pass rates

Each question bar now shows a concise description instead of the raw
snake_case field name. Descriptions pre-computed in build script
(67 questions mapped). E.g., "self_comparison_bias_addressed" becomes
"Self-evaluation bias acknowledged".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M explorer/src/views/findings.ts  | 10 ++++++----
M scripts/build-explorer-data.py  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--

2 files changed, 82 insertions(+), 6 deletions(-)
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -67,18 +67,20 @@ function renderQuestionRates(f: Findings): string {
     <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">${sorted.length} questions across 14 categories. Sorted by pass rate, worst first.</p>
     <h3 style="font-size:0.85rem;color:var(--red);margin-bottom:0.5rem">Worst 20</h3>
     ${worst20.map(([key, d]) => {
-      const [cat, q] = key.split('.');
+      const [cat] = key.split('.');
+      const desc = (d as any).desc || formatName(key.split('.')[1]);
       const color = d.rate < 10 ? 'var(--red)' : d.rate < 30 ? 'var(--yellow)' : 'var(--accent)';
       return `<div class="hbar">
-        <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+        <div class="hbar-label"><span>${desc} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
         <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div>
       </div>`;
     }).join('')}
     <h3 style="font-size:0.85rem;color:var(--green);margin:1rem 0 0.5rem">Best 10</h3>
     ${best10.map(([key, d]) => {
-      const [cat, q] = key.split('.');
+      const [cat] = key.split('.');
+      const desc = (d as any).desc || formatName(key.split('.')[1]);
       return `<div class="hbar">
-        <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+        <div class="hbar-label"><span>${desc} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
         <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:var(--green)"></div></div>
       </div>`;
     }).join('')}
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -480,11 +480,85 @@ def build():
 
     # --- Findings aggregations ---
 
-    # 1. Per-question pass rates
+    # 1. Per-question pass rates (with human-readable descriptions)
+    Q_DESCRIPTIONS = {
+        "artifacts.code_released": "Source code publicly released",
+        "artifacts.data_released": "Dataset publicly available",
+        "artifacts.environment_specified": "Environment/dependency specs provided",
+        "artifacts.reproduction_instructions": "Step-by-step reproduction instructions included",
+        "statistical_methodology.confidence_intervals_or_error_bars": "Confidence intervals or error bars on main results",
+        "statistical_methodology.significance_tests": "Statistical significance tests for comparative claims",
+        "statistical_methodology.effect_sizes_reported": "Effect sizes reported, not just p-values",
+        "statistical_methodology.sample_size_justified": "Sample size justified or power analysis discussed",
+        "statistical_methodology.variance_reported": "Variance or std dev reported across runs",
+        "evaluation_design.baselines_included": "Baseline comparisons included",
+        "evaluation_design.baselines_contemporary": "Baselines are contemporary and competitive",
+        "evaluation_design.ablation_study": "Ablation study showing which components matter",
+        "evaluation_design.multiple_metrics": "Multiple evaluation metrics used",
+        "evaluation_design.human_evaluation": "Human evaluation included, not just automated",
+        "evaluation_design.held_out_test_set": "Results on held-out test set, not dev/val",
+        "evaluation_design.per_category_breakdown": "Per-category or per-task breakdowns provided",
+        "evaluation_design.failure_cases_discussed": "Failure cases shown or discussed",
+        "evaluation_design.negative_results_reported": "Negative results reported",
+        "claims_and_evidence.abstract_claims_supported": "All abstract claims supported by results",
+        "claims_and_evidence.causal_claims_justified": "Causal claims backed by adequate study design",
+        "claims_and_evidence.generalization_bounded": "Generalizations bounded to tested setting",
+        "claims_and_evidence.alternative_explanations_discussed": "Alternative explanations discussed",
+        "claims_and_evidence.proxy_outcome_distinction": "Proxy vs outcome distinction acknowledged",
+        "setup_transparency.model_versions_specified": "Exact model versions specified",
+        "setup_transparency.prompts_provided": "Actual prompts/system instructions provided",
+        "setup_transparency.hyperparameters_reported": "Hyperparameters reported (temperature, etc.)",
+        "setup_transparency.scaffolding_described": "Agentic scaffolding described in detail",
+        "setup_transparency.data_preprocessing_documented": "Data preprocessing steps documented",
+        "limitations_and_scope.limitations_section_present": "Dedicated limitations section present",
+        "limitations_and_scope.threats_to_validity_specific": "Specific threats to validity discussed",
+        "limitations_and_scope.scope_boundaries_stated": "Explicit scope boundaries stated",
+        "data_integrity.raw_data_available": "Raw data available for verification",
+        "data_integrity.data_collection_described": "Data collection procedure described",
+        "data_integrity.recruitment_methods_described": "Participant/sample recruitment described",
+        "data_integrity.data_pipeline_documented": "Full data pipeline documented",
+        "conflicts_of_interest.funding_disclosed": "Funding source disclosed",
+        "conflicts_of_interest.affiliations_disclosed": "Author affiliations with evaluated product disclosed",
+        "conflicts_of_interest.funder_independent_of_outcome": "Funder independent of outcome",
+        "conflicts_of_interest.financial_interests_declared": "Financial interests declared",
+        "contamination.training_cutoff_stated": "Model training data cutoff stated",
+        "contamination.train_test_overlap_discussed": "Train/test overlap discussed",
+        "contamination.benchmark_contamination_addressed": "Benchmark contamination addressed",
+        "human_studies.pre_registered": "Study pre-registered",
+        "human_studies.irb_or_ethics_approval": "IRB or ethics approval mentioned",
+        "human_studies.demographics_reported": "Participant demographics reported",
+        "human_studies.inclusion_exclusion_criteria": "Inclusion/exclusion criteria stated",
+        "human_studies.randomization_described": "Randomization procedure described",
+        "human_studies.blinding_described": "Blinding described",
+        "human_studies.attrition_reported": "Participant attrition reported",
+        "cost_and_practicality.inference_cost_reported": "Inference cost or latency reported",
+        "cost_and_practicality.compute_budget_stated": "Total computational budget stated",
+        "experimental_rigor.seed_sensitivity_reported": "Results across multiple random seeds",
+        "experimental_rigor.number_of_runs_stated": "Number of experimental runs stated",
+        "experimental_rigor.hyperparameter_search_budget": "Hyperparameter search budget reported",
+        "experimental_rigor.best_config_selection_justified": "Best config selection justified",
+        "experimental_rigor.multiple_comparison_correction": "Multiple comparison correction applied",
+        "experimental_rigor.self_comparison_bias_addressed": "Self-evaluation bias acknowledged",
+        "experimental_rigor.compute_budget_vs_performance": "Performance reported vs compute budget",
+        "experimental_rigor.benchmark_construct_validity": "Benchmark construct validity discussed",
+        "experimental_rigor.scaffold_confound_addressed": "Scaffolding confound addressed",
+        "data_leakage.temporal_leakage_addressed": "Temporal leakage addressed",
+        "data_leakage.feature_leakage_addressed": "Feature leakage addressed",
+        "data_leakage.non_independence_addressed": "Train/test non-independence addressed",
+        "data_leakage.leakage_detection_method": "Concrete leakage detection method used",
+        "survey_methodology.prisma_or_structured_protocol": "PRISMA or structured review protocol",
+        "survey_methodology.quality_assessment_of_sources": "Quality assessment of source papers",
+        "survey_methodology.publication_bias_discussed": "Publication bias discussed",
+    }
+
     q_rates = {}
     for key, d in question_pass_counts.items():
         if d["applicable"] > 0:
-            q_rates[key] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]}
+            q_rates[key] = {
+                "rate": round(d["passed"] / d["applicable"] * 100, 1),
+                "n": d["applicable"],
+                "desc": Q_DESCRIPTIONS.get(key, ""),
+            }
 
     # 2. Year trends by category
     year_cat_trends = {}

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs

M	explorer/src/views/findings.ts	\|	10	++++++----
M	scripts/build-explorer-data.py	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--