commit 4d2226787818ffd5455c35bf72eeef923ae3a7ce
parent 5618e59897d0dae1dd10f47a1d8e147054069a9d
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 23 Mar 2026 10:09:30 +0100
Add human-readable descriptions to per-question pass rates
Each question bar now shows a concise description instead of the raw
snake_case field name. Descriptions pre-computed in build script
(67 questions mapped). E.g., "self_comparison_bias_addressed" becomes
"Self-evaluation bias acknowledged".
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 82 insertions(+), 6 deletions(-)
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -67,18 +67,20 @@ function renderQuestionRates(f: Findings): string {
<p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">${sorted.length} questions across 14 categories. Sorted by pass rate, worst first.</p>
<h3 style="font-size:0.85rem;color:var(--red);margin-bottom:0.5rem">Worst 20</h3>
${worst20.map(([key, d]) => {
- const [cat, q] = key.split('.');
+ const [cat] = key.split('.');
+ const desc = (d as any).desc || formatName(key.split('.')[1]);
const color = d.rate < 10 ? 'var(--red)' : d.rate < 30 ? 'var(--yellow)' : 'var(--accent)';
return `<div class="hbar">
- <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-label"><span>${desc} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
<div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div>
</div>`;
}).join('')}
<h3 style="font-size:0.85rem;color:var(--green);margin:1rem 0 0.5rem">Best 10</h3>
${best10.map(([key, d]) => {
- const [cat, q] = key.split('.');
+ const [cat] = key.split('.');
+ const desc = (d as any).desc || formatName(key.split('.')[1]);
return `<div class="hbar">
- <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-label"><span>${desc} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
<div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:var(--green)"></div></div>
</div>`;
}).join('')}
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -480,11 +480,85 @@ def build():
# --- Findings aggregations ---
- # 1. Per-question pass rates
+ # 1. Per-question pass rates (with human-readable descriptions)
+ Q_DESCRIPTIONS = {
+ "artifacts.code_released": "Source code publicly released",
+ "artifacts.data_released": "Dataset publicly available",
+ "artifacts.environment_specified": "Environment/dependency specs provided",
+ "artifacts.reproduction_instructions": "Step-by-step reproduction instructions included",
+ "statistical_methodology.confidence_intervals_or_error_bars": "Confidence intervals or error bars on main results",
+ "statistical_methodology.significance_tests": "Statistical significance tests for comparative claims",
+ "statistical_methodology.effect_sizes_reported": "Effect sizes reported, not just p-values",
+ "statistical_methodology.sample_size_justified": "Sample size justified or power analysis discussed",
+ "statistical_methodology.variance_reported": "Variance or std dev reported across runs",
+ "evaluation_design.baselines_included": "Baseline comparisons included",
+ "evaluation_design.baselines_contemporary": "Baselines are contemporary and competitive",
+ "evaluation_design.ablation_study": "Ablation study showing which components matter",
+ "evaluation_design.multiple_metrics": "Multiple evaluation metrics used",
+ "evaluation_design.human_evaluation": "Human evaluation included, not just automated",
+ "evaluation_design.held_out_test_set": "Results on held-out test set, not dev/val",
+ "evaluation_design.per_category_breakdown": "Per-category or per-task breakdowns provided",
+ "evaluation_design.failure_cases_discussed": "Failure cases shown or discussed",
+ "evaluation_design.negative_results_reported": "Negative results reported",
+ "claims_and_evidence.abstract_claims_supported": "All abstract claims supported by results",
+ "claims_and_evidence.causal_claims_justified": "Causal claims backed by adequate study design",
+ "claims_and_evidence.generalization_bounded": "Generalizations bounded to tested setting",
+ "claims_and_evidence.alternative_explanations_discussed": "Alternative explanations discussed",
+ "claims_and_evidence.proxy_outcome_distinction": "Proxy vs outcome distinction acknowledged",
+ "setup_transparency.model_versions_specified": "Exact model versions specified",
+ "setup_transparency.prompts_provided": "Actual prompts/system instructions provided",
+ "setup_transparency.hyperparameters_reported": "Hyperparameters reported (temperature, etc.)",
+ "setup_transparency.scaffolding_described": "Agentic scaffolding described in detail",
+ "setup_transparency.data_preprocessing_documented": "Data preprocessing steps documented",
+ "limitations_and_scope.limitations_section_present": "Dedicated limitations section present",
+ "limitations_and_scope.threats_to_validity_specific": "Specific threats to validity discussed",
+ "limitations_and_scope.scope_boundaries_stated": "Explicit scope boundaries stated",
+ "data_integrity.raw_data_available": "Raw data available for verification",
+ "data_integrity.data_collection_described": "Data collection procedure described",
+ "data_integrity.recruitment_methods_described": "Participant/sample recruitment described",
+ "data_integrity.data_pipeline_documented": "Full data pipeline documented",
+ "conflicts_of_interest.funding_disclosed": "Funding source disclosed",
+ "conflicts_of_interest.affiliations_disclosed": "Author affiliations with evaluated product disclosed",
+ "conflicts_of_interest.funder_independent_of_outcome": "Funder independent of outcome",
+ "conflicts_of_interest.financial_interests_declared": "Financial interests declared",
+ "contamination.training_cutoff_stated": "Model training data cutoff stated",
+ "contamination.train_test_overlap_discussed": "Train/test overlap discussed",
+ "contamination.benchmark_contamination_addressed": "Benchmark contamination addressed",
+ "human_studies.pre_registered": "Study pre-registered",
+ "human_studies.irb_or_ethics_approval": "IRB or ethics approval mentioned",
+ "human_studies.demographics_reported": "Participant demographics reported",
+ "human_studies.inclusion_exclusion_criteria": "Inclusion/exclusion criteria stated",
+ "human_studies.randomization_described": "Randomization procedure described",
+ "human_studies.blinding_described": "Blinding described",
+ "human_studies.attrition_reported": "Participant attrition reported",
+ "cost_and_practicality.inference_cost_reported": "Inference cost or latency reported",
+ "cost_and_practicality.compute_budget_stated": "Total computational budget stated",
+ "experimental_rigor.seed_sensitivity_reported": "Results across multiple random seeds",
+ "experimental_rigor.number_of_runs_stated": "Number of experimental runs stated",
+ "experimental_rigor.hyperparameter_search_budget": "Hyperparameter search budget reported",
+ "experimental_rigor.best_config_selection_justified": "Best config selection justified",
+ "experimental_rigor.multiple_comparison_correction": "Multiple comparison correction applied",
+ "experimental_rigor.self_comparison_bias_addressed": "Self-evaluation bias acknowledged",
+ "experimental_rigor.compute_budget_vs_performance": "Performance reported vs compute budget",
+ "experimental_rigor.benchmark_construct_validity": "Benchmark construct validity discussed",
+ "experimental_rigor.scaffold_confound_addressed": "Scaffolding confound addressed",
+ "data_leakage.temporal_leakage_addressed": "Temporal leakage addressed",
+ "data_leakage.feature_leakage_addressed": "Feature leakage addressed",
+ "data_leakage.non_independence_addressed": "Train/test non-independence addressed",
+ "data_leakage.leakage_detection_method": "Concrete leakage detection method used",
+ "survey_methodology.prisma_or_structured_protocol": "PRISMA or structured review protocol",
+ "survey_methodology.quality_assessment_of_sources": "Quality assessment of source papers",
+ "survey_methodology.publication_bias_discussed": "Publication bias discussed",
+ }
+
q_rates = {}
for key, d in question_pass_counts.items():
if d["applicable"] > 0:
- q_rates[key] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]}
+ q_rates[key] = {
+ "rate": round(d["passed"] / d["applicable"] * 100, 1),
+ "n": d["applicable"],
+ "desc": Q_DESCRIPTIONS.get(key, ""),
+ }
# 2. Year trends by category
year_cat_trends = {}