ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit 4d2226787818ffd5455c35bf72eeef923ae3a7ce
parent 5618e59897d0dae1dd10f47a1d8e147054069a9d
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon, 23 Mar 2026 10:09:30 +0100

Add human-readable descriptions to per-question pass rates

Each question bar now shows a concise description instead of the raw
snake_case field name. Descriptions pre-computed in build script
(67 questions mapped). E.g., "self_comparison_bias_addressed" becomes
"Self-evaluation bias acknowledged".

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mexplorer/src/views/findings.ts | 10++++++----
Mscripts/build-explorer-data.py | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts @@ -67,18 +67,20 @@ function renderQuestionRates(f: Findings): string { <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">${sorted.length} questions across 14 categories. Sorted by pass rate, worst first.</p> <h3 style="font-size:0.85rem;color:var(--red);margin-bottom:0.5rem">Worst 20</h3> ${worst20.map(([key, d]) => { - const [cat, q] = key.split('.'); + const [cat] = key.split('.'); + const desc = (d as any).desc || formatName(key.split('.')[1]); const color = d.rate < 10 ? 'var(--red)' : d.rate < 30 ? 'var(--yellow)' : 'var(--accent)'; return `<div class="hbar"> - <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-label"><span>${desc} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div> </div>`; }).join('')} <h3 style="font-size:0.85rem;color:var(--green);margin:1rem 0 0.5rem">Best 10</h3> ${best10.map(([key, d]) => { - const [cat, q] = key.split('.'); + const [cat] = key.split('.'); + const desc = (d as any).desc || formatName(key.split('.')[1]); return `<div class="hbar"> - <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-label"><span>${desc} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:var(--green)"></div></div> </div>`; }).join('')} diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -480,11 +480,85 @@ def build(): # --- Findings aggregations --- - # 1. Per-question pass rates + # 1. Per-question pass rates (with human-readable descriptions) + Q_DESCRIPTIONS = { + "artifacts.code_released": "Source code publicly released", + "artifacts.data_released": "Dataset publicly available", + "artifacts.environment_specified": "Environment/dependency specs provided", + "artifacts.reproduction_instructions": "Step-by-step reproduction instructions included", + "statistical_methodology.confidence_intervals_or_error_bars": "Confidence intervals or error bars on main results", + "statistical_methodology.significance_tests": "Statistical significance tests for comparative claims", + "statistical_methodology.effect_sizes_reported": "Effect sizes reported, not just p-values", + "statistical_methodology.sample_size_justified": "Sample size justified or power analysis discussed", + "statistical_methodology.variance_reported": "Variance or std dev reported across runs", + "evaluation_design.baselines_included": "Baseline comparisons included", + "evaluation_design.baselines_contemporary": "Baselines are contemporary and competitive", + "evaluation_design.ablation_study": "Ablation study showing which components matter", + "evaluation_design.multiple_metrics": "Multiple evaluation metrics used", + "evaluation_design.human_evaluation": "Human evaluation included, not just automated", + "evaluation_design.held_out_test_set": "Results on held-out test set, not dev/val", + "evaluation_design.per_category_breakdown": "Per-category or per-task breakdowns provided", + "evaluation_design.failure_cases_discussed": "Failure cases shown or discussed", + "evaluation_design.negative_results_reported": "Negative results reported", + "claims_and_evidence.abstract_claims_supported": "All abstract claims supported by results", + "claims_and_evidence.causal_claims_justified": "Causal claims backed by adequate study design", + "claims_and_evidence.generalization_bounded": "Generalizations bounded to tested setting", + "claims_and_evidence.alternative_explanations_discussed": "Alternative explanations discussed", + "claims_and_evidence.proxy_outcome_distinction": "Proxy vs outcome distinction acknowledged", + "setup_transparency.model_versions_specified": "Exact model versions specified", + "setup_transparency.prompts_provided": "Actual prompts/system instructions provided", + "setup_transparency.hyperparameters_reported": "Hyperparameters reported (temperature, etc.)", + "setup_transparency.scaffolding_described": "Agentic scaffolding described in detail", + "setup_transparency.data_preprocessing_documented": "Data preprocessing steps documented", + "limitations_and_scope.limitations_section_present": "Dedicated limitations section present", + "limitations_and_scope.threats_to_validity_specific": "Specific threats to validity discussed", + "limitations_and_scope.scope_boundaries_stated": "Explicit scope boundaries stated", + "data_integrity.raw_data_available": "Raw data available for verification", + "data_integrity.data_collection_described": "Data collection procedure described", + "data_integrity.recruitment_methods_described": "Participant/sample recruitment described", + "data_integrity.data_pipeline_documented": "Full data pipeline documented", + "conflicts_of_interest.funding_disclosed": "Funding source disclosed", + "conflicts_of_interest.affiliations_disclosed": "Author affiliations with evaluated product disclosed", + "conflicts_of_interest.funder_independent_of_outcome": "Funder independent of outcome", + "conflicts_of_interest.financial_interests_declared": "Financial interests declared", + "contamination.training_cutoff_stated": "Model training data cutoff stated", + "contamination.train_test_overlap_discussed": "Train/test overlap discussed", + "contamination.benchmark_contamination_addressed": "Benchmark contamination addressed", + "human_studies.pre_registered": "Study pre-registered", + "human_studies.irb_or_ethics_approval": "IRB or ethics approval mentioned", + "human_studies.demographics_reported": "Participant demographics reported", + "human_studies.inclusion_exclusion_criteria": "Inclusion/exclusion criteria stated", + "human_studies.randomization_described": "Randomization procedure described", + "human_studies.blinding_described": "Blinding described", + "human_studies.attrition_reported": "Participant attrition reported", + "cost_and_practicality.inference_cost_reported": "Inference cost or latency reported", + "cost_and_practicality.compute_budget_stated": "Total computational budget stated", + "experimental_rigor.seed_sensitivity_reported": "Results across multiple random seeds", + "experimental_rigor.number_of_runs_stated": "Number of experimental runs stated", + "experimental_rigor.hyperparameter_search_budget": "Hyperparameter search budget reported", + "experimental_rigor.best_config_selection_justified": "Best config selection justified", + "experimental_rigor.multiple_comparison_correction": "Multiple comparison correction applied", + "experimental_rigor.self_comparison_bias_addressed": "Self-evaluation bias acknowledged", + "experimental_rigor.compute_budget_vs_performance": "Performance reported vs compute budget", + "experimental_rigor.benchmark_construct_validity": "Benchmark construct validity discussed", + "experimental_rigor.scaffold_confound_addressed": "Scaffolding confound addressed", + "data_leakage.temporal_leakage_addressed": "Temporal leakage addressed", + "data_leakage.feature_leakage_addressed": "Feature leakage addressed", + "data_leakage.non_independence_addressed": "Train/test non-independence addressed", + "data_leakage.leakage_detection_method": "Concrete leakage detection method used", + "survey_methodology.prisma_or_structured_protocol": "PRISMA or structured review protocol", + "survey_methodology.quality_assessment_of_sources": "Quality assessment of source papers", + "survey_methodology.publication_bias_discussed": "Publication bias discussed", + } + q_rates = {} for key, d in question_pass_counts.items(): if d["applicable"] > 0: - q_rates[key] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]} + q_rates[key] = { + "rate": round(d["passed"] / d["applicable"] * 100, 1), + "n": d["applicable"], + "desc": Q_DESCRIPTIONS.get(key, ""), + } # 2. Year trends by category year_cat_trends = {}

Impressum · Datenschutz