calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (14076B)
      1 {
      2   "paper_slug": "agentic-ai-software-2025",
      3   "calibrator": "opus",
      4   "calibration_date": "2026-02-28",
      5   "total_questions": 50,
      6   "agreement_count": 48,
      7   "disagreement_count": 2,
      8   "agreement_rate": 0.96,
      9   "disagreements": [
     10     {
     11       "category": "artifacts",
     12       "question": "data_released",
     13       "sonnet": { "applies": true, "answer": true },
     14       "opus": { "applies": false, "answer": false },
     15       "direction": "applies_boundary",
     16       "explanation": "Sonnet credits the paper's header 'Datasets — SWE-bench' and the fact that SWE-bench is a publicly available benchmark. Opus judges that this is a position paper that does not actually use or produce any dataset — it merely discusses SWE-bench as context. The paper runs no experiments on SWE-bench. For a theoretical/position paper, data_released is structurally inapplicable since no data is used or produced by the paper itself."
     17     },
     18     {
     19       "category": "data_integrity",
     20       "question": "raw_data_available",
     21       "sonnet": { "applies": true, "answer": true },
     22       "opus": { "applies": false, "answer": false },
     23       "direction": "applies_boundary",
     24       "explanation": "Same reasoning as data_released. Sonnet credits SWE-bench as available raw data for verification. Opus judges that a position paper with no experiments produces no data to verify. The raw_data_available question asks whether underlying data can be independently verified — but there is no 'underlying data' in a position paper. SWE-bench is referenced, not used."
     25     }
     26   ],
     27   "opus_checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper explicitly states 'Code — N.A.' in the header. No repository or code archive is provided."
     33       },
     34       "data_released": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "This is a position/perspective paper that does not use or produce any dataset. The header lists 'Datasets — SWE-bench' but the paper runs no experiments on SWE-bench — it merely discusses the benchmark as context for the agentic AI landscape. For a theoretical/position paper, data release is structurally inapplicable."
     38       },
     39       "environment_specified": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "Position paper with no original experiments. No code to run and no environment to specify."
     43       },
     44       "reproduction_instructions": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "Position paper with no original experiments. Nothing to reproduce."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "Position paper with no quantitative experiments or statistical results."
     55       },
     56       "significance_tests": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No comparative statistical claims are made. The paper is a conceptual discussion without original empirical data."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "No quantitative effect measurements are reported. Position paper."
     65       },
     66       "sample_size_justified": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No data collection or sample is involved. Position paper."
     70       },
     71       "variance_reported": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No experimental runs are conducted. Position paper with no numerical results."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "Position paper. Discusses various systems (Devin, SWE-agent, OpenHands, AutoCodeRover) conceptually but conducts no comparative evaluation."
     82       },
     83       "baselines_contemporary": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No baseline comparison is conducted in this position paper."
     87       },
     88       "ablation_study": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No experiments are conducted. Ablation studies are not applicable to a position paper."
     92       },
     93       "multiple_metrics": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No empirical evaluation is conducted. Position paper."
     97       },
     98       "human_evaluation": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "No evaluation of system outputs is conducted. Position paper."
    102       },
    103       "held_out_test_set": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "No experiments are run. No datasets are collected or split."
    107       },
    108       "per_category_breakdown": {
    109         "applies": false,
    110         "answer": false,
    111         "justification": "No empirical evaluation is conducted that would yield per-category results."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper mentions overfitting in program repair and a general 'trust deficit' in AI-generated code, but does not present concrete failure cases of the systems discussed (AutoCodeRover, Devin, etc.). No systematic error analysis is provided."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "No negative results are reported. The paper is advocacy for the intent-inference approach with no experiments that could produce negative results."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The abstract claims 'AI agents have recently shown significant promise in software engineering' but the paper provides no quantitative evidence — only conceptual discussion and citations to external work. No original results support this claim."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper makes implicit causal claims: working on program representations 'enhances trust', intent inference 'combats the overfitting problem.' These are asserted without controlled evidence within this paper."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title 'thoughts from Software Engineering community' implies community consensus, but represents one researcher's perspective. Discussion of AutoCodeRover on SWE-bench is generalized to claims about agentic AI for software engineering broadly, without bounding these generalizations."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper advocates intent inference as the key approach without considering alternative explanations for AutoCodeRover's success or discussing why simpler text-based approaches (like those it contrasts against) might be adequate in some settings."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "Position paper with no LLM experiments. No model API calls are made."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "Position paper that does not run LLM experiments. No prompts are used."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No experiments are conducted. No hyperparameters are relevant."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "The paper describes AutoCodeRover's approach at a high conceptual level (code search, fault localization, patch generation) but lacks detail on tool interfaces, retry logic, feedback mechanisms, or context management. The scaffolding is described only at the level of a schematic (Figure 2) and brief prose."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No data is collected or preprocessed. Position paper."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations or threats-to-validity section exists. Challenges (trust deficit, vulnerability risks) are mentioned inline but not organized as substantive limitations discussion."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No specific threats to validity are discussed. The paper does not acknowledge limitations of its argument, SWE-bench as an evaluation benchmark, or potential biases in advocating the intent-inference paradigm."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not clarify what it is not claiming or what settings the argument may not apply to."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "Position paper with no original data to verify. The paper references SWE-bench but does not use it in any experiment. There is no 'underlying data' produced by this paper that could be independently verified."
    195       },
    196       "data_collection_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No data collection is performed. Position paper."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No participants or samples are recruited. Position paper."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No data pipeline exists. Position paper."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding or acknowledgments section is present. No mention of grants, institutional support, or sponsors. The author is a university professor (not a solo independent researcher), so funding disclosure is expected."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The paper discloses the author's dual affiliation: 'Full-time involvement as Professor at NUS, while being Senior Advisor at SonarSource SA.' This is relevant given AutoCodeRover's integration into SonarQube (a SonarSource product)."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The author is a Senior Advisor at SonarSource SA, whose product SonarQube integrates AutoCodeRover, the system prominently promoted in this paper. This creates a non-independent commercial relationship. No independent funder is disclosed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement is provided. The SonarSource advisory role is disclosed as an affiliation but not explicitly framed as a financial interest or conflict of interest."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Position paper that does not benchmark any pre-trained model's capability. No model evaluation is conducted."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No benchmark evaluation of pre-trained models is conducted. Contamination is not applicable."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The paper references SWE-bench as an existing benchmark but does not run evaluations. Contamination is not applicable."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human subjects study is conducted. Position paper."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study is conducted. Position paper."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved. Position paper."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participant selection is involved. Position paper."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No experimental study with human participants. Position paper."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No experimental study with human participants. Position paper."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants involved. Position paper."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Position paper with no experiments. Cost reporting is not applicable."
    293       },
    294       "compute_budget_stated": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "Position paper with no computational experiments. Compute budget is not applicable."
    298       }
    299     }
    300   }
    301 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs