calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (12784B)
      1 {
      2   "paper_slug": "agentic-ai-software-2025-2",
      3   "calibration_date": "2026-02-28",
      4   "calibration_model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 49,
      7   "disagreement_count": 1,
      8   "agreement_rate": 0.98,
      9   "disagreements": [
     10     {
     11       "category": "evaluation_design",
     12       "question": "failure_cases_discussed",
     13       "sonnet": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "The paper discusses challenges at a high level (developer hesitation, trust barriers) but does not present or discuss specific failure cases of AI systems with concrete examples."
     17       },
     18       "opus": {
     19         "applies": false,
     20         "answer": false,
     21         "justification": "This is a pure position/opinion paper with no system built or evaluated. Failure cases are structurally inapplicable — there is no approach whose failures could be examined. The high-level discussion of trust barriers is conceptual argumentation, not failure analysis of a system."
     22       },
     23       "direction": "applies_boundary",
     24       "note": "Sonnet treated failure_cases_discussed as applicable to a position paper on the grounds that the paper could discuss AI system failures. Opus judged it structurally inapplicable because the paper evaluates no system and presents no empirical results — consistent with the scan agent instructions that say 'Most empirical checklist items will have applies: false' for theoretical/position papers."
     25     }
     26   ],
     27   "opus_checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": false,
     31         "answer": false,
     32         "justification": "This is a 5-page opinion/position piece with no experiments, no implementation, and no analysis scripts. There is nothing to release."
     33       },
     34       "data_released": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "No dataset is collected, analyzed, or used. The paper is purely argumentative."
     38       },
     39       "environment_specified": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "No computational experiments are conducted. No environment to specify."
     43       },
     44       "reproduction_instructions": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No experiments are run. There is nothing to reproduce."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "This is a theoretical opinion piece with no empirical results or numerical measurements."
     55       },
     56       "significance_tests": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No comparative empirical claims are made that would require significance testing."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "No empirical results are reported; effect sizes are not applicable."
     65       },
     66       "sample_size_justified": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No samples or experimental participants are used; this is a theoretical paper."
     70       },
     71       "variance_reported": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No experimental runs are conducted; variance reporting is not applicable."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "This is a position paper with no experimental evaluation; baselines are not applicable."
     82       },
     83       "baselines_contemporary": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No evaluation is conducted; baseline contemporaneity is not applicable."
     87       },
     88       "ablation_study": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No system is built or evaluated; ablation studies are structurally inapplicable."
     92       },
     93       "multiple_metrics": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No evaluation metrics are reported in this position paper."
     97       },
     98       "human_evaluation": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "No system is evaluated; human evaluation is not applicable."
    102       },
    103       "held_out_test_set": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "No experiments or test sets exist in this position paper."
    107       },
    108       "per_category_breakdown": {
    109         "applies": false,
    110         "answer": false,
    111         "justification": "No empirical results to break down by category."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "This is a pure position/opinion paper with no system built or evaluated. Failure cases are structurally inapplicable — there is no approach whose failures could be examined. The scan agent instructions state that for theoretical/position papers, 'Most empirical checklist items will have applies: false.'"
    117       },
    118       "negative_results_reported": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "No experiments are run from which negative results could be reported."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract frames this as an opinion piece about LLM agents and trust in AI SE. The body provides conceptual discussion consistent with this framing. No empirical claims in the abstract are contradicted."
    129       },
    130       "causal_claims_justified": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "The paper makes normative arguments ('we argue that...') rather than empirical causal claims. Language like 'trust is strengthened when...' is prescriptive opinion, not measured causal effect."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper makes sweeping claims about 'AI software engineers' and 'future development workflows' without bounding these to specific systems, languages, organizational types, or settings. The title 'Agentic AI Software Engineers: Programming with Trust' implies broad applicability with no stated boundaries."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "The paper presents no empirical results. The schema specifies NA for 'papers that present no empirical results (e.g., pure surveys or taxonomies).' This is a pure opinion piece."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No models are used in experiments. Models are discussed conceptually only."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No prompting experiments are conducted."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No computational experiments are run."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No new agentic system is built or evaluated. The paper discusses existing systems conceptually but does not implement scaffolding."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No data is collected or preprocessed."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations or threats-to-validity section exists. The 5-page paper ends with a brief 'Outlook' section that does not address limitations of the arguments presented."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. The paper does not acknowledge any limitations of its conceptual arguments, such as the evidentiary basis for claiming trust is the key adoption barrier."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not state what its arguments do not cover. It speaks broadly about 'AI software engineers' and 'future development workflows' without bounding to specific domains, model types, or organizational contexts."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No data is collected. This is a pure opinion piece."
    195       },
    196       "data_collection_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No data collection is performed."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No participants or samples are recruited."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No data pipeline exists in this position paper."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding or acknowledgments section is present. The paper has no mention of grants or sponsorship."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are listed on the first page. The footnote additionally notes Roychoudhury is 'Senior Advisor at SonarSource.' Per schema, listing affiliations counts as YES even if the conflict is not explicitly acknowledged."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence cannot be assessed. Additionally, Roychoudhury's advisory role at SonarSource (which acquired AutoCodeRover, a system discussed favorably) creates a potential non-independent relationship that is not addressed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement exists. Roychoudhury is Senior Advisor at SonarSource, which acquired AutoCodeRover — a system prominently featured and described favorably in the paper — but this is not disclosed as a conflict of interest."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No pre-trained model is evaluated on a benchmark. This is a position paper."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No benchmark evaluation is conducted."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No benchmark evaluation is conducted."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved in this opinion piece."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects research is conducted."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "This is a theoretical opinion piece with no method or system; cost reporting is not applicable."
    293       },
    294       "compute_budget_stated": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "No computational experiments are run."
    298       }
    299     }
    300   }
    301 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs