calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (14109B)
      1 {
      2   "calibration_metadata": {
      3     "paper_slug": "agentic-software-engineering-2025",
      4     "calibration_date": "2026-02-28",
      5     "sonnet_scan_date": null,
      6     "opus_model": "claude-opus-4-6",
      7     "agreement_rate": 1.0,
      8     "total_questions": 50,
      9     "agreements": 50,
     10     "disagreements": 0
     11   },
     12   "opus_checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No code is released. The paper is a position/vision paper proposing a conceptual framework (CRAFT values, 'whole of process' vision, vocabulary guidance) with no implementation or computational artifacts. However, applies=true because the authors could have released supplementary materials such as a structured dataset of existing frameworks surveyed."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No dataset is released. The paper references existing datasets (e.g., AIDev by Li et al.) but produces no new data and releases none. A structured compilation of the cited frameworks and visions could have been released."
     23       },
     24       "environment_specified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "No software environment is involved. This is a purely conceptual position paper with no experiments, code, or computational components."
     28       },
     29       "reproduction_instructions": {
     30         "applies": false,
     31         "answer": false,
     32         "justification": "Nothing to reproduce computationally. The paper proposes a vision, values, and vocabulary guidance with no experiments or computational artifacts."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "No empirical results are presented. This is a theoretical/position paper with no quantitative measurements."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No comparative empirical claims requiring significance tests. The paper makes no quantitative comparisons."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No empirical results to report effect sizes for. This is a position paper."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No sample of participants or data points is used. This is a theoretical paper."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No quantitative measurements are made across runs or trials. Not applicable to a theoretical/vision paper."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "This is a position/vision paper with no empirical evaluation. There are no experimental results to compare against baselines."
     67       },
     68       "baselines_contemporary": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "No baselines exist in a position paper proposing a conceptual framework. Not applicable."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No system with components is implemented or evaluated. Not applicable to a vision paper."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No empirical evaluation is conducted. Not applicable."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No system outputs are produced to evaluate. Human evaluation is structurally inapplicable to a pure vision paper."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No empirical evaluation with training/test splits. Not applicable to a theoretical paper."
     92       },
     93       "per_category_breakdown": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No empirical results exist to break down by category. Not applicable."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "No system is evaluated, so there are no failure cases. Not applicable to a position paper."
    102       },
    103       "negative_results_reported": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "No experiments were run that could yield negative results. Not applicable to a vision/position paper."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims the paper contributes (a) a 'whole of process' vision grounded in SE foundations (delivered in Sections 2-3 and Figure 2), (b) CRAFT values and principles (delivered in Section 4 and Table 1), and (c) vocabulary guidance (delivered in Section 5). All three are present in the paper body. The claims are descriptive of contributions, not empirical, so they are internally consistent."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper makes no causal claims. It proposes a vision and principles, citing prior empirical work for motivational context. Prescriptive language like 'we need to expand' is normative, not causal."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes broad prescriptive claims about what 'agentic SE needs' across the entire discipline, based on a selective reading of a small number of early empirical studies (some under review as of Jan 2026). It acknowledges ideas are 'not meant to be exhaustive or final' but does not state specific boundaries on what the vision does NOT cover or what populations/contexts are excluded. The title 'Toward Agentic Software Engineering Beyond Code' implies comprehensive scope."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": false,
    127         "answer": false,
    128         "justification": "The paper presents no empirical results. It is a pure vision/position paper proposing a framework. There are no observed results that require alternative explanations."
    129       }
    130     },
    131     "setup_transparency": {
    132       "model_versions_specified": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No AI model is used or evaluated. The paper references existing agentic systems (Devin, Jules, Codex, Claude Code) descriptively but does not use them."
    136       },
    137       "prompts_provided": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No prompting or LLM interaction is conducted. This is a theoretical position paper."
    141       },
    142       "hyperparameters_reported": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No models are used or tuned. Not applicable to a position paper."
    146       },
    147       "scaffolding_described": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No agentic scaffolding is implemented. The paper proposes a conceptual vision but builds no system."
    151       },
    152       "data_preprocessing_documented": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No data collection or preprocessing is performed. The paper is a position/opinion piece that selectively cites related work, not a systematic review with a documented pipeline."
    156       }
    157     },
    158     "limitations_and_scope": {
    159       "limitations_section_present": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No dedicated limitations or threats-to-validity section exists. Section 6 is a brief conclusion that acknowledges 'we may not know what agentic SE truly looks like until it is studied empirically in the wild' but this is a single sentence, not a substantive limitations discussion."
    163       },
    164       "threats_to_validity_specific": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "No specific threats to validity are discussed. The acknowledgment that ideas may be 'defined by practice' is a generic disclaimer, not a specific threat to this paper's methodology or conclusions."
    168       },
    169       "scope_boundaries_stated": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper does not explicitly state what its vision does NOT cover or what claims it is NOT making. Saying ideas are 'not meant to be exhaustive or final' is a generic hedge, not a specific scope boundary statement."
    173       }
    174     },
    175     "data_integrity": {
    176       "raw_data_available": {
    177         "applies": false,
    178         "answer": false,
    179         "justification": "No data is collected or analyzed. This is a position paper that cites related work informally. No raw data exists to release."
    180       },
    181       "data_collection_described": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper draws on selected literature to motivate its framework but provides no description of how literature was identified, screened, or selected. The citation of related work appears to be an informal, unsystematic selection."
    185       },
    186       "recruitment_methods_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No human participants are recruited. Not applicable to a position paper."
    190       },
    191       "data_pipeline_documented": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No data pipeline exists. The paper does not systematically collect or transform any data."
    195       }
    196     },
    197     "conflicts_of_interest": {
    198       "funding_disclosed": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No funding is mentioned. The acknowledgments section thanks individuals for feedback but does not mention any funding source or grant."
    202       },
    203       "affiliations_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The author's affiliation (Monash University, Melbourne, Australia) is clearly stated on the title page with contact email."
    207       },
    208       "funder_independent_of_outcome": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No funding is disclosed, so funder independence cannot be assessed. The schema specifies NA if unfunded."
    212       },
    213       "financial_interests_declared": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No competing interests or financial disclosure statement appears in the paper. Absence of a declaration is not the same as absence of conflict."
    217       }
    218     },
    219     "contamination": {
    220       "training_cutoff_stated": {
    221         "applies": false,
    222         "answer": false,
    223         "justification": "No pre-trained model is evaluated on any benchmark. This is a position/vision paper."
    224       },
    225       "train_test_overlap_discussed": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "No model training or benchmark evaluation is conducted. Not applicable."
    229       },
    230       "benchmark_contamination_addressed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No benchmarks are used. Not applicable to a position/vision paper."
    234       }
    235     },
    236     "human_studies": {
    237       "pre_registered": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No human participants are involved. This is a theoretical/vision paper."
    241       },
    242       "irb_or_ethics_approval": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved. Not applicable."
    246       },
    247       "demographics_reported": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved. Not applicable."
    251       },
    252       "inclusion_exclusion_criteria": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved. Not applicable."
    256       },
    257       "randomization_described": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants or experimental conditions. Not applicable."
    261       },
    262       "blinding_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants or experimental conditions. Not applicable."
    266       },
    267       "attrition_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved. Not applicable."
    271       }
    272     },
    273     "cost_and_practicality": {
    274       "inference_cost_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This is a theoretical/vision paper. No system is built or evaluated, so there are no inference costs to report."
    278       },
    279       "compute_budget_stated": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No computational experiments are conducted. Not applicable to a theoretical position paper."
    283       }
    284     }
    285   },
    286   "disagreements": [],
    287   "notes": "Perfect agreement between Opus and Sonnet on all 50 questions. This is expected for a purely theoretical/position paper with no empirical content — the paper has clear-cut characteristics that leave little room for interpretive differences. The vast majority of questions (35 out of 50) are structurally inapplicable (applies=false) because the paper has no experiments, no data, no model evaluations, no benchmarks, and no human participants. The remaining 15 applicable questions have straightforward answers: the paper satisfies abstract_claims_supported and affiliations_disclosed (both true), while failing on all others (code_released, data_released, generalization_bounded, all limitations questions, data_collection_described, funding_disclosed, financial_interests_declared)."
    288 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs