calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (14356B)
      1 {
      2   "paper_slug": "advancing-code-generation-2025",
      3   "calibration_date": "2026-02-28",
      4   "model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 49,
      7   "disagreement_count": 1,
      8   "agreement_rate": 0.98,
      9   "disagreements": [
     10     {
     11       "category": "evaluation_design",
     12       "question": "failure_cases_discussed",
     13       "sonnet": { "applies": true, "answer": true },
     14       "opus": { "applies": true, "answer": false },
     15       "direction": "sonnet_generous",
     16       "explanation": "Sonnet credits Section 3's discussion of challenges (prompt sensitivity, usability, security, evaluation limitations) as failure cases. Opus disagrees: the schema asks for 'error analysis, qualitative examples of failures, discussion of where the approach breaks down.' Section 3 discusses high-level challenges and research directions, not concrete failure cases with specific examples. There is no error analysis, no qualitative examples of specific failures, and no discussion of specific cases where approaches broke down. The challenges are stated at a conceptual/thematic level (e.g., 'prompt sensitivity can yield different responses') without showing actual failure instances. This is a vision paper providing suggestions, not a paper analyzing failure cases."
     17     }
     18   ],
     19   "opus_checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No code repository or archive is mentioned. The paper is a vision/roadmap paper with no implementation, but supplementary materials or analysis scripts could have been released."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No dataset is collected or released. The paper discusses existing benchmarks (HumanEval, MBPP, APPS) only in passing. A corpus of reviewed papers could have been released but was not."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specification is provided. The paper has no experiments and no computational artifacts requiring an environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No reproduction instructions are provided. The paper contains no experiments or systematic review methodology to reproduce."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "This is a vision/roadmap paper with no experiments or quantitative results. Statistical uncertainty measures are not applicable."
     47       },
     48       "significance_tests": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No comparative experiments are conducted. No claims of measured difference between systems are made."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No empirical results are reported; the paper does not measure any effects."
     57       },
     58       "sample_size_justified": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No data collection or sampling occurs in this paper."
     62       },
     63       "variance_reported": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "No experimental runs are performed; variance reporting is not applicable."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a vision paper proposing a conceptual framework, not an empirical evaluation. No baseline comparisons are applicable."
     74       },
     75       "baselines_contemporary": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No baselines are used; the paper proposes a conceptual framework without evaluation."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No experiments are run; ablation studies are not applicable to this vision paper."
     84       },
     85       "multiple_metrics": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No empirical evaluation is conducted in this paper."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No system outputs are produced or evaluated; there are no results from any implemented system."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No data splits are used; this is a vision paper without experiments."
     99       },
    100       "per_category_breakdown": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "No empirical evaluation is conducted. The paper organizes discussion by themes but presents no quantitative results to break down."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Section 3 discusses high-level challenges in LLM code generation (prompt sensitivity, usability, security, evaluation limitations), but these are thematic research directions, not specific failure case analyses. The paper provides no error analysis, no qualitative examples of concrete failures, and no discussion of where a specific approach breaks down with evidence. The challenges are stated at a conceptual level without showing actual failure instances."
    109       },
    110       "negative_results_reported": {
    111         "applies": false,
    112         "answer": false,
    113         "justification": "No experiments are conducted, so there are no experimental results (positive or negative) to report."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims the paper presents a 'six-layer vision framework' and 'actionable recommendations' — both are present in the paper body. The abstract does not make unsupported empirical claims; it accurately describes the content as a vision and analysis paper."
    121       },
    122       "causal_claims_justified": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "The paper makes no causal claims based on its own evidence. It discusses existing literature findings at a high level but does not make new causal inferences from original data."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper makes broad generalizations about 'LLM-based code generation' without bounding them. For instance, Section 3.1.2 states 'developers using such plugins do not necessarily see large efficiency gains' citing a single study [41], while the title and framing suggest conclusions apply broadly. The paper does not specify which models, tasks, or development contexts its claims are limited to."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper discusses challenges and proposes solutions but does not consider alternative explanations for the phenomena it describes. For example, the usability challenge discussion does not consider whether cited studies have confounds, selection bias, or other factors that might explain the observed phenomena differently."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not run any model evaluations. It mentions GPT-3.5 and GPT-4 only as examples in the literature discussion."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting as part of any experiment. Prompt strategies are discussed conceptually."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No experiments are run; no hyperparameters are applicable."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The paper describes scaffolding architectures conceptually (the six-layer framework) but does not implement or evaluate any scaffolding. No third-party scaffolding is evaluated."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No data is collected or processed in this paper. The paper informally discusses existing literature but does not present a systematic review with a documented selection pipeline."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations or threats-to-validity section. The paper ends with a brief conclusion (Section 4) that does not discuss limitations of the vision or roadmap itself."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats-to-validity discussion is present. The paper does not reflect on potential biases in its literature coverage, incompleteness of its framework, or limitations of its analysis."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not state explicit scope boundaries. It discusses LLM-based code generation broadly without clarifying what types of systems, tasks, or domains are in or out of scope."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No underlying data is collected or released. The paper references existing literature but does not provide a corpus of papers reviewed or any systematic data."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not describe how it selected the papers it discusses. No search strategy, database queries, inclusion/exclusion criteria, or literature search methodology is presented."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. The paper is a vision/roadmap paper, not a human subjects study. NA per schema guidance."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "There is no documented pipeline for how papers were identified, screened, and included. The review appears informal with no systematic methodology described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No acknowledgments section or funding disclosure is present. Authors are affiliated with University of Sydney and CSIRO's Data61, both publicly funded institutions, yet no funding source is mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Haolin Jin and Huaming Chen at University of Sydney; Qinghua Lu and Liming Zhu at CSIRO's Data61, Australia."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funding is disclosed. Without knowing the funder, independence cannot be assessed. The paper is a vision paper that does not evaluate any specific commercial product."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate any pre-trained model's capabilities on a benchmark. Models and benchmarks are discussed conceptually only."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is performed; train/test overlap is not applicable."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No model is evaluated on any benchmark in this paper."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this paper."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this paper."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this paper."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this paper."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this paper."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this paper."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved in this paper."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a vision/roadmap paper with no implemented system or experiments. Cost reporting is not applicable."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a vision/roadmap paper with no experiments requiring computational resources."
    290       }
    291     }
    292   }
    293 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs