calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (17407B)
      1 {
      2   "paper_slug": "adoption-generative-artificial-2026",
      3   "calibration_date": "2026-02-28",
      4   "total_questions": 50,
      5   "agreement_count": 49,
      6   "disagreement_count": 1,
      7   "agreement_rate": 0.98,
      8   "disagreements": [
      9     {
     10       "category": "artifacts",
     11       "question": "environment_specified",
     12       "sonnet": {"applies": false, "answer": false},
     13       "opus": {"applies": true, "answer": false},
     14       "direction": "applies_boundary",
     15       "explanation": "Sonnet marked applies=false, reasoning that a survey/interview study has no software environment to specify. Opus marks applies=true because the study performed computational analyses (k-means clustering, chi-squared tests, Spearman correlations) that require statistical software, and per the scan-agent instructions for survey papers, all artifacts questions should have applies=true. A survey study CAN provide environment specifications for its analysis code. Since they did not, the correct answer is applies=true, answer=false."
     16     }
     17   ],
     18   "opus_checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No GitHub link, Zenodo archive, or repository URL is provided anywhere in the paper. No analysis scripts or survey instruments are released as downloadable artifacts."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The survey data (109 responses from LimeSurvey) and 18 interview transcripts are not publicly released. No data repository link is provided."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The study performed computational analyses (k-means clustering, chi-squared tests, Spearman correlations) but does not specify the statistical software, package versions, or environment used. A survey study with quantitative analysis can and should provide environment specifications for reproducibility."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level (LimeSurvey, semi-structured interviews, open coding), but no replication package, protocol document, or detailed procedures are linked."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper reports means on 5-point scales (e.g., Mean=4.0, Mean=3.9, Mean=3.4) and percentages without any confidence intervals, error bars, or uncertainty estimates."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Chi-squared tests are reported with full statistics: chi2(2, N=109)=10.84, p_adj=0.022 for experience-prompting; chi2(5, N=109)=13.91, p_adj=0.016 and chi2(4, N=109)=11.30, p_adj=0.023 for cluster demographics. Spearman correlations with p-values are reported throughout Section 5."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Spearman correlation coefficients (rho values such as rho=0.33, rho=0.39, rho=-0.33, rho=0.48) are reported throughout Section 5, providing effect sizes. Chi-squared statistics with N are provided. Percentage breakdowns with counts give practical magnitude context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The sample size of n=109 is not justified by power analysis or formal reasoning. No explanation is given for why 109 is sufficient for the subgroup analyses performed, particularly with small strata (e.g., n=22 junior developers)."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper reports means on 5-point scales without standard deviations or any spread measures. Percentage distributions are shown in figures but no variance, standard deviation, or IQR is provided for the reported mean values."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 4 ('Global Context') explicitly compares results to the 2025 Stack Overflow Developer Survey, providing an external reference point for the German sample's adoption and challenge patterns."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The Stack Overflow Developer Survey 2025 used for comparison is contemporary with the study's 2025 data collection period."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "This is a survey/interview study, not a system with components to ablate. Ablation is structurally inapplicable."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The study uses multiple measurement approaches across six survey blocks: adoption rates, usage frequency across 7 tasks, perceived effectiveness of 10 prompting strategies, severity of 14 challenges, and perceived impact on 5 workflow dimensions."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is itself a human-subjects survey study. There is no system producing outputs that require human evaluation. Human evaluation of system outputs is structurally inapplicable."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "This is a survey study, not an ML or benchmark evaluation. The concept of held-out test sets does not apply."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive breakdowns are provided: usage frequency by task (Figure 3), tool adoption by tool type (Figure 2), prompting effectiveness by experience level (Figure 8), challenge severity by challenge (Figure 5), code generation frequency by company size (Figure 10)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses multiple failure modes: hallucinations (Section 3.4), context wall limitations (Section 5.4), cases where prompting 'ends up taking longer than writing code' (Section 3.3), and the verification tax imposed by unreliable outputs."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Negative findings reported: bug fixing improved for only 40% (39% no change), role prompting showed poor effectiveness (Mean=2.9), pre-made prompts rated least effective (Mean=2.8), senior developers skeptical of AI tools (only 39% finding specific instructions effective), and distrust correlating with reduced workflow speed."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: 'experience level moderates perceived benefits' (chi-squared tests, Section 5.1), 'organizational size affects tool selection and intensity' (Figure 10, Section 5.3), 'limited awareness of project context is most significant barrier' (Figure 6, Section 5.4)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper uses causal language: 'experience seems to have an impact on developers' interaction' (Section 5.1), 'moderates the perceived effectiveness' (abstract). These causal interpretations are drawn from cross-sectional survey data without randomization or causal identification strategy. Section 5.4 disclaims causality for one correlation, but causal language is used elsewhere."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Findings are consistently scoped to 'German software engineers' in title, abstract, and conclusions. Section 7 acknowledges convenience sample limitations and that 'findings were sampled in 2025 and may not generalize to future states.'"
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 5.5 explicitly lists four alternative explanations for the Proficiency Cycle: lower baseline performance, task type differences, self-reinforcing proficiency cycle, and response bias. Section 5.4 states the distrust-speed correlation 'is consistent with two interpretations.'"
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "This is a survey/interview study that does not use LLMs as a methodological tool. The LLMs discussed are tools used by surveyed practitioners, not experimental variables controlled by the researchers."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "This study does not use LLM prompting in its methodology. It asks human participants about their prompting practices but does not itself prompt any LLMs."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No LLMs are used in the study's methodology. The schema description focuses on LLM hyperparameters (temperature, top-p, etc.). The k-means k=2 is stated but LLM hyperparameters are not applicable."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used in the study's methodology."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 2.2 documents filtering: 210 initial responses, 101 excluded for being incomplete (most with no questions answered), leaving n=109 complete submissions. Section 2.1 describes the interview open coding process with two independent coders resolving discrepancies through discussion."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 7 'Threats to Validity' is a dedicated section discussing internal validity, external validity, and construct validity."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 identifies specific threats: fixed thematically grouped question order may cause carryover effects, convenience sampling via personal contacts and LinkedIn may not represent the broader population, self-reported frequency/effectiveness introduces measurement error. These are specific to this study's design."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 states findings may not generalize beyond Germany and 2025 ('findings were sampled in 2025 and may not generalize to future states'). The conclusion explicitly notes the cross-sectional design prevents causal inference, directing future work to longitudinal designs."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw survey response data and interview transcripts are not publicly available. No data repository link is provided. Only summary statistics, figures, and correlation values are presented."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 2.2 describes the data collection procedure: LimeSurvey platform, distribution via personal contacts and LinkedIn, collection period April 15 to August 20, 2025, voluntary participation with no incentives, anonymous data collection, and survey structure with six blocks."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 2.2 states participants were recruited through 'personal contacts and on social media platforms like LinkedIn,' explicitly noting convenience sampling. Section 7 acknowledges this as a threat to external validity."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Pipeline documented: 210 raw responses -> 101 excluded (incomplete, most with no questions answered) -> 109 complete submissions. Interview open coding process described: two researchers independently coded, then met to resolve discrepancies through discussion (Section 2.1)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure, acknowledgments section, or mention of grants or sponsors appears in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed on the title page: four authors at Technical University of Munich (Heilbronn campus) and one at Heilbronn University of Applied Science. No commercial affiliations."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": false,
    216         "answer": false,
    217         "justification": "No funding source is disclosed. Appears to be unfunded academic research from two universities. The schema says NA if unfunded."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement, patent disclosure, or financial interests declaration is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This is a survey/interview study that does not evaluate a pre-trained model's capability on any benchmark. Contamination concerns are structurally inapplicable."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey/interview study with no benchmark evaluation. Train/test overlap is not applicable."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey/interview study with no benchmark evaluation. Benchmark contamination is not applicable."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No pre-registration link (OSF, AsPredicted, or similar) is mentioned. The study was not pre-registered."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics board approval is mentioned despite collecting data from 18 interview participants and 109 survey respondents."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Section 2.2 reports detailed demographics: role distribution (62% developers, 13% team leads/managers, 8% architects), experience (mean 12.1 years, median 10 years, 26% senior, 54% mid-level, 20% junior), company size breakdown, education levels (51% Master's, 22% Bachelor's, 14% Doctorate), and geographic distribution (88% Germany)."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No formal inclusion/exclusion criteria are stated for survey or interview participation. Participants are described as 'software engineers working mainly in Germany' but no screening process or explicit eligibility criteria are described."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "This is a cross-sectional survey study, not an experimental study with condition assignment. Randomization is structurally inapplicable."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "This is a cross-sectional survey, not an experiment with treatment/control conditions. Blinding is not applicable."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": true,
    276         "justification": "Section 2.2 reports: 210 initial responses recorded, 101 excluded (most with no questions answered at all), leaving n=109 complete submissions. The attrition and primary reason are stated."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "This is a survey/interview study with no LLM inference performed as part of the methodology. Cost is structurally inapplicable."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a survey/interview study requiring no significant computational budget. The schema says NA for survey papers."
    289       }
    290     }
    291   }
    292 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs