calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (17356B)
      1 {
      2   "paper_slug": "agentic-adoption-github-2026",
      3   "calibration_model": "opus",
      4   "scan_model": "sonnet",
      5   "timestamp": "2026-02-28",
      6   "total_questions": 50,
      7   "agreement_count": 49,
      8   "disagreement_count": 1,
      9   "agreement_rate": 0.98,
     10   "disagreements": [
     11     {
     12       "category": "conflicts_of_interest",
     13       "question": "funder_independent_of_outcome",
     14       "sonnet": {
     15         "applies": false,
     16         "answer": false,
     17         "justification": "No funding source is disclosed; applicability cannot be assessed. The authors appear to be academic researchers without commercial ties to the evaluated products."
     18       },
     19       "opus": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The authors are researchers at CNRS, Inria, UFMG, and Institut Polytechnique de Paris — institutions that routinely fund research through government grants. The absence of a funding disclosure does not mean the work is unfunded; it means we cannot verify funder independence. Since the paper should have disclosed funding and did not, applies=true and answer=false is appropriate."
     23       },
     24       "direction": "applies_boundary",
     25       "explanation": "Sonnet treats the absence of funding disclosure as evidence the paper is unfunded (applies=false). Opus reasons that academic researchers at major public research institutions almost certainly receive institutional or grant funding, and the absence of disclosure is itself a deficiency — the question is applicable but unanswerable, hence applies=true, answer=false."
     26     }
     27   ],
     28   "opus_checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Footnote 22 states 'we will share all the datasets and analyses that we made to facilitate this' in future tense. No working URL or archive is provided at time of publication."
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper promises future release (footnote 22) but provides no download link, DOI, or archive. The dataset of 129,134 GitHub projects and derived metrics is not released."
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No requirements.txt, Dockerfile, conda environment, or dependency specification is provided. The pipeline is described conceptually but without environment details."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No step-by-step reproduction instructions are provided. Section 4.2 describes the pipeline at a high level but does not constitute a reproducible recipe with commands to run."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5.2 explicitly computes a 99% CI with 1% margin of error for commit-level sampling using Cochran's formula. Section 10.1 uses 95%/96% CI with Bonferroni correction for the qualitative commit sample."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Chi-square goodness-of-fit tests are applied in Sections 6.2 and 6.3 to assess distribution uniformity across deciles, with all p-values < 0.01."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Effect sizes are reported alongside Chi-square tests in Sections 6.2 and 6.3: 'The effect sizes are small for lines of code, contributors, commits, and issues, and medium for pull requests and age.' Contextual effect sizes for commit comparisons (median 34 vs 10 added lines) are also provided."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Sample sizes are justified using Cochran's formula (Section 4.2.5) with explicit parameters: N=120,000, p=0.5, 99% CI, 1% margin yields n=14,575, increased to 16,000. Section 10.1 similarly justifies n=790 using Cochran's formula with finite population correction."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Distributions are reported with quartiles (Q1, median, Q3) in Figure 11. Ridgeline plots and sparklines convey spread throughout. Interquartile ranges are visible for commit size comparisons across human, bot, and AI-assisted categories."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Human-authored commits serve as natural baselines in RQ5 (Section 9) for commit size comparison. Zeng et al.'s human-authored commit type baseline is used in RQ6 (Section 10.3)."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Zeng et al. [43] is from 2025. Human and bot commit comparisons are drawn from the same 2025 time-period dataset."
     88       },
     89       "ablation_study": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is an observational mining study with no system components to ablate. The study evaluates different detection heuristic categories but these are complementary measurement strategies, not ablatable components."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Multiple metrics: file-level adoption rate, commit-level adoption rate, adoption by project characteristic deciles, commit size (lines added, deleted, files changed), commit type distribution, temporal evolution, and co-adoption counts."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 10.2: Two authors manually classified 277 commits into seven categories by inspecting commit messages and diffs, with 90% initial agreement (250/277). The remaining 27 were resolved through discussion."
    103       },
    104       "held_out_test_set": {
    105         "applies": false,
    106         "answer": false,
    107         "justification": "This is an observational study of real-world repositories. There is no machine learning model being evaluated and no train/test split concept applies."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Extensive per-category breakdowns: by LOC, age, contributors, commits, issues, PRs (Tables 3-4), by organization (Table 5), by topic (Figures 4-5), by programming language (Figure 6), and by tool (Figure 8)."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 11 extensively discusses where heuristics fail: CONVENTIONS.md excluded due to false positive risk, AGENTS.md attribution ambiguity, developers disabling traces, squash merge imprecision, and systematic under- and over-estimation reasons."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Several negative/unexpected findings reported: 41.2% of file-level adopters have zero AI-assisted commits; very low correlation (r=0.1 at best) between file and commit metrics; AI/LLM topic projects having lower commit ratios despite higher file-level adoption."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "All abstract claims — 15.85%-22.60% adoption, rapid adoption, breadth across project types/languages/organizations, larger AI commits — are supported by results in Sections 5-10 with appropriate hedging."
    130       },
    131       "causal_claims_justified": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "The paper is explicitly observational and avoids causal claims. It uses language like 'we find that,' 'we observe,' and explicitly states 'How this affects productivity is an open question, which we do not try to answer' (Section 9). Hedged speculations (e.g., 'likely raised the popularity') are not presented as findings."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Claims are explicitly bounded to public GitHub repositories with >=10 stars, >=5,000 LOC, >=100 commits. Section 11.1 acknowledges results 'may not translate well to more general use, and to industry in particular.' Estimates are labeled as conservative vs. high bounds."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Section 11 systematically discusses alternatives: high PR metric may be 'self-fulfilling'; low commit ratios for AI/LLM topics likely reflect deliberate trace-hiding rather than lower use; extensive enumeration of over- and under-estimation reasons including developer behavior, agent configuration, and workflow differences."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "This is a mining study that does not run LLM inference. The authors study coding agents used by other developers in repositories; they do not invoke any LLM themselves."
    152       },
    153       "prompts_provided": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The study does not use LLM prompting. The analysis pipeline mines repository artifacts using GitHub APIs and git commands."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No LLM hyperparameters are involved. The study uses GitHub REST and GraphQL APIs, git log, and heuristic pattern matching. Statistical parameters (confidence levels, margins of error) are reported separately."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used in the analysis methodology. This is a mining study."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.2 documents the pipeline in detail: 130,621 initial projects filtered to 129,134, with 900 pipeline failures and 500 dotfile exclusions explained with counts. Commit extraction is documented in 6 numbered steps (Section 4.2.5). Filtering criteria are explicit."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 11 'Discussion' contains dedicated subsections: 11.1 'Limitations', 11.2 'Reasons for over-estimating coding agent use', 11.3 'Reasons for under-estimating coding agent use', and 11.4 'Putting it all together', spanning multiple pages."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Threats are specific to this study: Peril of Partial Observability (file markers without commit markers), squash merge attribution bias, AGENTS.md attribution uncertainty, AI/LLM topic projects deliberately hiding traces, GitHub Linguist file-extension limitations."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Explicit scope restrictions: GitHub public repositories with >=10 stars, >=5,000 LOC, >=100 commits, active in last 3 months. The paper states results 'may not translate well to more general use, and to industry in particular' (Section 11.1). Conservative and high estimates are explicitly labeled."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw data is available at time of publication. Footnote 22 promises future sharing of datasets and analyses but provides no current download link, DOI, or archive."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Data collection is described in Section 4.2: the sampling tool by Dabic et al. [10] is used with explicit criteria, accessed on 29/08/2025. GitHub REST and GraphQL APIs are described. Analysis run on October 31st, 2025."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "This is a mining study with no human participants. The 'sample' is GitHub repositories satisfying objective inclusion criteria, not recruited participants."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The data pipeline is documented in Section 4.2 with explicit step counts: 130,621 initial -> 129,134 analyzed (900 failures, 500 dotfiles). Commit extraction documented in 6 numbered steps in Section 4.2.5. Each filtering stage has stated criteria and counts."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding or acknowledgments section is present in the paper. No funding sources are disclosed."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly disclosed on the title page: Univ. Bordeaux/CNRS, Univ. Rennes/Inria/CNRS/IRISA, UFMG, and Institut Polytechnique de Paris. None are affiliated with coding agent companies."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The authors are researchers at CNRS, Inria, UFMG, and Institut Polytechnique de Paris — institutions that routinely fund research through government grants. The absence of a funding disclosure does not mean the work is unfunded; it means we cannot verify funder independence. Since the paper should have disclosed funding and did not, the criterion applies but is not satisfied."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement is present. Absence of disclosure is not the same as absence of conflict."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This is a repository mining study that does not evaluate a pre-trained model's capability on any benchmark. No LLM is tested for knowledge or capability."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Not applicable — this is a mining study, not a benchmark evaluation of an LLM."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Not applicable — the paper does not use a benchmark to evaluate LLM knowledge or capabilities."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "This is a repository mining study. Mining public GitHub repositories is not a human subjects study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "Mining public GitHub repository data does not require IRB approval. No human subjects are involved."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants. Repository selection criteria are stated but these are not participant inclusion/exclusion criteria."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants. The random sampling of repositories uses statistical principles (Cochran's formula) documented in Section 4.2.5, but this is not human randomization."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants. The two-author manual classification in Section 10.2 is a qualitative coding exercise, not a human subjects experiment."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper mentions storage requirements ('more than half a terabyte in total' in Section 4.2.4) but does not report time, API cost, or per-unit analysis cost for the mining pipeline."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Storage requirements (>0.5 TB) are mentioned but total computational budget, hardware specifications, and wall-clock time are not stated."
    299       }
    300     }
    301   }
    302 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs