scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20165B)
      1 {
      2   "paper": {
      3     "title": "Configuring Agentic AI Coding Tools: An Exploratory Study",
      4     "authors": ["Matthias Galster", "Seyedmoein Mohsenimofidi", "Jai Lal Lulla", "Muhammad Auwal Abubakar", "Christoph Treude", "Sebastian Baltes"],
      5     "year": 2026,
      6     "venue": "Anonymous Conference (under review)",
      7     "arxiv_id": "2602.14690"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper references supplementary material with scripts and data at a Zenodo archive (ref [5], doi:10.5281/zenodo.18625980): 'Our data collection and analysis scripts and the analyzed data are available online [5].'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The Zenodo supplementary material includes the analyzed data: 'Our data collection and analysis scripts and the analyzed data are available online [5].'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No mention of environment specifications, requirements.txt, or dependency details in the paper. The lingua-language-detector and GPT-5.2 are mentioned but no environment setup is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While scripts and data are released, the paper does not describe step-by-step reproduction instructions. The supplementary material may contain them but the paper itself does not provide specific commands or a README for reproduction."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Table 3 reports IQR (interquartile ranges) for all metrics, e.g., 'Age: 6.7 (4.3–9.4)'. This serves as uncertainty quantification for the descriptive statistics."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 3 uses Mann-Whitney U tests with Benjamini-Hochberg FDR correction across 20 comparisons, reporting significance levels (*p<.05, **p<.01, ***p<.001)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 3 reports Cliff's delta effect sizes for each comparison, e.g., 'δ=-0.08', 'δ=0.21'."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why the initial 37,249 repositories were sufficient or why this sample size is adequate. The sample is convenience-based from SEART GitHub search with no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 3 reports IQR for all metrics. Section 6.2 reports min, max, and median for Skills per repository (min=1.0, max=28.0, median=2)."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares across tools (Claude, Copilot, Codex, Cursor, Gemini) and against the full sample baseline in Table 3. Related work in Section 2 positions findings against prior studies."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The comparison is across contemporary tools (all active in 2025-2026). Related work references are from 2025-2026."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a mining/observational study with no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: repository age, contributor count, commit count, size in KB (Table 3), plus adoption rates, co-occurrence correlations, and temporal trends."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation of system outputs is not relevant to this mining study. The paper measures adoption patterns, not output quality."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a mining study, not a predictive modeling study. No train/test split is applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Extensive per-tool breakdowns (Figures 2, 3, Tables 2, 3), per-mechanism breakdowns, per-language breakdowns, and per-artifact type breakdowns are provided throughout."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses limitations of its heuristic detection approach in Section 6.4, and notes gaps like no evidence of persistent memory usage for Subagents (Section 6.3)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that Skills are shallowly adopted (83.3% with no additional resources), Subagent memory is unused, and advanced mechanisms see very low adoption. These are negative/null findings."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The three abstract claims (Context Files dominate, Skills/Subagents shallowly adopted, distinct configuration cultures) are all supported by data in Sections 5-6 with figures and tables."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal-adjacent claims like 'A reason for this could be that Claude Code is the most popular agentic AI coding tool' (Section 6.1) without evidence beyond correlation. However, these are hedged with 'could be' language, making them speculative rather than strong causal claims. The Discussion section offers explanations for adoption patterns without causal evidence."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6.4 (External validity) explicitly states: 'Our study covers only open-source repositories on GitHub; practices in proprietary or enterprise settings may differ.' The paper also notes it is a 'point-in-time snapshot (February 2026) of a rapidly evolving landscape.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 (Discussion) considers alternative explanations for shallow adoption: novelty of mechanisms, effort required, lack of empirical evidence on effectiveness, and developers gravitating to lowest-friction options."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "GPT-5.2 is specified for the repository classification step (Section 3): 'uses the GPT-5.2 model to determine...'. Default parameter values were used."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper states 'In our supplementary material, we share the final prompt and previous iterations of the prompt' (Section 3). The prompt content itself is in the supplementary material [5], not the paper body, but it is released."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3 states 'we kept OpenAI's default values for all parameters,' which is sufficient to reproduce the configuration."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used in the methodology. GPT-5.2 is used as a simple classifier, not in an agentic loop."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figure 1 provides a detailed pipeline with counts at each stage: 37,249 → 36,184 (language filter) → 32,564 (engineered filter) → 2,926 (config detection). Filtering criteria are described at each stage."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6.4 'Threats to Validity' provides a dedicated subsection organized by construct, internal, and external validity."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6.4 discusses specific threats: heuristics detect presence not active use, GPT-5.2 classification used a single labeling run without inter-model agreement, 2,204 'unsure' cases excluded, cannot isolate agentic vs conversational usage for some tools."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.4 explicitly states: only OSS on GitHub, cannot claim representativeness for closed-source, no examination of application domain variation, point-in-time snapshot. Section 7 also notes 'early empirical signals rather than settled findings.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The supplementary material (Zenodo, ref [5]) includes the analyzed data and scripts."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 and Figure 1 describe the data collection in detail: SEART GitHub search tool, repository selection criteria (non-fork, ≥2 contributors, license, created before Jan 2024, commits since June 2025), with specific dates."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data source is GitHub repositories selected via SEART search tool with documented criteria."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Figure 1 shows the complete pipeline with counts at each filtering stage: 37,249 → excluded 24 archived → excluded 60 without README and 994 non-English → 32,564 engineered → 2,926 with config files. Each exclusion count is documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: University of Bamberg, Heidelberg University, Singapore Management University. None are affiliated with the tools being studied."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is not the same as confirming independence."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This is a mining study. GPT-5.2 is used only as a classifier for README files, not evaluated on a benchmark. No model capability is being measured."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No benchmark evaluation of a pre-trained model's capability. GPT-5.2 is a utility tool in the pipeline, not the subject of evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is performed. The study mines repositories for configuration artifacts."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. This is a repository mining study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper uses GPT-5.2 to classify 36,184 repositories but does not report the API cost or token consumption of this classification step."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of computational budget, hardware used, or time required for the data collection and analysis pipeline."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Context Files dominate the configuration landscape and are often the sole mechanism in a repository, with AGENTS.md emerging as an interoperable standard.",
    286       "evidence": "2,634 of 2,926 repositories use Context Files (Section 5.2). AGENTS.md receives 368 incoming references (Section 6.1). CLAUDE.md→AGENTS.md is the most common reference pair (311 cases).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Advanced mechanisms such as Skills and Subagents are only shallowly adopted, with Skills predominantly relying on static instructions rather than executable workflows.",
    291       "evidence": "83.3% of Skills include no additional resources (Section 6.2). Median Skills per repository is 2 (min=1, max=28). No repositories use Subagent persistent memory (Section 6.3).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Distinct configuration cultures are forming around different tools, with Claude Code users employing the broadest range of mechanisms.",
    296       "evidence": "Figure 3 shows Claude appears across all mechanism types. Cursor emphasizes Rules (59.8%). Table 3 shows statistically significant differences in repository characteristics across tools.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Repositories using Cursor are significantly younger than those using other tools.",
    301       "evidence": "Table 3: Cursor median age 5.5 years vs 6.7 overall, p<.001, Cliff's δ=-0.19.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["observational"],
    306   "key_findings": "This study systematically identifies eight configuration mechanisms across five agentic AI coding tools (Claude Code, Copilot, Codex, Cursor, Gemini) and analyzes their adoption in 2,926 GitHub repositories. Context Files (especially CLAUDE.md and AGENTS.md) dominate, appearing in 90% of repositories with configuration artifacts, while advanced mechanisms like Skills and Subagents remain shallowly adopted. Skills primarily serve as static documentation (83.3% have no executable resources), and distinct tool-specific configuration cultures are emerging, with Claude Code users showing the broadest mechanism diversity.",
    307   "red_flags": [
    308     {
    309       "flag": "LLM-based classification without validation",
    310       "detail": "GPT-5.2 was used to classify 36,184 repositories as 'engineered' or not, with only spot-checks for validation. No inter-annotator agreement, no comparison with human labels, and no alternative model was used. 2,204 'unsure' cases were excluded without analysis."
    311     },
    312     {
    313       "flag": "Presence as proxy for adoption",
    314       "detail": "The study detects configuration file presence but acknowledges it cannot confirm whether the corresponding tool is actively used. This is noted in threats to validity but could significantly affect adoption counts."
    315     }
    316   ],
    317   "cited_papers": [
    318     {
    319       "title": "Agent READMEs: An Empirical Study of Context Files for Agentic Coding",
    320       "authors": ["Worawalan Chatlatanagulchai", "Hao Li", "Yutaro Kashiwa"],
    321       "year": 2025,
    322       "arxiv_id": "2511.12884",
    323       "relevance": "Directly studies context files for agentic coding tools, finding they are actively maintained and focus on functional development instructions."
    324     },
    325     {
    326       "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects",
    327       "authors": ["Hao He", "Courtney Miller", "Shyam Agarwal"],
    328       "year": 2026,
    329       "relevance": "Empirical study of Cursor AI's impact on code quality in open-source projects."
    330     },
    331     {
    332       "title": "Beyond the Prompt: An Empirical Study of Cursor Rules",
    333       "authors": ["Shaokang Jiang", "Daye Nam"],
    334       "year": 2026,
    335       "relevance": "Empirical study of Cursor rule-based configuration mechanisms for AI coding tools."
    336     },
    337     {
    338       "title": "On the Impact of AGENTS.md Files on the Efficiency of AI Coding Agents",
    339       "authors": ["Jai Lal Lulla", "Seyedmoein Mohsenimofidi", "Matthias Galster"],
    340       "year": 2026,
    341       "arxiv_id": "2601.20404",
    342       "relevance": "Controlled study measuring the impact of AGENTS.md on agent runtime and token consumption."
    343     },
    344     {
    345       "title": "Context Engineering for AI Agents in Open-Source Software",
    346       "authors": ["Seyedmoein Mohsenimofidi", "Matthias Galster", "Christoph Treude"],
    347       "year": 2026,
    348       "relevance": "Studies context engineering practices in open-source projects for AI agent configuration."
    349     },
    350     {
    351       "title": "Decoding the Configuration of AI Coding Agents: Insights from Claude Code Projects",
    352       "authors": ["Hélio Victor F Santos", "Vitor Costa"],
    353       "year": 2025,
    354       "arxiv_id": "2511.09268",
    355       "relevance": "Analyzes configuration artifacts in Claude Code projects specifically."
    356     },
    357     {
    358       "title": "Swe-agent: Agent-computer interfaces enable automated software engineering",
    359       "authors": ["John Yang", "Carlos E Jimenez"],
    360       "year": 2024,
    361       "relevance": "Foundational work on agent-computer interfaces for automated software engineering."
    362     },
    363     {
    364       "title": "Prompts as Software Engineering Artifacts: A Research Agenda and Preliminary Findings",
    365       "authors": ["Hugo Villamizar", "Jannik Fischbach"],
    366       "year": 2025,
    367       "relevance": "Argues prompts should be treated as software engineering artifacts, relevant to configuration-as-code perspective."
    368     },
    369     {
    370       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    371       "authors": ["Agnia Sergeyuk", "Yaroslav Golubev"],
    372       "year": 2025,
    373       "doi": "10.1016/j.infsof.2024.107610",
    374       "relevance": "Survey of AI coding assistant usage in practice, providing context for adoption patterns."
    375     },
    376     {
    377       "title": "Meta Context Engineering via Agentic Skill Evolution",
    378       "authors": ["Haoran Ye", "Xuning He"],
    379       "year": 2026,
    380       "arxiv_id": "2601.21557",
    381       "relevance": "Framework for co-evolving agentic skills and context artifacts via evolutionary search."
    382     }
    383   ]
    384 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs