scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (16731B)
      1 {
      2   "paper": {
      3     "title": "Context Engineering for AI Agents in Open-Source Software",
      4     "authors": ["Seyedmoein Mohsenimofidi", "Matthias Galster", "Christoph Treude", "Sebastian Baltes"],
      5     "year": 2026,
      6     "venue": "MSR '26",
      7     "doi": "10.1145/3793302.3793350"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "Section 3 states 'Our data collection and analysis scripts and the analyzed data are available online [23]' with a Zenodo DOI (10.5281/zenodo.17428770)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Same Zenodo archive includes 'the analyzed data' per Section 3."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependency lists, or setup instructions mentioned in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The Zenodo archive may contain them but the paper does not describe them."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a mining/qualitative study reporting descriptive statistics (counts, means, SDs). No inferential claims requiring CIs."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper makes no comparative claims requiring significance tests; it reports descriptive findings."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative claims are made that would require effect sizes."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample of 10,000 repositories is selected via a ranking approach but no formal justification for why 10,000 (vs. all 48,795) is adequate. The RQ3 sample of 10 files with ≥10 commits is convenience-based with no justification."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No experimental runs to report variance over. The study reports SDs for file lengths as descriptive statistics, which is appropriate but not what this criterion targets."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is an exploratory mining study, not an evaluation of a system. No baselines are applicable."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No system evaluation; baselines not applicable."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system evaluation with metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate. The manual coding of files is part of data analysis, not evaluation of a system."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No prediction or evaluation task requiring train/test split."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Tables 1 and 2 provide per-category breakdowns of information types and change categories in AGENTS.md files."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No discussion of failure cases or where the analysis approach might break down."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that 50% of AGENTS.md files had no changes, that no established content structure exists, and that adoption is only 5% — these are honest negative/null findings."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about adoption rates (466 projects), variation in structure/style, and commit-level evolution are all supported by the results in Sections 4.1-4.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes no causal claims; it is purely descriptive/exploratory."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly labels itself as a 'preliminary study' and notes limitations such as focus on four tools, mature popular repositories, and plans for future extension."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the observed patterns (e.g., whether low adoption reflects unawareness vs. deliberate choice, or whether patterns reflect template copying vs. organic development)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No LLM models used in the methodology."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting used in the methodology."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models or hyperparameters involved."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3 describes the full filtering pipeline: 228,890 → OSI license filter → language/commit/watcher thresholds → 48,795 → top 10,000 by ranking. Figure 1 outlines the process. Criteria at each stage are stated."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section. Some limitations are mentioned inline (e.g., in RQ1 answer and conclusion) but there is no substantive dedicated discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity discussed. Only brief inline mentions like 'One limitation is our focus on four selected tools.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states it is a 'preliminary study,' notes the focus on mature popular repositories, and identifies multiple directions for future work that bound the current scope."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Data available via Zenodo (reference [23], DOI 10.5281/zenodo.17428770)."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes the full data collection procedure: SEART tool, filtering criteria, cloning, scanning for context files, manual checks."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data source is GitHub repositories using a standard benchmark-style sampling approach."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3 and Figure 1 document the pipeline from initial repository selection through filtering to final sample, with counts at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Heidelberg University, University of Bamberg, Singapore Management University. None evaluate their own products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model evaluated on any benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model evaluated on any benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model evaluated on any benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants; this is a repository mining study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Mining study with no model inference; cost not applicable."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Mining study; compute budget not a meaningful concern."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Only 5% (466/10,000) of mature OSS repositories have adopted AI context files.",
    286       "evidence": "Section 4.1 reports scanning 10,000 repositories and finding 466 with at least one context file.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "There is no established content structure for AGENTS.md files, with wide variation in information and presentation style.",
    291       "evidence": "Section 4.2 shows 14 content categories (Table 1) and five stylistic dimensions, with high variation in file length (M=142, SD=231 for AGENTS.md).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "50% of AGENTS.md files have never been changed after initial creation.",
    296       "evidence": "Section 4.3 reports 77 of 155 AGENTS.md files had no changes.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The most frequent changes to AGENTS.md files are adding and modifying instructions.",
    301       "evidence": "Table 2 shows 'Add instruction(s)' (78) and 'Modify instruction(s)' (59) as top categories across 169 annotated commits.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["observational", "qualitative"],
    306   "key_findings": "This mining study of 10,000 GitHub repositories finds that only 5% have adopted AI context files (AGENTS.md, CLAUDE.md, etc.). AGENTS.md files vary widely in content and style, with no established structure — common topics include coding conventions, contribution guidelines, and architecture. The authors identify five writing styles (descriptive, prescriptive, prohibitive, explanatory, conditional). Half of AGENTS.md files were never modified after creation, and most changes involve adding or modifying instructions.",
    307   "red_flags": [
    308     {
    309       "flag": "No limitations section",
    310       "detail": "For a 5-page MSR paper, a dedicated limitations section is still expected. Threats to validity are only mentioned in passing, and key threats (e.g., selection bias from focusing on mature/popular repos, reliability of manual coding) are not discussed."
    311     },
    312     {
    313       "flag": "Small qualitative sample for RQ3",
    314       "detail": "RQ3 evolution analysis is based on only 10 files with ≥10 commits (169 commits), which is a convenience sample. The paper acknowledges this is preliminary but does not discuss how representative these files are."
    315     }
    316   ],
    317   "cited_papers": [
    318     {
    319       "title": "Guidelines for Empirical Studies in Software Engineering involving Large Language Models",
    320       "authors": ["Sebastian Baltes", "Florian Angermeir"],
    321       "year": 2025,
    322       "arxiv_id": "2508.15503",
    323       "relevance": "Directly addresses methodological guidelines for LLM-based SE research."
    324     },
    325     {
    326       "title": "On the Use of Agentic Coding Manifests: An Empirical Study of Claude Code",
    327       "authors": ["Worawalan Chatlatanagulchai", "Kundjanasith Thonglek"],
    328       "year": 2025,
    329       "relevance": "Empirical study of AI context files specific to Claude Code."
    330     },
    331     {
    332       "title": "Large Language Models for Software Engineering: Survey and Open Problems",
    333       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    334       "year": 2023,
    335       "doi": "10.1109/ICSE-FOSE59343.2023.00008",
    336       "relevance": "Major survey on LLMs for SE covering code generation and open problems."
    337     },
    338     {
    339       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    340       "authors": ["Xinyi Hou", "Yanjie Zhao"],
    341       "year": 2024,
    342       "doi": "10.1145/3695988",
    343       "relevance": "Comprehensive SLR on LLMs for software engineering."
    344     },
    345     {
    346       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    347       "authors": ["John Yang", "Carlos E. Jimenez"],
    348       "year": 2024,
    349       "relevance": "Foundational agentic SE tool that enables LLM agents to interact with repositories."
    350     },
    351     {
    352       "title": "AutoCodeRover: Autonomous Program Improvement",
    353       "authors": ["Yuntong Zhang", "Haifeng Ruan"],
    354       "year": 2024,
    355       "doi": "10.1145/3650212.3680384",
    356       "relevance": "Autonomous agent for program improvement using code search APIs."
    357     },
    358     {
    359       "title": "Software Engineering Using Autonomous Agents: Are We There Yet?",
    360       "authors": ["Samdyuti Suri", "Sankar Narayan Das"],
    361       "year": 2023,
    362       "doi": "10.1109/ASE56229.2023.00174",
    363       "relevance": "Evaluates autonomous agents (Auto-GPT) for SE tasks, highlighting context importance."
    364     },
    365     {
    366       "title": "A Survey of Context Engineering for Large Language Models",
    367       "authors": ["Lingrui Mei", "Jiayu Yao"],
    368       "year": 2025,
    369       "arxiv_id": "2507.13334",
    370       "relevance": "Survey defining context engineering as a discipline for LLMs."
    371     },
    372     {
    373       "title": "The rise and potential of large language model based agents: a survey",
    374       "authors": ["Zhiheng Xi"],
    375       "year": 2025,
    376       "doi": "10.1007/S11432-024-4222-0",
    377       "relevance": "Survey on LLM-based agents covering frameworks and capabilities."
    378     },
    379     {
    380       "title": "Prompting in the Wild: An Empirical Study of Prompt Evolution in Software Repositories",
    381       "authors": ["Mahan Tafreshipour", "Aaron Imani"],
    382       "year": 2025,
    383       "doi": "10.1109/MSR66628.2025.00106",
    384       "relevance": "Studies how prompts evolve in repositories, directly related to context engineering practices."
    385     }
    386   ]
    387 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs