scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28551B)
      1 {
      2   "paper": {
      3     "title": "Codified Context: Infrastructure for AI Agents in a Complex Codebase",
      4     "authors": ["Aristidis Vasilopoulos"],
      5     "year": 2026,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2602.20478"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The companion repository is published at https://github.com/arisvas4/codified-context-infrastructure, including representative agent specifications, the MCP retrieval server, example documents, factory agents, and analysis scripts (Section 6, final paragraph)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper mentions 'Raw milestone data is available in the companion repository' (Section 4.2) and 'extraction methodology, scripts, and dataset schema are described in the companion repository' (Section 4.3), but the actual conversation history dataset (1,457 JSONL files) and the full project codebase (108,000 lines of C#) are not released. Only analysis scripts and representative examples are provided."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned in the paper. The MCP retrieval server is described as '~1,600 lines Python' but no Python version or dependencies are specified."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The companion repository includes factory agents for bootstrapping the architecture on new projects, but no step-by-step reproduction instructions for replicating the quantitative metrics or case study results are described. The underlying C# project is not available for reproduction."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper reports point estimates only (e.g., '2,801 human prompts', '1,197 agent invocations', '~9.9 human prompts per session') with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper explicitly states 'No causal relationships are claimed' (Section 4.1) and makes no comparative claims requiring significance tests. It is a systems paper and experience report."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "The paper does not make comparative claims between conditions. It reports descriptive metrics about a single system and explicitly disclaims causal or comparative claims (Section 4.1)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is provided for why 283 sessions, 4 case studies, or 1,457 conversation files represent an adequate sample. The paper acknowledges some data was lost ('some early files were lost during a cache cleanup') but does not discuss implications for coverage."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported for any of the interaction metrics. Only totals and averages are given (e.g., '~9.9 human prompts per session', '~6 autonomous agent turns per human prompt')."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No baselines are included. There is no comparison against development without the codified context infrastructure, against single-file manifests, or against alternative knowledge organization approaches. The paper acknowledges this: 'It is not possible to quantify with statistical rigor how much the architecture improved development speed or code quality' (Section 5.3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are included at all, so contemporaneity is moot. The paper mentions Google Conductor as a concurrent approach but does not compare against it."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is presented. The paper does not test what happens when individual tiers (constitution, agents, knowledge base) are removed. The paper acknowledges 'Controlled benchmarking—measuring task completion rates and error rates with and without each architecture tier—is the most immediate priority' (Section 5.3)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple descriptive metrics are reported: infrastructure growth (lines, files by tier), interaction metrics (human prompts, agent invocations, agent turns, sessions), retrieval metrics (1,478 MCP calls across 218 sessions), and per-agent invocation counts (Section 4.2, 4.3, Table 3)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the system's outputs is conducted. The case studies are the author's own observations of their own system. No independent human evaluators assessed the quality of agent outputs with vs. without the infrastructure."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "The paper does not evaluate model performance on a benchmark or test set. It is a systems paper with descriptive metrics and case studies."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Breakdowns are provided by tier (T1/T2/T3 in Table 3), by agent capability class (8 higher-capability vs. 11 standard-capability agents in Section 3.2), by session type (structured ~13% vs. ad-hoc ~87% in Section 4.3), and per-agent invocation counts (code reviewer 154, network-protocol-designer 85)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.2 discusses specification staleness as the primary failure mode, with two specific examples: a combat specification referencing legacy stat fields and an undescribed second case. Case Study 4 describes a bug that required 'five context window exhaustions and 84 code edits' before resolution."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that specification staleness caused agents to generate incorrect code on at least two occasions (Section 5.2), that some early conversation files were lost during a cache cleanup reducing data coverage (Section 4.3), and that keyword matching retrieval has precision limitations (Section 5.3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims are modest: it presents a three-component architecture, reports quantitative metrics across 283 sessions, describes four case studies, and states the framework is open-source. All of these are supported in the paper body (Sections 3-4, companion repository link in Section 6)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper explicitly disclaims causal claims: 'No causal relationships are claimed; confounding factors including developer experience growth cannot be isolated' (Section 4.1). The case studies use language like 'illustrate' rather than 'prove'. This appropriate hedging means the study design is adequate for the claims actually made."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly bounds generalization: 'single developer on a single project; its effectiveness in team settings, other project types, or larger scales has not been evaluated' (Section 5.3). It also notes that the project domain 'demands more extensive documentation than many application types, which may limit generalizability to simpler projects.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5.3 discusses that developer experience growth cannot be isolated as a confound, that the observational methodology prevents quantifying the architecture's contribution, and that the tool-specific implementation limits transferability claims. The paper explicitly acknowledges 'confounding factors including developer experience growth cannot be isolated' (Section 4.1)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper states 'Claude Code' was used as the sole code-generation tool but does not specify which Claude model version, API version, or snapshot date. No version identifier beyond 'Claude Code' is provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper provides representative prompt/specification content: the constitution structure (Section 3.1), agent specification format with an abbreviated example in Appendix B (328-line coordinate-wizard agent), knowledge base document format with a specification example (Section 3.3), and orchestration trigger tables (Table 1). Full specifications are available in the companion repository."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the Claude Code sessions used during development."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding is described in substantial detail: the three-tier architecture (Section 3), orchestration protocols with trigger tables (Section 3.1.1, Table 1), MCP retrieval service with five search tools (Section 3.3.1), workflow diagrams (Figures 1 and 3), and structured vs. ad-hoc session modes (Section 4.3)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The interaction data extraction methodology is described only at a high level: 'Interaction data was extracted from 1,457 JSONL conversation history files' (Section 4.3). The paper acknowledges 'some early files were lost during a cache cleanup, and agent chain data is available only for a 31-day window' but does not document filtering criteria, classification methodology, or transformation steps in the paper itself. It references the companion repository for details."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.3 'Threats to Validity and Future Work' provides a substantive discussion of limitations spanning multiple paragraphs, covering single-developer evaluation, observational methodology, and tool-specific implementation."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats are specific to this study: 'single developer on a single project' (not generic), 'The project domain (real-time distributed simulation) demands more extensive documentation than many application types' (not generic), 'some early files were lost during a cache cleanup' (specific data loss), and confounding with developer experience growth (Section 5.3)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Explicit scope boundaries are stated: 'its effectiveness in team settings, other project types, or larger scales has not been evaluated' (Section 5.3), 'Transferability has not been evaluated', and 'No causal relationships are claimed' (Section 4.1). The evaluation methodology section also explicitly states 'This is a systems paper and experience report, where the primary contribution is the architecture rather than statistical evidence of effectiveness' (Section 4.1)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The raw conversation history data (1,457 JSONL files) is not released. The paper references analysis scripts and dataset schema in the companion repository but not the underlying data itself."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.3 describes data collection: interaction data from 1,457 JSONL conversation history files, with the caveat that 'some early files were lost during a cache cleanup, and agent chain data is available only for a 31-day window.' Section 4.2 describes infrastructure growth metrics reconstructed from Git history."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. This is a single-developer experience report using data from the author's own development sessions."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper states extraction methodology and scripts are in the companion repository (Section 4.3) but does not document in the paper itself how raw JSONL files were processed into the reported metrics. There is no description of filtering steps, classification criteria for structured vs. ad-hoc sessions, or how the 757 'classifiable' agent invocations were selected from the total 1,197."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgments section states: 'This work was conducted independently with no external funding.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author is listed as 'Independent Researcher, USA' with no corporate affiliation. The paper evaluates Claude Code, and the author has no disclosed connection to Anthropic."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "The work is explicitly stated as independently conducted with no external funding (Acknowledgments). There is no funder to evaluate for independence."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper. The companion repository is open-source, but there is no explicit declaration about absence of financial interests."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It is a systems paper and experience report describing an architecture for organizing project knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate a pre-trained model on any benchmark. No train/test overlap concern applies."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate model performance on any benchmark. No contamination concern applies."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. This is a single-developer experience report using the author's own development sessions."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. Single-developer experience report."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. The author does describe their own background ('primary background is in chemistry rather than software engineering') but this is self-description, not participant demographics."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants and not an experimental study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants and not an experimental study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token consumption totals, or per-session costs are reported despite using Claude Code extensively across 283 sessions with 16,522 agent turns. Only maintenance overhead is described in time terms ('approximately 1-2 hours per week', Section 5.2)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, API spend, or hardware specifications are stated. The paper reports development time (70 days part-time) but not computational costs."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The three-tier codified context infrastructure supported a single developer in constructing a 108,000-line distributed system in under 70 days of part-time development using AI agents as the sole code-generation tool.",
    286       "evidence": "Table 3 reports project scale: 108,256 C# LOC, 405 source files, 148 commits over 70 days. Section 4.1 states Claude Code was the sole code-generation tool. Context infrastructure totaled ~26,200 lines across 54 files (Section 4.2, Table 3).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The codified context infrastructure reduced the need for in-prompt explanation, with over 80% of human prompts being 100 words or fewer.",
    291       "evidence": "Section 4.3 reports 'Over 80% of human prompts were 100 words or fewer, consistent with the hypothesis that pre-loaded context reduces the need for in-prompt explanation.' However, no baseline comparison without context infrastructure is provided.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Meta-infrastructure prompts (building the knowledge architecture itself) accounted for only 4.3% of substantive prompts.",
    296       "evidence": "Section 4.3 states: 'Meta-infrastructure prompts—building the knowledge architecture itself—accounted for 4.3% of substantive prompts, representing the direct overhead of the approach.'",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Specialist agents were the dominant invocation type, with 57% of classifiable agent invocations being project-specific specialists.",
    301       "evidence": "Section 4.3 reports: 'Of 757 classifiable agent invocations, 432 (57%) were project-specific specialists defined in the context infrastructure and 325 were built-in tool agents.'",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Coordination across 74 independent sessions over four weeks produced no save-related bugs when agents had access to the save-system specification.",
    306       "evidence": "Case Study 1 (Section 4.4.1) describes the save-system.md specification appearing in 74 sessions and 12 agent conversations. Five subsequent features touching persistence were implemented correctly. However, this is observational with no controlled comparison.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "The context infrastructure's maintenance overhead averaged approximately 1-2 hours per week.",
    311       "evidence": "Section 5.2 states: 'Total maintenance overhead averaged approximately 1-2 hours per week,' comprising ~5 minutes per session when specifications were affected plus biweekly ~30-45 minute review passes.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["case-study", "observational"],
    316   "key_findings": "The paper presents a three-tier codified context infrastructure (constitution, specialized agents, knowledge base) for maintaining AI coding agent consistency across sessions in a large codebase. Developed iteratively during construction of a 108,000-line C# distributed system across 283 sessions, the architecture treats documentation as machine-readable infrastructure rather than human-oriented artifacts. Quantitative metrics show 57% of agent invocations used project-specific specialists, 80%+ of human prompts were under 100 words, and meta-infrastructure overhead was 4.3% of prompts. Four observational case studies illustrate distinct mechanisms (coordination, experience capture, gap detection, domain-expert debugging) but no controlled comparisons are provided.",
    317   "red_flags": [
    318     {
    319       "flag": "Single-developer self-evaluation",
    320       "detail": "The sole author developed, used, and evaluated the system. There is no independent assessment of effectiveness, no external users, and no separation between the developer and evaluator roles. The author acknowledges this limitation."
    321     },
    322     {
    323       "flag": "No controlled comparison",
    324       "detail": "No baseline comparison is provided (e.g., development with single-file manifests, without context infrastructure, or using alternative approaches). The paper cannot quantify whether the architecture actually improved outcomes versus the alternatives. The paper acknowledges this explicitly."
    325     },
    326     {
    327       "flag": "Survivorship bias in case studies",
    328       "detail": "Case studies were 'selected for qualitative diversity' from successful uses of the system. Instances where the context infrastructure failed to prevent errors or was counterproductive may be underrepresented, beyond the two staleness incidents mentioned in Section 5.2."
    329     },
    330     {
    331       "flag": "Missing cost data",
    332       "detail": "Despite 283 sessions with 16,522 agent turns, no API costs or token consumption data is reported. This makes it impossible to assess the practical economics of the approach."
    333     },
    334     {
    335       "flag": "Incomplete interaction data",
    336       "detail": "Some early conversation files were lost during a cache cleanup, and agent chain data is available only for a 31-day window within the 70-day measurement period. The impact of this data loss on reported metrics is not assessed."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Why Do Multi-Agent LLM Systems Fail?",
    342       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    343       "year": 2025,
    344       "arxiv_id": "2503.13657",
    345       "relevance": "Empirical study of failure modes in multi-agent LLM systems, directly relevant to understanding agentic coding system reliability."
    346     },
    347     {
    348       "title": "Agent READMEs: An Empirical Study of Context Files for Agentic Coding",
    349       "authors": ["W. Chatlatanagulchai", "H. Li", "Y. Kashiwa"],
    350       "year": 2025,
    351       "arxiv_id": "2511.12884",
    352       "relevance": "Empirical characterization of agent context files across Claude Code, Codex, and GitHub Copilot—directly studies the manifest pattern this paper extends."
    353     },
    354     {
    355       "title": "On the Use of Agentic Coding Manifests: An Empirical Study of Claude Code",
    356       "authors": ["W. Chatlatanagulchai", "K. Thonglek", "B. Reid"],
    357       "year": 2025,
    358       "arxiv_id": "2509.14744",
    359       "relevance": "Empirical study of CLAUDE.md manifests in Claude Code projects, providing baseline data on how developers configure AI coding agents."
    360     },
    361     {
    362       "title": "Promptware Engineering: Software Engineering for Prompt-Enabled Systems",
    363       "authors": ["Z. Chen", "C. Wang", "W. Sun"],
    364       "year": 2025,
    365       "arxiv_id": "2503.02400",
    366       "relevance": "Proposes treating prompts as engineered software artifacts, a principle extended by the codified context infrastructure approach."
    367     },
    368     {
    369       "title": "Context Engineering for Multi-Agent LLM Code Assistants Using Elicit, NotebookLM, ChatGPT, and Claude Code",
    370       "authors": ["M. Haseeb"],
    371       "year": 2025,
    372       "arxiv_id": "2508.08322",
    373       "relevance": "Studies integrated multi-tool workflows for context engineering in multi-file code generation, showing higher success rates than single-agent systems."
    374     },
    375     {
    376       "title": "Agentic Software Engineering: Foundational Pillars and a Research Roadmap",
    377       "authors": ["A. E. Hassan", "H. Li", "D. Lin"],
    378       "year": 2025,
    379       "arxiv_id": "2509.06216",
    380       "relevance": "Provides foundational framework and roadmap for agentic software engineering research including trust and scaffolding mechanisms."
    381     },
    382     {
    383       "title": "Professional Software Developers Don't Vibe, They Control: AI Agent Use for Coding in 2025",
    384       "authors": ["R. Huang", "A. Reyna", "S. Lerner"],
    385       "year": 2025,
    386       "arxiv_id": "2512.14012",
    387       "relevance": "Empirical study of how professional developers actually use AI coding agents, characterizing knowledge transfer as an open interaction design problem."
    388     },
    389     {
    390       "title": "Beyond the Prompt: An Empirical Study of Cursor Rules",
    391       "authors": ["S. Jiang", "D. Nam"],
    392       "year": 2025,
    393       "arxiv_id": "2512.18925",
    394       "relevance": "Empirical classification of instruction types developers include in Cursor configuration files, providing baseline data on manifest patterns."
    395     },
    396     {
    397       "title": "On the Impact of AGENTS.md Files on the Efficiency of AI Coding Agents",
    398       "authors": ["J. L. Lulla", "S. Mohsenimofidi", "M. Galster"],
    399       "year": 2026,
    400       "arxiv_id": "2601.20404",
    401       "relevance": "Quantitative evidence that AGENTS.md files reduce runtime by 29% and token consumption by 17%, providing effectiveness data for the manifest pattern."
    402     },
    403     {
    404       "title": "Context Engineering for AI Agents in Open-Source Software",
    405       "authors": ["S. Mohsenimofidi", "M. Galster", "C. Treude"],
    406       "year": 2025,
    407       "arxiv_id": "2510.21413",
    408       "relevance": "Studies adoption of context engineering in open-source repositories, finding only ~5% have adopted any context file format."
    409     },
    410     {
    411       "title": "Decoding the Configuration of AI Coding Agents: Insights from Claude Code Projects",
    412       "authors": ["H. V. F. Santos", "V. Costa", "J. E. Montandon"],
    413       "year": 2025,
    414       "arxiv_id": "2511.09268",
    415       "relevance": "Characterizes CLAUDE.md configurations in Claude Code projects, finding 72.6% specify application architecture."
    416     },
    417     {
    418       "title": "Agentic Software Engineers: Programming with Trust",
    419       "authors": ["A. Roychoudhury", "C. Pasareanu", "M. Pradel"],
    420       "year": 2025,
    421       "arxiv_id": "2502.13767",
    422       "relevance": "Calls for formal scaffolding, grounding, and trust mechanisms for agentic AI software engineering."
    423     },
    424     {
    425       "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving Language Models",
    426       "authors": ["Q. Zhang", "C. Hu", "S. Upasani"],
    427       "year": 2026,
    428       "arxiv_id": "2510.04618",
    429       "relevance": "Formalizes Agentic Context Engineering (ACE) as evolving playbooks with a generate-reflect-curate cycle, identifying brevity bias in context optimization."
    430     },
    431     {
    432       "title": "Agentic Much? Adoption of Coding Agents on GitHub",
    433       "authors": ["R. Robbes", "T. Matricon", "T. Degueule"],
    434       "year": 2026,
    435       "arxiv_id": "2601.18341",
    436       "relevance": "Documents adoption of fully agentic systems for coding on GitHub, providing context for the emerging agentic development ecosystem."
    437     }
    438   ]
    439 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs