scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27089B)
      1 {
      2   "paper": {
      3     "title": "AI Agentic Programming: A Survey of Techniques, Challenges, and Opportunities",
      4     "authors": [
      5       "Huanting Wang",
      6       "Jingzhi Gong",
      7       "Huawei Zhang",
      8       "Jie Xu",
      9       "Zheng Wang"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2508.11126"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No repository URL or code archive is provided. The survey could have released analysis scripts, extraction tools, or dataset metadata but did not. No mention of code availability anywhere in the paper."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The corpus of 152 reviewed papers and the extracted taxonomy data are not released. No links to supplementary data files or public dataset are provided."
     26       },
     27       "environment_specified": {
     28         "applies": false,
     29         "answer": false,
     30         "justification": "This is a survey paper with no experiments or software environment; no runtime environment is needed or applicable."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No instructions for reproducing the search, screening, or selection process are provided beyond a general description of the methodology. A researcher could not replicate the corpus without full query strings, database access dates, and screening decisions."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "This is a survey paper with no primary statistical analysis; confidence intervals are not applicable."
     43       },
     44       "significance_tests": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No hypothesis tests are conducted; this is a literature survey with qualitative taxonomy and descriptive counts."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No effect sizes are applicable; the paper does not conduct meta-analytic or comparative statistical analyses."
     53       },
     54       "sample_size_justified": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No participant sample; the paper reviews a corpus of papers using inclusion/exclusion criteria, which is a standard SLR approach, not a statistical sample requiring power analysis."
     58       },
     59       "variance_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No repeated measurements or experiments are run; variance reporting is not applicable to this survey."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The survey does not compare itself against prior surveys on the topic (e.g., the 2024 survey on LLM-based agents for SE by Liu et al., or Hou et al. 2024 on LLMs for SE). No structured comparison of coverage, methodology, or conclusions with related surveys is provided."
     70       },
     71       "baselines_contemporary": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No experimental baselines are used; this criterion is not applicable to a survey paper with no experiments."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No system or model is evaluated; ablation studies are not applicable to a survey paper."
     80       },
     81       "multiple_metrics": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "The survey presents a taxonomy and qualitative analysis without using multiple evaluation metrics."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Human evaluation of the survey's outputs (taxonomy, findings) is not conducted; this is a theoretical/taxonomical survey paper."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No prediction or classification model is trained; held-out test sets are not applicable."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The survey provides per-category breakdowns in several ways: Table 5 compares representative systems across behavioral dimensions, Table 8 breaks down benchmarks by source, language, task, and difficulty, and Figure 6 shows the temporal distribution of papers by year."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 5 (Challenges) discusses failure modes of existing agentic systems, including limitations of benchmarks (Section 5.1), communication protocol deficiencies (5.2), domain-specific weaknesses (5.3), and safety failures (5.4). Figure 9 shows where all top-10 LLMs still fail on software optimization tasks."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5 and the benchmark analysis in Section 6.3 explicitly note negative findings: all models perform poorly on software optimization (GSO benchmark), benchmarks are biased toward Python, and existing frameworks lack multi-turn interactive evaluation support."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims to introduce a taxonomy, review techniques, and discuss challenges. The paper delivers all these: taxonomy in Section 4, techniques in Sections 2 and 4, challenges in Section 5, and opportunities in Section 6. Descriptive claims are appropriately qualified."
    117       },
    118       "causal_claims_justified": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "The survey makes no causal empirical claims; it describes, categorizes, and synthesizes existing work without claiming causal relationships from its own evidence."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper states 'We focus primarily on LLM-driven agentic systems for software development' but the title 'AI Agentic Programming: A Survey of Techniques, Challenges, and Opportunities' is quite broad. No search date cutoff is stated, no explicit boundaries on what was excluded (non-English papers, specific databases, adjacent domains), and no statement of what the survey's results do NOT show. The scoping is a topic statement rather than explicit scope boundaries per the schema's requirement."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": false,
    130         "answer": false,
    131         "justification": "This is a survey/taxonomy paper with no empirical results of its own; alternative explanations for observed patterns are not applicable. The paper presents the state of the field rather than testing hypotheses."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "The survey itself does not use any LLM; it reviews papers that do. No LLM is deployed by the survey authors."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "The survey does not use any prompting; it is a literature review conducted by human researchers."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No models or algorithms are run by the survey authors; hyperparameter reporting is not applicable."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used by the survey authors; this is a human-conducted literature review."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Section 3 describes the filtering pipeline stages with counts (7,700 → 395 → 141 → 152), but does not fully state the actual filtering CRITERIA applied at each stage. The inclusion/exclusion criteria are listed (Section 3.2), but the criteria for moving from 141 full-text reviewed to 152 final (the increase via citation chaining) are not clearly specified. The screening criteria are general and lack operationalized decision rules a second researcher could apply independently."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The paper has a 'Challenges' section (Section 5) and a 'Conclusion' (Section 7), but these address challenges in the field, not limitations of the survey itself."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats-to-validity section exists. The paper does not discuss potential biases in its own paper selection process, publication bias in the literature, or limitations of the inclusion/exclusion criteria. Generic disclaimers about the field moving fast (Introduction) do not substitute for specific methodological threats."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "While the paper states it focuses on LLM-driven agentic systems for software development, it does not explicitly state what it does NOT cover (e.g., which adjacent areas were excluded and why, whether non-English papers were excluded, what date range was searched, or what databases were excluded). The search cutoff date is not mentioned."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The list of 152 reviewed papers is not released as a dataset. There is no appendix or supplementary material listing the full corpus. The bibliography covers only cited papers, not the complete reviewed corpus."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 3 describes the search strategy in detail: databases searched (Google Scholar, ACM DL, IEEE Xplore, SpringerLink, arXiv), the Boolean search string combining agent terms, programming terms, and AI/LLM terms, and the three-stage selection process."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants; this is a literature review. The criterion about recruitment methods applies to human subjects research, which this paper is not."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The pipeline counts (7,700 → 395 → 141 → 152) are given in Section 3.3 and Figure 5, but the criteria applied at each stage are not operationalized. The jump from 141 to 152 (via citation chaining adding 11 papers) lacks explanation of how these 11 were selected or why the original search missed them. Disagreement resolution criteria for the two-researcher screening are not specified."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No acknowledgments section or funding disclosure is present in the paper. There is no mention of grants, institutional support, or funding agencies anywhere in the text."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All five authors list their affiliation as University of Leeds (UK) on the title page, with individual email addresses. Author affiliations are clearly disclosed."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so funder independence cannot be assessed. The paper appears to be academic survey work without declared external funding."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement is present. There is no declaration that authors hold (or do not hold) patents, equity, or financial interests related to the tools and systems reviewed (including GitHub Copilot, Claude, etc.)."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This is a survey paper; the authors do not train or evaluate any pre-trained models on benchmarks. Contamination questions are not applicable."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "This is a survey paper; no models are evaluated. The contamination question is not applicable."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This is a survey paper; no models are evaluated on benchmarks by the authors. Contamination is not applicable to the survey itself, though the paper discusses contamination as a challenge in the reviewed literature (Section 6.3)."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants; this is a literature review of published papers."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants; IRB approval is not applicable to a systematic literature review of published papers."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this survey."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants; the inclusion/exclusion criteria in Section 3.2 refer to papers, not human subjects."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants; this is not an experimental study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants; blinding is not applicable."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants; attrition is not applicable."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a survey paper; the authors do not run any LLM inference. The paper reviews cost considerations of others' systems (Table 6 shows token pricing) but does not report the cost of the survey's own methods."
    281       },
    282       "compute_budget_stated": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a survey paper; no compute budget is required. No models are trained or run by the survey authors."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "The field of AI agentic programming is characterized by no standard taxonomy, benchmark suite, or evaluation methodology.",
    292       "evidence": "Introduction section: 'There is no standard taxonomy, benchmark suite, or evaluation methodology.' The paper motivates this gap as the reason for the survey.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Existing benchmarks such as HumanEval and SWE-Bench are inadequate for capturing the full complexity of real-world software engineering workflows.",
    297       "evidence": "Section 5.1 and Table 7: SWE-Bench is restricted to Python, most tasks are function/module-level, with no multi-turn feedback, third-party library usage, or build pipeline management. Figure 9 shows all top-10 LLMs still score poorly on software optimization (GSO benchmark).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Current benchmarks are heavily biased toward Python, limiting generalizability of results to other programming languages.",
    302       "evidence": "Section 6.3: 'they are heavily biased toward a small set of programming languages - especially Python' with citations [59, 222]. Table 8 confirms Python dominates the listed benchmarks.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "AI agentic programming represents a distinct paradigm from program synthesis, code completion, and DevOps automation, characterized by multi-step planning, tool integration, and iterative refinement.",
    307       "evidence": "Section 2.5 systematically compares agentic programming to each related paradigm with specific capability distinctions. The taxonomy in Figure 7 and Table 5 operationalizes these distinctions.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Most current mainstream AI coding agents use persistent memory mechanisms, while simple assistants like GitHub Copilot rely on transient context.",
    312       "evidence": "Table 4 compares context management of eight representative agents. GitHub Copilot uses 'Sliding window over active buffer' with no persistent memory, while SWE-agent, Devika, OpenDevin use vector DB or structured stores.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "qualitative",
    318     "meta-analysis"
    319   ],
    320   "key_findings": "This systematic literature review of 152 papers introduces a taxonomy of AI agentic programming systems along behavioral (reactive vs. proactive, single-turn vs. multi-turn, tool-augmented vs. standalone, static vs. adaptive) and architectural (interactive code assistants, autonomous task-oriented agents, planning-centric agents, multi-agent systems) dimensions. The paper identifies key challenges including inadequate benchmarks biased toward Python and single-turn tasks, communication protocol limitations for multi-agent systems, domain-specific model weaknesses, and safety/trust concerns. It also synthesizes opportunities in compiler-agent integration, scalable memory architectures, domain specialization, and safety alignment. The paper covers tools ranging from GitHub Copilot and Cursor to SWE-agent, ChatDev, and AutoCodeRover, providing comparative tables of LLM capabilities and pricing.",
    321   "red_flags": [
    322     {
    323       "flag": "No limitations section on the survey itself",
    324       "detail": "The paper has a 'Challenges' section (Section 5) covering limitations of the field, but includes no discussion of limitations of the survey's own methodology — no acknowledgment of potential publication bias, search string limitations, database coverage gaps, or the possibility that the 152-paper corpus is incomplete or systematically skewed."
    325     },
    326     {
    327       "flag": "Survey does not assess methodological quality of reviewed papers",
    328       "detail": "The survey collects and categorizes papers by architecture and behavior type, but does not evaluate the quality of evidence in those papers. This is the 'laundering' pattern: a survey that aggregates papers without quality assessment inherits and amplifies the methodological weaknesses of its sources. There is no quality appraisal instrument or scoring reported."
    329     },
    330     {
    331       "flag": "No funding disclosure",
    332       "detail": "Five authors at a UK university produced a substantial systematic review with no funding acknowledgment. This is unusual and raises questions about whether institutional or industry support exists but was not declared."
    333     },
    334     {
    335       "flag": "Screening criteria underspecified for replication",
    336       "detail": "Section 3.2 states two-researcher screening was used, but decision rules for borderline cases, disagreement resolution procedures (beyond 'through discussion'), and the criteria for citation chaining additions (11 papers added, bringing 141 to 152) are not specified. Another team could not replicate the corpus selection."
    337     },
    338     {
    339       "flag": "Search date range not stated",
    340       "detail": "The paper does not disclose when the database searches were conducted, making it impossible to assess whether recent papers (e.g., from mid-2025) were included or excluded. The arXiv paper itself is dated September 2025 and cites tools accessed 'Accessed: 2025-09-15', suggesting the searches were recent, but no explicit search cutoff date is given."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    346       "authors": [
    347         "Carlos E Jimenez",
    348         "John Yang",
    349         "Alexander Wettig",
    350         "Shunyu Yao",
    351         "Kexin Pei",
    352         "Ofir Press",
    353         "Karthik Narasimhan"
    354       ],
    355       "year": 2023,
    356       "arxiv_id": "2310.06770",
    357       "relevance": "Core benchmark for evaluating agentic code-repair agents on real GitHub issues; heavily cited as the dominant agentic SE benchmark."
    358     },
    359     {
    360       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    361       "authors": [
    362         "John Yang",
    363         "Carlos E Jimenez",
    364         "Alexander Wettig",
    365         "Kilian Lieret",
    366         "Shunyu Yao",
    367         "Karthik Narasimhan",
    368         "Ofir Press"
    369       ],
    370       "year": 2024,
    371       "relevance": "Key agentic system for automated software engineering that uses a multi-role agent scaffold to solve GitHub issues; exemplifies the multi-agent collaborative category."
    372     },
    373     {
    374       "title": "ChatDev: Communicative Agents for Software Development",
    375       "authors": [
    376         "Chen Qian",
    377         "Wei Liu",
    378         "Hongzhang Liu",
    379         "Nuo Chen",
    380         "Yufan Dang",
    381         "Jiahao Li",
    382         "Cheng Yang",
    383         "Weize Chen",
    384         "Yusheng Su",
    385         "Xin Cong",
    386         "Juyuan Xu",
    387         "Dahai Li",
    388         "Zhiyuan Liu",
    389         "Maosong Sun"
    390       ],
    391       "year": 2024,
    392       "arxiv_id": "2307.07924",
    393       "relevance": "Pioneering multi-agent software development simulation with role-playing agents (CEO, CTO, programmers) for end-to-end software development."
    394     },
    395     {
    396       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    397       "authors": [
    398         "Xinyi Hou",
    399         "Yanjie Zhao",
    400         "Yue Liu",
    401         "Zhou Yang",
    402         "Kailong Wang",
    403         "Li Li",
    404         "Xiapu Luo",
    405         "David Lo",
    406         "John Grundy",
    407         "Haoyu Wang"
    408       ],
    409       "year": 2024,
    410       "relevance": "Broad systematic review of LLMs in software engineering; directly related prior survey that the current paper extends toward the agentic paradigm."
    411     },
    412     {
    413       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    414       "authors": [
    415         "Junwei Liu",
    416         "Kaixin Wang",
    417         "Yixuan Chen",
    418         "Xin Peng",
    419         "Zhenpeng Chen",
    420         "Lingming Zhang",
    421         "Yiling Lou"
    422       ],
    423       "year": 2024,
    424       "arxiv_id": "2409.02977",
    425       "relevance": "Close prior survey covering LLM-based agents specifically for SE; the current paper extends and updates this coverage."
    426     },
    427     {
    428       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    429       "authors": [
    430         "Shunyu Yao",
    431         "Jeffrey Zhao",
    432         "Dian Yu",
    433         "Nan Du",
    434         "Izhak Shafran",
    435         "Karthik Narasimhan",
    436         "Yuan Cao"
    437       ],
    438       "year": 2023,
    439       "arxiv_id": "2210.03629",
    440       "relevance": "Foundational prompting paradigm for agentic systems that interleaves reasoning and action; widely adopted in coding agents."
    441     },
    442     {
    443       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    444       "authors": [
    445         "Qingyun Wu",
    446         "Gagan Bansal",
    447         "Jieyu Zhang",
    448         "Yiran Wu",
    449         "Beibin Li",
    450         "Erkang Zhu",
    451         "Li Jiang",
    452         "Xiaoyun Zhang",
    453         "Shaokun Zhang",
    454         "Jiale Liu"
    455       ],
    456       "year": 2024,
    457       "relevance": "Leading multi-agent framework for LLM-based applications; key infrastructure for agentic programming systems."
    458     },
    459     {
    460       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    461       "authors": [
    462         "Guanzhi Wang",
    463         "Yuqi Xie",
    464         "Yunfan Jiang",
    465         "Ajay Mandlekar",
    466         "Chaowei Xiao",
    467         "Yuke Zhu",
    468         "Linxi Fan",
    469         "Anima Anandkumar"
    470       ],
    471       "year": 2023,
    472       "arxiv_id": "2305.16291",
    473       "relevance": "Influential example of a planning-centric agent that generates and accumulates reusable skills; cited as a model for long-horizon agentic planning."
    474     },
    475     {
    476       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    477       "authors": [
    478         "Xingyao Wang",
    479         "Boxuan Li",
    480         "Yufan Song",
    481         "Frank F Xu",
    482         "Xiangru Tang",
    483         "Mingchen Zhuge",
    484         "Jiayi Pan",
    485         "Yueqi Song",
    486         "Bowen Li",
    487         "Jaskirat Singh"
    488       ],
    489       "year": 2024,
    490       "arxiv_id": "2407.16741",
    491       "relevance": "Open-source agentic platform for software development; directly relevant to the survey's coverage of autonomous task-oriented agents."
    492     },
    493     {
    494       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    495       "authors": [
    496         "Naman Jain",
    497         "King Han",
    498         "Alex Gu",
    499         "Wen-Ding Li",
    500         "Fanjia Yan",
    501         "Tianjun Zhang",
    502         "Sida Wang",
    503         "Armando Solar-Lezama",
    504         "Koushik Sen",
    505         "Ion Stoica"
    506       ],
    507       "year": 2024,
    508       "arxiv_id": "2403.07974",
    509       "relevance": "Contamination-free benchmark for LLM code evaluation; addresses the training data leakage problem that affects static benchmarks like HumanEval."
    510     },
    511     {
    512       "title": "AutoCodeRover: Autonomous Program Improvement",
    513       "authors": [
    514         "Yuntong Zhang",
    515         "Haifeng Ruan",
    516         "Zhiyu Fan",
    517         "Abhik Roychoudhury"
    518       ],
    519       "year": 2024,
    520       "relevance": "Multi-agent system for autonomous navigation, editing, and validation of real-world repositories; key example of collaborative agentic SE."
    521     },
    522     {
    523       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    524       "authors": [
    525         "Noah Shinn",
    526         "Federico Cassano",
    527         "Ashwin Gopinath",
    528         "Karthik Narasimhan",
    529         "Shunyu Yao"
    530       ],
    531       "year": 2023,
    532       "relevance": "Foundational technique for self-improving agents using verbal reinforcement; directly enables the feedback-loop patterns central to agentic programming."
    533     }
    534   ]
    535 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs