scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28200B)
      1 {
      2   "paper": {
      3     "title": "Agents4PLC: Automating Closed-loop PLC Code Generation and Verification in Industrial Control Systems using LLM-based Agents",
      4     "authors": [
      5       "Zihan Liu",
      6       "Ruinan Zeng",
      7       "Dongxia Wang",
      8       "Gengyun Peng",
      9       "Jingyi Wang",
     10       "Qiang Liu",
     11       "Peiyu Liu",
     12       "Wenhai Wang"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2410.14209"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides a GitHub link (https://github.com/Luoji-zju/Agents4PLC release) in Section V-A, and a project site at https://hotbento.github.io/Agent4PLC/. These are working URLs provided in the paper."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper constructs a benchmark of 23 programming tasks with formal verification specifications, but no explicit download link or repository for the benchmark data is provided separately from the code repository. It is unclear whether the benchmark is accessible as a standalone release."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using LangGraph and MetaGPT frameworks, and notes that CodeLlama 34B runs on 'a single NVIDIA A800 80GB PCIe GPU', but no detailed environment specification (requirements.txt, Dockerfile, conda environment, or library versions) is provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper points to a GitHub link and project site for 'more experiment details', but the paper itself contains no step-by-step reproduction instructions. The reader is directed externally without any commands or procedural guidance in the paper."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables I, II, and III are presented as raw counts and pass rates (e.g., '16 16 100.0%') with no confidence intervals, error bars, or uncertainty estimates."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims Agents4PLC 'significantly outperforms' LLM4PLC and other methods based solely on comparing pass rate numbers, with no statistical significance tests (e.g., t-tests, chi-squared, or permutation tests) performed."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "While raw performance differences can be calculated from the tables, the paper reports no formal effect sizes (e.g., Cohen's d, odds ratios). Percentage improvements are implicit but not contextualized as effect size measures."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The benchmark contains only 23 programming tasks (16 easy, 7 medium). No justification is given for why 23 tasks are sufficient to support generalizable claims, and no power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Results are reported as single-run pass rates with no standard deviation, variance across runs, or other spread measures. It is unclear whether experiments were repeated and averaged or run once."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares Agents4PLC against LLM4PLC (prior work) and ChatDev (a general-purpose multi-agent system) across multiple base LLMs (CodeLlama 34B, DeepSeek V2.5, GPT-4o, GPT-4o-mini), as shown in Table III."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "LLM4PLC (2024) and ChatDev (2024) are contemporary baselines from the same year. The base LLMs used (GPT-4o, DeepSeek V2.5) are current models."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section V-C (RQ3) presents an ablation study removing individual components (RAG, syntax hints, one-shot prompting, CoT in the Debugging Agent) and measuring their impact on pass rates, as shown in Table II."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports three metrics: syntax compilation success rate, verifiable rate, and pass rate. Additionally, RQ2 evaluates efficiency by counting generation attempts."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "The evaluation is entirely automated (compiler checks, formal verification). Human evaluation is clearly irrelevant to the claims, which concern automated correctness verification of generated PLC code."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "The paper uses a single benchmark for both developing and evaluating the framework. There is no explicit separation into a development set and a held-out test set. The same 23-task benchmark appears to be used throughout."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by 'Easy' and 'Medium' benchmark levels across all tables, providing some category-level detail. The benchmark tasks span categories including Logical Control, Mathematical Operations, Real-time Monitoring, and Process Control."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses failures, e.g., that ChatDev 'occasionally produces code in unrelated languages, such as Python or C++', and discusses cases where the debugging agent's effectiveness could not be fully captured. However, failure analysis is limited."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The ablation study (Table II) shows configurations that perform worse, e.g., removing CoT 'noticeably reduces the framework's performance' for medium problems, and that one-shot prompting 'do not lead to a substantial improvement'. RAG is noted to have 'unintended negative effects' on easy problems."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims Agents4PLC 'significantly outperforms previous methods' and this is supported by Table III showing higher pass rates across nearly all model/level combinations versus LLM4PLC. Claims about the benchmark being 'comprehensive' are supported by the description of 23 tasks with formal specs."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims about component contributions (e.g., 'RAG provides noticeable improvements for medium-level problems', 'CoT is crucial for handling more complex debugging tasks') are supported by the ablation study which systematically removes one component at a time, meeting the standard for controlled single-variable manipulation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims to address 'critical challenges in PLC programming' and highlights the 'potential of our framework to generate verifiable code applicable to real-world industrial applications', based on a benchmark of only 23 tasks (16 easy, 7 medium) in ST language only. The case study covers 4 practical tasks, which is an extremely small sample for industrial applicability claims. The title and framing suggest broad applicability beyond what the evidence supports."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for why Agents4PLC outperforms LLM4PLC. Notably, the paper acknowledges automating LLM4PLC with an 'extra automation program' which could introduce implementation bias. No threats-to-validity section or discussion of confounds is present."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper refers to 'GPT-4o' and 'GPT-4o-mini' with only documentation URL references (e.g., https://platform.openai.com/docs/models/gpt-4o) but no specific API version or snapshot date. 'GPT-4o' without a snapshot date does not constitute a specified version per the schema criteria."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper describes that 'prompts for the Coding Agent play a crucial role' and that they 'include critical elements of PLC coding', but no actual prompt text is provided in the paper or appendix. Figure 3 shows an example interaction but not the full system prompt. The description is in natural language only."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No hyperparameters are reported for any LLM calls (e.g., temperature, top-p, max tokens). The paper does not state the sampling settings used for GPT-4o, DeepSeek V2.5, or CodeLlama 34B."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The paper provides a detailed description of the multi-agent scaffold in Section III, including the workflow among Retrieval Agent, Planning Agent, Coding Agent, Debugging Agent, and Validation Agent. Figure 2 shows the architecture, Figure 3 shows a worked example, and the paper describes the loop threshold and feedback mechanisms."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The benchmark construction section states the dataset 'comprises 23 programming tasks' with 'human-written-verified formal specifications and reference PLC code', but does not describe how tasks were selected, what inclusion/exclusion criteria were applied, or how human verification was performed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions future work but does not substantively discuss limitations of the current approach beyond noting the framework currently focuses on ST language."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats-to-validity section exists. The paper does not discuss threats such as the small benchmark size (23 tasks), potential contamination of LLM training data with ST code, or the fact that LLM4PLC was automated by the authors which could introduce bias."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. The conclusion briefly notes future plans to 'expand the framework to support additional PLC programming languages', implicitly acknowledging ST-only scope, but does not explicitly bound claims."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The raw benchmark data (23 programming tasks and formal specifications) is not explicitly released as a separate downloadable artifact. The GitHub link is mentioned but no benchmark data download is confirmed in the paper."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The benchmark construction is described only at a high level: 23 programming tasks covering 'Logical Control, Mathematical Operations, Real-time Monitoring, Process Control'. How tasks were identified, what sources were used, and what inclusion criteria were applied is not explained."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "This is a benchmark evaluation study with no human participants. Data source is a newly constructed PLC programming task benchmark, not human subjects."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The paper mentions that the benchmark 'transitions from natural language requirements to human-written-verified formal specifications and reference PLC code', but the pipeline for creating and verifying these specifications (human verification process, tools used, annotation protocol) is not documented."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No acknowledgments section or funding disclosure appears in the paper. One co-author (Wenhai Wang) is affiliated with both 'UWin Tech & Zhejiang University', and UWinTech products are used in the case study, but no formal funding disclosure is made."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed in the header. Wenhai Wang's dual affiliation with UWin Tech (a company whose product is used in RQ4 case studies) and Zhejiang University is disclosed in the author list."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Wenhai Wang is affiliated with UWin Tech, whose software platform (UWinTech Control Engineering Application Software) is used in the RQ4 practical case study, creating a potential conflict. No funding source is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "There is no competing interests statement in the paper. Given Wenhai Wang's affiliation with UWin Tech and the use of their commercial platform in the case study, the absence of a competing interests declaration is a notable omission."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses GPT-4o, GPT-4o-mini, DeepSeek V2.5, and CodeLlama 34B but does not state the training data cutoff dates for any of these models. This is relevant because ST code examples may exist online that could be in the training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper does not discuss whether the 23 benchmark tasks or their solutions could have appeared in the training data of the evaluated LLMs. The benchmark tasks appear to be newly created, but no contamination analysis is performed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper claims the benchmark is newly constructed ('first benchmark dataset focused on the task of generating ST code from natural language specification'), but does not verify that similar tasks or solutions are absent from the LLMs' training data, nor discuss this risk."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved. This is a benchmark evaluation study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved. This is a benchmark evaluation study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved. This is a benchmark evaluation study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved. This is a benchmark evaluation study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved. This is a benchmark evaluation study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved. This is a benchmark evaluation study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved. This is a benchmark evaluation study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper does not report API costs, token usage, or cost per example for calls to GPT-4o and GPT-4o-mini. For an agentic system with multiple rounds of LLM calls per example, cost is practically important but not reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The only compute information given is that CodeLlama 34B runs on 'a single NVIDIA A800 80GB PCIe GPU'. Total compute budget (GPU hours, API spend, total time for experiments) is not stated."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Agents4PLC significantly outperforms previous methods (LLM4PLC, ChatDev) in PLC code generation, achieving superior results across syntax compilation, verifiable rate, and pass rate metrics.",
    295       "evidence": "Table III shows Agents4PLC with GPT-4o achieves 100% syntax compilation, 68.8% verifiable rate, and 50% pass rate on Easy problems vs. LLM4PLC/GPT-4o at 87.5%, 0%, and 12.5% respectively. Section V-A discusses these results in detail.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Agents4PLC is more efficient than LLM4PLC, generating correct code more often in a single attempt.",
    300       "evidence": "Table I shows that Agents4PLC with DeepSeek V2.5, GPT-4o, and GPT-4o-mini achieves 100% first-attempt syntax compilation on both Easy and Medium problems, while LLM4PLC requires multiple attempts across models.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Syntax hints significantly enhance code generation effectiveness, especially for easy problems, while RAG primarily benefits medium-level problems.",
    305       "evidence": "Table II ablation study shows 'One-shot + Syntax Hint' achieves highest easy problem pass rate (68.8%) while 'One-shot + RAG + Syntax Hint' performs best on medium problems (28.6% pass rate).",
    306       "supported": "weak"
    307     },
    308     {
    309       "claim": "Agents4PLC can generate verifiable PLC code that meets real-world industrial control requirements, demonstrated through four practical case studies.",
    310       "evidence": "Section V-D (RQ4) describes four case studies (LED control, motor control, temperature/pressure monitoring, node delay monitoring) using the UWinTech platform, with qualitative descriptions of correct behavior.",
    311       "supported": "weak"
    312     },
    313     {
    314       "claim": "CoT reasoning in the Debugging Agent is crucial for handling complex (medium-level) debugging tasks.",
    315       "evidence": "Table II shows removing CoT reduces medium-level verifiable rate from 42.9% to 14.3%, as described in Section V-C.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "methodology_tags": ["benchmark-eval", "case-study"],
    320   "key_findings": "Agents4PLC is a multi-agent LLM framework for automated PLC code generation and verification using Structured Text, combining a Retrieval Agent, Planning Agent, Coding Agent, Debugging Agent, and Validation Agent with formal verification tools (nuXmv, PLCverif). Evaluated on a new 23-task benchmark, Agents4PLC with GPT-4o achieves 50% pass rate on Easy problems and 28.6% on Medium problems, substantially outperforming the prior LLM4PLC approach. An ablation study shows syntax hints and CoT-based debugging contribute most to performance, particularly for complex tasks. Four practical case studies on industrial hardware demonstrate qualitative functional correctness of generated code.",
    321   "red_flags": [
    322     {
    323       "flag": "Tiny benchmark (23 tasks)",
    324       "detail": "The entire evaluation rests on 23 programming tasks (16 Easy, 7 Medium). Pass rates derived from 7 Medium-level tasks (e.g., 2/7 = 28.6%) carry extreme statistical uncertainty, yet the paper makes broad claims about industrial applicability. No sample size justification is provided."
    325     },
    326     {
    327       "flag": "No statistical uncertainty quantification",
    328       "detail": "All results are reported as single-run pass rates with no confidence intervals, standard deviations, or significance tests. The claim that Agents4PLC 'significantly outperforms' baselines is not supported by any statistical test."
    329     },
    330     {
    331       "flag": "Potential conflict of interest: UWin Tech affiliation",
    332       "detail": "Co-corresponding author Wenhai Wang is affiliated with UWin Tech, whose commercial software platform is used for the RQ4 case study. No competing interests statement or funding disclosure is provided, making it impossible to assess independence of the evaluation."
    333     },
    334     {
    335       "flag": "Fairness of LLM4PLC comparison",
    336       "detail": "The authors state they 'write an extra automation program to drive the components of the LLM4PLC framework' to automate it. This introduces an implementation variable that is not controlled: the comparison may reflect differences in automation quality, not the underlying framework capabilities."
    337     },
    338     {
    339       "flag": "No model version specification",
    340       "detail": "GPT-4o and GPT-4o-mini are referenced only by marketing name without API version or snapshot date. Since model behavior changes across versions, results may not be reproducible."
    341     },
    342     {
    343       "flag": "Benchmark contamination not addressed",
    344       "detail": "The evaluated LLMs (GPT-4o, DeepSeek V2.5) are trained on large code corpora that may include ST code examples similar to the benchmark tasks. No contamination analysis or discussion is provided."
    345     },
    346     {
    347       "flag": "No prompts released",
    348       "detail": "The system heavily relies on prompt engineering (syntax hints, CoT templates, RAG prompts) but no actual prompt text is provided in the paper or appendix, making the core methodology unverifiable and irreproducible from the paper alone."
    349     }
    350   ],
    351   "cited_papers": [
    352     {
    353       "title": "LLM4PLC: Harnessing Large Language Models for Verifiable Programming of PLCs in Industrial Control Systems",
    354       "authors": ["M. Fakih", "R. Dharmaji", "Y. Moghaddas", "G. Quiros", "O. Ogundare", "M. A. Al Faruque"],
    355       "year": 2024,
    356       "relevance": "The primary baseline for Agents4PLC, this paper introduces an LLM-based PLC code generation pipeline with external verification tools, representing the prior state of the art in this domain."
    357     },
    358     {
    359       "title": "ChatDev: Communicative Agents for Software Development",
    360       "authors": ["C. Qian", "W. Liu", "H. Liu", "N. Chen", "Y. Dang", "J. Li", "C. Yang", "W. Chen", "Y. Su", "X. Cong"],
    361       "year": 2024,
    362       "arxiv_id": "2307.07924",
    363       "relevance": "A multi-agent software development system used as a baseline, representing general-purpose LLM-based code generation via agent collaboration."
    364     },
    365     {
    366       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    367       "authors": ["S. Hong", "X. Zheng", "J. Chen", "Y. Cheng", "J. Wang", "C. Zhang"],
    368       "year": 2023,
    369       "arxiv_id": "2308.00352",
    370       "relevance": "Multi-agent framework used as infrastructure for Agents4PLC, relevant to the survey's focus on agentic code generation systems."
    371     },
    372     {
    373       "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving",
    374       "authors": ["M. A. Islam", "M. E. Ali", "M. R. Parvez"],
    375       "year": 2024,
    376       "arxiv_id": "2405.11403",
    377       "relevance": "A four-agent code generation system for competitive programming, cited as a related multi-agent approach, relevant to the survey's focus on LLM-based code generation."
    378     },
    379     {
    380       "title": "AutoSafeCoder: A Multi-Agent Framework for Securing LLM Code Generation through Static Analysis and Fuzz Testing",
    381       "authors": ["A. Nunez", "N. T. Islam", "S. K. Jha", "P. Najafirad"],
    382       "year": 2024,
    383       "arxiv_id": "2409.10737",
    384       "relevance": "A multi-agent framework adding security verification (static analysis and fuzzing) to code generation, related to the verification-augmented agent approach of this paper."
    385     },
    386     {
    387       "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation",
    388       "authors": ["D. Huang", "Q. Bu", "J. M. Zhang", "M. Luck", "H. Cui"],
    389       "year": 2023,
    390       "arxiv_id": "2312.13010",
    391       "relevance": "A three-agent code generation system with iterative testing and feedback, a related approach to the closed-loop verification in Agents4PLC."
    392     },
    393     {
    394       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    395       "authors": ["Q. Wu", "G. Bansal", "J. Zhang", "Y. Wu", "B. Li", "E. Zhu", "L. Jiang", "X. Zhang", "S. Zhang", "J. Liu", "A. H. Awadallah", "R. W. White", "D. Burger", "C. Wang"],
    396       "year": 2024,
    397       "relevance": "A foundational multi-agent framework cited as infrastructure for building LLM-based multi-agent systems, directly relevant to the survey's focus on agentic workflows."
    398     },
    399     {
    400       "title": "Large Language Model-based Agents for Software Engineering: A Survey",
    401       "authors": ["J. Liu", "K. Wang", "Y. Chen", "X. Peng", "Z. Chen", "L. Zhang", "Y. Lou"],
    402       "year": 2024,
    403       "arxiv_id": "2409.02977",
    404       "relevance": "A survey of LLM-based agents for software engineering, directly relevant to the survey scope of agentic AI for programming tasks."
    405     },
    406     {
    407       "title": "Code Llama: Open Foundation Models for Code",
    408       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle", "S. Sootla", "I. Gat", "X. E. Tan", "Y. Adi", "J. Liu", "R. Sauvestre", "T. Remez"],
    409       "year": 2023,
    410       "arxiv_id": "2308.12950",
    411       "relevance": "A code-specialized LLM used as a base model in Agents4PLC, relevant to the survey's coverage of code generation models."
    412     },
    413     {
    414       "title": "Keep the Conversation Going: Fixing 162 out of 337 Bugs for $0.42 each using ChatGPT",
    415       "authors": ["C. S. Xia", "L. Zhang"],
    416       "year": 2023,
    417       "arxiv_id": "2304.00385",
    418       "relevance": "LLM-based automated code repair paper that inspired the CoT-based debugging agent design in Agents4PLC, relevant to the survey's focus on automated debugging with LLMs."
    419     },
    420     {
    421       "title": "A Survey on Large Language Model based Autonomous Agents",
    422       "authors": ["L. Wang", "C. Ma", "X. Feng", "Z. Zhang", "H. Yang", "J. Zhang"],
    423       "year": 2024,
    424       "relevance": "A comprehensive survey on LLM-based autonomous agents, directly relevant to the broader survey scope on agentic AI systems."
    425     },
    426     {
    427       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    428       "authors": ["X. Hou", "Y. Zhao", "Y. Liu", "Z. Yang", "K. Wang", "L. Li", "X. Luo", "D. Lo", "J. Grundy", "H. Wang"],
    429       "year": 2023,
    430       "relevance": "A systematic literature review on LLMs for software engineering, directly relevant to the survey scope and potentially relevant for citation chasing on methodology quality."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs