scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25022B)
      1 {
      2   "paper": {
      3     "title": "LLM-based and Retrieval-Augmented Control Code Generation",
      4     "authors": [
      5       "Heiko Koziolek",
      6       "Sten Grüner",
      7       "Rhaban Hark",
      8       "Virendra Ashiwal",
      9       "Sofia Linsbauer",
     10       "Nafise Eskandani"
     11     ],
     12     "year": 2024,
     13     "venue": "LLM4Code'24 (1st International Workshop on Large Language Models for Code)",
     14     "doi": "10.1145/3643795.3648384"
     15   },
     16   "scan_version": 2,
     17   "active_modules": [],
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository is released: 'LLM-CodeGen-RAG Github Repository' [15] at https://github.com/hkoziolek/LLM-CodeGen-RAG. The paper states: 'We also publish our prompts, raw data, and source code to enable independent replication [15].'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The OSCAT function block library is open-source and publicly available. The paper states 'We also publish our prompts, raw data, and source code to enable independent replication [15],' indicating test prompts and data are in the GitHub repository."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Specific tool versions are mentioned throughout Section 4 (GPT-4-32k version 0613, FAISS-CPU 1.7.4, text-embedding-ada-002, OpenPLC-Editor, PDFPlumberLoader), but there is no requirements.txt, Dockerfile, or dedicated environment setup section. Python version and LangChain version are not specified."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper does not contain step-by-step reproduction instructions. While a GitHub repository is referenced [15], the paper itself lacks a 'Reproducing Results' section or specific commands to replicate the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No confidence intervals or error bars are reported. The evaluation consists of 3 spot-sample tests with qualitative pass/fail assessment and no quantitative metrics with uncertainty."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are used. The paper makes implicit comparisons (RAG-augmented generation is useful) based on 3 qualitative test cases without any statistical testing."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No effect sizes are reported. The paper claims 'speed up typical programming tasks significantly' (Section 8) without any quantitative measurement of time savings or comparison."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Only 3 spot-sample tests are conducted. The paper acknowledges 'these tests are merely spot samples and not exhaustive' (Section 5) but does not justify why 3 is sufficient for any claims."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance is reported. Each test is run once. Temperature is set to 0 to reduce randomness, but no multi-run analysis is performed to verify reproducibility."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No baselines are included. There is no comparison with non-RAG generation (plain GPT-4 without function block context), manual coding, or other code generation approaches."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No baselines are included at all, so the question of whether they are contemporary does not arise."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The system has multiple components (document chunker, embedder, vector store retriever, prompt template, LLM) but no ablation study is conducted to determine which components contribute to performance."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No formal quantitative metrics are defined. The evaluation checks compilation success and functional correctness qualitatively on 3 examples, but these are not systematically measured across a test suite."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The authors manually inspect generated code for correctness, identify errors (e.g., wrong variable names in Test 1, function vs function block confusion in Test 2), compile the code, and verify functional correctness through OpenPLC simulation with forced input values."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "There is no held-out test set. The prompt template was developed through 'an iterative process' (Section 4), and the same 3 tests are the only evaluation. No separation between development and evaluation data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Only 3 individual test cases are shown. There is no breakdown by category of control logic (e.g., PID control, signal processing, boolean logic) or by complexity level."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Failure cases are discussed in detail: Test 1 had incorrect variable names (OUT_MAX instead of OUT) due to ambiguous specification document. Test 2 incorrectly instantiated STAIR as a function block instead of a function. Both required manual corrections."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Two of three tests produced code requiring manual corrections. The paper honestly reports these errors: incorrect variable naming from ambiguous specifications (Test 1) and function block vs function confusion (Test 2)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract states 'we successfully generated IEC 61131-3 ST code that integrated the desired function blocks, could be compiled, and validated through simulations.' However, 2 of 3 tests required manual corrections before the code could compile. The abstract omits the manual fix step, overstating the generation quality."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Section 8 claims 'The method can save control engineers significant time' and 'speed up typical programming tasks significantly' without any time measurement or controlled comparison. These are causal claims about productivity improvement with no supporting evidence."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'LLM-based and Retrieval-Augmented Control Code Generation' is broad, but the evaluation uses only GPT-4, one function block library (OSCAT), and 3 hand-crafted prompts. While Section 6 discusses external validity, the claims in the abstract and conclusion are not bounded to the tested configuration."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Section 6 discusses threats to validity but focuses on generic methodological concerns (non-determinism, simple prompts, specific tooling). No specific alternative explanations for the observed results are discussed — e.g., whether GPT-4 already knows OSCAT blocks from training data, making the RAG contribution unclear."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures compilation success and functional correctness on 3 examples, but claims 'significant time savings' (Section 8) without measuring time. Section 5 acknowledges 'comparing the code generation duration with manual implementation are considered future work,' yet Section 8 still claims time savings."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4 specifies 'gpt-4-32k, version 0613' — a specific model version with snapshot date. Also specifies text-embedding-ada-002 for embeddings."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Figure 4 shows the full prompt template structure, and Section 5 provides the actual queries embedded in Tests 1-3. The reader can reconstruct the complete prompts sent to GPT-4."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4 states: 'GPT-4's temperature parameter was set to 0 to produce as deterministic output as possible, other parameters were left on default values.' Embedding model and vector store search type are also specified."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The RAG pipeline is described in detail: Figure 2 shows the full workflow diagram. Section 3 describes each component (Document Loader, Document Chunker, Text Embedder, Vector Store, Retriever, LLM). Section 4 specifies the exact implementations used (LangChain, PDFPlumberLoader, FAISS, RetrievalQA chain)."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4 describes how the OSCAT PDF was loaded (PDFPlumberLoader), chunked (custom regex chunker matching subsection headings), embedded (text-embedding-ada-002), and stored (FAISS-CPU 1.7.4, 4.5 MB database). Page numbers were recovered by text comparison."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 'Threats to Validity' is a dedicated section discussing internal, construct, and external validity threats across several paragraphs."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 discusses threats specific to this study: non-deterministic LLM behavior addressed by temperature=0, prompts being simple compared to practice, OSCAT being open-source and rarely used commercially, OpenPLC being less complex than commercial IDEs, only 3 tests conducted."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6 explicitly states: 'We established internal validity only through three initial tests, but have not yet exhaustively tested the method.' Also: 'control generation prompts are still simple though not artificial, but there is more complex control logic in practice' and 'The OSCAT library is also available as open-source and rarely used in commercial systems.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The paper states 'We also publish our prompts, raw data, and source code to enable independent replication [15]' referencing the GitHub repository. Generated code is also shown in Figures 3, 6, 8, and 10."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4 describes collecting 'over 50 Control Narratives from customers' and manually formulating control code generation prompts. The OSCAT library source is described (open-source, 496-page PDF, 400+ function blocks)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants in this study. The data sources (OSCAT library, customer narratives) are described but do not involve participant recruitment."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Sections 3 and 4 describe the full pipeline: OSCAT PDF → PDFPlumberLoader (page-based strings) → custom regex chunker (per function block) → text-embedding-ada-002 → FAISS vector store → similarity search → prompt augmentation → GPT-4 → code output → OpenPLC import."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source is disclosed. All authors are from ABB Research, implying corporate funding, but no acknowledgments section or funding statement is present."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All authors are listed with 'ABB Research, Germany' affiliation and firstname.lastname@de.abb.com email addresses. The corporate affiliation is clearly disclosed."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "ABB is a major industrial automation company that would directly benefit from automating control code generation. The method is designed for ABB's domain (PLC/DCS control logic with proprietary function block libraries). The funder has a commercial interest in the method succeeding."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement is included. ABB Research employees developing a method for ABB's core business domain (industrial automation) have an undisclosed financial interest in the outcome."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper uses GPT-4-32k (version 0613) to generate code but does not state its training data cutoff. This is relevant because the OSCAT library is publicly available online and could be in GPT-4's training data."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper does not discuss whether GPT-4 has seen OSCAT documentation during training. The paper paradoxically motivates RAG by claiming 'LLMs trained on public Internet data are not aware of such function blocks' while testing with OSCAT, an open-source library available online."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "OSCAT documentation is publicly available at oscat.de since at least 2023, predating GPT-4's training. The paper does not address whether GPT-4 could already generate OSCAT-based code without RAG, which would undermine the claimed contribution of the retrieval-augmented approach."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The evaluation is purely technical (code generation, compilation, simulation)."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 4 reports: 'Document retrieval including the call of embeddings API was performed in a sub-second time range while the delivery of the complete LLM answer took up to 20 seconds.' Wall-clock latency is provided, though no API cost in dollars or tokens consumed."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated. No GPU hours, total API spend, or hardware specifications are mentioned beyond the per-query latency."
    294       }
    295     }
    296   },
    297   "claims": [
    298     {
    299       "claim": "The retrieval-augmented method can integrate pre-built function blocks into LLM-generated IEC 61131-3 ST control code.",
    300       "evidence": "Three tests (Section 5) show GPT-4 selecting and instantiating OSCAT function blocks (SH_1, FT_AVG, GEN_SIN, STAIR, CTRL_PID, TON) based on RAG-retrieved specifications. All three produced code that compiled and ran correctly after manual corrections in 2 of 3 cases.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "The generated code could be compiled and validated through simulations.",
    305       "evidence": "Section 5 shows OpenPLC compilation and simulation results (Figures 7, 9, 11). Test 1 required fixing two variable names, Test 2 required changing a function block call to a function call, Test 3 worked without corrections.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "The method can save control engineers significant time in implementing control logic.",
    310       "evidence": "No time measurement or comparison with manual coding is provided. Section 5 states 'Comparing the code generation duration with manual implementation are considered future work.'",
    311       "supported": "unsupported"
    312     },
    313     {
    314       "claim": "GPT-4 correctly identified the required complex function blocks for control code generation queries supported by retrieval-augmented generation.",
    315       "evidence": "In all three tests (Section 5), GPT-4 selected the appropriate OSCAT function blocks (SH_1+FT_AVG, GEN_SIN+STAIR, CTRL_PID+TON) from the vector store retrieval results.",
    316       "supported": "weak"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "case-study"
    321   ],
    322   "key_findings": "The paper demonstrates basic feasibility of using retrieval-augmented generation with GPT-4 to produce IEC 61131-3 Structured Text control code that integrates function blocks from the OSCAT library. In 3 spot-sample tests, GPT-4 correctly identified appropriate function blocks from vector store retrieval, but 2 of 3 tests required manual corrections (variable name errors, function vs function block confusion) before the code could compile. The errors were attributed to ambiguities in the OSCAT specification documents rather than LLM limitations.",
    323   "red_flags": [
    324     {
    325       "flag": "Company evaluating its own potential product",
    326       "detail": "All authors are from ABB Research, a major industrial automation company. The method is designed for ABB's core business domain (control code with proprietary function block libraries). No funding or competing interests disclosure."
    327     },
    328     {
    329       "flag": "Extremely small sample size",
    330       "detail": "Only 3 spot-sample tests are conducted. The paper acknowledges they are 'merely spot samples and not exhaustive' but still makes broad feasibility and time-savings claims based on them."
    331     },
    332     {
    333       "flag": "Abstract overstates results",
    334       "detail": "The abstract states code was 'successfully generated' and 'could be compiled,' but 2 of 3 tests required manual corrections before compilation was possible. The manual correction step is omitted from the abstract."
    335     },
    336     {
    337       "flag": "No baseline comparison",
    338       "detail": "No comparison with plain GPT-4 without RAG, other LLMs, or manual coding. Without a non-RAG baseline, it is impossible to assess whether the retrieval augmentation actually helps or whether GPT-4 already knows OSCAT from training data."
    339     },
    340     {
    341       "flag": "Contamination risk undermines contribution",
    342       "detail": "The paper motivates RAG by claiming LLMs don't know proprietary function blocks, but tests with OSCAT — an open-source library available online. GPT-4 may already know OSCAT blocks from training data, making the RAG contribution unclear without a non-RAG baseline."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "ChatGPT for PLC/DCS Control Logic Generation",
    348       "authors": ["Heiko Koziolek", "Sten Gruener", "Virendra Ashiwal"],
    349       "year": 2023,
    350       "relevance": "Directly evaluates LLM code generation capabilities for IEC 61131-3 control logic with 100 representative prompts."
    351     },
    352     {
    353       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    354       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L Glassman"],
    355       "year": 2022,
    356       "relevance": "User study evaluating LLM-based code generation tool usability, relevant to developer productivity with AI coding tools."
    357     },
    358     {
    359       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    360       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    361       "year": 2023,
    362       "arxiv_id": "2302.06590",
    363       "relevance": "Experimental study with 95 developers showing 55% time savings with GitHub Copilot, key productivity claim in the field."
    364     },
    365     {
    366       "title": "In-IDE code generation from natural language: Promise and challenges",
    367       "authors": ["Frank F Xu", "Bogdan Vasilescu", "Graham Neubig"],
    368       "year": 2022,
    369       "relevance": "Experiments with programmers on IDE-integrated code generation showing mixed results on developer workflow and code quality."
    370     },
    371     {
    372       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    373       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    374       "year": 2023,
    375       "arxiv_id": "2305.01210",
    376       "relevance": "Proposes EvalPlus framework for rigorously evaluating LLM code generation correctness."
    377     },
    378     {
    379       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    380       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    381       "year": 2023,
    382       "arxiv_id": "2308.01861",
    383       "relevance": "Benchmark for evaluating LLM class-level code generation, relevant to code generation evaluation methodology."
    384     },
    385     {
    386       "title": "Retrieval augmented code generation and summarization",
    387       "authors": ["Md Rizwan Parvez", "Wasi Uddin Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"],
    388       "year": 2021,
    389       "arxiv_id": "2108.11601",
    390       "relevance": "REDCODER — retrieval-augmented code generation integrating formerly written code, foundational work for RAG in code generation."
    391     },
    392     {
    393       "title": "ReACC: A retrieval-augmented code completion framework",
    394       "authors": ["Shuai Lu", "Nan Duan", "Hojae Han", "Daya Guo"],
    395       "year": 2022,
    396       "arxiv_id": "2203.07722",
    397       "relevance": "Retrieval-augmented code completion using external context from existing codebases."
    398     },
    399     {
    400       "title": "Impact of code language models on automated program repair",
    401       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    402       "year": 2023,
    403       "arxiv_id": "2302.05020",
    404       "relevance": "Evaluates code language models for automated program repair, relevant to LLM code generation capabilities."
    405     },
    406     {
    407       "title": "Code generation using machine learning: A systematic review",
    408       "authors": ["Enrique Dehaerne", "Bappaditya Dey", "Sandip Halder", "Stefan De Gendt", "Wannes Meert"],
    409       "year": 2022,
    410       "relevance": "Systematic review of 37 publications on ML-based code generation, relevant as a survey of the code generation field."
    411     },
    412     {
    413       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    414       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    415       "year": 2020,
    416       "relevance": "Foundational RAG paper proposing retrieval-augmented generation for knowledge-intensive tasks."
    417     }
    418   ]
    419 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs