scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25595B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM-based and Retrieval-Augmented Control Code Generation",
      6     "authors": [
      7       "Heiko Koziolek",
      8       "Sten Grüner",
      9       "Rhaban Hark",
     10       "Virendra Ashiwal",
     11       "Sofia Linsbauer"
     12     ],
     13     "year": 2024,
     14     "venue": "1st International Workshop on Large Language Models for Code (LLM4Code'24)",
     15     "arxiv_id": null,
     16     "doi": "10.1145/3643795.3648384"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims the method 'speed[s] up typical programming tasks significantly,' but provides no timing data or productivity metrics. The three test cases show successful code generation, but speedup is never quantified.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper implicitly claims RAG improves code generation quality, but provides no comparison group (with/without RAG, other LLMs, or manual implementation). Three spot tests without baselines cannot establish causation.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title and abstract suggest broad applicability, but the evaluation is bounded to one open-source library (OSCAT, rarely used commercially), one notation (ST), one model (GPT-4), and three simple control logic examples. These limitations are discussed in Section 6 but not reflected in the main claims.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss when RAG is necessary versus using an LLM directly, why manual fixes are required (variables not in specs), or when other code generation approaches (model-driven, P&ID-based) might be more suitable.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper conflates 'code compiles successfully and passes simulation' with 'correctly solves the control problem.' For safety-critical industrial automation, compilation is a weak proxy for correctness—semantic and safety properties are not analyzed.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Threats to Validity' provides a dedicated limitations discussion covering internal, construct, and external validity threats.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are noted: temperature=0 to reduce LLM non-determinism (internal), OSCAT library 'rarely used in commercial systems' (construct), and 'only tested on OSCAT library' with external validity uncertain (external).",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Paper explicitly bounds scope to: OSCAT library only, ST notation only (four other IEC 61131-3 notations untested), GPT-4 only, OpenPLC IDE (less complex than commercial), and 'spot samples' not exhaustive testing.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "All authors list 'ABB Research, Germany' but no funding source is explicitly disclosed. No statement indicates whether ABB funded this work or if it was self-directed research.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors' affiliations with ABB Research are clearly listed. The paper evaluates an open-source library (OSCAT), not an ABB product, mitigating some conflict concerns.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "If ABB funded this (implied but not stated), the company has financial interest in demonstrating LLM feasibility for control code generation. Even though the evaluation uses open-source tools, ABB stands to benefit from positive results.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patent disclosures, or financial interest declarations are present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined in context: 'retrieval-augmented generation' (Section 3), 'function blocks' (Section 2 with examples), 'IEC 61131-3' and 'ST' (Section 2), 'control code' as automation-managing software.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contribution is explicitly stated in Section 1: 'We propose an LLM-based and retrieval-augmented control code generation method that can automate many control programming tasks' using vector embeddings and similarity search.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 7 systematically reviews related work (code generation ML surveys, LLM code benchmarks, RAG frameworks for Python/Java, control code generation methods, and prior LLM work on IEC 61131-3) and positions this work as the first to combine RAG with control logic.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Paper states 'we publish our prompts, raw data, and source code to enable independent replication' with GitHub link provided [15].",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "OSCAT library is public, and test prompts are provided, but the 50+ control narratives 'collected from customers' are not indicated as released. Customer data remains proprietary.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Implementation details scattered (Python, LangChain, GPT-4-32k v0613, FAISS-CPU 1.7.4, OpenAI ada-002, OpenPLC) but no formalized specification (requirements.txt, Dockerfile, conda env).",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Paper describes the method pipeline and implementation choices but lacks step-by-step reproduction instructions. GitHub repo may contain setup guidance, but it is not in the paper itself.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Only three test cases reported with qualitative outcomes ('compiled successfully', 'simulation showed expected functionality'). No error bars, confidence intervals, or repeated runs with variance.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": false,
    155           "answer": false,
    156           "justification": "No comparative claims between different methods are made, so significance testing is not applicable.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No metrics on code generation quality, success rate, lines-of-code reduction, or any quantitative effect measure. Success is reported only qualitatively.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Three test cases with no justification for sample size. Paper explicitly states 'these tests are merely spot samples and not exhaustive.'",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Only single runs of three test cases reported. No variance, standard deviation, or range across multiple attempts.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No baselines compared: no GPT-4 without RAG, no other LLMs, no manual implementation timing, no other code generation methods.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "No baselines are included, so this criterion does not apply.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study testing the contribution of RAG by disabling retrieval or comparing against plain LLM prompts.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "Only one implicit metric: code compiles and simulation runs correctly. No metrics on generation time, manual fixes required, code efficiency, or developer productivity.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "This is not a human subjects study. Tests are performed by authors with manual verification, not evaluation of control engineers' preferences or usability.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "Only three test cases shown, all apparently from a small set. No held-out test set for validation. The paper explicitly calls them 'spot samples.'",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Three tests cover different control logic types (sampling, signal generation, PID) but no systematic breakdown of what types pass/fail or error patterns.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Tests 1 and 2 required manual fixes (incorrect variable names), but these are mentioned briefly and not analyzed systematically. No discussion of failure frequency or patterns.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "Two of three tests required manual corrections, indicating the method doesn't always work correctly, but these failures are minimized and framed as 'initial encouraging evidence' rather than limitations.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "GPT-4-32k, version 0613, temperature=0 are all explicitly specified. Specific enough for reproducibility on that model version.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 4 provides a prompt template with placeholders, and Sections 5.1-5.3 provide the exact prompts used for all three test cases.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature=0 is explicitly set to minimize randomness; other parameters 'left on default values' (though defaults could be more explicitly stated).",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "RAG scaffolding is described in detail: vector store retrieval (Section 3, Steps a-c), similarity search, prompt augmentation (Steps 1-4), and Figure 2 diagram showing the full pipeline.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Document loading (PDFPlumber), chunking (custom regex on OSCAT structure), embedding (OpenAI ada-002), and vector storage (FAISS) are all documented in Section 4.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "OSCAT library and code are publicly available, but the 50+ customer control narratives are not clearly indicated as released. Raw input data not fully available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "Paper states 'we collected over 50 Control Narratives from customers' but provides no detail on collection methodology, timeframe, number of organizations, or sampling approach.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "This is not a human subjects study with recruitment; not applicable.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "Pipeline is described (narratives → prompts → queries → code → compilation → simulation) but the critical narrative-to-prompt step was manual: 'we did not implement the proposed Control Narrative Extractor.'",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "GPT-4-32k version 0613 is specified, but OpenAI's training data cutoff date for this model is not stated in the paper.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The three test cases use OSCAT (open-source, likely in training data) and control narratives (proprietary), but potential overlap with GPT-4's training corpus is not discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "This paper evaluates a code generation system, not model capabilities on standard benchmarks; not applicable.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human subjects involved; not applicable.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human subjects involved; not applicable.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human subjects involved; not applicable.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human subjects involved; not applicable.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human subjects involved; not applicable.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human subjects involved; not applicable.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human subjects involved; not applicable.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Latency is reported: 'document retrieval...performed in a sub-second time range while...complete LLM answer took up to 20 seconds.' Monetary cost not reported but latency is relevant.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget or cost estimate provided (API calls, embedding storage, vector search cost).",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Retrieval-augmented generation improves the quality of LLM-based control code generation by integrating proprietary function blocks",
    375       "evidence": "Three test cases where GPT-4 with RAG successfully identified and instantiated OSCAT function blocks into compilable ST code",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "The method can speed up typical control programming tasks significantly",
    380       "evidence": "Abstract claims this but paper provides no timing data, comparison to manual implementation, or productivity metrics",
    381       "supported": "unsupported"
    382     },
    383     {
    384       "claim": "GPT-4 can correctly select appropriate function blocks when provided with augmented prompts containing relevant specifications",
    385       "evidence": "All three test cases show GPT-4 selecting correct function blocks (SH_1, FT_AVG in Test 1; GEN_SIN, STAIR in Test 2; CTRL_PID, TON in Test 3)",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Generated control code produces functionally correct behavior as validated through simulation",
    390       "evidence": "Three tests show generated code compiling successfully and producing correct output in OpenPLC simulations",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "The method requires manual fixes in practice due to specification gaps (e.g., missing variable names in documentation)",
    395       "evidence": "Test 1 and Test 2 both required manual variable corrections (OUT_MAX→OUT, STAIR function vs. block)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "The method generalizes beyond OSCAT to other function block libraries and industrial automation domains",
    400       "evidence": "Paper only tested on OSCAT library with simple control logic. External validity explicitly noted as uncertain",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "case-study"
    406   ],
    407   "key_findings": "A retrieval-augmented generation system can be applied to LLM-based control code generation in IEC 61131-3 Structured Text, successfully identifying and instantiating function blocks from specifications into compilable code. However, evaluation is extremely limited (three 'spot sample' test cases) without baselines, systematic metrics, or comparison to non-augmented LLM generation. Two of three tests required manual corrections to generated code, and the practical speedup benefits claimed in the abstract are not substantiated with evidence.",
    408   "red_flags": [
    409     {
    410       "flag": "No baseline comparisons",
    411       "detail": "Paper provides no comparison between RAG-augmented generation and GPT-4 without RAG, making the contribution of retrieval unclear. No comparison to other LLMs, manual implementation, or alternative code generation methods."
    412     },
    413     {
    414       "flag": "Extremely limited evaluation (3 tests)",
    415       "detail": "Paper explicitly acknowledges 'these tests are merely spot samples and not exhaustive' with no systematic evaluation methodology. Results could be cherry-picked and may not reflect typical performance."
    416     },
    417     {
    418       "flag": "Manual fixes required but minimized",
    419       "detail": "Tests 1 and 2 required manual corrections to generated variable names, indicating the method doesn't always produce correct code. These failures are mentioned briefly but not analyzed for frequency or underlying causes."
    420     },
    421     {
    422       "flag": "No quantitative success metrics",
    423       "detail": "Only qualitative observations reported ('code compiled', 'simulation showed expected behavior'). No metrics on success rate, code quality, efficiency, developer time savings, or other quantifiable outcomes."
    424     },
    425     {
    426       "flag": "Undisclosed or implied funding conflict",
    427       "detail": "All authors from ABB Research (industrial automation company with clear interest in LLM code generation) but no explicit funding disclosure. Potential financial interest in positive results."
    428     },
    429     {
    430       "flag": "Limited generalization evidence",
    431       "detail": "Tested only on OSCAT (rarely used in commercial systems), only ST notation (one of five IEC 61131-3 options), only GPT-4, and only simple control logic. Generalization to production scenarios unclear."
    432     },
    433     {
    434       "flag": "No statistical rigor",
    435       "detail": "No error bars, confidence intervals, significance tests, effect sizes, power analysis, or sample size justification for the three test cases."
    436     },
    437     {
    438       "flag": "Customer data not released",
    439       "detail": "50+ control narratives used for prompt generation are collected from customers but not released, limiting reproducibility and data access."
    440     },
    441     {
    442       "flag": "No ablation study",
    443       "detail": "Paper does not test ablations (e.g., RAG on/off, different retrieval strategies, vector store size impact) to isolate contributions of the proposed method."
    444     },
    445     {
    446       "flag": "Incomplete environment specifications",
    447       "detail": "Implementation details scattered across text without formal environment specification. No requirements.txt, Dockerfile, or conda environment file mentioned in paper."
    448     }
    449   ],
    450   "cited_papers": [
    451     {
    452       "title": "A classification framework for automated control code generation in industrial automation",
    453       "relevance": "Directly related survey of control code generation methods (Koziolek et al. 2020)"
    454     },
    455     {
    456       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    457       "relevance": "Foundational RAG technique applied in this work (Lewis et al. 2020)"
    458     },
    459     {
    460       "title": "Retrieval-augmented generation for code summarization via hybrid GNN",
    461       "relevance": "Prior work applying RAG to code generation for Python/Java (Liu et al. 2020)"
    462     },
    463     {
    464       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    465       "relevance": "Empirical study of LLM code generation productivity benefits (Peng et al. 2023)"
    466     },
    467     {
    468       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    469       "relevance": "Evaluation framework for LLM-based code generation tools (Vaithilinga et al. 2022)"
    470     },
    471     {
    472       "title": "In-IDE code generation from natural language: Promise and challenges",
    473       "relevance": "Study of LLM code generation usability and developer experience (Xu et al. 2022)"
    474     }
    475   ],
    476   "engagement_factors": {
    477     "practical_relevance": {
    478       "score": 2,
    479       "justification": "Control code generation for industrial automation is a real, high-value problem affecting power plants and chemical processing, but the method remains unvalidated at scale."
    480     },
    481     "surprise_contrarian": {
    482       "score": 0,
    483       "justification": "RAG for code generation is an established technique; applying it to IEC 61131-3 is a minor domain extension without conceptual novelty."
    484     },
    485     "fear_safety": {
    486       "score": 1,
    487       "justification": "Control code manages critical infrastructure (power plants, chemical reactors) but paper does not discuss safety implications, validation rigor, or failure modes in production."
    488     },
    489     "drama_conflict": {
    490       "score": 0,
    491       "justification": "No controversial claims, competing approaches dismissed, or high-stakes debates. A straightforward feasibility study with no controversy."
    492     },
    493     "demo_ability": {
    494       "score": 2,
    495       "justification": "Requires setup of LLMs, vector stores, and function block libraries, but code and OSCAT are publicly available. Reproducible by practitioners with moderate effort."
    496     },
    497     "brand_recognition": {
    498       "score": 1,
    499       "justification": "ABB is a well-known industrial automation company, but this is a workshop paper (LLM4Code'24) with limited venue prestige."
    500     }
    501   },
    502   "hn_data": {
    503     "threads": [],
    504     "top_points": 0,
    505     "total_points": 0,
    506     "total_comments": 0
    507   }
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs