scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22615B)
      1 {
      2   "paper": {
      3     "title": "Automated Code Generation and Validation for Software Components of Microcontrollers",
      4     "authors": ["Sebastian Haug", "Christoph Böhm", "Daniel Mayer"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2502.18905",
      8     "doi": "10.48550/arXiv.2502.18905"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The paper describes a 'prototypical implementation' in Python but does not release it."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No dataset or code corpus is released. The paper uses a specific STM32F407 codebase for its experiments but does not provide it for download."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper mentions using Python, LangChain, pycparser, FAISS, CMake, and Renode, but provides no version numbers, requirements.txt, Dockerfile, or detailed environment specification."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided. The paper describes the approach at a conceptual level with figures but does not include commands, scripts, or a README for reproducing results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No confidence intervals or error bars are reported. The paper states 'more than 100 iterations' were run but provides no quantitative uncertainty measures on any results."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No statistical significance tests are used. The paper makes claims about consistency and robustness without any formal statistical analysis."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No effect sizes are reported. The paper does not quantify success rates, failure rates, or any measurable outcomes from its 100+ iterations beyond qualitative statements like 'consistently produced syntactically correct and functionally accurate code.'"
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper states 'more than 100 iterations' for each test case but provides no justification for why this number was chosen and no power analysis."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance or standard deviation is reported across the 100+ iterations. The paper mentions 'structural variance was observed in the regenerated code' but does not quantify it."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No baselines are compared against. The paper does not compare its AST+RAG approach to any alternative method (e.g., direct prompting without RAG, STM32CubeIDE's built-in generation, or other code generation tools)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No baselines are included at all, so there are no contemporary baselines to evaluate."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The system has multiple components (AST analysis, RAG retrieval, prompt engineering) but no ablation study is performed to show which components contribute to the results. For example, no comparison of generation with vs. without RAG context."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The only evaluation criteria mentioned are compilation success and functional correctness via Renode tests. No quantitative metrics (e.g., success rate, code similarity, token count, latency) are formally reported as numbers."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of the generated code quality is reported. The paper mentions 'expert feedback' in the conclusion/outlook section but does not describe any systematic human evaluation of generated outputs."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper tests on the same STM32F407 GPIO functions throughout. There is no separation of development and test scenarios, and no held-out functions or microcontrollers used for evaluation."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "While two test cases are described (random element deletion vs. complete HAL deletion), no per-function or per-category quantitative breakdown of results is provided. Results are described only qualitatively."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No failure cases are discussed. The paper reports only success ('consistently produced syntactically correct and functionally accurate code') without discussing any instances where generation failed or produced incorrect code."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No negative results are reported. Every experiment is described as successful, with no mention of approaches that were tried and failed or configurations that did not work."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The abstract claims the approach 'enables autonomous code completion for embedded applications,' but the evidence only covers GPIO operations on a single microcontroller (STM32F407). The claim of enabling 'autonomous code completion for embedded applications' broadly is not supported by the narrow evaluation."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes causal claims such as 'our approach enables autonomous code completion' and 'this approach minimizes the need for manual intervention,' but without baselines or ablation studies, there is no evidence that the AST+RAG design specifically causes these improvements rather than, e.g., the LLM alone."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper tests on a single microcontroller (STM32F407) with only GPIO operations but makes broad claims about 'embedded systems' and 'microcontroller platforms.' The title itself ('Software Components of Microcontrollers') overgeneralizes from one microcontroller and one peripheral type."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations are discussed. For instance, the paper does not consider whether GPT-4o Mini could generate correct HAL code without the RAG component, or whether the success is due to STM32F407 code being abundant in the model's training data."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper mentions 'GPT-4o Mini' in Section III.B and 'ChatGPT-4-turbo' in the threats to validity (Section VI), creating confusion about which model was actually used. Neither is specified with a version identifier or snapshot date (e.g., 'gpt-4o-mini-2024-07-18')."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper shows prompt templates with placeholders (e.g., '{function_name}', '{context}', '{sample_parameters}') in Section III.B but does not provide the actual fill values used in experiments. Some constraint text is shown but truncated with '...' — the reader cannot reconstruct the complete prompts sent to the model."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section III.B states: 'we set the model's temperature to 0 during experimentation.' While this is only one hyperparameter, it is the most impactful one for deterministic generation and is explicitly reported."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper describes the scaffolding in reasonable detail: AST analysis via pycparser to detect missing functions (Section III.C), FAISS-based RAG for context retrieval (Section II.C), LangChain for API orchestration (Section III.B), and the overall workflow in Figures 2-3. The pipeline from detection to generation to validation is documented."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper does not describe how the existing codebase was prepared, how the vector store was populated with embeddings, or what documents were indexed for RAG retrieval. The data preprocessing pipeline from raw code to FAISS index is not documented."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section VI 'Threats to Validity' is a dedicated section discussing limitations of the study."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section VI mentions specific threats: the experiment was conducted using a specific model (ChatGPT-4-turbo) and particular code generation tasks involving HAL deletions; results may not transfer to other programming languages or embedded systems; the rapid evolution of language models may affect reproducibility."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "While the threats to validity section mentions some limitations, the paper does not explicitly state what the results do NOT show. It does not clearly bound the scope to GPIO operations on STM32F407, and the abstract and conclusion suggest broad applicability beyond what was tested."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data from the 100+ iterations is available. The generated code, test logs, compilation results, or Renode output logs are not provided."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The data collection procedure is vaguely described. The paper states 'more than 100 iterations' were run for each test case but provides no details about how results were recorded, what constituted a success or failure, or what specific measurements were taken."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were involved. The study is a technical prototype evaluation using automated code generation and validation."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The pipeline from code generation to validation is described at a high level (Figures 3-4), but the data pipeline for collecting and analyzing experimental results is not documented. No details on how outcomes from 100+ iterations were aggregated or analyzed."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source is disclosed. The acknowledgment section mentions 'Technical resources and project guidance were generously provided by AGSOTEC' but does not disclose financial funding."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: Sebastian Haug and Christoph Böhm from Munich University of Applied Sciences, Daniel Mayer from AGSOTEC GmbH."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "AGSOTEC GmbH provided resources and guidance, and one co-author (Daniel Mayer) is from AGSOTEC. Whether AGSOTEC has a commercial interest in the outcome is not discussed. The funder's independence is not established."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is included in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper does not state the training data cutoff date for GPT-4o Mini. Since the approach uses an LLM to generate code for a well-known microcontroller (STM32F407), the model may have seen STM32 HAL code during training, which would confound the evaluation."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether STM32F407 HAL code exists in GPT-4o Mini's training data. Given that STM32 HAL code is widely available on GitHub, this is a significant concern that is not addressed."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper does not address whether the specific GPIO functions and register configurations for STM32F407 could have been memorized from training data. The model may simply be recalling known HAL implementations rather than generating novel code."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference cost, API cost, or latency information is reported. The paper mentions the number of API calls per iteration (1 for random element deletion, 12 for complete HAL deletion) but does not report actual costs or time."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The total API spend, hardware used for running the experiments, or wall-clock time for the 100+ iterations is not reported."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "The method consistently generates accurate and functional HAL code for the STM32F407 microcontroller across 100+ iterations.",
    287       "evidence": "Section IV.E states 'The process consistently produced syntactically correct and functionally accurate code' across 'more than 100 iterations' for each test case. No quantitative success rate is provided.",
    288       "supported": "weak"
    289     },
    290     {
    291       "claim": "The approach enables autonomous code completion for embedded applications without developer intervention.",
    292       "evidence": "The abstract claims this, and Section V.A discusses 'feasibility of autonomous code integration.' However, the evaluation is limited to GPIO operations on a single microcontroller with no baselines.",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "The code generation process handles both single-element regeneration and complete HAL regeneration.",
    297       "evidence": "Section IV.B describes two test cases: random element deletion (1 API call) and complete HAL deletion (12 API calls). Both are reported as successful across 100+ iterations, but no failure rates or quantitative metrics are given.",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "The validation process via Renode emulation efficiently verifies functional correctness without physical hardware.",
    302       "evidence": "Section III.D and IV.D describe the Renode-based validation workflow. The claim about efficiency is plausible but not quantified (no comparison to hardware-based testing time or cost).",
    303       "supported": "moderate"
    304     }
    305   ],
    306   "methodology_tags": ["case-study", "benchmark-eval"],
    307   "key_findings": "The paper proposes an AST+RAG approach for automatically generating hardware abstraction layer (HAL) code for the STM32F407 microcontroller's GPIO operations. The approach uses pycparser to detect missing functions in the codebase and GPT-4o Mini with FAISS-based retrieval to generate them. Over 100+ iterations of two test cases (single element deletion and complete HAL deletion), the authors report consistent generation of compilable and functionally correct code, validated via Renode hardware emulation. No quantitative metrics, baselines, or failure analysis are provided.",
    308   "red_flags": [
    309     {
    310       "flag": "No quantitative results reported",
    311       "detail": "Despite running 100+ iterations per test case, the paper reports no success rates, failure rates, compilation success percentages, or any numerical metrics. All results are described qualitatively (e.g., 'consistently produced syntactically correct code')."
    312     },
    313     {
    314       "flag": "No baselines or comparisons",
    315       "detail": "The paper does not compare the AST+RAG approach against any alternative: not against direct prompting without RAG, not against existing code generation tools, and not against STM32CubeIDE's built-in generation capabilities."
    316     },
    317     {
    318       "flag": "Training data contamination risk",
    319       "detail": "STM32F407 HAL code is widely available on GitHub and almost certainly in GPT-4o Mini's training data. The model may be recalling known HAL implementations rather than demonstrating the value of the AST+RAG approach. This is not discussed."
    320     },
    321     {
    322       "flag": "Model version inconsistency",
    323       "detail": "Section III.B mentions 'GPT-4o Mini' as the model used, but Section VI (Threats to Validity) references 'ChatGPT-4-turbo.' It is unclear which model was actually used in experiments."
    324     },
    325     {
    326       "flag": "No failure cases reported",
    327       "detail": "Over 100+ iterations, zero failures are mentioned. This is suspicious for an LLM-based code generation system — even with temperature=0, edge cases and failures would be expected. The complete absence of failure discussion suggests selective reporting."
    328     },
    329     {
    330       "flag": "Overgeneralized claims",
    331       "detail": "The paper tests on one microcontroller (STM32F407) with one peripheral type (GPIO) but makes claims about 'embedded applications' broadly and discusses extending to 'other microcontroller families' as if the current results support such generalization."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Outline, Then Details: Syntactically Guided Coarse-To-Fine Code Generation",
    337       "authors": ["W. Zheng", "S. P. Sharan", "A. K. Jaiswal", "K. Wang", "Y. Xi", "D. Xu", "Z. Wang"],
    338       "year": 2023,
    339       "arxiv_id": "2305.00909",
    340       "relevance": "Proposes syntactically guided code generation relevant to structured code generation approaches."
    341     },
    342     {
    343       "title": "Self-Planning Code Generation with Large Language Models",
    344       "authors": ["X. Jiang", "Y. Dong", "L. Wang", "Z. Fang", "Q. Shang", "G. Li", "Z. Jin", "W. Jiao"],
    345       "year": 2024,
    346       "doi": "10.1145/3672456",
    347       "relevance": "Self-planning approach for LLM-based code generation, directly related to structured code generation methodology."
    348     },
    349     {
    350       "title": "Retrieval Augmented Code Generation and Summarization",
    351       "authors": ["M. R. Parvez", "W. Ahmad", "S. Chakraborty", "B. Ray", "K.-W. Chang"],
    352       "year": 2021,
    353       "doi": "10.18653/v1/2021.findings-emnlp.232",
    354       "relevance": "Foundational work on retrieval-augmented generation applied to code, directly relevant to RAG-based code generation evaluation."
    355     },
    356     {
    357       "title": "LLM-based and Retrieval-Augmented Control Code Generation",
    358       "authors": ["H. Koziolek", "S. Grüner", "R. Hark", "V. Ashiwal", "S. Linsbauer", "N. Eskandani"],
    359       "year": 2024,
    360       "relevance": "Applies LLM-based RAG to control code generation, closely related to the survey scope of AI-assisted code generation."
    361     },
    362     {
    363       "title": "ReACC: A Retrieval-Augmented Code Completion Framework",
    364       "authors": ["S. Lu", "N. Duan", "H. Han", "D. Guo", "S.-w. Hwang", "A. Svyatkovskiy"],
    365       "year": 2022,
    366       "arxiv_id": "2203.07722",
    367       "relevance": "Retrieval-augmented code completion framework directly compared to in this paper, relevant to AI code generation evaluation."
    368     },
    369     {
    370       "title": "The Faiss library",
    371       "authors": ["M. Douze", "A. Guzhva", "C. Deng", "J. Johnson", "G. Szilvasy", "P.-E. Mazaré", "M. Lomeli", "L. Hosseini", "H. Jégou"],
    372       "year": 2024,
    373       "arxiv_id": "2401.08281",
    374       "relevance": "Vector similarity search library used as infrastructure for RAG-based code generation approaches."
    375     }
    376   ]
    377 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs