ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21726B)


      1 {
      2   "paper": {
      3     "title": "Feedback Loops and Code Perturbations in LLM-based Software Engineering: A Case Study on a C-to-Rust Translation System",
      4     "authors": ["Martin Weiss", "Jesko Hecking-Harbusch", "Jochen Quante", "Matthias Woehrle"],
      5     "year": 2025,
      6     "venue": "arXiv preprint (submitted to IEEE)",
      7     "arxiv_id": "2512.02567"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper. The internal automotive code cannot be released, and no code for the translation system itself is shared."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The benchmark includes 30 internal automotive files that cannot be released. The 20 external files reference prior work but no dataset download link is provided. The paper states 'we are unable to release the full dataset' (Sec. IV-A)."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions tools (rustc, clippy, clang libFuzzer) but not their versions or configuration."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., pass@5 = 0.79). No confidence intervals or error bars are reported on the main results tables."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., 'GPT-4o mini is ~50% better than Phi-4') but no statistical significance tests are reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage point improvements with baseline context, e.g., 'first feedback loop has a high impact and can boost performance by up to 24%' and 'up to 9% points difference' between models, with baseline values visible in tables and figures."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark uses 50 C files with 76 functions. No justification is given for why 50 files is sufficient, and no power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Standard deviation is reported for benchmark file characteristics (Table I) but not for the experimental results themselves. Pass@k values are point estimates without spread measures across runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The no-feedback-loop condition (iteration count <= 1) serves as baseline. Multiple models are compared against each other. The Identity (no perturbation) condition serves as baseline for RQ3."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper uses GPT-4o mini, Phi-4, and reasoning models o3-mini and GPT-5 mini, which are contemporary models. GPT-3.5 Turbo is included explicitly as a historical reference."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study systematically varies three factors (feedback loops/iterations, model choice, perturbations) and measures their individual effects. The iteration count comparison (Table III) shows the contribution of each additional feedback loop."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses pass@k for various k values, compilation success rate, and fuzzing success rate as distinct metrics. It also introduces robust pass@k (pessimistic/optimistic views) for perturbation analysis."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "The system's outputs are validated by automated compilation and differential fuzzing checks. Human evaluation of translation quality is not relevant to the claims about success rates."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning training study. The benchmark is used directly for evaluation; there is no training/test split to hold out."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by internal vs. external files (Fig. 3), by perturbation level (Fig. 8), and by model (Fig. 5). Failure analysis is broken down by failure type (Fig. 4)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section VI-B provides detailed error analysis including fuzzing setup failures, fuzzing exceptions, translation system errors, and LLM API errors (Table IV). Specific failure causes are discussed (type conversion issues, macro translation problems)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that large/complex files remain challenging, that LLMCodeExtraction perturbation decreases performance, and that certain internal files consistently fail. The diminishing returns of additional feedback iterations is also reported."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims ('first feedback loop can boost performance by up to 24%', 'up to 50% with both', 'up to 9% points difference' between models) are all supported by results in Sec. V."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about feedback loops improving performance. The experimental design uses controlled single-variable manipulation (varying iteration count while holding other factors constant), which is adequate for these claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section VI-C explicitly states: 'the concrete results on the translation task may not generalize to other LLM-based tasks. They may not even generalize to translation tasks between other language pairs.' The title specifies it as a case study on C-to-Rust."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section VI discusses that performance differences between internal and external files 'might be due to the differences in length' and that LLMCodeExtraction perturbation's impact may be because it 'adds more complexity to the control flow.' The threats section discusses confounds from LLM randomness."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT-3.5 Turbo', 'GPT-4o mini', 'Phi-4', 'o3-mini', and 'GPT-5 mini' without specifying snapshot dates or API versions. These are marketing names without version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The exact prompts are provided in Sec. III: 'Translate the following C code to Rust. Keep all identifiers exactly as they are. <C code>' and the feedback prompt 'You made the following mistakes: <error messages>'. These are the actual prompts used."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section IV-B states 'All models in this paper are used with their default parameters, and we set the temperature to 0.7.' The number of runs (n=20) and max iterations (i=5) are also specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The translation system's scaffolding is described in detail in Sec. III with a diagram (Fig. 1): compilation check, linting via clippy, differential fuzzing, feedback loop with error messages, and the harness generation process."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-A describes the benchmark construction: 30 internal automotive files, 10 open-source files, 10 competitive programming files, sampled from larger sets. Table I provides file characteristics. The perturbation procedures are documented in Sec. IV-C."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VI-C 'Threats to Validity' provides a dedicated subsection discussing internal and external validity threats."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats section discusses specific issues: impact of errors on internal validity (referring to Sec. VI-B), running on only 50 C files, and that results may not generalize to other language pairs or LLM tasks."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states results 'may not generalize to other LLM-based tasks' and 'may not even generalize to translation tasks between other language pairs.' It also notes the focus on 'files that capture our target domain (embedded source code).'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw experimental data (individual run outcomes, token counts per run) is not available. The internal benchmark files cannot be released."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section IV-A describes the three sources (30 internal automotive, 10 open-source from prior work, 10 competitive programming), the sampling approach, and Table I characterizes the files."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The study uses code benchmarks only."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from C input through translation, compilation check, linting, fuzzing, and feedback is documented in Sec. III. The procedure for running experiments is described in Sec. IV-D. Error cases and their handling are documented in Table IV."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper. Three of four authors are from Bosch Research."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: one from Otto von Guericke University, three from Bosch Research."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Three authors are from Bosch Research, which has a potential interest in demonstrating the viability of LLM-based code translation for automotive software. No funding disclosure is provided to assess independence."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state training data cutoff dates for any of the models used (GPT-3.5 Turbo, GPT-4o mini, Phi-4, o3-mini, GPT-5 mini)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The paper notes that 'Keeping code internal has the benefit that LLMs should not have seen it during training' (Sec. IV-A), acknowledging the contamination concern for the 30 internal files. However, the 20 external files from prior published work are not similarly addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The 20 external benchmark files are from published prior work and could have been in the training data of the models used. This is only partially addressed by noting internal code is unseen; the external code contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Figure 2 and Figure 7 show token consumption versus performance trade-offs. The paper discusses cost-performance trade-offs explicitly, reporting token sums for different iteration counts and models."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total API spend, wall-clock time for the full experiment suite, or total computational budget is stated. Only relative token counts are shown in figures."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The first feedback loop iteration boosts translation success rate by up to 24%.",
    286       "evidence": "Table III shows pass@5 for final result increasing from 0.69 (<=1 iteration) to 0.78 (<=2 iterations) for GPT-4o mini. Sec. V-A discusses this.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Combining feedback loops and multiple runs can boost performance by up to 50%.",
    291       "evidence": "Sec. V-A and Fig. 2 show the combined effect of iterations and k runs on pass@k improvement.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "LLM selection has up to 9 percentage points difference in performance.",
    296       "evidence": "Fig. 5 shows pass@k differences across GPT-3.5 Turbo, GPT-4o mini, and Phi-4. Sec. V-B discusses model differences.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "With feedback loops, Phi-4 becomes the best model for k in [4, 11].",
    301       "evidence": "Fig. 5 shows Phi-4 with 5 iterations outperforming GPT models in the k=4 to k=11 range.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "GPT-4o mini is robust against most code perturbations.",
    306       "evidence": "Fig. 8 shows most perturbations cluster around the Identity baseline. Sec. V-C discusses this finding.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Feedback loops significantly improve success rates for both reasoning and non-reasoning models.",
    311       "evidence": "Fig. 7 shows consistent improvement patterns across GPT-4o mini, o3-mini, and GPT-5 mini with the first feedback providing ~20% improvement.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "case-study"],
    316   "key_findings": "Feedback loops in LLM-based C-to-Rust translation systems significantly improve success rates, with the first iteration providing the largest gain (up to 24%). Model selection matters but its impact is reduced when feedback loops are used; notably, the open-source Phi-4 model outperforms GPT models in the mid-range of k runs when feedback is enabled. Code perturbations have minimal impact on translation success for most perturbation types, and diversity from perturbations can be leveraged as data augmentation to boost performance by up to 9%.",
    317   "red_flags": [
    318     {
    319       "flag": "Small benchmark size",
    320       "detail": "Only 50 C files with 76 functions, with 30 being internal (unreproducible). No justification for sample size adequacy."
    321     },
    322     {
    323       "flag": "No statistical significance tests",
    324       "detail": "Comparative claims between models and conditions are made based on point estimates without any statistical tests to confirm differences are not due to chance."
    325     },
    326     {
    327       "flag": "Unreproducible dataset",
    328       "detail": "60% of the benchmark (30 internal automotive files) cannot be released, making independent verification impossible."
    329     },
    330     {
    331       "flag": "Model versions unspecified",
    332       "detail": "All models are referenced by marketing names without snapshot dates or API versions, making exact reproduction impossible."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Large language models for software engineering: A systematic literature review",
    338       "authors": ["X. Hou", "Y. Zhao", "Y. Liu"],
    339       "year": 2024,
    340       "relevance": "Comprehensive survey of LLMs for software engineering tasks, directly relevant to survey scope."
    341     },
    342     {
    343       "title": "Large language models for software engineering: Survey and open problems",
    344       "authors": ["A. Fan", "B. Gokkaya", "M. Harman"],
    345       "year": 2023,
    346       "relevance": "Major survey on LLMs in software engineering from ICSE-FoSE."
    347     },
    348     {
    349       "title": "Evaluating agent-based program repair at Google",
    350       "authors": ["P. Rondon", "R. Wei", "J. Cambronero"],
    351       "year": 2025,
    352       "relevance": "Industrial evaluation of agentic program repair, relevant to agentic AI in SE."
    353     },
    354     {
    355       "title": "Automated unit test improvement using large language models at Meta",
    356       "authors": ["N. Alshahwan", "J. Chheda", "A. Finogenova"],
    357       "year": 2024,
    358       "relevance": "Industrial-scale LLM-based test generation study."
    359     },
    360     {
    361       "title": "Towards translating real-world code with LLMs: A study of translating to Rust",
    362       "authors": ["H. F. Eniser", "H. Zhang", "C. David"],
    363       "year": 2024,
    364       "arxiv_id": "2405.11514",
    365       "relevance": "Directly related C-to-Rust LLM translation work with differential fuzzing approach."
    366     },
    367     {
    368       "title": "VERT: verified equivalent Rust transpilation with few-shot learning",
    369       "authors": ["A. Z. H. Yang", "Y. Takashima", "B. Paulsen"],
    370       "year": 2024,
    371       "arxiv_id": "2404.18852",
    372       "relevance": "C-to-Rust translation using LLMs with formal verification, related generate-and-check approach."
    373     },
    374     {
    375       "title": "How much does AI impact development speed? An enterprise-based randomized controlled trial",
    376       "authors": ["E. Paradis", "K. Grey", "Q. Madison"],
    377       "year": 2025,
    378       "relevance": "RCT measuring AI coding assistant productivity impact in enterprise setting."
    379     },
    380     {
    381       "title": "Do users write more insecure code with AI assistants?",
    382       "authors": ["N. Perry", "M. Srivastava", "D. Kumar", "D. Boneh"],
    383       "year": 2023,
    384       "relevance": "Study on security implications of AI-assisted coding."
    385     },
    386     {
    387       "title": "ReCode: Robustness evaluation of code generation models",
    388       "authors": ["S. Wang", "Z. Li", "H. Qian"],
    389       "year": 2023,
    390       "relevance": "Robustness evaluation methodology for code generation, directly used in this paper."
    391     },
    392     {
    393       "title": "On the robustness of code generation techniques: An empirical study on GitHub Copilot",
    394       "authors": ["A. Mastropaolo", "L. Pascarella", "E. Guglielmi"],
    395       "year": 2023,
    396       "relevance": "Robustness evaluation of AI code generation tools."
    397     },
    398     {
    399       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    400       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    401       "year": 2024,
    402       "relevance": "Major benchmark for evaluating LLM software engineering capabilities."
    403     },
    404     {
    405       "title": "Evaluating large language models trained on code",
    406       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    407       "year": 2021,
    408       "arxiv_id": "2107.03374",
    409       "relevance": "Introduced pass@k metric and HumanEval benchmark, foundational to code generation evaluation."
    410     }
    411   ]
    412 }

Impressum · Datenschutz