ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25162B)


      1 {
      2   "paper": {
      3     "title": "Bridging LLM-Generated Code and Requirements: Reverse Generation technique and SBC Metric for Developer Insights",
      4     "authors": ["Ahilan Ayyachamy Nadar Ponnusamy"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2502.07835"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'Our code and datasets are available on GitHub: GitHub Repository' and Section 3.2 mentions 'A reference implementation was developed in Python and is available in the associated GitHub repository.' A GitHub link is referenced, though the URL itself is not fully visible in the extracted text."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Our code and datasets are available on GitHub: GitHub Repository.' The 90-requirement dataset is claimed to be available in the GitHub repository."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements.txt, library versions, or dependency details are provided in the paper. The paper mentions PyTorch cos_sim and Sentence Transformers but does not provide version numbers or a reproducible environment specification."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While Section 3.2 describes the high-level steps of the reference implementation, there are no step-by-step reproduction instructions (e.g., commands to run, configuration files to use). The paper relies on the reader to find and figure out the GitHub repository."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported for SBC scores or any other results. Results are presented as point estimates in graphs (Figures 2, 4) without uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims models perform 'similarly' and trends are 'closely aligned' (Section 4.2) but provides no statistical tests to support these comparative claims. No p-values, t-tests, or other significance tests are used."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes are reported. The paper describes SBC score thresholds qualitatively (e.g., 'above 0.55' for high quality, 'above 0.65' for semantically very close) but provides no systematic effect size analysis or baseline context for interpreting differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The dataset consists of 90 requirements with no justification for why 90 was chosen. No power analysis or discussion of whether 90 requirements is sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Each LLM was run for 3 iterations (Section 4.1), but no standard deviation, variance, or spread measure is reported across these iterations. Section 4.2 states performance was 'consistent with minimal variance' but provides no quantification."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No baseline comparisons are included. The paper does not compare SBC score against existing metrics (BLEU alone, CodeBERTScore, pass@k, etc.) on the same dataset to demonstrate SBC's superiority. It only compares different LLMs using SBC."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are included at all, so contemporaneity cannot be assessed. The related work discusses G-EVAL, ICE-Score, and metamorphic prompt testing but never compares SBC against them experimentally."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The SBC score has three components (semantic similarity, BLEU, completeness) with fixed weights (0.7, 0.1, 0.2), but no ablation study is conducted to show that these weights are optimal or that each component contributes meaningfully. The weights are stated as design choices with only verbal justification."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports three component metrics (semantic similarity, BLEU, completeness) along with the composite SBC score, though these are sub-components of the proposed metric rather than independent evaluation dimensions."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is conducted. The paper explicitly acknowledges this gap in Section 5: 'human feedback remains an essential validation mechanism. Future work should incorporate qualitative assessments from software engineers.' This is a significant omission for a metric claiming to align with developer needs."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No held-out test set is used. All 90 requirements are used for evaluation without any train/test split. The SBC weights appear to be set a priori rather than tuned, but there is no validation of this design on held-out data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper provides per-LLM breakdowns (Section 4.1, Figure 2) and the dataset spans multiple technology categories (UI, Data Layer, Business Logic per Section 3.1). The graphs show per-question performance across LLMs."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.1 discusses 'missing elements' (omitted requirements) and 'extra elements' (hallucinations) as failure indicators. Figure 3 shows a specific example with missing and extra elements. Section 5 discusses common failure modes."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results are reported. All models are described as performing similarly and consistently. No failed approaches, sub-optimal configurations, or surprising failures are discussed."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims the metric 'improves the evaluation of AI-generated code' and offers 'a real-time, interpretable scoring system,' but no comparison against existing metrics is provided to demonstrate improvement. The abstract claims the approach benefits 'developers of all experience levels,' but no user study validates this."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims such as 'This integration empowers developers at all levels' (Section 4.3) and 'our approach allows developers to quickly evaluate' (Section 5) without any causal evidence. The claim that SBC 'improves the evaluation of AI-generated code' is causal but unsupported by comparative experiments."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims to bridge 'LLM-Generated Code and Requirements' generally, but the study uses only 90 requirements, 4 specific open models, and a limited set of technologies. No bounding of generalization is stated. Section 5 mentions future expansion but does not bound current claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed. For example, the observation that all LLMs perform similarly could be due to the simplicity of the requirements, the dominance of the semantic similarity weight (0.7), or the small dataset size. None of these alternatives are considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Section 3.3 lists models as 'Codellama 13B', 'Qwen2.5-Coder 14B', 'Deepseek Coder 6.7B', and 'Codestral 22B' with quantization levels (Q4 or Q5) but does not specify exact version identifiers, snapshot dates, or which specific quantization variant was used for each model."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Section 3.2 describes the process as 'passing the generated code back to the LLM with a detailed prompt to reconstruct the requirement' but the actual prompt text is not provided anywhere in the paper or appendix."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4 states 'we conducted all tests with the temperature set to zero.' The SBC weight parameters are also specified (0.7, 0.1, 0.2). However, other LLM parameters (max_tokens, top_p) are not mentioned."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The approach is a straightforward pipeline: generate code, reverse-generate requirements, compute SBC score. There is no agent loop, tool use, or autonomous decision-making."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 3.1 states '90 requirements were curated' across multiple technologies but does not describe the curation process, selection criteria, or how requirements were formulated. The process for converting SBC output to CSV for visualization is mentioned but not documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. Section 5 ('Conclusion and Next Steps') briefly mentions areas for future exploration but does not systematically discuss limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The conclusion mentions that 'human feedback remains an essential validation mechanism' and suggests expanding the dataset, but these are framed as future work rather than current limitations."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show, what types of code generation are excluded, or what limitations exist in the current evaluation."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 4.1 states results were 'recorded in JSON format' and the abstract claims 'Our code and datasets are available on GitHub.' The JSON output includes detailed per-question SBC scores and component breakdowns, suggesting raw data availability."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section 3.1 describes the dataset categories (UI, Data Layer, Business Logic) and states 90 requirements were curated, but does not describe who created the requirements, what criteria were used, or how they were validated as representative of real-world development."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved in this study. The dataset consists of curated programming requirements, not human subjects data."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 documents the pipeline steps: iterate through requirements, invoke LLM for code generation, perform reverse generation, compare using SBC scoring, store results in JSON. Section 3.6 describes the visualization pipeline from JSON to CSV to pivot tables to graphs."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants, sponsors, or funding agencies."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The author lists only a personal email address (ahilanp@gmail.com) with no institutional affiliation disclosed. No university, company, or organizational affiliation is provided."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed. The paper appears to be independent work by a solo author with a personal email, suggesting it may be unfunded."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial disclosure is provided anywhere in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses four LLMs (Codellama 13B, Qwen2.5-Coder 14B, Deepseek Coder 6.7B, Codestral 22B) but does not state training data cutoff dates for any of them."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 3.1 states 'To prevent data contamination, as discussed by [16] and [14], a completely new dataset was created instead of relying on pre-existing benchmark datasets.' This directly addresses train/test overlap by creating novel requirements."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The paper explicitly addresses contamination risk by creating a new dataset rather than using existing benchmarks (Section 3.1). This is a meaningful mitigation strategy, as novel requirements would not appear in training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or wall-clock time is reported despite running 4 models x 3 iterations x 90 questions (1,080 inference calls minimum). The paper claims integration into development lifecycle but does not quantify the practical cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No compute budget, hardware specifications, or total computation time is stated. The paper uses quantized models locally but does not specify what hardware was used or how long experiments took."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The SBC score combined with reverse generation provides actionable insights for developers by highlighting missing features and hallucinations in LLM-generated code.",
    286       "evidence": "Section 4.1 shows examples of missing and extra elements detected (Figure 3). The paper demonstrates that the method identifies gaps but provides no user study or comparison against alternative approaches to validate 'actionable insights.'",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "All four LLMs perform similarly, with performance trends rising and falling together across different questions.",
    291       "evidence": "Section 4.2 states 'the line graphs for all LLMs are closely aligned' with reference to Figure 4, but no statistical test of similarity is conducted. Only visual inspection of graphs is offered as evidence.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Reverse-generated requirements are of high quality and easy to interpret when SBC scores exceed 0.55.",
    296       "evidence": "Section 4.2 shows sample outputs (Figures 5, 6) for scores above 0.55 and 0.65. This is anecdotal evidence from individual examples, not a systematic validation. No human judges assessed quality or interpretability.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The SBC metric improves the evaluation of AI-generated code compared to traditional metrics like BLEU and ROUGE.",
    301       "evidence": "The abstract and introduction claim improvement over traditional metrics, but no head-to-head comparison is conducted. The paper cites prior work showing BLEU/ROUGE have weak correlations with human judgment but does not demonstrate that SBC has stronger correlation.",
    302       "supported": "unsupported"
    303     },
    304     {
    305       "claim": "Creating a new dataset prevents data contamination issues present in existing benchmarks.",
    306       "evidence": "Section 3.1 states the dataset was created 'to prevent data contamination' rather than using pre-existing benchmarks. This is a reasonable design choice but the claim is only partially supported since no contamination analysis is conducted.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper proposes the SBC (Semantic-BLEU-Completeness) score, a hybrid metric that evaluates LLM-generated code by reverse-generating requirements and comparing them to original specifications. Testing on 4 open-source models (Codellama 13B, Qwen2.5-Coder 14B, Deepseek Coder 6.7B, Codestral 22B) across 90 curated requirements, the paper finds that all models perform similarly with closely aligned trends. The reverse generation approach identifies missing functionality and hallucinations, though no human evaluation or comparison against existing metrics validates SBC's effectiveness.",
    312   "red_flags": [
    313     {
    314       "flag": "No baseline comparison for the proposed metric",
    315       "detail": "The paper proposes SBC as an improvement over BLEU, ROUGE, and CodeBERTScore but never compares SBC against any of these metrics on the same dataset. Without head-to-head comparison, the claim of improvement is unsupported."
    316     },
    317     {
    318       "flag": "No human validation of the metric",
    319       "detail": "The paper claims SBC provides 'actionable insights' and benefits 'developers of all experience levels' but includes no human evaluation whatsoever. The paper acknowledges this gap but proceeds to make strong claims regardless."
    320     },
    321     {
    322       "flag": "Arbitrary fixed weights without justification",
    323       "detail": "The SBC score uses fixed weights (0.7 semantic, 0.1 BLEU, 0.2 completeness) with only verbal justification ('semantic similarity is the most critical factor'). No ablation study, sensitivity analysis, or empirical optimization validates these weights."
    324     },
    325     {
    326       "flag": "Very small dataset with no statistical rigor",
    327       "detail": "90 requirements with 3 iterations per model, but no error bars, standard deviations, confidence intervals, or significance tests. Claims about model similarity and performance trends are based solely on visual inspection of graphs."
    328     },
    329     {
    330       "flag": "Claims substantially outrun the evidence",
    331       "detail": "The paper claims the metric 'improves evaluation of AI-generated code' and provides 'a real-time, interpretable scoring system' for enterprise integration, but the evidence consists only of running the metric on a small custom dataset with no external validation."
    332     },
    333     {
    334       "flag": "Solo author with no institutional affiliation",
    335       "detail": "The paper lists only a personal email (ahilanp@gmail.com) with no institutional affiliation, no acknowledgments, and no competing interests statement. While this is not inherently problematic, combined with the methodological gaps, it raises questions about peer feedback during development."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Out of the BLEU: How should we assess quality of the code generation models?",
    341       "authors": ["M. Evtikhiev", "A. Pankevich", "V. Zakharov", "D. Chernobrov", "D. Ustalov"],
    342       "year": 2023,
    343       "relevance": "Directly evaluates quality metrics for code generation models, relevant to LLM code evaluation methodology."
    344     },
    345     {
    346       "title": "Evaluating large language models trained on code",
    347       "authors": ["M. Chen"],
    348       "year": 2021,
    349       "arxiv_id": "2107.03374",
    350       "relevance": "Introduces the Codex model and HumanEval benchmark, foundational work for LLM code generation evaluation."
    351     },
    352     {
    353       "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code",
    354       "authors": ["S. Zhou"],
    355       "year": 2023,
    356       "arxiv_id": "2302.05527",
    357       "relevance": "Neural-based evaluation metric for code generation using pretrained models, directly relevant to code quality assessment."
    358     },
    359     {
    360       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    361       "authors": ["P. Vaithilingam", "T. Zhang", "E. L. Glassman"],
    362       "year": 2022,
    363       "relevance": "User study evaluating usability of LLM code generation tools, relevant to developer productivity and AI-assisted coding research."
    364     },
    365     {
    366       "title": "GitHub Copilot AI Pair Programmer: Asset or Liability?",
    367       "authors": ["V. Arghavan", "F. Amin", "Z. Michel"],
    368       "year": 2023,
    369       "arxiv_id": "2206.15331",
    370       "relevance": "Evaluates GitHub Copilot's effectiveness for developers of different experience levels, relevant to AI-assisted coding productivity."
    371     },
    372     {
    373       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    374       "authors": ["W. Tong"],
    375       "year": 2024,
    376       "arxiv_id": "2410.02184",
    377       "relevance": "LLM-based code evaluation framework addressing syntactic variations and alternative solutions in generated code."
    378     },
    379     {
    380       "title": "G-EVAL: NLG Evaluation using GPT-4 with Better Human Alignment",
    381       "authors": ["X. Yang"],
    382       "year": 2023,
    383       "arxiv_id": "2303.16634",
    384       "relevance": "LLM-driven evaluation framework using chain-of-thought, foundational for understanding LLM-based evaluation metrics."
    385     },
    386     {
    387       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    388       "authors": ["Y. Terry"],
    389       "year": 2024,
    390       "relevance": "LLM-based code evaluation metric covering multiple programming languages, directly comparable to SBC approach."
    391     },
    392     {
    393       "title": "Validating LLM-Generated Programs with Metamorphic Prompt Testing",
    394       "authors": ["X. Wang", "D. Zhu"],
    395       "year": 2024,
    396       "arxiv_id": "2406.06864",
    397       "relevance": "Metamorphic testing approach for validating LLM-generated code, relevant to code quality assurance methodology."
    398     },
    399     {
    400       "title": "The Model Openness Framework: Promoting Completeness and Openness for Reproducibility, Transparency, and Usability in Artificial Intelligence",
    401       "authors": ["M. White", "I. Haddad", "C. Osborne", "X.-Y. Liu", "A. Abdelmonsef", "S. Varghese"],
    402       "year": 2024,
    403       "arxiv_id": "2403.13784",
    404       "relevance": "Framework for model openness and reproducibility, relevant to transparency in AI research methodology."
    405     }
    406   ]
    407 }

Impressum · Datenschutz