scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30116B)
      1 {
      2   "paper": {
      3     "title": "Prompt Variability Effects On LLM Code Generation",
      4     "authors": [
      5       "Andrei Paleyes",
      6       "Radzim Sendyka",
      7       "Diana Robinson",
      8       "Christian Cabrera",
      9       "Neil D. Lawrence"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2506.10204",
     14     "doi": "10.48550/arXiv.2506.10204"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval", "qualitative"],
     19   "key_findings": "All four LLMs exhibit similar rapid decay in code similarity under keyboard typos augmentation, but are substantially more robust to semantic augmentations (synonyms, paraphrasing). Data contamination significantly reduces measured sensitivity on older LeetCode problems, motivating the use of newer and original tasks. Persona-based evaluation using LLM-simulated user backgrounds (junior engineer, principal engineer, astrophysicist, English teacher) shows qualitative differences in both prompts and generated code, with the largest gaps between non-technical and technical personas.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "An anonymous repository link is provided: https://anonymous.4open.science/r/code-gen-sensitivity-0D19. The abstract states 'we share our code for the benefit of the community.' The custom dataset is also stated to be in this repository."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The custom dataset of 22 programming tasks is available in the repository. LeetCode (Old) is referenced from a public HuggingFace dataset (NyanDoggo/leetcode). LeetCode (New) consists of publicly available recent LeetCode problems."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. The paper mentions using NLPaug and specific LLM APIs but does not provide a reproducible environment specification."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are described in the paper. The pipeline is described conceptually in Section III.A but there are no explicit commands or scripts referenced for reproducing the experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Figures 2, 3, and 4 show 'shaded regions the 95% intervals, calculated from the set of approximately 3400 observations for each rate step' (Section III.E.1)."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are reported. Claims like 'Gemini 2.0 Flash is the most resilient to this augmentation method' (Section III.E.1) are based on visual inspection of plots without formal tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper reports raw TSED similarity scores at various augmentation rates (e.g., 'drops below 0.5 TSED after only 10%') but no formal effect sizes such as Cohen's d or standardized differences between models or conditions."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for the choice of 5 repetitions per request, 20 LeetCode (New) problems, 22 custom problems, or the overall sample size. No power analysis is discussed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "95% confidence intervals are shown as shaded regions in Figures 2-4, calculated from ~3400 observations per augmentation rate step. Each request was repeated 5 times."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The unaugmented prompt (rate=0) serves as the baseline throughout. Four LLMs are compared against each other. Three augmentation methods provide comparative conditions."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The four evaluated models (GPT-4o mini, Claude 3 Haiku, Gemini 2.0 Flash, Llama 3.3 70B) are all contemporary LLMs at the time of writing."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Three augmentation methods (typos, synonyms, paraphrasing) are compared, and three datasets (LeetCode Old, LeetCode New, Our Dataset) are evaluated separately, effectively ablating the augmentation method and data source components."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The quantitative evaluation uses only TSED (Tree Similarity of Edit Distance) as the output similarity metric. BERT Score and Sacre BLEU are used only to validate the paraphrasing augmentation quality, not to evaluate code outputs."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "The persona evaluation is analyzed observationally by the authors, but there is no formal human evaluation of the LLM-generated code outputs (no independent human raters, no rating criteria, no inter-rater reliability)."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The evaluation uses three separate datasets, none used for any model selection or tuning. LeetCode (New) was specifically chosen from March 2025 to avoid contamination. The custom dataset was created independently."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by dataset (LeetCode Old, LeetCode New, Our Dataset) in Figure 4, by augmentation method (typos, synonyms, paraphrasing) in Figures 2-3, and by model across all experiments."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Data contamination on LeetCode (Old) is discussed as a failure mode (Section III.E.2). In the persona evaluation, the English teacher persona received no code for the accounting task — the LLM provided instructions instead (Section IV.C)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The LeetCode (Old) dataset is explicitly reported as unusable for sensitivity evaluation due to data contamination (Section III.B, III.E.2). The English teacher persona failure case is also a negative result."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims (1) a synthetic evaluation pipeline (Section III), (2) a persona-based evaluation (Section IV), (3) experimental evidence on multiple LLMs (Section III.E), and (4) open code (anonymous repo link). All are substantiated in the paper."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper claims that 'user background and knowledge have an observable effect on both the prompt and the code generated from it' (Section IV.C), but the persona evaluation uses LLM-simulated personas rather than real humans. The causal claim about human behavior is not justified by this study design. The paper acknowledges 'future work should also explore this experiment using human participants' but the claim stands in the results section."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section VI.B acknowledges that 'any study on LLMs is bound to be missing most recent releases.' The paper specifies exact models tested and presents results as illustrative rather than universal. The abstract calls it 'experimental evidence illustrating utility of our methods.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "Section VI discusses internal validity (LLM randomness) and external validity (model updates) but does not discuss specific alternative explanations for the findings, such as whether persona-generated prompt differences reflect length/style rather than background knowledge, or whether TSED differences indicate meaningful functional code differences."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly states 'we do not evaluate correctness of generated code, and specifically focus on deviations between the codes generated from the original and altered prompts' (Section III.D), clearly distinguishing what is measured (syntactic code similarity via TSED) from broader claims about code quality."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are listed as 'GPT-4o mini', 'Claude 3 Haiku', 'Gemini 2.0 Flash', 'Llama 3.3 70B' (Section III.E) — marketing names without specific API versions, snapshot dates, or precise model identifiers."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper describes adding 'prefixes and postfixes' to augmented prompts (Section III.E) but does not provide their actual text. Persona descriptions are referenced as being in the repository, not in the paper. One example original prompt is given ('Write a Calculator class...') but augmented prompts and persona instructions are not provided."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Temperature is set to 0 (Section III.E) and repetition count is 5, but no other hyperparameters are reported (max tokens, top-p, system prompts, etc.). These omissions could affect reproducibility."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The paper makes direct API calls to LLMs for code generation."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The augmentation pipeline is described in detail (Section III.A-C): keyboard typos via NLPaug with QWERTY distance, synonyms via NLPaug with WordNet, paraphrasing via Gemini with Sacre BLEU and BERT Score validation. TSED distance computation is described in Section III.D."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section VI 'Threats to Validity' is a dedicated section with subsections on internal validity (VI.A) and external validity (VI.B)."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section VI.A discusses specific threats: LLM output randomness even at temperature 0 (mitigated by ~3400 datapoints per step), and small-scale persona evaluation (mitigated by linguistic analytics validation and repetition). Section VI.B discusses rapid model release cycles as an external validity threat."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper mentions 'we have only considered one-step interactions' and that 'future work should also explore this experiment using human participants' but does not explicitly state what the results do NOT show. These are framed as future directions rather than explicit scope boundaries."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "While code and the custom task dataset are referenced in the anonymous repo, there is no explicit mention of raw LLM outputs (the ~3400 responses per augmentation step) being available for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data sources are clearly described: LeetCode (Old) from HuggingFace, LeetCode (New) from March 2025 problems, custom dataset of 22 tasks spanning simulations, algorithms, data science, application development, and games (Section III.B). Persona descriptions based on LinkedIn profiles and job descriptions (Section IV.A)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants were recruited. Data consists of programming task benchmarks and LLM-generated persona outputs."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The synthetic pipeline is documented step-by-step in Section III.A: generate n reference outputs, iterate augmentation rates, generate augmented outputs, compute pairwise TSED distances, average. The persona pipeline is documented in Section IV.A-B: define personas, generate prompts (5 iterations), sample one prompt per persona, run in ChatGPT and Claude."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section is present."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are listed as affiliated with the Department of Computer Science and Technology, University of Cambridge, with email addresses provided."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence cannot be assessed. The authors are academic researchers not affiliated with any LLM provider being evaluated, which is favorable, but the absence of any funding statement means this cannot be confirmed."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the four evaluated models. The paper discusses contamination conceptually but does not specify when each model's training data ends."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Data contamination is discussed extensively in Sections III.B and III.E.2. The paper shows empirically that LeetCode (Old) tasks exhibit lower sensitivity due to likely inclusion in training data, citing [12] for detailed contamination analysis."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "The paper explicitly addresses contamination by using LeetCode (New) tasks from March 2025 (post-training cutoff) and a custom-created dataset of 22 original tasks. Figure 4 demonstrates the contamination effect on old vs. new datasets."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. The study uses LLM-simulated personas, not real human subjects."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. All evaluations use LLM APIs and LLM-generated persona outputs."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. Personas are LLM-simulated character descriptions, not real participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants recruited or studied."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants. Not an experimental study with human subjects."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants. Not applicable."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants. Not applicable."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "The pipeline makes thousands of API calls across four models and multiple conditions, but no inference cost, API spend, or per-example cost is reported."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No total computational budget, GPU hours, or hardware details are mentioned despite the significant number of LLM API calls required."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Each request is repeated 5 times at temperature 0, and results are reported with 95% intervals showing variance across runs. The paper notes that 'even setting temperature to 0 does not guarantee fully deterministic results' (Section VI.A) and reports this variability."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Section III.E states 'we repeated each request to a LLM 5 times' and 'approximately 3400 datapoints per single augmentation rate step.' Section IV.A states '5 iterations for each persona for each of the three tasks.'"
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search budget is reported. The choice of temperature=0, 5 repetitions, and augmentation rate steps is not justified through any search or sensitivity analysis."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "The paper does not select a 'best' configuration — it reports results across all augmentation rates, methods, datasets, and models comprehensively rather than cherry-picking a best result."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors propose the evaluation pipeline and evaluate it themselves without discussing author-evaluation bias or seeking independent validation of their methodology."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Models of different sizes and compute costs (Llama 3.3 70B vs GPT-4o mini) are compared without any discussion of compute budget differences or performance-per-compute analysis."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper justifies using TSED over BLEU/BERT Score (Section III.D) but does not discuss whether syntactic tree similarity actually measures meaningful code sensitivity. Two syntactically different programs could be functionally equivalent, a validity gap that is not addressed."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. The paper makes direct API calls to LLMs without any agentic scaffolding."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "Temporal leakage is directly addressed: LeetCode (New) uses problems from March 2025 to avoid training data overlap, and the custom dataset was independently created. The paper empirically demonstrates the temporal leakage effect on LeetCode (Old) in Section III.E.2."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup could leak information through the augmented prompts or whether the prompt engineering prefixes/postfixes provide hints not available in real usage."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the three datasets share structural similarities or whether problems within each dataset are independent. The custom tasks span multiple domains but independence is not formally analyzed."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No formal leakage detection method (canary strings, membership inference, n-gram overlap analysis) is applied. The paper relies on task recency and originality rather than a concrete detection method."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "All four LLMs exhibit similar sensitivity to keyboard typos augmentation, with code similarity dropping rapidly between 0.0 and 0.6 augmentation rate and plateauing around 0.3 TSED.",
    371       "evidence": "Figure 2 (left) shows overlapping sensitivity curves for all four models with 95% confidence intervals, aggregated over ~3400 observations per rate step (Section III.E.1).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Synonyms and paraphrasing are much weaker augmentation methods than typos, with LLMs showing greater robustness to these semantic changes.",
    376       "evidence": "Figure 2 (right) shows synonyms only dropping to ~0.5 TSED, and Figure 3 shows similar trends for paraphrasing, compared to ~0.3 for typos (Section III.E.1).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Gemini 2.0 Flash is the most robust to synonym augmentation, never dropping below 0.6 in code similarity.",
    381       "evidence": "Figure 2 (right) shows Gemini 2.0 Flash maintaining higher TSED values than other models under synonym augmentation (Section III.E.1). However, no statistical significance test is reported.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Data contamination causes LLMs to show lower sensitivity on older LeetCode problems compared to newer or original tasks.",
    386       "evidence": "Figure 4 shows LeetCode (Old) maintaining higher TSED across augmentation rates than LeetCode (New) or the custom dataset, with the custom dataset showing highest sensitivity (Section III.E.2). Supported by reference to [12].",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "User background and knowledge have an observable effect on both the prompt generated and the code produced from it.",
    391       "evidence": "Section IV.C describes qualitative differences in code generated for different personas (Flask app for principal engineer vs Python classes for junior engineer; no code generated for English teacher on one task). However, this is based on LLM-simulated personas, not real human participants, and is qualitatively analyzed without formal metrics.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "GPT-4o mini and Gemini 2.0 Flash produce nearly identical code outputs for the same unaltered prompt at temperature 0, while Llama 3.3 and Claude 3 Haiku show higher instability.",
    396       "evidence": "Section III.E.1 reports ~0.9 similarity for GPT-4o mini and Gemini 2.0 Flash at rate=0, versus higher variance for the other two models. Visible in Figures 2-3.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "LLM personas as proxy for real humans",
    403       "detail": "The persona evaluation uses LLM-simulated personas (instructing an LLM to adopt a persona and generate prompts) rather than real human participants, but draws conclusions about how human background affects prompt development and code generation. The paper acknowledges this limitation but the claims still stand in the results section."
    404     },
    405     {
    406       "flag": "No statistical significance tests",
    407       "detail": "Comparative claims about model robustness (e.g., 'Gemini 2.0 Flash is the most resilient') are based on visual inspection of plots without formal statistical tests, despite having sufficient data (~3400 observations per step) to run them."
    408     },
    409     {
    410       "flag": "Single quantitative output metric",
    411       "detail": "TSED is the only code similarity metric used. No functional correctness evaluation is performed. Syntactically different programs may be functionally equivalent, and the paper does not assess whether TSED differences correspond to meaningful behavioral differences in the code."
    412     },
    413     {
    414       "flag": "Small-scale qualitative persona evaluation",
    415       "detail": "The persona evaluation uses only 3 tasks, 4 personas, and 2 LLMs with purely qualitative (observational) analysis. No inter-rater reliability, no formal coding scheme, and randomly sampling 1 of 5 generated prompts per persona introduces selection effects."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Evaluating large language models trained on code",
    421       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    422       "year": 2021,
    423       "relevance": "Introduces HumanEval, a foundational benchmark for LLM code generation evaluation."
    424     },
    425     {
    426       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    427       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    428       "year": 2023,
    429       "relevance": "Major benchmark for evaluating LLM ability to resolve real-world software engineering issues."
    430     },
    431     {
    432       "title": "NLPerturbator: Studying the robustness of code LLMs to natural language variations",
    433       "authors": ["Junkai Chen", "Zhenhao Li", "Xing Hu", "Xin Xia"],
    434       "year": 2024,
    435       "arxiv_id": "2406.19783",
    436       "relevance": "Directly related work on robustness of code LLMs to 18 categories of natural language perturbations."
    437     },
    438     {
    439       "title": "LLM performance for code generation on noisy tasks",
    440       "authors": ["Radzim Sendyka", "Christian Cabrera", "Andrei Paleyes", "Diana Robinson", "Neil Lawrence"],
    441       "year": 2025,
    442       "arxiv_id": "2505.23598",
    443       "relevance": "Companion paper from same group studying data contamination effects on LLM code generation evaluation."
    444     },
    445     {
    446       "title": "Effectiveness of symmetric metamorphic relations on validating the stability of code generation LLM",
    447       "authors": ["Patrick Y. P. Chan", "John Keung", "Zhen Yang"],
    448       "year": 2025,
    449       "relevance": "Complementary approach using metamorphic testing to validate stability of code-generating LLMs."
    450     },
    451     {
    452       "title": "HumanEvalComm: Benchmarking the communication competence of code generation for LLMs and LLM agent",
    453       "authors": ["Jie JW Wu", "Fatemeh H. Fard"],
    454       "year": 2024,
    455       "relevance": "Benchmark for evaluating LLM communication competence in code generation dialogues."
    456     },
    457     {
    458       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    459       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    460       "year": 2023,
    461       "relevance": "Rigorous evaluation methodology for LLM-generated code correctness, published at NeurIPS."
    462     },
    463     {
    464       "title": "Program synthesis with large language models",
    465       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    466       "year": 2021,
    467       "arxiv_id": "2108.07732",
    468       "relevance": "Introduces Mostly Basic Programming Problems (MBPP) benchmark for LLM code generation."
    469     },
    470     {
    471       "title": "Quantifying language models' sensitivity to spurious features in prompt design",
    472       "authors": ["Melanie Sclar", "Yejin Choi", "Yulia Tsvetkov", "Alane Suhr"],
    473       "year": 2023,
    474       "relevance": "Studies LLM sensitivity to prompt formatting, closely related to this paper's investigation of prompt variability effects."
    475     },
    476     {
    477       "title": "A natural experiment on LLM data contamination in code generation",
    478       "authors": ["Mark Roberts", "Hossein Thakur", "Chris Herlihy", "Colin White", "Samuel Dooley"],
    479       "year": 2023,
    480       "relevance": "Studies data contamination in LLM code generation, directly relevant to this paper's contamination analysis."
    481     },
    482     {
    483       "title": "Revisiting code similarity evaluation with abstract syntax tree edit distance",
    484       "authors": ["Yewei Song", "Cedric Lothritz", "Xunzhu Tang", "Tegawendé Bissyandé", "Jacques Klein"],
    485       "year": 2024,
    486       "relevance": "Introduces the TSED metric used as the primary code similarity measure in this paper."
    487     },
    488     {
    489       "title": "Out of the Bleu: how should we assess quality of the code generation models?",
    490       "authors": ["Mikhail Evtikhiev", "Egor Bogomolov", "Yaroslav Sokolov", "Timofey Bryksin"],
    491       "year": 2023,
    492       "relevance": "Evaluates NLP code similarity metrics for LLM-generated code, informing the metric choice in this paper."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs