scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29951B)
      1 {
      2   "paper": {
      3     "title": "A Multi-Language Perspective on the Robustness of LLM Code Generation",
      4     "authors": ["Fazle Rabbi", "Zishuo Ding", "Jinqiu Yang"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2504.19108",
      8     "doi": "10.48550/arXiv.2504.19108"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "LLM code generation robustness varies significantly across programming languages, with Java being most resilient and C++ most fragile under semantics-preserving perturbations. Semantic perturbations (docstrings, function names) cause larger performance drops than formatting changes across all models. Larger model size does not consistently improve robustness. Automated prompt repair via LLM-based docstring fixing yields only marginal recovery (~6-12% of failed cases), effective for syntactic noise but not semantic perturbations.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "A replication package is released at https://github.com/frabbisw/robustextended, explicitly stated in the abstract ('We have compiled and released a dedicated dataset') and in Section 1 ('The source code and benchmark dataset are publicly available')."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The perturbed datasets for Java, C++, and JavaScript are released as part of the replication package. Section 1 contribution (4) states they 'release robustness evaluation datasets for Java, C++, and JavaScript.'"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using HuggingFace's AutoModelForCausalLM API (Section 4.3) but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper itself does not contain a 'Reproducing Results' section or detailed commands to replicate experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 8-15 are reported as point estimates (e.g., RP5@1, RD5@1, RR5@1 values) without any confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 16 reports p-values from Fisher's Exact Test across models, perturbation types, and languages to test statistical significance of robustness drops."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The Robust Drop metric (RD5@1) directly quantifies the magnitude of performance decrease. Results are reported with baseline context throughout Tables 8-15 (e.g., nominal RP5@1 of 0.63 dropping to 0.42 under DocString perturbation for Magicoder-7B on Java)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for using 164 problems from HumanEval-X, why 5 perturbed versions per strategy, or any power analysis. The choice of s=5 is adopted from ReCode without independent justification."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The paper uses 5 perturbed prompts per strategy but these are different inputs, not repeated runs, and no variance across these is reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Nominal (unperturbed) performance serves as the baseline for measuring robustness drops. Six models are compared against each other and against prior ReCode results for Python."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The model selection includes recent models (Magicoder-7B from 2023, QwenCode-2.5-6.7B from 2024) alongside older ones (Incoder, CodeGen from 2022), providing a reasonable temporal range for comparison."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is an evaluation study, not a system with components to ablate. The per-perturbation-type breakdown serves a similar purpose but is not an ablation of a proposed system."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Three robustness metrics are used: Robust Pass (RPs@1), Robust Drop (RDs@1), and Robust Relative (RRs@1). Results are also reported under both worst-case (5/5) and partial-case (3/5) scenarios."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.4.3 describes human annotation of 300 prompt pairs by two independent annotators, scoring naturalness and semantic similarity on a 5-point Likert scale with Cohen's kappa inter-annotator agreement reported (Table 7)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "No training or fine-tuning was performed (Section 4.3: 'We did not perform any training or fine-tuning; we directly evaluated them in their pretrained form'). The HumanEval-X and EvalPlus-X test cases are inherently held out from the evaluation pipeline."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Tables 12-15 provide detailed per-perturbation breakdowns for all 29 perturbation types across all models and languages. Results are also broken down by perturbation category (DocString, Function Name, Syntax, Format) and by language (Java, C++, JavaScript)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figures 1, 2, and 8 show concrete failure examples. RQ2 (Section 5.2) provides feature-level analysis of what drives failures. RQ3 discusses when prompt repair fails to recover performance."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "RQ3 reports that automated prompt repair 'yields modest gains limited to trivial noise and often fails on semantic perturbations, making it an unreliable robustness strategy.' They also report cases where larger models are less robust than smaller ones."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims to conduct a comparative analysis of robustness across languages, introduce perturbations in four areas, and investigate LLM-based prompt repair. All three are addressed with supporting results in Sections 5.1-5.3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The primary causal claims (perturbations cause performance drops) are supported by a controlled experimental design where semantics-preserving perturbations are the only variable changed. The perturbation framework ensures single-variable manipulation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper explicitly bounds evaluation to Java, C++, and JavaScript, excludes Python and Go with stated reasons (Section 3.2), and names the specific models tested. The title ('A Multi-Language Perspective') appropriately scopes the work rather than claiming universal LLM behavior."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section 7 (Threats to Validity) discusses internal and external threats but only in generic terms (dataset bias, limited number of models). No specific alternative explanations for the observed results are discussed, such as whether language-specific differences in training data volume could explain the Java vs C++ robustness gap."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures pass@k on test suites and frames results as robustness to perturbations, which directly matches the measurement. No proxy gap exists between what is measured (test pass rates under perturbation) and what is claimed (robustness degradation)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model names with parameter counts are provided: Incoder-1B, Incoder-6B, CodeGen-2B-Multi, CodeGen-6B-Multi, Magicoder-S-DS-6.7B, and Qwen2.5-Coder-7B-Instruct (Section 4.2). These are open-source models with fixed weights, making the names sufficient identifiers."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Figure 9 provides the exact prompt used for docstring repair. Code generation prompts are the publicly available HumanEval-X prompts. Perturbation examples are shown in Tables 1-2 and Figures 1, 3, 5."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.3 reports temperature=0.2, top_p=0.95, and maximum generation length=1536 tokens. Language-specific stopping rules are also described."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. Models are evaluated directly via HuggingFace API for code generation from prompts."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.2 describes dataset preparation in detail: converting EvalPlus test cases to Java/C++/JavaScript, updating canonical solutions, handling type incompatibilities (e.g., mixed lists excluded for Java/C++), and maintaining method signatures. Section 3.3 documents the perturbation pipeline."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Threats to Validity' provides a dedicated discussion of internal and external validity threats."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 7 discusses generic threats: 'generating the same codes from the same prompt each time by the code generation model can introduce biases' and 'we use three code generation models and the human-eval-x dataset, which could introduce bias for a limited number of models.' These are boilerplate threats, not specific alternative explanations for the observed language-dependent robustness differences."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly excludes Python (covered by prior work) and Go (less commonly used) from evaluation (Section 3.2). Section 4.2 explains that certain large models were excluded due to computational constraints. The scope is bounded to specific models, languages, and perturbation types."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The replication package at https://github.com/frabbisw/robustextended includes the perturbed datasets and code, enabling independent verification of results."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 describes the data collection process: starting from HumanEval-X, converting EvalPlus test cases to multiple languages, updating canonical solutions. Section 4.1 provides dataset statistics (Table 3)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Section 4.4.3 mentions 'two independent annotators' for the human evaluation of perturbation quality but provides no information about who they are, their qualifications, or how they were selected."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from HumanEval-X through EvalPlus integration, test case conversion, canonical solution updates, and perturbation generation is documented step-by-step in Sections 3.2-3.4, with statistics on the resulting dataset in Table 3."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources or acknowledgments section is present in the paper text. University affiliations are listed but funding is not disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Fazle Rabbi and Jinqiu Yang at Concordia University, Zishuo Ding at The Hong Kong University of Science and Technology. No conflict with evaluated products since all models are third-party."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Cannot assess funder independence since funding is not disclosed. The authors are at universities not affiliated with the evaluated models, suggesting likely independence, but without disclosure this cannot be confirmed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the six evaluated models. This is critical since HumanEval was published in 2021 and several models were trained after this date."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether HumanEval-X problems appeared in the training data of the evaluated models, despite HumanEval being widely known and potentially included in training corpora."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HumanEval was published in July 2021 and is widely available online. Models like QwenCode-2.5 (2024) were trained well after this date and likely encountered HumanEval solutions during training. This contamination risk is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "This is a benchmark evaluation study, not a human subjects study. The two annotators in Section 4.4.3 validate perturbation quality but are not participants in a human study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the main study. The annotation task uses two researchers, not recruited participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the main study design."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the main study design."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the main study design."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the main study design."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the main study design."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference costs, GPU hours, or wall-clock time per experiment are reported despite running six models across thousands of perturbed prompts in three languages."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The paper mentions excluding large models due to computational constraints ('Due to the excessive size and our inability to run them on our machine') but never states what hardware was used or the total compute budget."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis is reported. The paper uses temperature=0.2 (not deterministic) but does not assess how results vary across random seeds. The 5 perturbed prompts test input variation, not seed sensitivity."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper states k=1 (single generation per prompt) and s=5 (five perturbed prompts per strategy), but does not state whether the full experiment was repeated across multiple independent runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Section 4.3 mentions 'chosen through a small grid search on a held-out subset' but does not report the number of configurations tried or the search budget. The values are said to be 'taken from the ReCode Wang et al. (2022),' creating a contradiction that is not resolved."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper claims hyperparameters were selected via grid search on a held-out subset but provides no details on the validation set, the configurations tried, or the selection criterion."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Table 16 reports 48 Fisher's exact test p-values (4 perturbation types × 4 models × 3 languages) without any correction for multiple comparisons (e.g., Bonferroni, Holm)."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors constructed the EvalPlus-X dataset and perturbation pipeline, then evaluated models on their own benchmark. No discussion of potential bias from evaluating on their own dataset construction."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Models range from 1B to 7B parameters. The paper compares robustness across these scales without discussing or controlling for compute budget differences."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper notes HumanEval test cases are inadequate (motivating EvalPlus-X) but does not discuss whether function-level code completion from docstrings measures real-world code generation robustness, or whether the perturbation types reflect realistic user errors."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are evaluated directly for raw code generation from prompts."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "HumanEval was published in 2021. Models like QwenCode-2.5 (2024) and Magicoder (2023) were trained well after the benchmark was available online. No discussion of whether models may have seen HumanEval solutions during training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides information that would not be available in real usage (e.g., docstring examples that partially reveal the solution)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether HumanEval-X problems are independent from training data or whether similar problems exist in training corpora."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No contamination detection method is applied (no canary strings, membership inference, or n-gram overlap analysis)."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Code generation models exhibit varying robustness across different programming languages, with Java being most resilient, followed by JavaScript, and C++ being most fragile.",
    365       "evidence": "Tables 8-11 show consistent patterns: Java has lowest RD5@1 values across most perturbation types and models, while C++ shows the largest drops (e.g., RD5@1 of 0.80 for C++ vs 0.31 for Java under Syntax perturbation for Incoder-1B). Section 5.1.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Larger model size does not always guarantee better robustness or pass rate across all programming languages.",
    370       "evidence": "Tables 8 and 10 show examples where larger models are less robust (e.g., CodeGen-6B-Multi has RD5@1 of 0.64 on Java Function perturbation vs 0.40 for CodeGen-2B-Multi). Section 5.1.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Stronger test cases from EvalPlus expose substantially more failures than HumanEval-X test cases in both nominal and perturbed settings.",
    375       "evidence": "Comparing Tables 8 (EvalPlus) vs 9 (HumanEval-X) shows lower pass rates with EvalPlus across all models and languages. Section 5.1.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Automated prompt repair via LLM-based docstring fixing yields only partial recovery, averaging ~6-12% of failed cases depending on language.",
    380       "evidence": "Section 5.3 and Figure 10: JavaScript recovers ~11.7% (peak 16.5%), Java ~7.4% (peak 12.2%), C++ ~6.4% (peak 11.6%). New failures are rare (4-6% of cases).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "DocString and function name perturbations cause more severe robustness drops than syntax and format perturbations.",
    385       "evidence": "Tables 8-9 show higher RD5@1 values for DocString (up to 0.90) and Function Name perturbations compared to Format (up to 0.53) and Syntax perturbations. Figure 6 visualizes these differences.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Each programming language has distinct patterns of feature-level sensitivity under perturbations.",
    390       "evidence": "Figure 7 heatmaps show language-specific impact scores: Java is feature-sensitive under function name and docstring perturbations but not syntax; C++ shows moderate steady correlations; JavaScript shows consistently low correlations. Section 5.2.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No contamination analysis",
    397       "detail": "HumanEval was published in 2021 and is widely available online. Models trained after 2021 (especially QwenCode-2.5 from 2024) very likely encountered HumanEval solutions during training. This could inflate nominal pass rates and affect robustness measurements, but the paper never discusses this contamination risk."
    398     },
    399     {
    400       "flag": "No error bars or variance across runs",
    401       "detail": "All results are single-run point estimates. With temperature=0.2 (not deterministic), results could vary across runs, but no variance or confidence intervals are reported for any of the thousands of measurements."
    402     },
    403     {
    404       "flag": "Multiple comparisons without correction",
    405       "detail": "Table 16 reports 48 Fisher's exact test p-values without any correction for multiple comparisons. Several p-values near the 0.05 threshold (e.g., Java DocString for Incoder-6B at 0.01) may not survive correction."
    406     },
    407     {
    408       "flag": "Limited and partially outdated model selection",
    409       "detail": "Four of six models (Incoder-1B/6B, CodeGen-2B/6B) are from 2022 and small by current standards. No state-of-the-art models (GPT-4, Claude, Llama-3) are evaluated, limiting the contemporary relevance of findings. The exclusion of larger models due to hardware constraints is acknowledged but not mitigated."
    410     },
    411     {
    412       "flag": "RQ3 mitigation tested with single model only",
    413       "detail": "The prompt repair experiment (RQ3) uses only Magicoder-7B for both repair and code generation, making it impossible to determine whether the limited recovery is due to the repair strategy or the model. Testing with multiple models would strengthen the finding."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "ReCode: Robustness Evaluation of Code Generation Models",
    419       "authors": ["Shiqi Wang", "Zheng Li", "Haifeng Qian"],
    420       "year": 2023,
    421       "arxiv_id": "2212.10264",
    422       "relevance": "Primary baseline work: first robustness evaluation benchmark for code generation models, focused on Python only."
    423     },
    424     {
    425       "title": "Evaluating Large Language Models Trained on Code",
    426       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    427       "year": 2021,
    428       "arxiv_id": "2107.03374",
    429       "relevance": "Introduces HumanEval benchmark and Codex model, foundational for code generation evaluation."
    430     },
    431     {
    432       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    433       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    434       "year": 2023,
    435       "arxiv_id": "2305.01210",
    436       "relevance": "Introduces EvalPlus, showing HumanEval test cases are inadequate and stronger tests reveal more failures in LLM-generated code."
    437     },
    438     {
    439       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    440       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"],
    441       "year": 2022,
    442       "arxiv_id": "2203.13474",
    443       "relevance": "Introduces CodeGen family of code generation models evaluated in this study."
    444     },
    445     {
    446       "title": "CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Evaluations on HumanEval-X",
    447       "authors": ["Qinkai Zheng", "Xiao Xia", "Xu Zou"],
    448       "year": 2023,
    449       "arxiv_id": "2303.17568",
    450       "relevance": "Introduces HumanEval-X multilingual benchmark used as the base dataset for this study."
    451     },
    452     {
    453       "title": "Magicoder: Empowering Code Generation with OSS-Instruct",
    454       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    455       "year": 2023,
    456       "arxiv_id": "2312.02120",
    457       "relevance": "Introduces Magicoder model used both for code generation evaluation and as the prompt repair model in RQ3."
    458     },
    459     {
    460       "title": "Qwen2.5-Coder Technical Report",
    461       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    462       "year": 2024,
    463       "arxiv_id": "2409.12186",
    464       "relevance": "Technical report for QwenCode-2.5 model evaluated in this study as one of the stronger contemporary code generation models."
    465     },
    466     {
    467       "title": "On Robustness of Code Generation Techniques: An Empirical Study on GitHub Copilot",
    468       "authors": ["Antonio Mastropaolo", "Luca Pascarella", "Emanuele Guglielmi"],
    469       "year": 2023,
    470       "doi": "10.1109/ICSE48619.2023.00181",
    471       "relevance": "Empirical study on robustness of GitHub Copilot's code generation, finding sensitivity to small prompt perturbations."
    472     },
    473     {
    474       "title": "On Robustness of Prompt-Based Semantic Parsing with Large Pre-Trained Language Model: An Empirical Study on Codex",
    475       "authors": ["Terry Yue Zhuo", "Zhuang Li", "Yujin Huang"],
    476       "year": 2023,
    477       "arxiv_id": "2301.12868",
    478       "relevance": "Studies prompt-based adversarial robustness of Codex, finding vulnerability to crafted perturbations."
    479     },
    480     {
    481       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    482       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    483       "year": 2020,
    484       "arxiv_id": "2002.08155",
    485       "relevance": "Pre-trained code model used for variable renaming perturbation (VarRenamerCB) in the robustness framework."
    486     },
    487     {
    488       "title": "An Exploratory Study on Fine-Tuning Large Language Models for Secure Code Generation",
    489       "authors": ["Junjie Li", "Fazle Rabbi", "Cheng Cheng"],
    490       "year": 2024,
    491       "arxiv_id": "2408.09078",
    492       "relevance": "Studies fine-tuning LLMs for secure code generation, related to code quality and safety of LLM outputs."
    493     },
    494     {
    495       "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence",
    496       "authors": ["InCoder team"],
    497       "year": 2022,
    498       "arxiv_id": "2204.05999",
    499       "relevance": "Introduces InCoder models for code infilling and synthesis, two variants evaluated in this study."
    500     }
    501   ]
    502 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs