scan.json (28727B)
1 { 2 "paper": { 3 "title": "Across Programming Language Silos: A Study on Cross-Lingual Retrieval-augmented Code Generation", 4 "authors": [ 5 "Qiming Zhu", 6 "Jialun Cao", 7 "Xuanang Chen", 8 "Yaojie Lu", 9 "Hongyu Lin", 10 "Xianpei Han", 11 "Le Sun", 12 "Shing-Chi Cheung" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2506.03535" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Section X (Data Availability) provides a URL: https://anonymous.4open.science/r/Cross-Lingual-RACG-0F3C. The paper states 'We released the artifact and all experiment data' at this link. Note: this is an anonymous submission link, so long-term accessibility is uncertain." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The constructed 13,910-instance Multilingual Code Dataset Expansion is released at the same artifact URL (Section X). HumanEval-X is a publicly available dataset used for the parallel multilingual subset." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions specific model names and greedy decoding settings (Section II.E), but provides no requirements.txt, Dockerfile, or library version list. There is insufficient detail to recreate the environment." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper references the artifact URL but provides no step-by-step reproduction instructions within the paper itself. No README commands or scripted replication pipeline are described in the text." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper reports only point estimates (Pass@K values) across all tables. No confidence intervals, error bars, or standard errors are provided for any main result. Standard deviations of per-language scores appear in some tables but represent cross-language spread, not uncertainty over runs." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper makes comparative claims throughout (e.g., 'Java demonstrates superior cross-lingual utility over Python', 'multi-lingual LLMs achieve greater improvements compared to mono-lingual LLMs') but applies no statistical significance tests. All comparisons are based on direct number comparisons." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The schema states that 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper's tables consistently show both baseline and enhanced absolute Pass@K values alongside percentage improvements (e.g., Table II: baseline 55.28 → 95.85, '73% up'). This provides the magnitude of effect with full baseline context. While no formal effect size metric like Cohen's d is used, the schema explicitly allows 'percentage improvement with baseline context' as sufficient." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The dataset contains approximately 1,000 instances per language (Section II.C), but no power analysis or explicit justification for this sample size is provided. The paper states it was an extension of existing resources, not a principled sample size determination." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Experiments use greedy decoding (temperature=0.0) producing deterministic single-run outputs (Section II.E). No variance across runs is reported because only one run is conducted. While averaging across 5 models is performed, no spread measures (std dev, IQR) are provided for the aggregated results." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper consistently compares against 'Baseline (without RAG/injection)' for all experiments across Tables II-VI, IX. Retrieval strategies are compared against each other (BM25, BGE-large-en-v1.5, CodeRankEmbed) in Table X." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The baselines include the no-RAG baseline (which represents current LLM capability), and CodeRankEmbed (2024 arXiv) is used as the state-of-the-art retriever. The models tested (Qwen2.5-Coder-7B, DeepSeek-Coder-6.7B) are from 2024, which is contemporary." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper conducts ablation-style experiments comparing Doc vs. Doc w/o NL settings (removing NL comments from corpus), comparing knowledge injection vs. full RACG pipeline, and comparing mono-lingual vs. multi-lingual LLMs. These systematically isolate components." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses Pass@K for generation evaluation and Precision@K and Recall@K for retrieval evaluation (Section II.D, Table X). Multiple metrics are reported across different experimental aspects." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is a benchmark evaluation using automated test case execution (Pass@K). Human evaluation of the generated code outputs is not applicable given the use of executable unit tests as ground truth." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The study uses a retrieval corpus of reference solutions and evaluates generation on separate test queries with verified unit tests. The dataset construction (Section II.C) describes that each instance has test cases for evaluation, distinct from the retrieval corpus documents." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "All result tables provide per-programming-language breakdowns (e.g., Tables III, V, VI show results for all 13 PLs separately). The paper explicitly analyzes language-specific performance variations." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses failure modes including: PHP and Scala corpus degrading performance (Table V), Python mono-lingual LLMs suffering native language degradation with cross-lingual RACG, and Perl corpus causing widespread performance drops for mono-lingual LLMs (Table V). Section VIII also acknowledges limitations." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports multiple negative results: cross-lingual RACG can degrade performance for mono-lingual LLMs on their native language (e.g., -4.57% for Python in Table V), some corpora (PHP, Scala) cause net-negative effects, and Go corpus produces widespread degradation for mono-lingual LLMs (Table V)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The four key insights in the abstract (effectiveness, inequality, robustness, specialization) are all supported by empirical results in the paper. Tables II-X provide quantitative evidence for each claim. The abstract does not make claims beyond what the results show." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper makes causal claims primarily through controlled experiments: the knowledge injection setting (Section II.B.1) explicitly isolates the retrieval variable using an oracle retriever, and adversarial attacks (RQ3) directly manipulate corpus documents to measure causal effects on generation. These are adequate for the causal inferences drawn." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper tests only models up to 7B parameters but draws broader conclusions about 'multi-lingual RACG systems' and 'reliable multi-lingual code assistants.' Section VIII acknowledges model size as a threat but the abstract and conclusion still make broad claims. The conclusions about Java vs. Python utility may not generalize to larger models." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper presents one interpretation for most findings without systematically considering alternatives. For example, the claim that 'multi-lingual LLMs rely on internal knowledge to resist adversarial perturbations' is asserted without ruling out alternatives (e.g., distributional shift). Section VIII focuses on experimental limitations rather than alternative explanations for results." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section II.E specifies models by exact names: CodeLlama-7B-Instruct, Deepseek-Coder-6.7B-Instruct, Qwen2.5-Coder-7B-Instruct, Phi-1, Phi-1.5, and CodeRankEmbed for retrieval. These are sufficiently specific model identifiers (matching HuggingFace/arxiv model names)." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "Section II.E states 'we use a unified prompt template for LLMs, following the design in [19] and [43]' but no actual prompt text is provided in the paper or appendix. Readers cannot reconstruct the exact prompts sent to the models without reading the referenced papers." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section II.E states 'we implement greedy decoding with the temperature 0.0' and fixes the retrieval window size to 3 (top-K=3). A fixed random seed of 42 is also reported for perturbation experiments. Key inference hyperparameters are reported." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The RACG pipeline is described in Section II: a retriever (CodeRankEmbed) retrieves top-K documents from a corpus, which are injected into a prompt for the generation LLM. Figure 1 illustrates the pipeline construction and four experimental settings. The scaffold is sufficiently described." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section II.C describes the dataset construction process: using HumanEval-X (164 problems), extending from existing sources [43][44], generating reference solutions using Qwen2-72B-Instruct-GPTQ-Int4 with RACG, verifying through unit testing across 5 iterations. The pipeline and per-language counts are documented." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section VIII (Threats to Validity) is a dedicated section discussing methodological limitations, including model representativeness bias and deterministic perturbation outcomes from fixed random seeds." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section VIII identifies two specific threats: (1) the models tested (none exceeding 7B parameters) may not represent broader LLM landscape, and (2) fixed random seed (42) could limit perturbation diversity. Both are specific to this study and include mitigation measures attempted." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "While Section VIII acknowledges model size limitations, the paper does not explicitly state what the results do NOT show (e.g., it does not state that conclusions may not apply to larger models or commercial APIs). The conclusion broadly claims to 'establish foundational insights for designing more powerful and safer code intelligence' without bounding these claims." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The artifact URL (Section X) includes 'all experiment data' including the constructed 13,910-instance dataset. Raw dataset instances can be independently verified." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section II.C describes how the dataset was constructed: starting from HumanEval-X and extensions of existing datasets [43][44], generating reference solutions using Qwen2-72B-Instruct-GPTQ-Int4 via multi-lingual RACG, and verifying through unit testing over 5 iterations. Collection procedure is clear." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants or crowd-sourced recruitment. The data source is existing benchmark datasets (HumanEval-X, McEval, MultiPL-E) plus LLM-generated solutions verified by unit tests." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section II.C documents the full pipeline: starting datasets → LLM generation of missing reference solutions → unit test verification over 5 iterations → final 13,910 instances. Per-language counts are provided (e.g., Python: 1181, Kotlin: 1071, etc.)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No acknowledgments section or funding disclosure appears in the paper. There is no mention of grants, corporate sponsors, or funding agencies." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: Chinese Information Processing Laboratory (Institute of Software, CAS), University of Chinese Academy of Sciences, and Hong Kong University of Science and Technology. No obvious conflict with any evaluated model vendor." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "The schema says 'NA if unfunded.' Sonnet marked applies=false reasoning that no funding is disclosed making it unassessable. However, the authors are at major academic institutions (Chinese Academy of Sciences, HKUST) where research is typically funded. The absence of a funding disclosure does not mean the work is unfunded — it means we cannot verify funder independence. Since it is not clearly unfunded solo independent work, applies=true, answer=false is more appropriate." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement appears in the paper. The absence of disclosure means this criterion is not satisfied." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper uses HumanEval-X (based on HumanEval, published 2021) as a benchmark and evaluates models like CodeLlama and Qwen2.5-Coder without stating their training data cutoff dates. The training cutoff is not mentioned for any model." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "HumanEval-X is used as a benchmark (164 problems for the parallel dataset, Section II.C). All evaluated models were likely trained on data post-2021, which may include HumanEval problems. No discussion of potential train/test overlap appears in the paper." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "HumanEval was published in 2021 and the models evaluated (CodeLlama-2023, DeepSeek-Coder-2024, Qwen2.5-Coder-2024) have training cutoffs after HumanEval's release. The paper does not address whether HumanEval problems appear in model training data." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants. This is a benchmark evaluation of LLM-based code generation systems." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants. This is a benchmark evaluation study with no human subjects." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants. This is not an experimental study with human subjects." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants. Blinding is not applicable to automated benchmark evaluation." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper does not report inference costs, API costs, tokens consumed, or wall-clock time for running the experiments. Given the scale (13,910 instances × multiple models × multiple settings), this is a significant omission." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No GPU hours, hardware specifications, or total compute budget are stated. The paper used Qwen2-72B-Instruct-GPTQ-Int4 for dataset generation and multiple 7B models for evaluation, but no compute costs are quantified." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "Multi-lingual RACG significantly enhances multi-lingual code LLM generation, achieving Pass@K mean scores of 95.85% (+73%) for multi-lingual LLMs and 89.45% (+345%) for mono-lingual LLMs in same-language knowledge injection.", 295 "evidence": "Table II shows knowledge injection results on Parallel Multilingual Code Dataset for both multi-lingual and mono-lingual LLMs across 5 programming languages.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "Java demonstrates superior cross-lingual utility over Python as a retrieval corpus in RACG, with Java→others average improvement of +16.08%/+13.35% compared to Python→others: +8.29%/+6.30%.", 300 "evidence": "Tables IV, V, VI show cross-lingual RACG performance across all language pairs for both multi-lingual and mono-lingual LLMs. Section IV explicitly states the Java vs Python comparison.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Adversarial attacks significantly degrade mono-lingual RACG (Pass@K dropping by up to 87% for Python and 88% for Java), but cross-lingual RACG demonstrates inherent robustness with mitigated impacts.", 305 "evidence": "Tables VIII and IX show adversarial attack performance degradation for multi-lingual and mono-lingual LLMs across all perturbation types and language pairs.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "Domain-specific code retrievers (CodeRankEmbed) achieve approximately 91.60% Precision@5 and 88.04% Recall@20, substantially outperforming general text embeddings (BGE-large: ~56%, ~46%) and sparse retrieval (BM25: ~6.57%, ~4.79%).", 310 "evidence": "Table X directly compares three retrieval strategies on the Multilingual Code Dataset Expansion in Doc w/o NL setting.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "Perturbed code documentation can paradoxically improve generation quality in cross-lingual scenarios, with 3,520 cases where syntax perturbations enabled correct code generation that correct documentation or internal knowledge could not.", 315 "evidence": "Sections V and Figure 2 present Venn diagram analysis of 3,520 cases where perturbed documentation helped but correct documentation did not. The analysis is descriptive without statistical testing.", 316 "supported": "moderate" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval", 321 "observational" 322 ], 323 "key_findings": "This study constructs a 13,910-instance multilingual code dataset spanning 13 programming languages and evaluates cross-lingual retrieval-augmented code generation (RACG). Key findings are: (1) same-language corpus significantly enhances generation, with oracle injection reaching 95%+ Pass@1 for multi-lingual LLMs; (2) Java outperforms Python as a cross-lingual retrieval corpus despite Python's dominance in training data; (3) adversarial attacks severely degrade mono-lingual RACG but have mitigated effects in cross-lingual settings; and (4) domain-specific code retrievers vastly outperform general-purpose and sparse retrievers (91% vs. 56% vs. 6.57% precision).", 324 "red_flags": [ 325 { 326 "flag": "No statistical significance testing", 327 "detail": "All comparative claims (e.g., Java outperforms Python, multi-lingual LLMs are more robust than mono-lingual LLMs) are made based solely on comparing point estimates without any statistical significance tests. Given the relatively small parallel dataset (164 HumanEval-X problems), the differences may not be statistically meaningful." 328 }, 329 { 330 "flag": "Benchmark contamination not addressed", 331 "detail": "The paper uses HumanEval-X (derived from HumanEval, published 2021) as a core evaluation benchmark, but all tested models (CodeLlama-2023, DeepSeek-Coder-2024, Qwen2.5-Coder-2024) have training cutoffs after HumanEval's release. The paper does not discuss the risk that test problems appeared in training data, which could inflate performance estimates." 332 }, 333 { 334 "flag": "Single-run deterministic results, no variance", 335 "detail": "All experiments use greedy decoding (temperature=0.0) producing single deterministic outputs. While this is reproducible, reporting no variance across any experimental condition makes it impossible to assess result stability or whether differences are within noise thresholds." 336 }, 337 { 338 "flag": "Dataset generated by LLM rather than human-verified", 339 "detail": "The Multilingual Code Dataset Expansion (13,910 instances) uses Qwen2-72B-Instruct-GPTQ-Int4 to generate reference solutions, verified only by unit tests. LLM-generated reference solutions may introduce systematic biases, and unit test passage does not guarantee correctness for all edge cases." 340 }, 341 { 342 "flag": "Generalization beyond 7B models not bounded", 343 "detail": "All generation models tested are 7B-parameter instruction-tuned models. The paper makes broad conclusions about 'multi-lingual RACG systems' and 'code assistants' without acknowledging that findings may not transfer to larger models (GPT-4, Claude, Gemini) or API-based commercial systems." 344 }, 345 { 346 "flag": "No funding disclosed", 347 "detail": "The paper contains no acknowledgments section and no funding disclosure. This makes it impossible to assess potential conflicts of interest, though the academic affiliations suggest no obvious commercial conflict." 348 } 349 ], 350 "cited_papers": [ 351 { 352 "title": "Evaluating large language models trained on code", 353 "authors": [ 354 "M. Chen", 355 "J. Tworek", 356 "H. Jun" 357 ], 358 "year": 2021, 359 "arxiv_id": "2107.03374", 360 "relevance": "Foundational HumanEval benchmark paper introducing Pass@K metric used throughout this study." 361 }, 362 { 363 "title": "Code llama: Open foundation models for code", 364 "authors": [ 365 "B. Roziere", 366 "J. Gehring", 367 "F. Gloeckle" 368 ], 369 "year": 2023, 370 "arxiv_id": "2308.12950", 371 "relevance": "One of the evaluated multi-lingual code LLMs (CodeLlama-7B-Instruct) used in the benchmark evaluation." 372 }, 373 { 374 "title": "CodeRAG-bench: Can retrieval augment code generation?", 375 "authors": [ 376 "Z. Z. Wang", 377 "A. Asai", 378 "X. V. Yu" 379 ], 380 "year": 2025, 381 "relevance": "Most closely related prior work on RACG benchmarking, limited to Python; this paper extends to 13 languages." 382 }, 383 { 384 "title": "Deepseek-coder: When the large language model meets programming – the rise of code intelligence", 385 "authors": [ 386 "D. Guo", 387 "Q. Zhu", 388 "D. Yang" 389 ], 390 "year": 2024, 391 "arxiv_id": "2401.14196", 392 "relevance": "Evaluated as one of the multi-lingual code LLMs (DeepSeek-Coder-6.7B-Instruct) in the benchmark study." 393 }, 394 { 395 "title": "Qwen2.5-coder technical report", 396 "authors": [ 397 "B. Hui", 398 "J. Yang", 399 "Z. Cui" 400 ], 401 "year": 2024, 402 "arxiv_id": "2409.12186", 403 "relevance": "Evaluated as one of the multi-lingual code LLMs (Qwen2.5-Coder-7B-Instruct) and used for dataset generation." 404 }, 405 { 406 "title": "Codegeex: A pre-trained model for code generation with multilingual benchmarking on humaneval-x", 407 "authors": [ 408 "Q. Zheng", 409 "X. Xia", 410 "X. Zou" 411 ], 412 "year": 2023, 413 "relevance": "Introduces HumanEval-X, the parallel multilingual benchmark used for the controlled knowledge injection experiments in this paper." 414 }, 415 { 416 "title": "Multipl-e: a scalable and polyglot approach to benchmarking neural code generation", 417 "authors": [ 418 "F. Cassano", 419 "J. Gouwar", 420 "D. Nguyen" 421 ], 422 "year": 2023, 423 "relevance": "Multi-lingual code generation benchmark relevant to evaluating cross-lingual code generation capabilities." 424 }, 425 { 426 "title": "Mceval: Massively multilingual code evaluation", 427 "authors": [ 428 "L. Chai", 429 "S. Liu", 430 "J. Yang" 431 ], 432 "year": 2024, 433 "arxiv_id": "2406.07436", 434 "relevance": "One of the source datasets used to construct the Multilingual Code Dataset Expansion spanning 13 programming languages." 435 }, 436 { 437 "title": "PoisonedRAG: Knowledge corruption attacks to retrieval-augmented generation of large language models", 438 "authors": [ 439 "W. Zou", 440 "R. Geng", 441 "B. Wang" 442 ], 443 "year": 2024, 444 "arxiv_id": "2402.07867", 445 "relevance": "Prior work on adversarial attacks against RAG systems that motivates the adversarial attack experiments in this paper." 446 }, 447 { 448 "title": "Retrieval augmented code generation and summarization", 449 "authors": [ 450 "M. R. Parvez", 451 "W. Ahmad", 452 "S. Chakraborty" 453 ], 454 "year": 2021, 455 "relevance": "Classic RACG paper (REDCODER) whose document construction methodology is followed in this study." 456 }, 457 { 458 "title": "An empirical study of retrieval-augmented code generation: Challenges and opportunities", 459 "authors": [ 460 "Z. Yang", 461 "S. Chen", 462 "C. Gao" 463 ], 464 "year": 2025, 465 "relevance": "Related empirical study of RACG relevant to the survey scope on LLM-based software engineering." 466 }, 467 { 468 "title": "How should we build a benchmark? Revisiting 274 code-related benchmarks for LLMs", 469 "authors": [ 470 "J. Cao", 471 "Y.-K. Chan", 472 "Z. Ling" 473 ], 474 "year": 2025, 475 "arxiv_id": "2501.10711", 476 "relevance": "Systematic review of code benchmarks for LLMs, directly relevant to evaluating the quality of code generation benchmarks in the survey." 477 }, 478 { 479 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 480 "authors": [ 481 "C. E. Jimenez", 482 "J. Yang", 483 "A. Wettig" 484 ], 485 "year": 2024, 486 "relevance": "Major benchmark for LLM code generation on real-world software engineering tasks, relevant context for the survey." 487 } 488 ] 489 }