scan.json (32767B)
1 { 2 "paper": { 3 "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code", 4 "authors": [ 5 "Rangeet Pan", 6 "Ali Reza Ibrahimzada", 7 "Rahul Krishna", 8 "Divya Sankar", 9 "Lambert Pouguem Wassi", 10 "Michele Merler", 11 "Boris Sobolev", 12 "Raju Pavuluri", 13 "Saurabh Sinha", 14 "Reyhaneh Jabbarvand" 15 ], 16 "year": 2024, 17 "venue": "ICSE 2024", 18 "arxiv_id": "2308.03109", 19 "doi": "10.1145/3597503.3639226" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Artifacts including manual labeling and automation scripts are publicly available at GitHub (reference [7]: https://github.com/Intelligent-CAT-Lab/PLTranslationEmpirical). Explicitly stated: 'our artifacts, including manual labeling and automation scripts for evaluating LLMs, are publicly available.'" 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper states the dataset consists of '1,700 code samples in five PLs with 10K+ tests, 43K+ translated code, 1,748 manually labeled bugs, and 1,365 bug-fix pairs' and is publicly available via the artifact website [7]. Additionally, all source benchmarks (CodeNet, Avatar, EvalPlus) are public." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 2 specifies compute resources (16 A100 80GB GPUs) and exact compiler/runtime versions: Python 3.10, g++ 11, GCC Clang 14.0, Java 11, Go 1.20, Rust 1.73, .Net 7.0.14. Model sizes and context windows are in Table 1." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": true, 43 "justification": "The paper provides a public artifact website [7] with 'automation scripts for evaluating LLMs' and states all prompts used are in the replication package. Combined with the detailed methodology in §2-§6, this is sufficient for reproduction." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Tables 2-5 and Figures 2, 6-7 are reported as point estimates (percentages) with no confidence intervals or error bars." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims GPT-4 outperforms other models and that prompt crafting improves results, but no significance tests are applied. The only statistical measure mentioned is a correlation coefficient r (0.64-0.85) in §3.1 without a significance test." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Results are reported with baseline context: e.g., 'correct translations ranging from 2.1% to 47.3%' (Table 2), 'improves the success rate of GPT-4, StarCoder, Codegen, and Llama 2 by 12.33%, 3.55%, 2.65%, 1.97%, respectively' (§6). Absolute percentages with baselines provide effect size context." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification for why 200 samples per language in CodeNet, 249/250 in Avatar, or 164 in EvalPlus. No power analysis or sample size rationale is discussed." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "Section 8 explicitly acknowledges: 'for each translation task, we performed the translation once.' Single-run results with no variance, standard deviation, or spread reported." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Section 5 compares LLM-based translation against non-LLM-based approaches: CxGo (C→Go), C2Rust (C→Rust), and JavaToCSharp (Java→C#). Table 5 presents the comparison." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "The LLMs selected include the then-recent GPT-4 (Mar 2023), StarCoder (May 2023), Llama 2 (Jul 2023). Non-LLM baselines (CxGo, C2Rust, JavaToCSharp) are actively maintained open-source tools. Selection rationale is explained in §2." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": false, 87 "justification": "The prompt crafting approach (§6) has identifiable components (source code, incorrect translation, error details, expected behavior) but these are not ablated individually. No experiments isolate the contribution of each prompt component." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Results are broken down by outcome type: compilation error, runtime error, functional error, non-terminating execution, and successful translation (§3.2, Table 3, Figure 2). Bug categories provide additional evaluation dimensions." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": true, 97 "justification": "Eight human labelers manually investigated 1,748 buggy translations over 630 person-hours to create the bug taxonomy (§4.1.1). This is direct human evaluation of the LLM translation outputs." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Evaluation uses established benchmarks (CodeNet, Avatar, EvalPlus) with their provided test suites. No training or tuning is performed — LLMs are evaluated in zero-shot fashion, so the test data is fully independent." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Extensive breakdowns provided: per source/target language (Table 2, Table 3), per model (Figure 2), per bug category (Table 4), per dataset (Table 2), per iteration of prompt crafting (Figure 7)." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Failure analysis is the core contribution. Section 4 presents 15 categories of translation bugs with concrete code examples. Section 4.3 discusses specific failure modes in real-world projects." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Multiple negative results: most LLMs perform poorly (2.1-5.3% success for 5/7 models), all models fail on real-world projects (0% except GPT-4 at 8.1-13.6%), prompt crafting has diminishing returns (iter2 only +1.7%), and some prompt crafting attempts degrade outcomes (§6)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims are supported: 'correct translations ranging from 2.1% to 47.3%' matches Table 2; '15 categories of translation bugs' matches §4; 'prompt-crafting approach... improves performance by 5.5% on average' matches §6; '1,700 code samples from three benchmarks and two real-world projects' matches §2." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The main causal claim is that prompt crafting with error context improves translation (§6). This is supported by a controlled before/after comparison using the same models and data, with the only variable being the prompt content. The design (vanilla → iter1 → iter2) is adequate for this claim." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section 8 explicitly discusses generalization boundaries: results are tied to five specific PLs, seven specific LLMs, and specific datasets. The paper notes 'our selection is drawn from the ranking at the time of our experimentation' and acknowledges these choices may limit generalizability." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 8 discusses several alternatives: LLM non-determinism could change success rates, sensitivity to prompt templates could affect results (they followed best practices per model), lack of inter-rater reliability metric could affect taxonomy, and weak test suites could overcount successes." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper measures test-case passing as a proxy for translation correctness and explicitly discusses this limitation in §8 (Construct Validity): 'inadequate or weak test suites can cause buggy translations that pass the test suites to be considered correct.' They note CodeNet has only one test per sample, making it more susceptible." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Table 1 lists model names with parameter counts and release dates, but no exact version identifiers. GPT-4 is listed without a version/snapshot (e.g., 'gpt-4-0613'). Open-source models lack commit hashes or version tags. Only generic names like 'Llama 2 13B' are given." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Figure 1 shows the three vanilla prompting templates for different models (CodeGeeX, GPT-4, other models). Figure 5 shows the complete prompt crafting template. The paper also states 'All the prompts that we used are in the replication package [7].' The fill values ($SOURCE_CODE, $SOURCE_LANG, $TARGET_LANG) are mechanical substitutions from the benchmarks." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": false, 161 "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the seven LLMs. Context window sizes are listed in Table 1, but no inference-time hyperparameters are stated." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The approach is direct prompting of LLMs with source code, with no tool use, retry logic, or agent workflows." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 2 describes dataset selection criteria (PL popularity, paradigm coverage, dataset quality), how real-world projects were broken into classes/files, and that all comments were removed to fit context windows. The evaluation setup (compilers, test execution) is also described." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 8 'Threats to Validity' is a dedicated section with three subsections: External Validity, Internal Validity, and Construct Validity, providing substantive discussion." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Specific threats discussed: single translation per task and its impact on success rates (Internal), lack of inter-rater reliability metric for manual labeling (Internal), CodeNet having only one test case per sample (Construct), prompt sensitivity mitigated by following model-specific best practices (Internal), and potential bugs in automation scripts (Internal)." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "External validity section explicitly states scope boundaries: limited to five PLs selected by specific criteria, limited LLMs constrained by 20B parameter budget, specific datasets chosen. Internal validity notes single-run limitation. Construct validity notes test suite adequacy concerns." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The artifact [7] includes 1,700 code samples, 10K+ tests, 43K+ translated code, 1,748 manually labeled bugs, and 1,365 bug-fix pairs. All raw translations and labels are publicly available for verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 2 describes the dataset collection: three benchmarks (CodeNet, Avatar, EvalPlus) selected by stated criteria, two real-world projects (Commons CLI, Click) selected as well-maintained Java/Python projects providing command-line processing APIs. Table 2 provides detailed statistics." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants in the study. The eight labelers who built the bug taxonomy are members of the research team, not recruited participants. Data sources are standard public benchmarks and open-source projects." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The full pipeline is described: dataset collection and selection (§2) → LLM prompting with specific templates (§3) → execution and outcome classification (§3.2) → manual bug labeling in two phases with 8 labelers (§4.1.1) → prompt crafting iteration (§6). Real-world project preprocessing (breaking into files, removing comments) is also documented." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Acknowledgments section states: 'This work is supported by IBM-Illinois Discovery Accelerator Institute and NSF CCF 22-38045 CAR grants.'" 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All author affiliations are prominently listed: 8 authors from IBM Research Yorktown Heights and 2 from University of Illinois Urbana-Champaign. One author was an IBM Research intern." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "IBM funds the research via the IBM-Illinois Discovery Accelerator Institute, and 8/10 authors are IBM Research employees. IBM has commercial interests in code translation and LLM-based developer tools (e.g., watsonx Code Assistant), making the funder not fully independent of the outcome." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial disclosure statement is present in the paper. Given 8/10 authors are IBM employees and IBM has commercial products in the code translation space, this is a notable omission." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for any of the seven LLMs. Table 1 lists release dates but not training data cutoffs. This is critical since CodeNet (2021), Avatar (2021), and EvalPlus (2023) could overlap with training data." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether benchmark examples (CodeNet, Avatar, EvalPlus code samples) appeared in any model's training data. Given these are public datasets and the models were released in 2023, overlap is plausible." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "CodeNet was published in 2021 and Avatar in 2021, before the training cutoffs of all seven models (released 2023). No discussion of contamination risk despite the high likelihood that these benchmarks are in training data." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the study. The labelers building the bug taxonomy are researchers/co-authors, not study participants." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants. The study evaluates LLMs on benchmarks and involves researcher-performed manual analysis, not human subjects research." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants. The eight labelers are described as 'researchers or software engineers in the industry' but they are part of the research team, not study participants." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants or experimental conditions involving human subjects." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants or experimental conditions requiring blinding." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in the study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No API costs, tokens consumed, wall-clock time per translation, or cost per example are reported despite performing 43,379 translations including GPT-4 API calls." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "Hardware is mentioned (16 A100 80GB GPUs) but total GPU hours, training time, or total API spend are not quantified. The 630 person-hours for manual labeling is stated but computational budget is not." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Section 8 acknowledges: 'for each translation task, we performed the translation once. Performing a translation task multiple times may change the success rates in translation as LLMs are inherently non-deterministic.' Single-run results only." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 8 explicitly states: 'for each translation task, we performed the translation once.' The number of runs (1) is clearly stated, even though it is a limitation." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget reported. Temperature, sampling parameters, and alternative prompt templates were not systematically explored. The paper follows 'best practices described in the respective artifacts/papers/reports' but reports no search process." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "Prompt templates are justified by following each model's recommended format: 'we followed the templates similar to those we found in the artifacts, papers, or technical reports associated with each model' (§3.1). No configuration search was performed, making cherry-picking unlikely." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "The paper makes dozens of comparisons across 7 models, 31 language pairs, and 5 datasets without any statistical testing, let alone multiple comparison correction." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The prompt crafting technique (§6) is the authors' own contribution evaluated against vanilla prompting, but no discussion of author-evaluation bias. The authors also re-implement the evaluation for non-LLM baselines without acknowledging this bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Models range from 13B parameters to GPT-4 (unknown size) with vastly different computational costs. GPT-4 (47.3%) is compared directly to 13B models (2.1-5.3%) without discussing the compute difference. Performance is not reported as a function of compute budget." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "Section 8 (Construct Validity) discusses whether the evaluation actually measures translation quality: test-based assessment vs. static metrics, risk of weak test suites (especially CodeNet with 1 test per sample), and why they chose execution-based over static metrics. Section 3.1 also explains why they rejected static metrics like CodeBLEU." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is used. All models receive direct prompts without agentic workflows, tool use, or scaffolding layers." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed. CodeNet (2021), Avatar (2021), and EvalPlus (2023) contain code that could have been in the training data of models released in 2023. No temporal analysis is performed." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Not discussed. The evaluation provides source code in one language and expects translation to another — it is not analyzed whether the models might have seen the target code paired with the source code during training." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "Not discussed. CodeNet problems come from online judge systems (AIZU, AtCoder) where solutions in multiple languages exist publicly. Models could have trained on both source and target language versions of the same problems." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination is mentioned." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "LLMs achieve correct translations ranging from 2.1% to 47.3% across studied models", 374 "evidence": "Table 2 shows total/average success rates: GPT-4 47.3%, StarCoder 14.5%, CodeGen 8.1%, TB-Airoboros 5.3%, Llama 2 3.5%, CodeGeeX 2.8%, TB-Vicuna 2.1% (§3.1)", 375 "supported": "strong" 376 }, 377 { 378 "claim": "77.8% of unsuccessful translations result in compilation errors, indicating LLMs struggle with code syntax", 379 "evidence": "Table 3 and Figure 2 break down unsuccessful translations: 77.8% compilation errors, 8.4% runtime errors, 13.4% functional errors, 0.4% non-terminating (§3.2)", 380 "supported": "strong" 381 }, 382 { 383 "claim": "15 categories of translation bugs organized into 5 groups were identified through manual analysis", 384 "evidence": "630 person-hours of manual labeling by 8 human labelers examining 1,748 buggy GPT-4 translations across 31 language pairs (§4.1). Taxonomy presented in Figure 3 and Table 4.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "More than one-third (33.5%) of translation bugs are data-related bugs", 389 "evidence": "Table 4 shows data-related bugs (C1+C2+C3) account for 33.5% of all bugs, with incorrect input parsing (18.1%) and incorrect data type (11.5%) being the most common (§4.2)", 390 "supported": "strong" 391 }, 392 { 393 "claim": "LLMs fail to translate real-world projects effectively", 394 "evidence": "Table 2: Commons CLI Java→Python success rate 13.6% for GPT-4, 0% for all others. Click Python→Java: 0% for all models including GPT-4 (§3.1, §4.3)", 395 "supported": "strong" 396 }, 397 { 398 "claim": "C2Rust achieves 95% success rate on C-to-Rust translation, 34% better than GPT-4", 399 "evidence": "Table 5 shows C2Rust 95% vs GPT-4 61% for C→Rust on CodeNet. However, C2Rust generates unsafe, non-idiomatic code (§5, Figure 4)", 400 "supported": "strong" 401 }, 402 { 403 "claim": "Prompt crafting with error context improves translation success rate by 5.5% on average across LLMs", 404 "evidence": "Figure 6 shows iter1 improvements: GPT-4 +12.33%, StarCoder +3.55%, CodeGen +2.65%, Llama 2 +1.97%. GPT-4 iter2 adds +1.7%. Average ~5.5% (§6)", 405 "supported": "moderate" 406 }, 407 { 408 "claim": "LLM-based and non-LLM-based translation techniques have complementary strengths", 409 "evidence": "Table 5 and §5: CxGo achieves 62.3% on C→Go (vs GPT-4 72.5%), C2Rust 95% on C→Rust (vs GPT-4 61%), but JavaToCSharp 0% on Java→C# (vs GPT-4 49-59.2%). Table 6 summarizes: non-LLM better at broader context and determinism; LLMs better at target-language idioms.", 410 "supported": "moderate" 411 }, 412 { 413 "claim": "Strong correlation exists between test suite rigor and unsuccessful translation detection", 414 "evidence": "§3.1 reports correlation coefficients r ranging from 0.64 to 0.85 across all models between average tests per sample and unsuccessful translation rate.", 415 "supported": "moderate" 416 } 417 ], 418 "methodology_tags": ["benchmark-eval", "qualitative"], 419 "key_findings": "LLMs are not yet reliable for automated code translation, with success rates ranging from 2.1% (TheBloke-Vicuna) to 47.3% (GPT-4) across five programming languages. Manual analysis of 1,748 buggy GPT-4 translations revealed 15 bug categories in 5 groups, with data-related bugs (33.5%) and syntactic/semantic differences (30.5%) being the most prevalent. Real-world project translation is essentially infeasible for current LLMs. An iterative prompt-crafting technique providing error context improves success by 5.5% on average, with GPT-4 seeing the largest gain (12.3%), but considerable room for improvement remains.", 420 "red_flags": [ 421 { 422 "flag": "Single-run evaluation", 423 "detail": "Each translation task was performed only once despite LLMs being inherently non-deterministic. The paper acknowledges this in §8 but argues it does not affect bug characterization. However, reported success rates could vary significantly across runs." 424 }, 425 { 426 "flag": "No contamination analysis", 427 "detail": "CodeNet (2021) and Avatar (2021) are public datasets that likely appeared in the training data of models released in 2023. No contamination analysis is performed, which could inflate success rates (models may have memorized solutions) or distort the bug distribution." 428 }, 429 { 430 "flag": "No statistical significance tests", 431 "detail": "All comparative claims (GPT-4 vs other models, prompt crafting improvements, LLM vs non-LLM) are based on comparing point estimates without statistical testing. With single-run evaluations, the observed differences may not be reliable." 432 }, 433 { 434 "flag": "No inter-rater reliability metric", 435 "detail": "The bug taxonomy was constructed by 8 labelers over 630 hours, but no Cohen's kappa, Fleiss' kappa, or other inter-rater reliability metric is reported. The paper acknowledges this in §8 and describes a discussion-based resolution process, but quantitative agreement is not measured." 436 }, 437 { 438 "flag": "Hyperparameters not reported", 439 "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 7 LLMs. These significantly affect output quality and reproducibility." 440 }, 441 { 442 "flag": "Potential funder conflict", 443 "detail": "8/10 authors are IBM Research employees and the work is partially funded by IBM. IBM has commercial interests in code translation tools, though the paper's findings (LLMs are unreliable) do not obviously promote IBM products." 444 } 445 ], 446 "cited_papers": [ 447 { 448 "title": "Evaluating large language models trained on code", 449 "authors": ["Mark Chen", "Jerry Tworek"], 450 "year": 2021, 451 "arxiv_id": "2107.03374", 452 "relevance": "Introduces Codex and HumanEval benchmark for evaluating LLM code generation capabilities." 453 }, 454 { 455 "title": "StarCoder: may the source be with you!", 456 "authors": ["Raymond Li", "Loubna Ben Allal"], 457 "year": 2023, 458 "arxiv_id": "2305.06161", 459 "relevance": "One of the best-performing open-source code LLMs evaluated in the study (14.5% success rate)." 460 }, 461 { 462 "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis", 463 "authors": ["Erik Nijkamp", "Bo Pang"], 464 "year": 2022, 465 "relevance": "Open-source code LLM evaluated in the study for code translation capability." 466 }, 467 { 468 "title": "CodeGeeX: A Pre-trained Model for Code Generation with Multilingual Evaluations on HumanEval-X", 469 "authors": ["Qinkai Zheng", "Xiao Xia"], 470 "year": 2023, 471 "arxiv_id": "2303.17568", 472 "relevance": "Code LLM specifically trained for multilingual code generation and translation, evaluated in the study." 473 }, 474 { 475 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 476 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 477 "year": 2023, 478 "arxiv_id": "2305.01210", 479 "relevance": "EvalPlus benchmark used in this study; addresses rigorous evaluation of LLM-generated code." 480 }, 481 { 482 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 483 "authors": ["Meta AI"], 484 "year": 2023, 485 "relevance": "Open-source general LLM evaluated in the study for code translation." 486 }, 487 { 488 "title": "GPT-4 Technical Report", 489 "authors": ["OpenAI"], 490 "year": 2023, 491 "arxiv_id": "2303.08774", 492 "relevance": "Best-performing LLM in the study (47.3% success rate); the primary model for the bug taxonomy analysis." 493 }, 494 { 495 "title": "Avatar: A parallel corpus for Java-Python program translation", 496 "authors": ["Wasi Uddin Ahmad", "Md Golam Rahman Tushar"], 497 "year": 2021, 498 "arxiv_id": "2108.11590", 499 "relevance": "Parallel corpus benchmark used in the study for evaluating code translation between Java and Python." 500 }, 501 { 502 "title": "CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks", 503 "authors": ["Ruchir Puri", "David S Kung"], 504 "year": 2021, 505 "relevance": "Primary benchmark dataset with 1,000 code samples in 5 PLs used for the translation evaluation." 506 }, 507 { 508 "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning", 509 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 510 "year": 2022, 511 "relevance": "LLM-based program repair via zero-shot prompting; inspired the prompt crafting approach for fixing translation bugs." 512 }, 513 { 514 "title": "Repair is nearly generation: Multilingual program repair with LLMs", 515 "authors": ["Harshit Joshi", "José Cambronero Sanchez"], 516 "year": 2023, 517 "relevance": "Demonstrates LLMs can repair code bugs, motivating the iterative prompt-crafting approach for translation bugs." 518 }, 519 { 520 "title": "Unsupervised translation of programming languages", 521 "authors": ["Baptiste Roziere", "Marie-Anne Lachaux"], 522 "year": 2020, 523 "relevance": "Neural approach to code translation between C++, Java, and Python using unsupervised learning." 524 }, 525 { 526 "title": "A systematic evaluation of large language models of code", 527 "authors": ["Frank F Xu", "Uri Alon"], 528 "year": 2022, 529 "relevance": "Systematic evaluation of code LLMs including PolyCoder; relevant to understanding LLM code capabilities." 530 }, 531 { 532 "title": "CodeBERT: A pre-trained model for programming and natural languages", 533 "authors": ["Zhangyin Feng", "Daya Guo"], 534 "year": 2020, 535 "arxiv_id": "2002.08155", 536 "relevance": "Pre-trained model for code understanding; foundational work in code LLMs." 537 } 538 ] 539 }