scan.json (31799B)
1 { 2 "paper": { 3 "title": "RepoTransBench: A Real-World Multilingual Benchmark for Repository-Level Code Translation", 4 "authors": [ 5 "Yanli Wang", 6 "Yanlin Wang", 7 "Suiquan Wang", 8 "Daya Guo", 9 "Jiachi Chen", 10 "John Grundy", 11 "Xilin Liu", 12 "Yuchi Ma", 13 "Mingzhi Mao", 14 "Hongyu Zhang", 15 "Zibin Zheng" 16 ], 17 "year": 2024, 18 "venue": "IEEE Transactions on Software Engineering", 19 "arxiv_id": "2412.17744", 20 "doi": "10.1109/TSE.2025.3645056" 21 }, 22 "scan_version": 3, 23 "active_modules": ["experimental_rigor", "data_leakage"], 24 "methodology_tags": ["benchmark-eval"], 25 "key_findings": "Repository-level code translation remains highly challenging, with the best method (RepoTransAgent with Claude or GPT-4.1) achieving only 32.8% success rate across 1,897 samples and 13 language pairs. Translation difficulty exhibits a strong directional asymmetry: static-to-dynamic translations achieve 45-63% success while dynamic-to-static translations typically score below 10%. Repository complexity across all dimensions (cross-file dependencies, code length, structural complexity) inversely correlates with translation success. Different LLMs show unexpected specialization patterns for specific language pairs, likely reflecting training data composition.", 26 "checklist": { 27 "artifacts": { 28 "code_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The abstract states 'We provide the code and data at https://github.com/DeepSoftwareAnalytics/RepoTransBench' and reference [90] links to the same repository." 32 }, 33 "data_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "The benchmark data (1,897 repository samples with test suites) is released at the same GitHub repository mentioned in the abstract." 37 }, 38 "environment_specified": { 39 "applies": true, 40 "answer": false, 41 "justification": "Section V.D mentions using Docker as a sandboxed execution environment and bridging network for dependency installation, but no requirements.txt, Dockerfile, or detailed library version specifications are provided in the paper for reproducing the experimental setup." 42 }, 43 "reproduction_instructions": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper provides no step-by-step instructions for reproducing the experiments. A GitHub link is provided but the paper itself contains no README-style reproduction guide or commands to run." 47 } 48 }, 49 "statistical_methodology": { 50 "confidence_intervals_or_error_bars": { 51 "applies": true, 52 "answer": false, 53 "justification": "Tables IV and V report only point estimates (e.g., 32.8% SR) with no confidence intervals, error bars, or ± notation." 54 }, 55 "significance_tests": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper claims various methods and models outperform others (e.g., 'RepoTransAgent consistently outperforms baseline approaches') based solely on comparing raw percentages without any statistical significance tests." 59 }, 60 "effect_sizes_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper reports absolute performance numbers with baseline context throughout, e.g., 'RepoTransAgent achieves 32.8% SR compared to 0.0% for TranslationOnly and 11.3% for ErrorFeedback' and 'improvements of up to 21.5% over the error feedback method,' providing sufficient context to assess effect magnitude." 64 }, 65 "sample_size_justified": { 66 "applies": true, 67 "answer": false, 68 "justification": "The benchmark contains 1,897 samples across 13 language pairs, but there is no justification for why this number was chosen, no power analysis, and no discussion of whether the per-pair sample sizes (some as small as 64 for Matlab) are sufficient for the claims made." 69 }, 70 "variance_reported": { 71 "applies": true, 72 "answer": false, 73 "justification": "All results appear to be from single experimental runs. No standard deviations, variance across runs, or spread measures are reported anywhere in the paper." 74 } 75 }, 76 "evaluation_design": { 77 "baselines_included": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper compares RepoTransAgent against two baselines: TranslationOnly (direct translation without feedback) and ErrorFeedback (translation with error feedback iteration), evaluated across all 8 LLMs (Table IV)." 81 }, 82 "baselines_contemporary": { 83 "applies": true, 84 "answer": true, 85 "justification": "The 8 backbone LLMs are all from 2025 (Table III) including Claude-Sonnet-4, GPT-4.1, DeepSeek-Chat, and Qwen3-235B, representing state-of-the-art models at the time of evaluation." 86 }, 87 "ablation_study": { 88 "applies": true, 89 "answer": false, 90 "justification": "RepoTransAgent has 5 core capabilities (ReadFile, CreateFile, ExecuteCommand, SearchContent, Finished) but no ablation study is conducted to show which components contribute most to performance. The progressive comparison (TranslationOnly → ErrorFeedback → RepoTransAgent) does not constitute a component-level ablation." 91 }, 92 "multiple_metrics": { 93 "applies": true, 94 "answer": true, 95 "justification": "Four evaluation metrics are defined and used throughout: SR (Success Rate), CR (Compilation Rate), APR (Average Pass Rate), and AMPR (Average Module Pass Rate), as described in Section V.C." 96 }, 97 "human_evaluation": { 98 "applies": true, 99 "answer": false, 100 "justification": "Evaluation is entirely automated through test suite execution. No human evaluation of translation quality is conducted, though this could provide insights into partial successes and code readability." 101 }, 102 "held_out_test_set": { 103 "applies": true, 104 "answer": false, 105 "justification": "No discussion of whether any portion of the benchmark was used during agent development or prompt engineering. The paper does not describe a dev/test split or whether the reported results are on data that was used for any tuning decisions." 106 }, 107 "per_category_breakdown": { 108 "applies": true, 109 "answer": true, 110 "justification": "Table V provides detailed breakdowns across all 13 translation pairs for each of the 8 LLMs. Figure 4 also breaks down performance by repository complexity dimensions." 111 }, 112 "failure_cases_discussed": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section VI.D provides a detailed error analysis with 5 error categories (E1-E5): Configuration File Issues, Limited Understanding Ability Issues, Incomplete Generation Issues, Language Feature Issues, and Encoding Issues, each with concrete examples." 116 }, 117 "negative_results_reported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Several negative results are reported: TranslationOnly achieves 0% success across most models; DeepSeek-Reasoner performs 'dramatically worse' than DeepSeek-Chat (1.2% vs 22.5%); reasoning-focused models show 'no clear advantages'; dynamic-to-static translation scores below 10%." 121 } 122 }, 123 "claims_and_evidence": { 124 "abstract_claims_supported": { 125 "applies": true, 126 "answer": true, 127 "justification": "The abstract claims best-performing method achieves 32.8% (supported by Table IV), dynamic-to-static below 10% vs static-to-dynamic at 45-63% (supported by Table V), and that RepoTransAgent outperforms baselines (supported by Table IV showing improvements across all models)." 128 }, 129 "causal_claims_justified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper claims RepoTransAgent improves translation via its reasoning-action framework, but the comparison is confounded by compute budget—RepoTransAgent likely uses more API calls and iterations than TranslationOnly or ErrorFeedback. No ablation isolates which agent components drive improvement, and compute is not controlled across methods." 133 }, 134 "generalization_bounded": { 135 "applies": true, 136 "answer": true, 137 "justification": "Claims are generally bounded: the paper specifies 13 translation pairs, 7 languages, and 8 specific LLMs. Section VII explicitly notes results may not generalize to less common or domain-specific languages, and that language selection was based on TIOBE rankings." 138 }, 139 "alternative_explanations_discussed": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section VII discusses several alternative explanations: LLM generation variability could affect reproducibility, test suites may not capture all edge cases, repository filtering may bias toward well-maintained projects, and the lack of fine-tuning may impact performance." 143 }, 144 "proxy_outcome_distinction": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper measures test passage rates as a proxy for functional correctness and explicitly acknowledges the gap: 'passing all test cases provides strong evidence of functional correctness, it may not capture all edge cases or guarantee complete semantic equivalence' (Section VII)." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": false, 154 "justification": "Table III lists model names like 'Claude-Sonnet-4', 'GPT-4.1', 'Gemini-2.5-Flash-Lite', and 'o3-mini' with release dates and context windows, but no specific API versions or snapshot dates are provided. These are marketing names that can change behavior across API updates." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": false, 159 "justification": "Figure 3 shows only a prompt template with placeholders: 'You are a code translator. Translate {source_language} to {target_language} for project {project_name}...' The actual fill values and complete prompts are not provided in the paper." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": false, 164 "justification": "No temperature, top-p, max tokens, or other sampling/generation hyperparameters are reported for any of the 8 LLMs used. These settings significantly affect output quality." 165 }, 166 "scaffolding_described": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section IV describes the RepoTransAgent architecture in detail: 5 core actions (ReadFile, CreateFile, ExecuteCommand, SearchContent, Finished), the ReAct-based reasoning-action workflow, and the iterative translation/validation cycle with concrete execution examples." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section III documents the full pipeline: data collection from GitHub with star ranking, rule-based filtering (Algorithm 1 specifying language dominance, popularity threshold, and package exclusion criteria), and multi-agent test suite construction with four specialized agents." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section VII 'Threats to Validity' provides a dedicated section covering both internal and external threats with substantive discussion." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section VII discusses specific threats: the rapidly evolving LLM landscape, no fine-tuning applied to models, specific language pair selection based on TIOBE rankings, LLM generation variability affecting reproducibility, and that test suites may not capture all edge cases." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper explicitly bounds scope: results cover 13 translation pairs across 7 languages, 'may not generalize to less common languages or domain-specific languages,' and the filtering criteria favor well-maintained repositories, which 'is intentional to ensure benchmark quality' but may exclude experimental projects." 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": true, 198 "justification": "The benchmark data including repository samples and test suites is released at https://github.com/DeepSoftwareAnalytics/RepoTransBench, enabling independent verification." 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section III.A describes collecting repositories from GitHub, ranking by star count, filtering by the top 7 TIOBE languages plus Rust and Matlab, and conducting a developer questionnaire to identify practical translation demands." 204 }, 205 "recruitment_methods_described": { 206 "applies": true, 207 "answer": false, 208 "justification": "21 professional developers filled a questionnaire to determine language pair demands, but the paper does not describe how these developers were recruited, from which organizations, or whether this introduces selection bias." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": true, 213 "justification": "The full pipeline is documented: GitHub collection → star-based ranking → rule-based filtering (Algorithm 1 with 3 specific rules) → multi-agent test suite construction (4 agents described in Section III.C). Table II reports final statistics: 970 projects yielding 1,897 samples." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "The Acknowledgements section states: 'This work is supported by CCF-Huawei Populus Grove Fund CCF-HuaweiSE202403.'" 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Author affiliations are clearly listed: Sun Yat-sen University, Monash University, Huawei Cloud Computing Technologies Co., Ltd, and Chongqing University." 226 }, 227 "funder_independent_of_outcome": { 228 "applies": true, 229 "answer": true, 230 "justification": "The CCF-Huawei fund supports the research, but the paper evaluates general-purpose LLMs (from OpenAI, Anthropic, Google, DeepSeek, Alibaba) rather than any Huawei product specifically. Huawei does not have a direct financial stake in which model performs best on the benchmark." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests or financial interests statement is present in the paper despite Huawei co-authorship and Huawei funding." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": true, 241 "answer": false, 242 "justification": "Table III provides model release dates but not training data cutoff dates. Since the benchmark uses existing GitHub repositories, the training cutoff is critical for assessing whether models may have seen benchmark code during training." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": true, 246 "answer": false, 247 "justification": "The benchmark consists of existing public GitHub repositories (with >50 stars), which are highly likely to be included in LLM training corpora. No analysis of potential train/test overlap is conducted or discussed." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": true, 251 "answer": false, 252 "justification": "The benchmark uses popular GitHub repositories that were almost certainly available online before the models' training cutoffs. No contamination analysis, canary strings, or temporal analysis is performed despite this being a critical validity concern." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": false, 258 "answer": false, 259 "justification": "The paper's evaluation is an automated benchmark evaluation with no human participants. The 21-person developer questionnaire was for benchmark design only." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the main evaluation. The developer survey for language pair selection is a minor design input, not a human subjects study." 265 }, 266 "demographics_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the main evaluation." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the main evaluation." 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in the main evaluation." 280 }, 281 "blinding_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the main evaluation." 285 }, 286 "attrition_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants in the main evaluation." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No API costs, tokens consumed, wall-clock time, or per-example costs are reported despite evaluating 8 LLMs across 1,897 translation tasks using potentially expensive API calls." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "No total computational budget, GPU hours, or total API spend is reported. The experiments involve extensive LLM API usage across 8 models and 1,897 tasks but the cost is entirely unquantified." 302 } 303 }, 304 "experimental_rigor": { 305 "seed_sensitivity_reported": { 306 "applies": true, 307 "answer": false, 308 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs." 309 }, 310 "number_of_runs_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "The paper does not state how many experimental runs were conducted per configuration. Section VII acknowledges 'inherent randomness in their outputs' as a threat but does not report multiple runs." 314 }, 315 "hyperparameter_search_budget": { 316 "applies": true, 317 "answer": false, 318 "justification": "No hyperparameter search budget is reported. The agent's configuration and any tuning decisions for prompts or parameters are not documented." 319 }, 320 "best_config_selection_justified": { 321 "applies": true, 322 "answer": false, 323 "justification": "The paper does not explain how the agent's configuration or prompts were selected. No validation set or selection process is described for the agent design choices." 324 }, 325 "multiple_comparison_correction": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper makes many comparisons across 8 models, 3 methods, and 13 translation pairs without any multiple comparison correction or statistical testing." 329 }, 330 "self_comparison_bias_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "The authors evaluate their own RepoTransAgent framework against baselines they implemented (TranslationOnly, ErrorFeedback) without acknowledging the inherent bias of evaluating one's own system or having independent evaluation." 334 }, 335 "compute_budget_vs_performance": { 336 "applies": true, 337 "answer": false, 338 "justification": "RepoTransAgent uses an iterative multi-step reasoning process that almost certainly requires more API calls and compute than the simpler TranslationOnly and ErrorFeedback baselines. This compute difference is never quantified or controlled for." 339 }, 340 "benchmark_construct_validity": { 341 "applies": true, 342 "answer": true, 343 "justification": "Section VII explicitly discusses construct validity: 'passing all test cases provides strong evidence of functional correctness, it may not capture all edge cases or guarantee complete semantic equivalence.' They also argue execution-based metrics are superior to similarity-based metrics used in prior work." 344 }, 345 "scaffold_confound_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "When comparing different backbone LLMs, the same RepoTransAgent framework is used consistently (Table V), controlling for the scaffolding variable. The paper also evaluates each model under all three methods (TranslationOnly, ErrorFeedback, RepoTransAgent)." 349 } 350 }, 351 "data_leakage": { 352 "temporal_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "The benchmark uses existing GitHub repositories that were publicly available before the models' training periods. No analysis of whether models may have seen the source repositories' code during training is conducted." 356 }, 357 "feature_leakage_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether the evaluation setup leaks information. The agent can read source files and execute commands, but there is no analysis of whether this provides advantages beyond what a real translation scenario would offer." 361 }, 362 "non_independence_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "No discussion of whether the GitHub repositories in the benchmark (or similar code) appeared in the LLMs' training data. Popular repositories with >50 stars are especially likely to be in training corpora." 366 }, 367 "leakage_detection_method": { 368 "applies": true, 369 "answer": false, 370 "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap, decontamination) is applied despite using public GitHub repositories as the benchmark." 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "The best-performing method achieves only 32.8% success rate on repository-level code translation.", 377 "evidence": "Table IV shows RepoTransAgent with Claude and GPT-4.1 both achieving 32.8% SR, the highest across all method-model combinations.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "RepoTransAgent consistently outperforms baseline approaches across all evaluated backbone models.", 382 "evidence": "Table IV shows RepoTransAgent outperforming TranslationOnly and ErrorFeedback for every LLM tested. For Claude: 32.8% vs 0.0% and 11.3%. For GPT-4.1: 32.8% vs 0.0% and 15.6%.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Translation from statically-typed to dynamically-typed languages achieves substantially higher success rates (45-63%) compared to the reverse direction (below 10%).", 387 "evidence": "Table V shows C→Python at 61.5%, C++→Python at 63.0%, Java→Python at 45.9% for Claude, while Python→Java is 5.8%, Python→C++ is 3.5%, Python→Rust is 11.7%.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Repository complexity inversely correlates with translation success across all dimensions.", 392 "evidence": "Figure 4 shows successful translations consistently have lower median values for cross-file dependencies, intra-file dependencies, lines of code, functions, and classes compared to failed translations.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Reasoning-focused models do not show clear advantages over standard counterparts for repository-level translation.", 397 "evidence": "Table IV shows Qwen3-think at 19.1% vs Qwen3 at 16.9% (modest improvement), and DeepSeek-Reasoner dramatically worse at 1.2% vs DeepSeek-Chat at 22.5%.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Different LLM backbones demonstrate specialized advantages for specific translation pairs.", 402 "evidence": "Table V shows o3-mini achieving 67.2% on C→Rust, outperforming Claude (47.5%) and GPT-4.1 (47.5%) on the same pair, while Claude excels at translations to Python.", 403 "supported": "moderate" 404 } 405 ], 406 "red_flags": [ 407 { 408 "flag": "No error bars or variance reporting", 409 "detail": "All results across Tables IV and V are single-point estimates with no uncertainty quantification. LLM outputs are inherently stochastic, and the paper acknowledges this as a threat but does not mitigate it with multiple runs." 410 }, 411 { 412 "flag": "Critical contamination risk unaddressed", 413 "detail": "The benchmark uses popular GitHub repositories (>50 stars) that are almost certainly in the training data of the LLMs being evaluated. No contamination analysis or decontamination is performed. Models may have seen the source code during training, inflating results for some language pairs." 414 }, 415 { 416 "flag": "Compute confound between methods", 417 "detail": "RepoTransAgent uses an iterative multi-step reasoning process with multiple API calls, while TranslationOnly uses a single call. The improvement could partly be attributed to more compute/iterations rather than the agent architecture, but compute usage is never reported or controlled." 418 }, 419 { 420 "flag": "Self-evaluation bias", 421 "detail": "The authors evaluate their own RepoTransAgent against baselines they themselves implemented (TranslationOnly, ErrorFeedback). The baseline implementations may be suboptimal, and no independent evaluation or comparison against other agent frameworks (e.g., TransAgent [80]) on the same benchmark is provided." 422 }, 423 { 424 "flag": "Small developer survey for language pair justification", 425 "detail": "Only 21 professional developers were surveyed to justify the choice of 13 translation pairs, with no description of how these developers were recruited. This thin justification is presented as evidence of 'real-world developer needs.'" 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Lost in translation: A study of bugs introduced by large language models while translating code", 431 "authors": ["R. Pan", "A. R. Ibrahimzada", "R. Krishna"], 432 "year": 2024, 433 "relevance": "Directly related prior work showing LLMs struggle with repository-level code translation, achieving only 8.1% success rate with GPT-4." 434 }, 435 { 436 "title": "Evaluating large language models trained on code", 437 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 438 "year": 2021, 439 "arxiv_id": "2107.03374", 440 "relevance": "Introduced HumanEval benchmark and Codex model, foundational for evaluating LLM code capabilities." 441 }, 442 { 443 "title": "Exploring and unleashing the power of large language models in automated code translation", 444 "authors": ["Z. Yang", "F. Liu", "Z. Yu"], 445 "year": 2024, 446 "relevance": "UniTrans framework for LLM-based code translation at function level, a key baseline approach." 447 }, 448 { 449 "title": "Unsupervised translation of programming languages", 450 "authors": ["M.-A. Lachaux", "B. Roziere", "L. Chanussot"], 451 "year": 2020, 452 "arxiv_id": "2006.03511", 453 "relevance": "TransCoder: seminal work on unsupervised neural code translation with the TransCoder-test benchmark." 454 }, 455 { 456 "title": "TransAgent: An LLM-based multi-agent system for code translation", 457 "authors": ["Z. Yuan", "W. Chen", "H. Wang"], 458 "year": 2024, 459 "arxiv_id": "2409.19894", 460 "relevance": "Multi-agent system for code translation, directly comparable agent approach to RepoTransAgent." 461 }, 462 { 463 "title": "AlphaTrans: A neuro-symbolic compositional approach for repository-level code translation and validation", 464 "authors": ["A. R. Ibrahimzada", "K. Ke", "M. Pawagi"], 465 "year": 2025, 466 "relevance": "Neuro-symbolic approach for repository-level translation that decomposes repos into fragments, a directly competing method." 467 }, 468 { 469 "title": "Rectifier: Code translation with corrector via LLMs", 470 "authors": ["X. Yin", "C. Ni", "T. N. Nguyen"], 471 "year": 2024, 472 "arxiv_id": "2407.07472", 473 "relevance": "LLM-based error correction approach for code translation, relevant to iterative refinement methods." 474 }, 475 { 476 "title": "CodeNet: A large-scale AI for code dataset for learning a diversity of coding tasks", 477 "authors": ["R. Puri", "D. S. Kung", "G. Janssen"], 478 "year": 2021, 479 "arxiv_id": "2105.12655", 480 "relevance": "Large-scale code dataset including file-level translation benchmarks, a predecessor benchmark." 481 }, 482 { 483 "title": "SpecTra: Enhancing the code translation ability of language models by generating multi-modal specifications", 484 "authors": ["V. Nitin", "B. Ray"], 485 "year": 2024, 486 "arxiv_id": "2405.18574", 487 "relevance": "Specification-enhanced LLM code translation approach, relevant technique for improving translation quality." 488 }, 489 { 490 "title": "TRACY: Benchmarking execution efficiency of LLM-based code translation", 491 "authors": ["Z. Gong", "Z. Sun", "D. Huang"], 492 "year": 2025, 493 "arxiv_id": "2508.11468", 494 "relevance": "Benchmark for evaluating execution efficiency of code translations, complementary benchmark perspective." 495 }, 496 { 497 "title": "CodeTransOcean: A comprehensive multilingual benchmark for code translation", 498 "authors": ["W. Yan", "Y. Tian", "Y. Li"], 499 "year": 2023, 500 "arxiv_id": "2310.04951", 501 "relevance": "Multilingual file-level code translation benchmark, a direct predecessor to RepoTransBench at coarser granularity." 502 }, 503 { 504 "title": "Vert: Verified equivalent Rust transpilation with few-shot learning", 505 "authors": ["A. Z. Yang", "Y. Takashima", "B. Paulsen"], 506 "year": 2024, 507 "arxiv_id": "2404.18852", 508 "relevance": "LLM-based code translation to Rust with formal verification guarantees, relevant to code translation safety." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "Benchmark and agent framework are directly usable for evaluating and performing repository-level code translation, a real developer need." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "Confirms expected difficulty of repository-level translation; the static-to-dynamic asymmetry finding is interesting but not paradigm-shifting." 519 }, 520 "fear_safety": { 521 "score": 0, 522 "justification": "No safety or security concerns raised; the paper is about code translation quality." 523 }, 524 "drama_conflict": { 525 "score": 0, 526 "justification": "No controversy or conflict angle; straightforward benchmark evaluation." 527 }, 528 "demo_ability": { 529 "score": 2, 530 "justification": "Code and data released on GitHub; practitioners could run the benchmark or agent framework on their own repositories." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "Academic authors from Sun Yat-sen University and Monash; evaluates well-known LLMs (GPT-4.1, Claude) but the paper itself is not from a famous lab." 535 } 536 } 537 }