scan.json (29210B)
1 { 2 "paper": { 3 "title": "Reasoning Runtime Behavior of a Program with LLM: How Far Are We?", 4 "authors": ["Junkai Chen", "Zhiyuan Pan", "Xing Hu", "Zhenhao Li", "Ge Li", "Xin Xia"], 5 "year": 2024, 6 "venue": "International Conference on Software Engineering (ICSE)", 7 "arxiv_id": "2403.16437", 8 "doi": "10.1109/ICSE55347.2025.00012" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "REval evaluates 15 LLMs on four code reasoning tasks (code coverage, program state, execution path, output prediction) and a novel Incremental Consistency metric. Most models show poor performance (average 44.4% accuracy, 10.3 IC score). GPT-4-Turbo dominates with 75.0% accuracy and 42.5 IC score, more than doubling the second-best IC score. Execution Path Prediction is the hardest task, with most models below 10% accuracy. Chain-of-Thought prompting helps intermediate reasoning tasks but hurts output prediction.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a project page at https://r-eval.github.io and a replication package at figshare (https://figshare.com/s/e5de95bd79ab5ddea76c). The abstract states 'Our code, data and REval leaderboard are available.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The adapted benchmark data is available via the project page and figshare replication package. The base benchmarks (HumanEval, ClassEval) are publicly available on HuggingFace." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions 'a Linux server with 8 NVIDIA A800 GPUs' and deployment via vLLM, but does not provide a requirements.txt, Dockerfile, or specific library versions needed to recreate the environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper mentions a replication package with 'detailed information, such as model IDs and URLs' but does not include step-by-step reproduction instructions within the paper itself. No 'Reproducing Results' section or runnable scripts are described." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Results for all open-source models are reported as 'mean±standard deviation' across 5 repetitions (Table III). For example, CodeLlama-7B-Base CCP: 54.3±0.5." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are used despite numerous comparative claims (e.g., 'outperforms the second best by a large margin'). All comparisons are based on raw numerical differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Absolute differences are reported with baseline context throughout, e.g., 'an absolute improvement of 19.3%' (GPT-4-Turbo vs GPT-3.5), 'a relative improve by over 100% (i.e., 14.4% → 29.2%)', providing enough context to assess effect magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The benchmark has 3,152 problems (Table I). No justification is given for why this size is sufficient, and no power analysis is discussed." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Standard deviations are reported across 5 experimental repetitions for all open-source models in Table III (e.g., '54.3±0.5'). GPT models lack variance due to budget constraints, which is explicitly acknowledged." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "15 models are compared against each other across multiple tasks. Both general-purpose and code-specialized LLMs serve as mutual baselines. The paper also compares against prior work CRUXEval's scope." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models include GPT-4-Turbo, StarCoder2 (02/2024), Gemma (02/2024), and Mistral-7B (01/2024), all contemporary at time of study. Some older models like CodeLlama (08/2023) are included for comparison." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study isolating contributions of individual framework components. The CoT vs few-shot comparison (last row of Table III) tests prompting strategy on a single model but does not ablate framework design choices." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: Accuracy and F1 for CCP, Accuracy for PSP/EPP/OP, Average Accuracy across tasks, and the novel IC Score. Table III reports all of these." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is performed. All evaluation is automated using ground truth from program execution traces. No human assessment of benchmark quality or model output quality is reported." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The entire adapted benchmark serves as the test set. Models are evaluated with few-shot prompting where demonstrations are separate from the evaluated problems. No training or tuning is done on this data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by task (CCP, PSP, EPP, OP) for each model in Table III, revealing that EPP is much harder than OP. Per-model breakdowns across all dimensions are provided." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section VI-A presents a case study from EPP (HumanEval/59) showing where GPT-3.5 and CodeLlama-34B fail but GPT-4-Turbo succeeds, with analysis of why models choose incorrect answers." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results: CoT prompting decreases OP accuracy by 6.8% (Section V-A); Magicoder fine-tuning improves code generation but not reasoning; code LLMs don't significantly outperform general LLMs of the same size." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of 'average accuracy of 44.4%' and 'average IC score of 10.3' are directly supported by Table III, which shows these exact averages across all 15 models." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims like 'additional training with Python corpora leads to an improvement' and 'instruction tuning techniques brings gains' by comparing model variants, but confounds (different training data, architectures) are not controlled for. These are observational comparisons presented with causal language." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title 'How Far Are We?' implies a general assessment of the field, but results are limited to 2 Python benchmarks (HumanEval and ClassEval) with 15 specific models. The threats-to-validity section acknowledges benchmark limitations but the title and framing overgeneralize." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper speculates about causes (e.g., 'may demonstrate that apart from parameter size, the model architecture and training strategy also play an important role') but does not systematically consider confounds or alternative explanations. The threats section discusses methodological limitations but not alternative interpretations of the observed results." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper equates performance on four synthetic tasks (CCP, PSP, EPP, OP) with 'code reasoning ability' without discussing whether these tasks fully capture what code reasoning entails. No discussion of the gap between benchmark performance and actual program comprehension capability." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Model names are given (e.g., 'GPT-3.5-Turbo', 'GPT-4-Turbo') but without specific API versions or snapshot dates. Table II lists release dates (e.g., '01/2024') but not API version strings like 'gpt-4-turbo-2024-01-25'. Full details are deferred to the replication package." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Fig. 3 shows a prompt template with placeholders (e.g., '[PYTHON]...', '[QUESTION]...', '[ANSWER]...'). The actual few-shot demonstration content and system message text are not provided in the paper. Full prompts are deferred to the replication package." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section IV-D reports temperature=0.8, max tokens=256 (direct)/1024 (CoT), 5 repetitions for open-source models, and 'default settings in vLLM for the rest of the parameters.'" 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. Models are directly prompted via API/vLLM with standard few-shot or CoT prompting." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section III-D describes the full benchmark construction pipeline: runtime behavior extraction via custom program tracer, then problem construction with specific filtering rules for each task (CCP/EPP: last statement in control flow blocks; PSP: assignment/return/changed variables with priority rules; OP: assertion masking)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section VI-D 'Threats to Validity' provides a substantive discussion covering internal threats (selection criteria, task coverage) and external threats (model and benchmark generalizability)." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are discussed: selection criteria for statements/variables may not represent runtime state; dynamic features like memory allocation and exception handling are not covered; results restricted to specific models and benchmarks. Mitigations are described for each." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states 'there are still some dynamic features such as memory allocation and exception handling which may help measure code models, and we have not explored yet' and 'the results are restricted to the specific collection of code models and base benchmarks.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "A replication package is provided at figshare with code and data. The base benchmarks are publicly available on HuggingFace. Model outputs appear to be included based on the replication package reference." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section III-D describes the data collection process in detail: execution of canonical solutions from HumanEval and ClassEval using provided test cases with a customized program tracer, recording statement execution and local variable states at each step." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from standard public benchmarks (HumanEval and ClassEval)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "While the filtering rules are described in detail (Section III-D2), no intermediate counts are provided. The paper goes from base benchmarks to 3,152 final problems (Table I) without documenting how many candidates existed at each filtering stage." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgments section states: 'This research is supported by the Ningbo Natural Science Foundation (No. 2023J292)' and computing resources from 'the Supercomputing Center of Hangzhou City University.'" 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All author affiliations are listed: Zhejiang University, York University, and Peking University. None of the authors are affiliated with companies whose products are evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "The Ningbo Natural Science Foundation is a public funding body with no financial interest in the relative performance of any evaluated LLM." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "Table II lists model release dates (e.g., '08/2023', '02/2024') but not training data cutoff dates. Release date is not the same as training data cutoff." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether HumanEval or ClassEval solutions appeared in any model's training data. HumanEval was published in 2021 and is widely available; all evaluated models were trained after 2021." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "HumanEval was published in 2021 and has been publicly available on GitHub since then. All 15 evaluated models were released after 2021 and could have trained on HumanEval solutions. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "The paper mentions 'limited budget' as the reason for not repeating GPT experiments, but does not report actual inference costs, API costs, or per-example costs for any model." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware is mentioned (8 NVIDIA A800 GPUs) but total GPU hours, wall-clock time for experiments, or total API spend are not stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Experiments for all open-source models are repeated 5 times with temperature=0.8, and mean±std are reported in Table III, capturing seed sensitivity. However, GPT models are not repeated due to budget." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section IV-D explicitly states: 'experiments for all open-source models with few-shot prompting are repeated five times.' GPT models are run once, with the reason given." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is reported. Temperature=0.8 is adopted from prior work (CRUXEval) without exploring alternatives. No search budget is mentioned." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper uses a fixed configuration following prior work (CRUXEval) rather than selecting a best configuration. All models use the same settings, and all results are reported — no cherry-picking of configurations." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper makes many model-to-model comparisons across 4 tasks with 15 models but performs no statistical tests at all, let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors designed the REval benchmark and evaluate all models on it. No discussion of potential bias from designing both the evaluation framework and conducting the evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Model sizes range from 2B to unknown (GPT-4-Turbo), but performance is not analyzed as a function of compute. Closed-source models like GPT-4-Turbo likely use vastly more compute than 3B models but this difference is not discussed." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether CCP, PSP, EPP, and OP actually measure 'code reasoning ability' as claimed. No analysis of construct validity or comparison with alternative code reasoning definitions." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. Models are prompted directly via API/vLLM." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "HumanEval was published in 2021. All models were released 2023-2024 and could have been trained on HumanEval solutions. The paper does not discuss this temporal leakage risk." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the prompts or few-shot examples leak information about correct answers. The prompt structure (e.g., providing the full program and asking about specific lines) is not analyzed for information leakage." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Multiple problems are generated from the same program (different statements/variables), creating non-independent test items. This structural non-independence is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or temporal splits are used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Most LLMs show unsatisfactory performance on Runtime Behavior Reasoning with an average accuracy of 44.4%.", 365 "evidence": "Table III shows average accuracy across all 15 models on 4 tasks, computed as the mean of CCP (61.0%), PSP (35.6%), EPP (19.4%), and OP (61.8%).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "GPT-4-Turbo demonstrates clear superiority over all other models, achieving 75.0% average accuracy and 42.5 IC score.", 370 "evidence": "Table III shows GPT-4-Turbo at 88.4% CCP, 71.4% PSP, 57.7% EPP, 82.6% OP, and IC score 42.5 — more than double the second-best IC score of 20.6 (GPT-3.5).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Execution Path Prediction is the most challenging task, with most models below 10% accuracy.", 375 "evidence": "Table III shows EPP average of 19.4%, with 7 of 15 models below 10% accuracy. Only GPT-4-Turbo (57.7%) and Mistral-7B (35.8%) exceed 30%.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Chain-of-Thought prompting improves intermediate reasoning tasks but hurts Output Prediction.", 380 "evidence": "Last row of Table III: CodeLlama-7B-Instruct with CoT shows EPP improvement from 10.8% to 21.4% but OP decrease from 62.6% to 55.8% (absolute decrease of 6.8%).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Code LLMs do not exhibit an obviously leading advantage over general LLMs of the same size in code reasoning.", 385 "evidence": "Table III shows general LLMs like Mistral-7B (48.0% avg) outperforming code LLMs of similar size like StarCoder2-7B (39.6%) and CodeLlama-7B-Instruct (38.5%).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "There is a strong positive correlation between code reasoning and code generation (Pearson > 0.7).", 390 "evidence": "Table IV reports Pearson correlation: RBR-HE = 0.772, IC-HE = 0.724. Both exceed the 0.7 threshold for strong positive correlation.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Larger model variants within the same family show better code reasoning performance.", 395 "evidence": "CodeLlama-Instruct series: 7B→13B→34B shows EPP accuracy from 10.8%→14.4%→29.2%; StarCoder2: 3B→7B→15B shows average accuracy from 37.3%→39.6%→50.5%.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No contamination analysis on well-known benchmarks", 402 "detail": "HumanEval was published in 2021 and is widely available on GitHub. All 15 evaluated models were trained after 2021 and could have memorized solutions. The paper's adapted benchmark derives from HumanEval, so contamination could inflate code reasoning scores — yet this is never discussed." 403 }, 404 { 405 "flag": "No statistical significance tests for comparative claims", 406 "detail": "The paper makes numerous claims like 'outperforms by a large margin' and 'does not exhibit an obviously leading advantage' based purely on numerical comparisons without any statistical testing. With 15 models, 4 tasks, and 5 repetitions, proper statistical analysis is feasible but absent." 407 }, 408 { 409 "flag": "GPT models not repeated", 410 "detail": "Experiments for GPT-3.5-Turbo and GPT-4-Turbo are run only once due to 'limited budget,' while all comparative claims heavily feature these models. The two strongest-performing models lack any uncertainty quantification." 411 }, 412 { 413 "flag": "Non-independence of test problems", 414 "detail": "Multiple problems are generated from the same program (different statements, variables), creating correlated test items. Accuracy metrics treat each problem as independent, potentially inflating confidence in results." 415 } 416 ], 417 "cited_papers": [ 418 { 419 "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", 420 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather", "Armando Solar-Lezama", "Gabriel Synnaeve", "Sida I. Wang"], 421 "year": 2024, 422 "arxiv_id": "2401.03065", 423 "relevance": "Directly related benchmark for evaluating LLM code reasoning via input/output prediction; REval extends this approach to runtime behavior." 424 }, 425 { 426 "title": "CodeMind: A Framework to Challenge Large Language Models for Code Reasoning", 427 "authors": ["Changshu Liu", "Shizhuo Dylan Zhang", "Reyhaneh Jabbarvand"], 428 "year": 2024, 429 "arxiv_id": "2402.09664", 430 "relevance": "Concurrent framework for evaluating code reasoning with LLMs including specification reasoning and execution prediction." 431 }, 432 { 433 "title": "Evaluating Large Language Models Trained on Code", 434 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 435 "year": 2021, 436 "arxiv_id": "2107.03374", 437 "relevance": "Introduces HumanEval, the foundational code generation benchmark used as a base benchmark in REval." 438 }, 439 { 440 "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation", 441 "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"], 442 "year": 2024, 443 "relevance": "Class-level code generation benchmark used as the second base benchmark for REval, representing context-aware programming scenarios." 444 }, 445 { 446 "title": "Code Llama: Open Foundation Models for Code", 447 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 448 "year": 2023, 449 "arxiv_id": "2308.12950", 450 "relevance": "Major open-source code LLM family evaluated in the study; demonstrates model variant effects on code reasoning." 451 }, 452 { 453 "title": "Magicoder: Source Code Is All You Need", 454 "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu", "Yifeng Ding", "Lingming Zhang"], 455 "year": 2023, 456 "arxiv_id": "2312.02120", 457 "relevance": "Fine-tuned code LLM showing that code generation improvements don't necessarily transfer to code reasoning tasks." 458 }, 459 { 460 "title": "TRACED: Execution-Aware Pre-Training for Source Code", 461 "authors": ["Yangruibo Ding", "Benjamin Steenhoek", "Kexin Pei", "Gail Kaiser", "Wei Le", "Baishakhi Ray"], 462 "year": 2024, 463 "relevance": "Execution-aware pre-training technique combining static and dynamic code characteristics, directly relevant to understanding runtime behavior learning." 464 }, 465 { 466 "title": "Beyond Accuracy: Evaluating Self-Consistency of Code LLMs", 467 "authors": ["Marcus J. Min", "Yangruibo Ding", "Luca Buratti", "Saurabh Pujar", "Gail Kaiser", "Saikat Jana", "Baishakhi Ray"], 468 "year": 2023, 469 "relevance": "Evaluates self-consistency of code LLMs via back-translation, directly related to REval's incremental consistency metric." 470 }, 471 { 472 "title": "Unsupervised Evaluation of Code LLMs with Round-Trip Correctness", 473 "authors": ["Miltiadis Allamanis", "Sheena Panthaplackel", "Pengcheng Yin"], 474 "year": 2024, 475 "arxiv_id": "2402.08699", 476 "relevance": "Proposes round-trip correctness for unsupervised code LLM evaluation, a complementary approach to REval's supervised consistency evaluation." 477 }, 478 { 479 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 480 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 481 "year": 2022, 482 "relevance": "Foundational prompting technique evaluated in REval, showing mixed results for code reasoning tasks." 483 }, 484 { 485 "title": "StarCoder 2 and The Stack v2: The Next Generation", 486 "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal", "Federico Cassano"], 487 "year": 2024, 488 "arxiv_id": "2402.19173", 489 "relevance": "Open-source code LLM series evaluated in the study, trained on The Stack v2 dataset with varied architectures." 490 }, 491 { 492 "title": "Code Simulation Challenges for Large Language Models", 493 "authors": ["Emanuele La Malfa", "Christoph Weinhuber", "Orazio Torre", "Fangru Lin", "Anthony Cohn", "Nigel Shadbolt", "Michael Wooldridge"], 494 "year": 2024, 495 "arxiv_id": "2401.09074", 496 "relevance": "Concurrent work on simulating code execution with LLMs, focusing on algorithm complexity; related but more limited than REval's runtime behavior approach." 497 } 498 ] 499 }