scan.json (29448B)
1 { 2 "paper": { 3 "title": "ReMind: Understanding Deductive Code Reasoning in LLMs", 4 "authors": ["Jun Gao", "Yun Peng", "Xiaoxue Ren"], 5 "year": 2025, 6 "venue": "arXiv (under review)", 7 "arxiv_id": "2511.00488", 8 "doi": "10.48550/arXiv.2511.00488" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "LLMs exhibit a fundamental gap (up to 44.7%) between code generation and deductive reasoning abilities, even for code they generate themselves. A self-reasoning bias causes up to 43% relative performance drop when reasoning about code from other LLMs. The proposed ReMind multi-agent framework (Mutator, Executor, Inspector) achieves up to 23.2 absolute accuracy improvement over baselines and substantially reduces cross-source performance variance.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "An anonymous repository link is provided in Section 8: 'The code and data are available at https://anonymous.4open.science/r/remind-71F0/'. This is an anonymized URL for review, but it is a working link rather than a promise of future release." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The same anonymous repository is stated to contain both code and data. Additionally, the benchmarks used (HumanEval, LiveCodeBench) are publicly available." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper. The only tool mentioned is Python's AST module (footnote 1)." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymous link may contain a README, but the paper itself does not include reproduction steps." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 2-7 are point estimates with no confidence intervals or error bars. Table 5 reports standard deviation across code sources (not across experimental runs), which measures cross-source stability rather than experimental uncertainty." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are used anywhere. Claims like 'ReMind consistently and significantly outperforms all baselines' (Section 4.2.1) are based solely on comparing raw accuracy numbers without any formal test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports absolute accuracy differences with baseline context throughout, e.g., 'maximum average improvement of 23.2 in absolute accuracy over the best performing baselines' (abstract), and provides full accuracy tables (Tabs 2-7) showing both proposed and baseline numbers." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is given for the 152 HumanEval and 328 LiveCodeBench problems. These sizes are artifacts of the filtering protocol (problems where all 5 LLMs produce correct code) rather than a deliberate sample size choice." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Table 5 reports standard deviation of accuracy across different code sources, not across experimental runs. There is no mention of running experiments multiple times or reporting variance across seeds/runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Three baselines are compared: CoT (Chain-of-Thought), CoC (Chain-of-Code), and RHDA (Reflective Hypothesis Decomposition and Amendment). Described in Section 4.1." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "CoC (Li et al., ICML) and RHDA (Zhao et al., ICLR 2025) are recent and represent current approaches to code reasoning. The baselines are appropriate and competitive." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 7 presents an ablation study removing the Mutator (M) and Inspector (I) individually and together, showing each component's contribution across 5 LLMs on both benchmarks." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Only one metric is used: accuracy as defined in Equation 1 (product of per-test-case correctness indicators). Table 5 adds standard deviation across sources as a stability measure, but this is derived from the same accuracy metric." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of reasoning traces or outputs. All evaluation is automated via the accuracy metric comparing predicted outputs to ground truth execution results." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "LiveCodeBench is explicitly used as a zero-shot evaluation set with problems 'later than the knowledge cutoff of OpenAI o1' (Section 2.2). HumanEval and LiveCodeBench serve as separate evaluation benchmarks." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by executor LLM (rows), code source/generator LLM (columns), benchmark (HumanEval vs LiveCodeBench), and method (CoT, CoC, RHDA, ReMind) in Tables 2-7." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 2.4 presents two detailed motivating examples (Fig. 3) showing specific reasoning errors. Section 5.1 provides a case study (Fig. 5) demonstrating how ReMind detects and corrects errors that other methods miss." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that CoC and RHDA sometimes hurt performance compared to native CoT for reasoning models like DeepSeek-R1 and o1 (Section 4.2.1: 'Both CoC and RHDA fluctuate around 75% accuracy, significantly underperforming native CoT prompting'). The empirical study itself documents LLM failures." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of 'intrinsic gap,' 'consistent bias towards code sources,' and '23.2 in absolute accuracy' improvement are all supported by the experimental results in Tables 2-7. The empirical findings in RQ1-RQ3 back the three stated challenges." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The ablation study (Table 7) provides controlled single-variable removal of Mutator and Inspector to support claims about component contributions. The mutation experiment (Table 2, 'Mutation' rows) isolates the effect of style bias by having the executor rewrite code before reasoning." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims to address 'Deductive Code Reasoning in LLMs' broadly, but experiments are conducted only on Python code (HumanEval and LiveCodeBench are Python-only), with 5 specific LLMs, and a single accuracy metric. The abstract says 'five LLMs' and 'two benchmarks' without bounding claims to Python or these specific models." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "Section 5.3 (Threats to Validity) lists limitations (weaker models, sequential programs, computational overhead) but does not discuss alternative explanations for the observed improvements. No consideration of whether ReMind's gains stem from test-time compute scaling alone rather than the specific architectural design." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures accuracy of output prediction (Eq. 1) and frames it as 'deductive code reasoning' ability. The metric directly measures output prediction correctness, which is the claimed capability. No proxy gap exists between the measurement and the claim." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed as 'GPT-4o-mini,' 'DeepSeek-V3,' 'DeepSeek-R1,' and 'o1' without specific version dates, snapshot IDs, or API versions. No model version identifiers like 'gpt-4o-mini-2024-07-18' are provided." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper states models are 'prompted with simple while widely used Chain-of-Thought Prompting (CoT)' (Section 2.2) but does not provide the actual prompt text used for CoT, CoC, RHDA, or any ReMind agent prompts." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the 5 LLMs used. Test cases are capped at 15 (Section 2.2), but LLM inference settings are not stated." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The ReMind multi-agent framework is described in detail in Section 3 with a workflow diagram (Fig. 4). The roles of Mutator (Section 3.1.1), Executor (Section 3.1.2), and Inspector (Section 3.1.3) are explained, including the CFG-based validation mechanism." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 2.2 describes the validation protocol in detail: LLMs generate code, solutions are validated against official test cases, only instances where all LLMs produce correct code are retained. Resulting sizes (152 HumanEval, 328 LiveCodeBench) and test case cap (15) are stated." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5.3 'Threats to Validity' provides a dedicated subsection discussing limitations with substantive content across three paragraphs." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5.3 identifies specific threats: (1) weaker models may not benefit from mutation/inspection, (2) the approach is limited to 'sequential, deterministic programs' and struggles with 'concurrent, asynchronous, or side-effect-heavy code,' (3) computational overhead of the multi-agent workflow." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.3 explicitly states the approach is 'primarily designed for sequential, deterministic programs' and 'may face challenges when applied to concurrent, asynchronous, or side-effect-heavy code (e.g., I/O operations, network calls).'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 8 states 'The code and data are available at https://anonymous.4open.science/r/remind-71F0/'. The underlying benchmarks (HumanEval, LiveCodeBench) are also publicly available." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 2.2 describes the data collection: LLMs generate code for generation benchmarks, code is validated against test cases, filtering retains only problems where all models produce correct solutions. Source benchmarks and their properties are described in Table 1." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks (HumanEval, LiveCodeBench)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The filtering pipeline is documented: start with full benchmarks (164 HumanEval, 880 LiveCodeBench) → select time period (Oct 2023-Jan 2025 for LiveCodeBench) → generate code from 5 LLMs → validate → retain mutual correct solutions → cap test cases at 15 → final sets (152, 328)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources are mentioned anywhere in the paper. No acknowledgments section discusses grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Jun Gao and Xiaoxue Ren at Zhejiang University, Yun Peng at The Chinese University of Hong Kong. Authors are from academic institutions and do not evaluate products from their employers." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence of funding cannot be assessed. The absence of any funding disclosure means this criterion is not satisfied." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper mentions LiveCodeBench problems are 'later than the knowledge cutoff of OpenAI o1' (Section 2.2) but does not state specific training cutoff dates for any of the 5 models (GPT-4o-mini, DeepSeek-V3, DeepSeek-R1, o1-Low, o1-High)." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Section 2.3 (RQ3) explicitly discusses contamination: 'HumanEval has been publicly available for several years, raising concerns that potential data leakage may compromise the authenticity of reported findings.' LiveCodeBench is used specifically to address this concern." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "The paper acknowledges HumanEval contamination risk and uses LiveCodeBench (problems released Oct 2023-Jan 2025, after model knowledge cutoffs) as a contamination-free evaluation. This temporal split is a concrete mitigation strategy." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All experiments evaluate LLM performance on code benchmarks." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section 5.2 reports 'ReMind requires 5.0 and 5.2 API calls for HumanEval and LiveCodeBench on average' and compares this to baselines (CoT/CoC: fewer, RHDA: 5.3/5.5). While not dollar costs, the API call count provides a concrete cost comparison." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated — no GPU hours, total API spend, or wall-clock time for the full experimental evaluation across 5 LLMs × 2 benchmarks × 4 methods." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seeds, seed sensitivity, or results across multiple seeds anywhere in the paper. All results appear to be from single runs." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. Results appear to be single-run without explicit confirmation." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. The mutation strategies and ReMind configuration appear fixed with no description of how design choices were selected." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of how the ReMind configuration (number of mutations, iteration count, etc.) was selected. The paper does not explain whether configurations were tuned on a validation set or chosen a priori." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable. The absence of statistical tests is captured in the significance_tests item." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement all baselines (CoT, CoC, RHDA) and evaluate their own system (ReMind) without acknowledging the bias of evaluating their own implementation of baselines vs. their own proposed system." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Section 5.2 acknowledges ReMind uses more API calls than CoT/CoC but does not report performance as a function of compute budget. No performance-compute curves or matched-compute comparisons are provided." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper repurposes code generation benchmarks (HumanEval, LiveCodeBench) for evaluating deductive reasoning but does not discuss whether these benchmarks actually measure the claimed 'deductive code reasoning' capability or whether the construct is valid." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "ReMind IS the scaffold being tested. The comparison is between different methods (ReMind vs CoT/CoC/RHDA), not between different models using the same scaffold. The scaffold is the independent variable." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "The paper explicitly uses LiveCodeBench with problems from 'Oct 2023 to Jan 2025 (later than the knowledge cutoff of OpenAI o1)' to mitigate temporal leakage (Section 2.2)." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The code and test inputs are provided to LLMs, but there is no analysis of whether context formatting could provide unintended hints." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of potential non-independence between HumanEval and LiveCodeBench problems, or between training data and test problems beyond temporal separation." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "A temporal split is used as a concrete prevention method: LiveCodeBench problems are selected from after the models' knowledge cutoffs. This is a deliberate leakage prevention strategy, not just conceptual discussion." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "LLMs demonstrate up to a 44.7% gap between code generation and deductive code reasoning abilities", 365 "evidence": "Table 2 shows GPT-4o-mini achieves only 55.3% accuracy on self-execution in HumanEval despite producing functionally correct code (100% pass rate), yielding a 44.7% gap. All models show non-zero gaps (Section 2.3, RQ1).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "LLMs exhibit a consistent self-reasoning bias, with up to 43% relative performance drop in cross-execution settings", 370 "evidence": "Table 2 and Figure 1(b) show DeepSeek-V3 drops from 85.5% (self-execution) to 48.7% when reasoning about o1-Low's code, a 43% relative decline. All models show diagonal dominance in the performance matrix (Section 2.3, RQ2).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "ReMind achieves a maximum average improvement of 23.2 in absolute accuracy over best baselines", 375 "evidence": "Comparing Tables 4 and 6: on LiveCodeBench with DeepSeek-R1, ReMind averages 77.7% vs RHDA at 53.7% (~24pp) and CoC at 54.5% (~23.2pp). Multiple LLM×benchmark combinations show large improvements.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "ReMind eliminates self-reasoning bias and sustains high robustness regardless of code source", 380 "evidence": "Table 5 shows ReMind achieves the lowest standard deviation across code sources (2.9 on HumanEval, 1.3 on LiveCodeBench) compared to CoT (8.7, 3.9). Tab 4 color coding also shows uniform performance.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "ReMind enables robust zero-shot generalization on complex benchmarks", 385 "evidence": "Table 6 shows ReMind with o1-High achieves 78.4% average on LiveCodeBench vs CoT (59.1%), CoC (57.6%), RHDA (61.1%). Performance gap between HumanEval and LiveCodeBench is smaller for ReMind than baselines.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Both Mutator and Inspector are indispensable components, and their collaboration yields the best performance", 390 "evidence": "Table 7 ablation study: removing Inspector causes larger drops than removing Mutator (e.g., DS-V3 LiveCodeBench: 70.1 → 54.9 w/o I vs 70.1 → 58.0 w/o M). Removing both degrades to CoT performance.", 391 "supported": "strong" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No error bars or statistical tests", 397 "detail": "All results across Tables 2-7 are point estimates without confidence intervals, error bars, or significance tests. Claims of superiority ('significantly outperforms') are made by comparing raw accuracy numbers. With no indication of how many runs were conducted, results could be due to sampling noise." 398 }, 399 { 400 "flag": "Single-run experiments without seed sensitivity", 401 "detail": "The paper does not state how many runs produced the results or whether seed sensitivity was tested. LLM outputs are stochastic, and single-run results on 152-328 problems may not be stable. This is especially concerning given the paper's own finding about reasoning inconsistency." 402 }, 403 { 404 "flag": "Model versions not specified", 405 "detail": "No exact API versions or snapshot dates are given for any of the 5 LLMs. Since LLM behavior changes across versions, results may not be reproducible with current model versions." 406 }, 407 { 408 "flag": "Single metric evaluation", 409 "detail": "Only accuracy (Eq. 1) is reported. The strict all-or-nothing metric (product across test cases) could obscure partial improvements. No complementary metrics like per-test-case accuracy or reasoning trace quality are provided." 410 }, 411 { 412 "flag": "Prompts not provided", 413 "detail": "None of the actual prompts used for CoT, CoC, RHDA, or the three ReMind agents (Mutator, Executor, Inspector) are provided in the paper. This makes the comparison between methods unreproducible." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Evaluating large language models trained on code", 419 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 420 "year": 2021, 421 "arxiv_id": "2107.03374", 422 "relevance": "Introduces HumanEval, the primary benchmark used in this paper's evaluation of LLM code reasoning." 423 }, 424 { 425 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 426 "authors": ["Naman Jain", "King Han", "Alex Gu"], 427 "relevance": "Contamination-free code evaluation benchmark used as the zero-shot test set in this paper." 428 }, 429 { 430 "title": "CruxEval: A benchmark for code reasoning, understanding and execution", 431 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather"], 432 "year": 2024, 433 "arxiv_id": "2401.03065", 434 "relevance": "Code reasoning benchmark discussed as insufficient for studying deductive reasoning challenges." 435 }, 436 { 437 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 438 "authors": ["Daya Guo", "Dejian Yang"], 439 "year": 2025, 440 "arxiv_id": "2501.12948", 441 "relevance": "One of the five evaluated LLMs, a reasoning-focused model tested for deductive code reasoning." 442 }, 443 { 444 "title": "DeepSeek-V3 technical report", 445 "authors": ["Aixin Liu", "Bei Feng"], 446 "year": 2024, 447 "arxiv_id": "2412.19437", 448 "relevance": "One of the five evaluated LLMs, showing the strongest self-reasoning bias in cross-execution settings." 449 }, 450 { 451 "title": "Chain-of-thought prompting elicits reasoning in large language models", 452 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 453 "year": 2022, 454 "relevance": "Foundational prompting method used as a baseline and the default reasoning strategy in the empirical study." 455 }, 456 { 457 "title": "Chain of Code: Reasoning with a Language Model-Augmented Code Emulator", 458 "authors": ["Chengshu Li", "Jacky Liang", "Andy Zeng"], 459 "relevance": "Code-specific reasoning baseline that prompts LLMs to simulate execution, compared against ReMind." 460 }, 461 { 462 "title": "Unveiling the Magic of Code Reasoning through Hypothesis Decomposition and Amendment", 463 "authors": ["Yuze Zhao", "Tianyun Ji", "Wenjun Feng"], 464 "year": 2025, 465 "relevance": "RHDA framework baseline that decomposes execution into sub-procedures, the strongest non-ReMind baseline." 466 }, 467 { 468 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 469 "authors": ["John Yang", "Carlos E Jimenez"], 470 "year": 2024, 471 "relevance": "Multi-agent code reasoning system relevant to understanding LLM-based software engineering agents." 472 }, 473 { 474 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 475 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 476 "year": 2024, 477 "relevance": "LLM-driven multi-agent framework using structured interaction and role-based constraints for collaborative problem solving." 478 }, 479 { 480 "title": "CodeHalu: Investigating code hallucinations in LLMs via execution-based verification", 481 "authors": ["Yuchen Tian", "Weixiang Yan"], 482 "year": 2025, 483 "relevance": "Studies code hallucination in LLMs through execution-based verification, directly related to code reasoning reliability." 484 }, 485 { 486 "title": "How Accurately Do Large Language Models Understand Code?", 487 "authors": ["Sabaat Haroon", "Ahmad Faraz Khan"], 488 "year": 2025, 489 "arxiv_id": "2504.04372", 490 "relevance": "Empirical study of LLM code understanding accuracy, using mutation-based evaluation related to ReMind's Mutator." 491 }, 492 { 493 "title": "Mutation-guided llm-based test generation at Meta", 494 "authors": ["Christopher Foster", "Abhishek Gulati", "Mark Harman"], 495 "year": 2025, 496 "arxiv_id": "2501.12862", 497 "relevance": "Industry-scale LLM mutation testing at Meta, relevant to understanding LLM-driven code mutation approaches." 498 } 499 ] 500 }