scan.json (29984B)
1 { 2 "paper": { 3 "title": "Monte Carlo Tree Search for Execution-Guided Program Repair with Large Language Models", 4 "authors": ["Yixuan Liang"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.00129" 8 }, 9 "scan_version": 3, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "CodePilot combines Qwen3-8B with Monte Carlo Tree Search for automated GitHub issue resolution, achieving 24.67% resolve rate on SWE-bench Lite (300 instances). Ablation studies show MCTS contributes the largest improvement (+4.34pp), followed by thinking mode (+3.67pp) and self-refinement (+3.00pp). However, all results appear to be single-run with no statistical tests or uncertainty quantification, and the paper provides no code, no limitations section, and claims contamination-free evaluation without any contamination analysis.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, code archive, or download link is provided anywhere in the paper. The framework 'CodePilot' is described but no source code is released." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper evaluates on SWE-bench Lite, a publicly available benchmark. The fine-tuning dataset is described as 'historical GitHub issues with verified patches' but is not released separately." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions 'NVIDIA A100 GPUs' and 'vLLM inference' (Section VII.A) but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. The methodology sections describe the approach conceptually but do not include commands, scripts, or a reproduction guide." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Tables I and II are reported as point estimates (e.g., '24.67%') with no confidence intervals, error bars, or ± notation." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The abstract claims CodePilot 'significantly outperforming existing open-weight baselines' but no statistical significance test (p-value, t-test, bootstrap, etc.) is reported. Differences are assessed by comparing raw percentages only." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Table I provides baseline context (e.g., Agentless Qwen3-8B at 19.33% vs CodePilot at 24.67%, a +5.34pp improvement). Table II reports deltas for each ablation (e.g., -4.34% for w/o MCTS). These provide sufficient context to assess effect magnitude." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper uses SWE-bench Lite (300 instances) without justifying this sample size or discussing whether 300 instances provide sufficient statistical power for the claimed comparisons." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measure is reported. All results appear to be from single runs with no indication of result stability across seeds or repetitions." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Table I compares against Direct Generation (CodeLlama-7B), Agentless (CodeLlama-7B, DeepSeek-7B, Qwen2.5-7B, Qwen3-8B), and SWE-agent (Qwen3-8B)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include recent models (Qwen3-8B, Qwen2.5-7B) and recent frameworks (Agentless, SWE-agent). However, only open-weight baselines of similar scale are compared — no proprietary or larger-scale baselines are included." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table II presents an ablation study removing MCTS (-4.34%), thinking mode (-3.67%), self-refinement (-3.00%), and hybrid retrieval (-2.34%), plus a direct generation baseline (-10.67%)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper reports both Resolve Rate and Apply Rate (Tables I and II). Section VI also defines CodeBLEU and Localization Accuracy metrics, though these are not reported in the results tables." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation is included. All evaluation is automated via test suite pass/fail on SWE-bench Lite. For a program repair system, human review of patch quality (correctness, readability, side effects) would be relevant." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "SWE-bench Lite is a standard evaluation benchmark separate from the fine-tuning data, which is described as 'historical GitHub issues with verified patches' (Section IV.A). The benchmark serves as a held-out test set." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": false, 97 "justification": "Only aggregate resolve rate and apply rate are reported. No per-repository, per-difficulty, or per-category breakdown is provided. SWE-bench Lite spans multiple repositories with varying difficulty, but this variation is hidden behind a single number." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section III.B states 'our analysis shows incorrect localization accounts for one-third of failures,' providing a quantitative failure analysis. However, no qualitative examples of specific failures are shown." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": false, 107 "justification": "Every component in the ablation study (Table II) shows a positive contribution. No approaches that were tried and abandoned, configurations that failed, or unexpected negative outcomes are discussed." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": false, 114 "justification": "The abstract claims CodePilot 'significantly outperforming existing open-weight baselines' but no statistical significance test is performed — the comparison is based on raw percentage differences only. The word 'significantly' implies a tested statistical claim that is not substantiated." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The ablation study (Table II) makes causal claims by removing individual components (MCTS, thinking mode, self-refinement, hybrid retrieval) and measuring the impact. This controlled single-variable manipulation is an adequate design for causal attribution, though single-run results weaken confidence." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Program Repair with Large Language Models' broadly, and the abstract discusses 'automated GitHub issue resolution' in general terms. However, results are only on SWE-bench Lite (300 Python instances) with a single model (Qwen3-8B). No bounding to this specific setting is stated." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No alternative explanations for the results are discussed. The improvement could be partly due to the fine-tuning data, the specific Qwen3 model capabilities, or SWE-bench Lite characteristics rather than MCTS alone. None of these confounds are considered." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures resolve rate on SWE-bench Lite but frames CodePilot as solving 'automated software engineering' and 'GitHub issue resolution' broadly. The gap between benchmark performance and real-world issue resolution capability is not acknowledged." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "The main model is specified as 'Qwen3-8B-Instruct' (Section VII.A). Baseline models include 'CodeLlama-7B', 'DeepSeek-7B', 'Qwen2.5-7B'. These include model family, size, and variant (Instruct). Note: Section IV mentions 'Qwen3-32B-Instruct' inconsistently." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "No actual prompt text is provided. The paper describes prompt functionality conceptually (e.g., presenting functions in dependency order, using thinking mode) but never shows the actual prompts sent to the model." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section VII.A reports: 'T = 0.6, top-p = 0.95. MCTS parameters: c = 1.4, K = 3, N = 16. LoRA uses rank r = 16.' Key inference and search parameters are specified." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The paper describes the full pipeline in detail: hierarchical fault localization (Section III.B), MCTS-guided patch synthesis with UCT selection (Section III.C), execution-driven self-refinement loop (Section III.D), and confidence calibration (Section III.E). Workflow diagrams are provided in Figs. 1-3." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section V describes repository parsing via AST, semantic-aware code chunking with overlap strategy, issue text normalization including HTML cleaning and stack trace extraction, and context window packing with importance weighting." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "No dedicated limitations, threats to validity, or discussion section is present. The paper goes directly from results (Section VII) to conclusion (Section VIII) without discussing limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed anywhere in the paper. The conclusion mentions 'future work on multi-file modifications' but does not discuss specific threats to the current study's validity." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show, what settings were excluded, or what claims the authors are not making. The conclusion mentions future work but not current scope limitations." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw experimental data (per-instance results, generated patches, execution logs) is released. Only aggregate statistics in Tables I and II are provided." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": false, 190 "justification": "The fine-tuning dataset is described as 'historical GitHub issues with verified patches' (Section IV.A) but no details are provided about which repositories, time period, size, or selection criteria were used for this dataset." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. The evaluation uses SWE-bench Lite, a standard benchmark." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "While Section V documents the preprocessing pipeline for inference inputs, the fine-tuning data pipeline is undocumented — no information about how many training instances were collected, what filtering was applied, or how ground-truth patches were verified." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding source or acknowledgments section is present in the paper. Whether the work is funded or unfunded is not stated." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "The author's affiliation is listed as 'Illinois Institute of Technology, Chicago, USA.' No conflict with the evaluated model vendor (Alibaba/Qwen) is apparent." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": false, 216 "answer": false, 217 "justification": "No funding is disclosed. The work appears to be unfunded student research by a solo author at a university." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The training data cutoff for Qwen3-8B-Instruct is not stated. The paper additionally fine-tunes the model but does not specify the base model's training data coverage." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No analysis of whether SWE-bench Lite instances appear in Qwen3's pre-training data. No canary strings, membership inference, or temporal analysis is performed." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "SWE-bench was published in 2023 (arXiv:2310.06770) and is widely discussed online. Qwen3 could have been trained on its data. The paper twice claims 'contamination-free' (abstract, conclusion) but provides zero evidence to support this claim." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference cost, latency, or tokens consumed are reported. The MCTS approach with multiple expansion-simulation cycles per instance likely has significant compute cost, but this is not quantified." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The paper mentions 'NVIDIA A100 GPUs' (Section VII.A) but does not state total GPU hours, wall-clock time, or compute budget for either fine-tuning or inference." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No mention of multiple random seeds. All results appear to be from a single run, despite MCTS having stochastic elements that would make results seed-sensitive." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never stated. Results are presented as single point estimates without indicating how many repetitions produced them." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "MCTS parameters (c=1.4, K=3, N=16) and LoRA rank (r=16) are reported but no search budget, search method, or number of configurations tried is stated. It is unclear how these values were selected." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "No explanation of how the reported configuration was selected. The paper does not describe whether a validation set was used for tuning or how many alternatives were tried." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors implement their own baselines (Direct Generation, and presumably re-run Agentless and SWE-agent) without acknowledging that author-implemented baselines can systematically underperform." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "CodePilot uses MCTS with multiple expansion-simulation cycles, LoRA fine-tuning, and PPO — likely using substantially more compute than simple baselines like Direct Generation or Agentless. This compute difference is not discussed or controlled for." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether SWE-bench Lite adequately measures the claimed capability of 'automated software engineering' or 'GitHub issue resolution.' The benchmark's limitations and construct validity are not examined." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "Table I compares CodePilot (MCTS + hierarchical localization + self-refinement) against Agentless and SWE-agent, which use entirely different scaffolds. The performance difference is attributed to the approach without controlling for the scaffold confound." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether SWE-bench Lite issues and their solutions existed before Qwen3's training cutoff. SWE-bench issues come from real GitHub repositories and may have been part of training data." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information. The paper's retrieval-augmented approach provides the model with selected code context, but whether this mirrors realistic deployment conditions is not analyzed." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the fine-tuning data ('historical GitHub issues with verified patches') shares repositories or structural similarities with SWE-bench Lite instances." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No concrete leakage detection or prevention method is used despite the paper twice claiming 'contamination-free' evaluation. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline is described." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "CodePilot achieves 24.67% resolve rate on SWE-bench Lite, outperforming existing open-weight baselines.", 364 "evidence": "Table I shows CodePilot at 24.67% resolve rate vs Agentless (Qwen3-8B) at 19.33% and SWE-agent (Qwen3-8B) at 16.67%.", 365 "supported": "moderate" 366 }, 367 { 368 "claim": "MCTS contributes the largest individual improvement (+4.34pp) among all components.", 369 "evidence": "Table II ablation shows removing MCTS drops resolve rate from 24.67% to 20.33%.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Thinking mode contributes +3.67pp improvement.", 374 "evidence": "Table II shows removing thinking mode drops resolve rate from 24.67% to 21.00%.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Self-refinement contributes +3.00pp improvement.", 379 "evidence": "Table II shows removing self-refinement drops resolve rate from 24.67% to 21.67%.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Incorrect localization accounts for one-third of failures.", 384 "evidence": "Section III.B states 'our analysis shows incorrect localization accounts for one-third of failures' but no supporting data or breakdown is provided.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "CodePilot 'significantly outperforms' existing open-weight baselines.", 389 "evidence": "Table I shows percentage improvements but no statistical significance tests are performed. The word 'significantly' is used in the statistical sense without any test.", 390 "supported": "weak" 391 }, 392 { 393 "claim": "The evaluation is on 'contamination-free software engineering benchmarks.'", 394 "evidence": "Abstract and Section VIII assert contamination-free evaluation but provide no evidence — no training cutoff analysis, no overlap detection, no contamination testing.", 395 "supported": "unsupported" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "No error bars or multiple runs", 401 "detail": "All results in Tables I and II appear to be single-run point estimates with no uncertainty quantification. MCTS is inherently stochastic, making seed sensitivity analysis especially important, yet none is reported." 402 }, 403 { 404 "flag": "'Significantly outperforms' without statistical test", 405 "detail": "The abstract uses the word 'significantly' — a term with specific statistical meaning — but no significance test is performed. The claimed improvements are assessed only by comparing raw percentages." 406 }, 407 { 408 "flag": "Unsubstantiated contamination-free claim", 409 "detail": "The paper twice claims 'contamination-free' evaluation (abstract, conclusion) without any contamination analysis, training cutoff discussion, or leakage detection. SWE-bench has been public since 2023 and Qwen3 could have trained on it." 410 }, 411 { 412 "flag": "No code or reproducibility artifacts released", 413 "detail": "Despite proposing a complete framework (CodePilot), no source code, scripts, or reproduction instructions are provided. The results cannot be independently verified." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper has no limitations, threats to validity, or discussion section. It goes directly from results to a short conclusion, providing no self-critical assessment." 418 }, 419 { 420 "flag": "Model size inconsistency", 421 "detail": "Section III.A describes using 'Qwen3-8B-Instruct' while Section IV opens with 'While Qwen3-32B-Instruct provides strong zero-shot capabilities.' It is unclear whether 8B or 32B was actually used for fine-tuning." 422 }, 423 { 424 "flag": "Textbook math presented as contribution", 425 "detail": "Sections III.A and IV present standard formulations (RoPE, GQA, SwiGLU, LoRA, PPO) as if they are contributions of the paper. These are established techniques reproduced from their original papers." 426 }, 427 { 428 "flag": "Ablation components may not be independent", 429 "detail": "The ablation study removes one component at a time, but component improvements sum to 13.35pp while the full system improvement over direct generation is 10.67pp. This suggests component interactions or that the ablation baseline is not truly 'direct generation' as labeled." 430 }, 431 { 432 "flag": "Fine-tuning data undocumented", 433 "detail": "The LoRA fine-tuning dataset is described only as 'historical GitHub issues with verified patches' with no information about size, source repositories, time period, or potential overlap with SWE-bench Lite." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 439 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"], 440 "year": 2023, 441 "arxiv_id": "2310.06770", 442 "relevance": "Defines the SWE-bench benchmark used to evaluate CodePilot; foundational benchmark for LLM-based program repair evaluation." 443 }, 444 { 445 "title": "CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning", 446 "authors": ["H. Le", "Y. Wang", "A. D. Gotmare", "S. Savarese", "S. C. H. Hoi"], 447 "year": 2022, 448 "relevance": "Pioneered reinforcement learning for code generation, directly related to the PPO-based fine-tuning strategy used in CodePilot." 449 }, 450 { 451 "title": "Large Language Models Cannot Self-Correct Reasoning Yet", 452 "authors": ["J. Huang", "X. Chen", "S. Mishra", "H. S. Zheng", "A. W. Yu", "X. Song", "D. Zhou"], 453 "year": 2023, 454 "arxiv_id": "2310.01798", 455 "relevance": "Challenges the assumption that LLMs can self-correct, motivating CodePilot's use of external execution feedback rather than self-critique alone." 456 }, 457 { 458 "title": "StarCoder: May the Source Be with You!", 459 "authors": ["R. Li", "L. B. Allal", "Y. Zi", "N. Muennighoff"], 460 "year": 2023, 461 "arxiv_id": "2305.06161", 462 "relevance": "Major open-weight code LLM; relevant to the survey's coverage of code generation models and their capabilities." 463 }, 464 { 465 "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct", 466 "authors": ["Z. Luo", "C. Xu", "P. Zhao", "Q. Sun"], 467 "year": 2023, 468 "arxiv_id": "2306.08568", 469 "relevance": "Instruction-tuning for code LLMs, directly relevant to understanding how fine-tuning affects code generation performance." 470 }, 471 { 472 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 473 "authors": ["P. Lewis", "E. Perez", "A. Piktus", "F. Petroni", "V. Karpukhin"], 474 "year": 2020, 475 "relevance": "Foundational RAG paper; CodePilot's hybrid retrieval for fault localization builds on this approach." 476 }, 477 { 478 "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow", 479 "authors": ["D. Guo", "S. Ren", "S. Lu", "Z. Feng"], 480 "year": 2020, 481 "arxiv_id": "2009.08366", 482 "relevance": "Structural code representations using data flow; relevant to CodePilot's AST-based code analysis." 483 }, 484 { 485 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 486 "authors": ["S. Yao", "D. Yu", "J. Zhao", "I. Shafran", "T. Griffiths", "Y. Cao", "K. Narasimhan"], 487 "year": 2023, 488 "relevance": "Tree-search approach for LLM reasoning; directly related to CodePilot's MCTS-guided generation strategy." 489 }, 490 { 491 "title": "Training Language Models to Follow Instructions with Human Feedback", 492 "authors": ["L. Ouyang", "J. Wu", "X. Jiang", "D. Almeida", "C. Wainwright", "P. Mishkin"], 493 "year": 2022, 494 "relevance": "InstructGPT/RLHF paper; foundational to the PPO-based reinforcement learning from execution feedback used in CodePilot." 495 }, 496 { 497 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 498 "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"], 499 "year": 2023, 500 "relevance": "Self-reflective agents that improve through verbal feedback; directly related to CodePilot's execution-driven self-refinement mechanism." 501 } 502 ], 503 "engagement_factors": { 504 "practical_relevance": { 505 "score": 1, 506 "justification": "Describes a potentially useful program repair framework but releases no code, making it impractical for adoption." 507 }, 508 "surprise_contrarian": { 509 "score": 0, 510 "justification": "Confirms the expected finding that MCTS improves LLM code generation; no conventional wisdom is challenged." 511 }, 512 "fear_safety": { 513 "score": 0, 514 "justification": "No safety, security, or risk concerns are raised by this work." 515 }, 516 "drama_conflict": { 517 "score": 0, 518 "justification": "No controversy or conflict angle; a straightforward benchmark evaluation paper." 519 }, 520 "demo_ability": { 521 "score": 0, 522 "justification": "No code, demo, or installable tool is provided." 523 }, 524 "brand_recognition": { 525 "score": 1, 526 "justification": "Uses Qwen3 (moderately recognized) but the author and institution are not well-known in this space." 527 } 528 } 529 }