scan.json (30372B)
1 { 2 "paper": { 3 "title": "Are They All Good? Evaluating the Quality of CoTs in LLM-based Code Generation", 4 "authors": [ 5 "Binquan Zhang", 6 "Li Zhang", 7 "Zhiwen Luo", 8 "Yuxin Du", 9 "Fang Liu", 10 "Song Wang", 11 "Lin Shi" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2507.06980", 16 "doi": "10.48550/arXiv.2507.06980" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval", "qualitative"], 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "Section VIII provides a GitHub link: https://github.com/binquanzzz/CoT-Eval with 'data and scripts online to facilitate replication or future work.'" 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "Section VIII states data is provided at the GitHub repository. The benchmarks used (CoderEval, SWE-bench) are also publicly available." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No requirements.txt, Dockerfile, or environment setup details are provided in the paper. Only models are named without specifying the runtime environment." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub link is provided but the paper itself contains no instructions for replicating the experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All main results (pass@1 scores, factor percentages, recall/precision/F1) are reported as point estimates without confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "No statistical significance tests are used. Comparisons between models and between MAD vs single-LLM methods are made by comparing raw percentages without any formal tests." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Results are presented as raw percentages and counts without standardized measures of effect magnitude." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": true, 63 "justification": "Section III.A.3 explicitly justifies the pilot sample: 'we randomly selected 262 samples from 813 failed CoT samples (with a confidence level of 95% and a margin of error of ±5%).'" 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance, standard deviation, or spread measures are reported. Pass@1 values are single-run results. The taxonomy distribution percentages have no uncertainty measures." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The MAD framework is compared against single-LLM detection methods (DeepSeek-R1, Gemini-2.0, o1 alone). The repair experiment compares three feedback granularities against a 0% baseline." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "All three models used (DeepSeek-R1, Gemini-2.0-Flash-Thinking-Exp-01-21, o1-2024-12-17) are contemporary reasoning models released in late 2024 / early 2025." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The repair experiment (Section V) varies feedback granularity across three levels (simple, error type, detailed error feedback), functioning as an ablation of feedback information. The MAD framework tests different role assignments (DGO, GDO, OGD)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Detection is evaluated with Recall, Precision, and F1-Score (Table III). Code generation uses pass@1. Annotation quality uses Cohen's Kappa." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Four experts with 10+ years of experience manually analyzed CoT quality, supplemented by four additional annotators. Each sample independently annotated by two participants with Cohen's Kappa scores above 0.7." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Code correctness is evaluated using the original benchmark test suites (CoderEval unit tests, SWE-bench patches). The pilot set (262 samples) was used to build the taxonomy, with the remaining 551 annotated separately." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Extensive breakdowns are provided: by factor type (external vs internal), by sub-factor, by model (Table II), by dataset (Figure 7, CoderEval vs SWE-bench-NF), and by pass/fail status (Table I)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Sections III.B and III.C provide detailed failure case examples with figures: Unclear Implementation Details (Figure 2), Missing Dependencies (Figure 3), CoT Inconsistent with Prompt (Figure 4), Implicit Requirement Misinterpretation (Figure 5), Incorrect Planning (Figure 6)." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Self-repair results are critically low (0.0% pass@1 for all models on SWE-bench-NF with simple feedback, Table IV). MAD framework shows high recall but very low precision (e.g., MAD OGD: 43.9% recall but 5.49% precision). These limitations are honestly reported." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims (53.60% external factors, 40.10% internal factors, 18.5% correct CoT → wrong code, 11.90% wrong CoT → correct code) are all directly supported by results in Section III.D and Tables I-II." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Most claims are descriptive (factor distributions) rather than strongly causal. The repair experiment's claim that 'more detailed feedback improves repair' is supported by controlled comparison across three feedback levels. The study design is adequate for the claims made." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title claims 'LLM-based Code Generation' broadly, but results are only for Python tasks on CoderEval and SWE-bench-NF with 3 specific reasoning models. The threats-to-validity section acknowledges Python focus but the title and abstract do not bound the claims to Python." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The threats-to-validity section discusses methodological concerns (subjectivity, prompt sensitivity) but does not consider alternative explanations for the observed findings, such as whether the factor distribution would differ with different prompt engineering strategies or whether the CoT quality issues are artifacts of the benchmarks rather than the models." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures pass@1 on test suites and frames it as code correctness — the measurement matches the claim granularity. CoT quality is assessed by expert annotators against explicit criteria. The paper does not overclaim beyond its measurements." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific versions are provided: 'Gemini-2.0-Flash-Thinking-Exp-01-21', 'DeepSeek-R1', and 'OpenAI o1-2024-12-17'. These include snapshot dates or version identifiers." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": false, 154 "justification": "The repair feedback templates are provided verbatim in Section V.A. However, the MAD framework prompts (verifier, defender, arbiter roles) are only described in natural language without the actual system prompts. The main experiment uses benchmark inputs directly but the MAD prompts are not provided." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "The paper states 'we strictly adhere to the preset parameter configurations' but never specifies what those configurations are. No temperature, top-p, max tokens, or other sampling parameters are reported." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding (tool use, memory, retry logic) is used. The MAD framework is a multi-agent debate protocol, not agentic scaffolding in the typical sense." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section III.A.2 describes dataset selection: 230 Python tasks from CoderEval, 111 New Features instances from SWE-bench Full. The filtering criteria for SWE-bench-NF (New Features type) is clearly stated. The annotation pipeline (pilot → full annotation) is documented." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section VI.B 'Threats to Validity' provides substantive discussion covering external validity (Python-only, generalizability), internal validity (manual analysis subjectivity), and construct validity (prompt format sensitivity)." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific threats are discussed: Python-only focus ('specifically focuses on Python code generation tasks'), subjectivity of CoT quality labeling ('different labelers may have varying determinations'), and prompt format sensitivity ('evaluation results may be sensitive to the prompt format'). These are specific to this study." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section VI.B states: 'Our empirical study specifically focuses on Python code generation tasks' and notes 'it would be valuable to extend this analysis to other programming languages.' The SWE-bench subset is bounded to New Features only." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "Section VIII provides a GitHub link (https://github.com/binquanzzz/CoT-Eval) with 'data and scripts online to facilitate replication or future work,' implying raw CoT-code pairs and annotations are available." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section III.A describes the collection process: three LLMs given benchmark tasks, producing 1,023 CoT-code pairs (690 from CoderEval, 333 from SWE-bench-NF). Code correctness evaluated per benchmark protocols." 199 }, 200 "recruitment_methods_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section III.A.3 describes annotators: 'four experts, each with over ten years of experience in Java and Python development' for the pilot; 'one PhD student, two graduate students, and one undergraduate student' with SE research or open-source experience for remaining samples." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented: 1,023 total pairs → 813 failed (analyzed for taxonomy) + 210 passing (analyzed for CoT-code relationship). 262 pilot samples → codebook → 551 remaining samples. Each step with counts and criteria." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding or acknowledgments section is present in the paper. Funding status is not disclosed." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: Beihang University, China and York University, Canada. These are academic institutions with no direct conflict with the evaluated models." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "No funding is disclosed, so independence cannot be assessed. The authors evaluate third-party models (DeepSeek, Google, OpenAI), and no funding relationship with these companies is mentioned or denied." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement is present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the three models (DeepSeek-R1, Gemini-2.0-Flash-Thinking, o1). This is relevant because CoderEval and SWE-bench are public benchmarks." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether CoderEval or SWE-bench tasks appeared in the training data of the three reasoning models." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "CoderEval and SWE-bench are public benchmarks published before the models' training. No contamination analysis or discussion is provided, despite the risk that models may have seen benchmark solutions during training." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human subjects are studied. The annotators are part of the methodology (labeling CoT quality), not research participants." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human subjects are studied. Expert annotation of LLM outputs does not constitute human subjects research." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human subjects are studied. Annotator qualifications are described as part of methodology, not as participant demographics." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human subjects are studied." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human subjects study with experimental conditions." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human subjects study requiring blinding." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human subjects study with participants who could drop out." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference costs reported despite using three commercial/expensive reasoning models (o1, DeepSeek-R1, Gemini) across 1,023+ generations plus MAD experiments (3 models per detection) plus repair experiments." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No compute budget or API costs are stated. The MAD framework is acknowledged to have 'significant computational overhead' (Section IV.B) but this is not quantified." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of random seeds or multiple runs. All results appear to be from single runs per model." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": true, 308 "justification": "Section V.A states 'For each problem requiring repair, we generated one code solution.' The pass@1 metric implies single-attempt evaluation. The number of runs is implicitly clear." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Section VI.B mentions 'small-scale preliminary testing, experimenting with different prompts' but does not report how many configurations were tried or the search budget." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "The paper mentions selecting prompts that 'yield the most consistent and optimal performance across various models' but provides no details on the selection process or what configurations were considered." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The MAD framework is the authors' proposed approach, compared against single-LLM methods. No acknowledgment of potential bias in evaluating their own method, such as whether the MAD prompt design was tuned to perform well." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "The MAD framework uses 3 models per detection (vs 1 for baselines) — roughly 3x compute. Section IV.B mentions 'significant computational overhead' but does not quantify the cost-performance tradeoff." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of whether CoderEval and SWE-bench-NF actually measure what the paper claims to evaluate (CoT quality in code generation). The paper uses pass@1 as the ultimate criterion without questioning whether test-case passing equates to code quality." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No agentic scaffolding is used. Models are directly prompted with benchmark tasks." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of temporal leakage. CoderEval (2024) and SWE-bench (2023) were published before the models used, and solutions may appear in training data." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information. The benchmarks provide docstrings and context that may partially overlap with training data." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether benchmark examples share structural similarities with training data or with each other." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention methods applied. No canary strings, membership inference, or decontamination." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Among the factors affecting CoT quality, external factors account for 53.60% and internal factors make up 40.10%. External factors are primarily unclear requirements and lack of contextual information; internal factors are mainly misunderstanding of instructions.", 372 "evidence": "Section III.D reports distributions based on manual annotation of 813 failed CoT samples by expert annotators with Cohen's Kappa ≥0.7. Table II provides per-model breakdowns.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Despite CoT being correct, 18.5% of the generated code still contains errors, primarily due to LLMs failing to follow their own CoT steps.", 377 "evidence": "Section III.D: analysis of 210 passing code samples and their associated CoTs. Also Finding 3 states 'even with a correct CoT, there remains an 18.5% probability of generating erroneous code.'", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Even when the code is correct, there is an 11.9% chance that the CoT contains errors.", 382 "evidence": "Section III.D and Table I: among 210 passing code samples, 25 had incorrect CoTs (25/210 = 11.9%).", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "The MAD framework significantly outperforms single-model approaches for detecting low-quality CoTs, particularly in Recall and F1-score.", 387 "evidence": "Table III: MAD DGO achieves 42.5% recall vs 8.75% for DeepSeek-R1 alone on CoderEval, and 46.36% vs 23.64% on SWE-bench-NF. However, precision is notably low (12.50% for MAD DGO on CoderEval).", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "The level of detail in feedback positively impacts LLMs' self-repair performance for faulty CoTs.", 392 "evidence": "Table IV: Detailed error feedback yields the highest pass@1 across models (e.g., Gemini-2.0: 5.2% → 5.7% → 7.4% on CoderEval). However, absolute improvements are very small and no statistical tests confirm significance.", 393 "supported": "weak" 394 }, 395 { 396 "claim": "76.4% of CoTs generated by LLMs have quality issues.", 397 "evidence": "Section III.D and Table I: 782/1023 CoT-code pairs had quality issues. The proportion is higher for SWE-bench-NF (82.3%) than CoderEval (73.6%).", 398 "supported": "moderate" 399 } 400 ], 401 "key_findings": "This empirical study of 1,023 CoT-code pairs from three reasoning models (DeepSeek-R1, o1, Gemini-2.0-Flash-Thinking) finds that 76.4% of generated CoTs have quality issues, with external factors (unclear requirements, missing context) accounting for 53.6% and internal factors (requirement misunderstanding, incorrect planning) 40.1%. A key finding is the disconnect between CoT and code quality: correct CoTs still produce erroneous code 18.5% of the time, while 11.9% of correct code comes from flawed CoTs, indicating LLMs don't strictly follow their reasoning chains. A Multi-Agent Debate framework improves faulty CoT detection recall (up to 48.5%) but suffers from very low precision (as low as 4.85%), and self-repair capabilities remain critically limited.", 402 "red_flags": [ 403 { 404 "flag": "No uncertainty quantification", 405 "detail": "All main results (pass@1, factor distributions, recall/precision/F1) are reported as point estimates without confidence intervals, error bars, or statistical significance tests. Differences between models and methods are presented as meaningful without any formal testing." 406 }, 407 { 408 "flag": "Complete absence of contamination analysis", 409 "detail": "CoderEval (2024) and SWE-bench (2023) are public benchmarks that may appear in the training data of the three reasoning models. No training cutoff dates are stated and no contamination analysis is performed, yet pass@1 scores are interpreted at face value." 410 }, 411 { 412 "flag": "Single-run results without replication", 413 "detail": "Pass@1 is computed from single-run generation with no repeated trials or seed sensitivity analysis. Given the stochastic nature of LLM generation, single-run results may not be stable." 414 }, 415 { 416 "flag": "Cohen's Kappa overclaimed", 417 "detail": "Section III.A.3 reports Cohen's Kappa of 82% as 'indicating perfect agreement.' By the standard Landis-Koch interpretation, 0.82 is at the bottom of 'almost perfect' (0.81-1.00), not 'perfect.'" 418 }, 419 { 420 "flag": "Hyperparameters not specified", 421 "detail": "The paper uses three reasoning models with 'preset parameter configurations' that are never specified. Temperature, top-p, and other sampling parameters significantly affect output and are not reported." 422 }, 423 { 424 "flag": "MAD precision critically low", 425 "detail": "While the MAD framework's recall is emphasized, its precision is extremely low (as low as 4.85% for MAD OGD on SWE-bench-NF), meaning it produces overwhelming false positives. The paper's framing emphasizes the recall improvement without adequately addressing the practical uselessness of such low precision." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "SWE-bench: Can language models resolve real-world github issues?", 431 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 432 "year": 2023, 433 "arxiv_id": "2310.06770", 434 "relevance": "Foundational benchmark for evaluating LLMs on real-world software engineering tasks, used as one of two evaluation datasets in this study." 435 }, 436 { 437 "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation", 438 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 439 "year": 2023, 440 "arxiv_id": "2305.01210", 441 "relevance": "Proposes EvalPlus framework for rigorous LLM code generation evaluation, addressing test adequacy in benchmarks." 442 }, 443 { 444 "title": "Chain-of-thought prompting elicits reasoning in large language models", 445 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma"], 446 "year": 2022, 447 "relevance": "Foundational paper introducing CoT prompting that this study directly evaluates for quality in code generation." 448 }, 449 { 450 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 451 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 452 "year": 2025, 453 "arxiv_id": "2501.12948", 454 "relevance": "Open-source reasoning model evaluated in this study, relevant to LLM capability assessment." 455 }, 456 { 457 "title": "Evaluating large language models trained on code", 458 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 459 "year": 2021, 460 "arxiv_id": "2107.03374", 461 "relevance": "Introduces Codex and code generation evaluation methodology including pass@k metrics widely adopted in the field." 462 }, 463 { 464 "title": "Structured chain-of-thought prompting for code generation", 465 "authors": ["Jia Li", "Ge Li", "Yongmin Li", "Zhi Jin"], 466 "year": 2023, 467 "relevance": "Proposes structured CoT for code generation, directly related to the CoT quality evaluation in this study." 468 }, 469 { 470 "title": "Bugs in large language models generated code: An empirical study", 471 "authors": ["Florian Tambon", "Arghavan Moradi Dakhel", "Amin Nikanjam", "Foutse Khomh"], 472 "year": 2024, 473 "arxiv_id": "2403.08937", 474 "relevance": "Classifies defective code generated by LLMs using CoderEval, providing an error taxonomy for LLM code quality." 475 }, 476 { 477 "title": "Chain-of-thought reasoning in the wild is not always faithful", 478 "authors": ["Iván Arcuschin", "Jett Janiak", "Robert Krzyzanowski"], 479 "year": 2025, 480 "arxiv_id": "2503.08679", 481 "relevance": "Directly relevant to the CoT-code disconnect finding: studies whether CoT faithfully represents model reasoning." 482 }, 483 { 484 "title": "Refining chatgpt-generated code: Characterizing and mitigating code quality issues", 485 "authors": ["Yue Liu", "Thanh Le-Cong", "Ratnadira Widyasari"], 486 "year": 2024, 487 "relevance": "Systematic evaluation of ChatGPT code reliability including quality attributes beyond correctness." 488 }, 489 { 490 "title": "ClarifyGPT: A framework for enhancing LLM-based code generation via requirements clarification", 491 "authors": ["Fangwen Mu", "Lin Shi", "Song Wang"], 492 "year": 2024, 493 "doi": "10.1145/3660810", 494 "relevance": "Addresses requirement ambiguity in LLM code generation — directly related to the external factors (unclear requirements) identified in this study." 495 }, 496 { 497 "title": "Tree of thoughts: Deliberate problem solving with large language models", 498 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 499 "year": 2023, 500 "relevance": "Alternative reasoning framework to CoT, relevant to understanding LLM reasoning strategies for code generation." 501 }, 502 { 503 "title": "Agentless: Demystifying LLM-based software engineering agents", 504 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 505 "year": 2024, 506 "arxiv_id": "2407.01489", 507 "relevance": "Evaluates LLM-based approaches to software engineering tasks including code generation, relevant to agentic AI evaluation." 508 }, 509 { 510 "title": "Code repair with LLMs gives an exploration-exploitation tradeoff", 511 "authors": ["Hao Tang", "Keya Hu", "Jin Zhou"], 512 "year": 2024, 513 "relevance": "Studies LLM self-repair capabilities for code, directly relevant to the self-repair investigation in this study." 514 }, 515 { 516 "title": "Encouraging divergent thinking in large language models through multi-agent debate", 517 "authors": ["Tian Liang", "Zhiwei He", "Wenxiang Jiao"], 518 "year": 2023, 519 "arxiv_id": "2305.19118", 520 "relevance": "Introduces the Multi-Agent Debate framework adopted in this study for detecting low-quality CoTs." 521 } 522 ] 523 }