scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19832B)
      1 {
      2   "paper": {
      3     "title": "A comparative study of large language models with chain-of-thought prompting for automated program repair",
      4     "authors": ["Eko Darwiyanto", "Rizky Akbar Gusnaen", "Rio Nurtantyana"],
      5     "year": 2025,
      6     "venue": "IAES International Journal of Artificial Intelligence (IJ-AI)",
      7     "doi": "10.11591/ijai.v14.i6.pp4579-4589"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code repository URL is provided. The DATA AVAILABILITY section states 'Derived data supporting the findings of this study are available from the corresponding author, [RN], on request,' which does not count as released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The study uses the QuixBugs dataset, which is a publicly available benchmark (reference [13])."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions Python and API endpoints but provides no requirements.txt, library versions, or environment specification details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions or README are provided. The workflow is described at a high level (Figures 2 and 3) but without specific commands or scripts."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as averages over 5 runs but no confidence intervals or error bars are provided."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CoT prompting 'improves performance' and that DeepSeek-V3 is best, but no statistical significance tests are used. Comparisons are based solely on comparing average numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute improvements with baseline context, e.g., 'GPT-4o records an increase from 31.4 to 35.8 plausible patches' (Section 3.1)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Each model was tested 5 times with no justification for why 5 runs were chosen. No power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Only averages are reported across the 5 runs. No standard deviation, variance, or spread measures are provided."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Standard (zero-shot) prompting is used as the baseline comparison for CoT prompting across all models."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The 10 LLM models tested were all released in mid-to-late 2024, which is contemporary for a 2025 paper."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed on the CoT prompt components (e.g., removing the example section, varying the number of steps). The CoT prompt has multiple components but their individual contributions are not tested."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper evaluates both plausible patches (correctness) and token usage cost (efficiency)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. Evaluation is entirely automated via Pytest test cases. Human evaluation could have assessed patch quality beyond test-passing."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — the study uses a fixed benchmark (QuixBugs) with no training/tuning phase. The prompts are not tuned on a subset of the data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 (Appendix) provides per-program results for all 40 QuixBugs programs across all 10 models, and Figure 6 breaks down failures by bug type."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3.3 identifies failure patterns, discusses the most challenging programs (e.g., lcs_length with 9/10 failures), and analyzes bug types that cause the most failures."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that o1-mini and o1-preview show decreased performance with CoT prompting, and provides an explanation for why (Section 3.1)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about DeepSeek-V3 (36.6 patches, $0.006) and GPT-4o (35.8 patches, $0.226) are supported by the results in Section 3. The claim that 'CoT prompting improves performance in most models' is supported, with caveats noted."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims CoT prompting 'improves' LLM reasoning ability and 'successfully guides models' (causal language), but the study design is a simple before/after comparison without controlling for confounds like prompt length differences between standard and CoT prompts."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract claims CoT is 'an effective technique to improve LLM reasoning ability in APR tasks' but the study only tests on the Python portion of QuixBugs (40 small programs). The title says 'automated program repair' generally without bounding to this small benchmark."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the performance improvements, such as the effect of additional prompt length/context in CoT vs standard prompts, or whether the few-shot examples (not CoT steps themselves) drive the improvement."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are listed by marketing names (GPT-4o, Claude-3.5-Sonnet, etc.) with release dates but without specific API version strings or snapshot dates. 'GPT-4o' released '06 Aug' does not specify which version (e.g., gpt-4o-2024-08-06)."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Figure 1 shows the prompt structure as a diagram but does not provide the actual prompt text. The exact wording of the CoT steps, examples, and task instructions is not given."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 2.2 states 'the temperature is set to 0.0 and top_p to 0.01' using greedy sampling."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The system sends single API calls per bug."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2.4-2.5 describe the APR workflow from reading buggy files to prompt construction to response extraction to validation via Pytest, with flowcharts in Figures 2 and 3."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations section. The conclusion briefly mentions limitations in one sentence: 'The limitations of this study include the use of a dataset restricted to only the Python version of the QuixBugs dataset, as well as the use of LLM models that are only publicly accessible through APIs.'"
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed. The brief limitations mention in the conclusion is generic."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While the conclusion mentions the dataset is restricted to Python QuixBugs, the paper does not explicitly state what the results do NOT show or what claims cannot be made."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw data is 'available from the corresponding author on request,' which does not count as publicly available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection procedure is described: QuixBugs benchmark used, each model tested 5 times, responses extracted and validated via Pytest (Sections 2.1, 2.4, 2.5)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data source is a standard benchmark (QuixBugs)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from reading buggy files to prompt construction to API call to response extraction to Pytest validation is documented with flowcharts (Figures 2 and 3)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "FUNDING INFORMATION section states: 'This research was supported by Directorate of Research and Community Service, Telkom University.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Telkom University and BRIN. No authors are affiliated with the LLM companies being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funder (Telkom University) has no financial interest in the outcome of comparing LLM models for APR."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "A CONFLICT OF INTEREST STATEMENT section is present stating: 'Authors state no conflict of interest.'"
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Table 1 lists knowledge cutoff dates for some models (e.g., GPT-4o: Oct. 23, Claude-3-5-Sonnet: Apr. 24), though several are marked as unknown ('-')."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "QuixBugs was published in 2017 and is publicly available. All models tested have training data well after 2017, yet no discussion of potential contamination is provided."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "QuixBugs (2017) predates all models' training cutoffs. The benchmark programs are publicly available and likely in training data, but this contamination risk is never discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Token usage costs are reported per model for both prompting methods (Figure 5, Section 3.2), e.g., DeepSeek-V3 at $0.006 and o1-preview at $6.775."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated. Individual per-run costs are reported but total API spend for the entire experiment (5 runs x 10 models x 2 methods x 40 programs) is not given."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoT prompting improves performance in most LLM models for APR tasks compared to standard prompting.",
    286       "evidence": "Figure 4 and Section 3.1 show improvements for GPT-4o (31.4 to 35.8), Grok-2 (26.2 to 29.8), DeepSeek-V3, and Llama-3.3-70B, but declines for o1-mini and o1-preview.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "DeepSeek-V3 achieves the highest performance with an average of 36.6 plausible patches and the lowest cost of $0.006.",
    291       "evidence": "Section 3.1 and 3.2 report these numbers directly. Table 2 confirms 37 patches in the best run.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "GPT-4o shows competitive results with an average of 35.8 plausible patches at a cost of $0.226.",
    296       "evidence": "Reported in Section 3.1 and 3.2.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "o1-mini and o1-preview show decreased performance with CoT prompting due to internal CoT mechanisms conflicting with external CoT prompts.",
    301       "evidence": "Section 3.1 discusses this with reference to OpenAI documentation [28]. The performance decrease is shown in Figure 4, but the causal explanation is speculative.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "This paper compares 10 LLMs on the QuixBugs benchmark using standard vs. chain-of-thought prompting for automated program repair. CoT prompting improved plausible patch counts for most models, with DeepSeek-V3 achieving the best performance (36.6/40 average) at the lowest cost ($0.006). However, models with built-in reasoning (o1-mini, o1-preview) showed decreased performance with external CoT prompting. Failure analysis identified multi-location bugs and missing function calls as the most challenging bug types.",
    307   "red_flags": [
    308     {
    309       "flag": "Benchmark contamination risk",
    310       "detail": "QuixBugs was published in 2017 and is freely available online. All 10 tested models were trained after 2017, meaning the benchmark programs are very likely in their training data. This is never discussed and could mean the models are 'remembering' fixes rather than reasoning about them."
    311     },
    312     {
    313       "flag": "No statistical rigor",
    314       "detail": "With only 5 runs and no error bars, confidence intervals, or significance tests, the reported differences between models and prompting methods cannot be distinguished from noise. The difference between 35.8 and 36.6 average patches may not be statistically significant."
    315     },
    316     {
    317       "flag": "Prompts not provided",
    318       "detail": "The actual prompt text is not given — only a structural diagram (Figure 1). Without the exact prompts, the experiment cannot be reproduced, and the quality of the CoT examples cannot be assessed."
    319     },
    320     {
    321       "flag": "Confound between CoT and few-shot",
    322       "detail": "The CoT prompt includes two worked examples (few-shot), while the standard prompt is zero-shot. Performance improvements could be due to few-shot examples rather than chain-of-thought reasoning, but this confound is never discussed."
    323     },
    324     {
    325       "flag": "Tiny benchmark",
    326       "detail": "QuixBugs contains only 40 small single-function programs with simple bugs. Claims about APR effectiveness based on this benchmark have very limited generalizability."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "A survey on automated program repair techniques",
    332       "authors": ["K. Huang"],
    333       "year": 2023,
    334       "arxiv_id": "2303.18184",
    335       "relevance": "Comprehensive survey of APR techniques including LLM-based approaches."
    336     },
    337     {
    338       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    339       "authors": ["J. Wei"],
    340       "year": 2022,
    341       "relevance": "Foundational work on CoT prompting used as the basis for this study's approach."
    342     },
    343     {
    344       "title": "Can OpenAI's codex fix bugs?: an evaluation on QuixBugs",
    345       "authors": ["J. A. Prenner", "H. Babii", "R. Robbes"],
    346       "year": 2022,
    347       "doi": "10.1145/3524459.3527351",
    348       "relevance": "Prior evaluation of LLM (Codex) on QuixBugs benchmark for APR."
    349     },
    350     {
    351       "title": "A survey of learning-based automated program repair",
    352       "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"],
    353       "year": 2023,
    354       "doi": "10.1145/3631974",
    355       "relevance": "Survey of learning-based APR methods published in TOSEM."
    356     },
    357     {
    358       "title": "A comprehensive study of automatic program repair on the QuixBugs benchmark",
    359       "authors": ["H. Ye", "M. Martinez", "T. Durieux", "M. Monperrus"],
    360       "year": 2021,
    361       "doi": "10.1016/j.jss.2020.110825",
    362       "relevance": "Comprehensive prior study on APR using the same QuixBugs benchmark."
    363     },
    364     {
    365       "title": "Large language models are zero-shot reasoners",
    366       "authors": ["T. Kojima", "S. S. Gu", "M. Reid", "Y. Matsuo", "Y. Iwasawa"],
    367       "year": 2023,
    368       "doi": "10.5555/3600270.3601883",
    369       "relevance": "Zero-shot CoT prompting technique ('Let's think step by step') used in this study's prompt design."
    370     },
    371     {
    372       "title": "Empirical evaluation of large language models in automated program repair",
    373       "authors": ["J. Sun", "F. Li", "X. Qi", "H. Zhang", "J. Jiang"],
    374       "year": 2025,
    375       "arxiv_id": "2506.13186",
    376       "relevance": "Recent empirical evaluation of LLMs for APR, directly related to survey scope."
    377     },
    378     {
    379       "title": "DeepSeek-V3 technical report",
    380       "authors": ["DeepSeek-AI"],
    381       "year": 2025,
    382       "arxiv_id": "2412.19437",
    383       "relevance": "Technical report for the best-performing model in this study."
    384     },
    385     {
    386       "title": "Towards better chain-of-thought prompting strategies: a survey",
    387       "authors": ["Z. Yu", "L. He", "Z. Wu", "X. Dai", "J. Chen"],
    388       "year": 2023,
    389       "arxiv_id": "2310.04959",
    390       "relevance": "Survey of CoT prompting strategies referenced for explaining model size effects on CoT performance."
    391     }
    392   ]
    393 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs