scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20752B)
      1 {
      2   "paper": {
      3     "title": "A Critical Review of Large Language Model on Software Engineering: An Example from ChatGPT and Automated Program Repair",
      4     "authors": ["Quanjun Zhang", "Tongke Zhang", "Juan Zhai", "Chunrong Fang", "Bowen Yu", "Weisong Sun", "Zhenyu Chen"],
      5     "year": 2024,
      6     "venue": "ACM",
      7     "arxiv_id": "2310.08879"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'we release the studied dataset, scripts (i.e., data processing, model training, and model evaluation), and related models' with a GitHub link: https://github.com/iSEngLab/EvalGPTFix."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The EvalGPTFix benchmark dataset is released at the GitHub repository referenced in the paper."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specification (requirements.txt, Dockerfile, library versions) is described in the paper. Only the model name 'gpt-3.5-turbo' and fine-tuning dataset for baselines are mentioned."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repo is referenced but the paper itself does not include a reproduction guide."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as raw counts and percentages (e.g., 109/151 bugs fixed) without confidence intervals or error bars, despite the stochastic nature of ChatGPT's outputs."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims ChatGPT outperforms CodeT5 and PLBART but provides no statistical significance tests — only raw count comparisons (109 vs 79 vs 41)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context: '27.5% and 62.4% less than what is achieved by ChatGPT' and per-bug-type fixing rates (96%, 100%, 50%, 71%)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark contains 151 bugs from only 2 AtCoder contests. No justification is given for why this sample size is sufficient, nor is there any power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "ChatGPT is queried 35 rounds and the paper shows per-round counts in Figure 2, but no variance, standard deviation, or spread measure across runs is reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "CodeT5 and PLBART are used as baselines in RQ1, with a Venn diagram (Figure 3) showing overlap of fixed bugs."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "CodeT5 (2021) and PLBART (2021) are the baselines. By 2023, newer code LLMs existed (e.g., StarCoder, Code Llama). The paper acknowledges not including other LLMs but does not justify why newer models were excluded."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ2 systematically varies prompt types (problem description, error information, bug localization) to measure their individual contribution. RQ3 adds dialogue. These function as ablation-like studies of prompt components."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only one metric is used: number of bugs correctly fixed (pass all test cases). No other metrics (e.g., patch quality, token efficiency, time-to-fix) are reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of patch quality is performed. Evaluation is entirely automated via test suite pass/fail."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "EvalGPTFix is constructed from post-2023 AtCoder submissions specifically to avoid training data contamination. The test cases come from AtCoder's backend, averaging 38 per problem."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by bug type (CE: 96%, TLE: 100%, RE: 50%, WA: 71%) and by prompt type in RQ2."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The discussion section analyzes ChatGPT's failure in self-repair (only 2/13 bugs fixed) and case studies illustrate both successes and the limitations of different prompt strategies."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The self-repair experiment in Section 5 is a notable negative result: ChatGPT could only fix 2 out of 13 bugs in its own generated code despite 30 dialogue rounds."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (109/151 fixed with basic prompt, 34 additional with advanced prompts, 9 from dialogues) are all supported by results in Sections 4.1-4.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The prompt variation experiments (RQ2) are controlled single-variable manipulations: each prompt type is tested independently on the same unfixed bugs. The ablation design is adequate for the causal claims made."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Large Language Model on Software Engineering' broadly, but experiments test only ChatGPT (gpt-3.5-turbo) on competitive programming bugs in Java. The threats-to-validity section acknowledges this but the title and framing significantly overstate scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for ChatGPT's superior performance over CodeT5/PLBART (e.g., model size differences, training data volume, fine-tuning data quality). The threats section discusses benchmark and baseline selection but not confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 3.3 specifies 'the API of ChatGPT with the model gpt-3.5-turbo released by OpenAI.' This is a specific model identifier, though no snapshot date is given. Given the era (early 2023), gpt-3.5-turbo was unambiguous."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt texts are provided for all prompt types: basic prompt (Section 4.1), problem description prompt, error information prompts (CE/TLE/RE/WA), bug localization prompt (Section 4.2), and dialogue prompt (Section 4.3)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No API hyperparameters (temperature, top-p, max tokens) are reported for ChatGPT. For baselines, beam size=50 is stated, but ChatGPT sampling parameters are missing."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. ChatGPT is queried directly via API with prompts."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2 documents a detailed 5-step pipeline: raw data collection, bug-fixing pairs construction (with 6-token diff threshold), test case mining, static-based filtering (dedup, 500-token limit, comment removal), and dynamic-based filtering (with explicit removal criteria). Final count: 151 pairs from 2 contests."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 'Threats to Validity' discusses three specific threats across approximately one full page."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats are specific to this study: (1) EvalGPTFix contains small competitive programming problems that may not reflect real-world repair, (2) only CodeT5 and PLBART as baselines, (3) only ChatGPT and APR as LLM/task representatives."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While the threats section notes limitations, it does not explicitly state what the results do NOT show. The paper frames findings broadly ('indicating the potential of ChatGPT in repairing real-world buggy programs') despite testing only on competitive programming."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The dataset and scripts are released at the GitHub repository (https://github.com/iSEngLab/EvalGPTFix), enabling independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.2 describes crawling Java submissions from AtCoder programming contests in 2023, with specific contest identifiers (Beginner Contest 297 and 298)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from public AtCoder submissions."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The 5-step pipeline in Section 3.2 documents each transformation with criteria: raw collection → pair construction (6-token threshold) → test case mining → static filtering (dedup, length, comments) → dynamic filtering (3 removal conditions). Final count of 151 pairs is stated."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Nanjing University (State Key Laboratory for Novel Software Technology) and University of Massachusetts Amherst."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. Authors are academic researchers with no apparent commercial stake in ChatGPT's performance."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The paper states 'ChatGPT states its knowledge cutoff is in September 2021' and constructs EvalGPTFix from 2023 data specifically to address this."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "This is a central contribution of the paper. Section 1 extensively discusses data leakage concerns, demonstrates ChatGPT's knowledge of Defects4J (Figure 1), and constructs a post-cutoff benchmark to avoid overlap."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The entire paper is motivated by benchmark contamination. EvalGPTFix uses 2023 AtCoder data, after ChatGPT's September 2021 training cutoff, specifically to create a clean benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token counts, or latency are reported despite querying ChatGPT for 35 rounds across 151 bugs plus additional prompt experiments."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget (API spend, hardware for baseline fine-tuning, total wall-clock time) is stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ChatGPT fixes 109 out of 151 bugs in EvalGPTFix using a basic prompt within 35 independent rounds.",
    286       "evidence": "Section 4.1, Figure 2 shows per-round fix counts converging after 35 rounds.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "ChatGPT outperforms CodeT5 and PLBART by 27.5% and 62.4% in prediction accuracy on EvalGPTFix.",
    291       "evidence": "Section 4.1: CodeT5 fixes 79 bugs, PLBART fixes 41, vs ChatGPT's 109. Figure 3 shows overlap.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Advanced prompts (error info, problem description, bug localization) enable ChatGPT to fix 25, 18, and 10 additional bugs respectively.",
    296       "evidence": "Section 4.2, Figure 4 shows per-round results for each prompt type. Figure 5 shows overlap.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Dialogue-based repair enables ChatGPT to fix 9 additional bugs not fixed by single prompts.",
    301       "evidence": "Section 4.3 reports 9/17 unfixed bugs repaired through multi-round dialogue.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "ChatGPT has limited ability in self-repair of its own generated code.",
    306       "evidence": "Section 5: only 2 out of 13 bugs in ChatGPT-generated code fixed despite 30 dialogue rounds.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "ChatGPT (gpt-3.5-turbo) fixes 109/151 bugs on a contamination-free benchmark (EvalGPTFix) constructed from post-training-cutoff AtCoder contests, outperforming CodeT5 (79) and PLBART (41). Advanced prompts with error feedback, problem descriptions, and bug localization collectively enable 34 additional fixes. However, ChatGPT shows limited self-repair capability, fixing only 2/13 bugs in its own generated code. The paper highlights benchmark contamination as an overlooked concern in LLM-based SE evaluations.",
    312   "red_flags": [
    313     {
    314       "flag": "Unfair baseline comparison",
    315       "detail": "ChatGPT (gpt-3.5-turbo, ~175B parameters, 35 sampling rounds) is compared against much smaller fine-tuned models CodeT5 and PLBART (beam size 50). The comparison does not account for massive differences in model scale, training data, and inference budget."
    316     },
    317     {
    318       "flag": "No uncertainty quantification",
    319       "detail": "Despite acknowledging ChatGPT's stochastic nature (35 rounds needed for stability), no confidence intervals, variance, or statistical tests are reported for any results."
    320     },
    321     {
    322       "flag": "Overly broad title and framing",
    323       "detail": "Title claims 'Large Language Model on Software Engineering' but experiments test only one model (gpt-3.5-turbo) on one task (APR) with one language (Java) on competitive programming problems."
    324     },
    325     {
    326       "flag": "Missing API parameters",
    327       "detail": "Temperature and other sampling parameters for ChatGPT are not reported, making reproduction difficult given that these significantly affect output quality and consistency."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    333       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    334       "year": 2023,
    335       "arxiv_id": "2304.00385",
    336       "relevance": "Directly evaluates ChatGPT for conversational APR on Defects4J, relevant to LLM capability assessment."
    337     },
    338     {
    339       "title": "An analysis of the automatic bug fixing performance of chatgpt",
    340       "authors": ["Dominik Sobania", "Martin Briesch", "Carol Hanna", "Justyna Petke"],
    341       "year": 2023,
    342       "relevance": "Early evaluation of ChatGPT's bug-fixing on QuixBugs, relevant to LLM code repair benchmarking."
    343     },
    344     {
    345       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    346       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    347       "year": 2023,
    348       "relevance": "Systematic evaluation of LLMs for APR at ICSE, directly relevant to LLM capability in SE."
    349     },
    350     {
    351       "title": "Codex Hacks HackerRank: Memorization Issues and a Framework for Code Synthesis Evaluation",
    352       "authors": ["Anjan Karmakar", "Julian Aron Prenner", "Marco D'Ambros", "Romain Robbes"],
    353       "year": 2022,
    354       "arxiv_id": "2212.02684",
    355       "relevance": "Highlights memorization/contamination issues in code LLM evaluation, directly relevant to benchmark integrity."
    356     },
    357     {
    358       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair Via Zero-shot Learning",
    359       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    360       "year": 2022,
    361       "relevance": "Evaluates zero-shot LLM-based APR (AlphaRepair), relevant to understanding LLM repair capabilities."
    362     },
    363     {
    364       "title": "Can Openai's Codex Fix Bugs? An Evaluation on Quixbugs",
    365       "authors": ["Julian Aron Prenner", "Hlib Babii", "Romain Robbes"],
    366       "year": 2022,
    367       "relevance": "Early evaluation of Codex for program repair, relevant to LLM capability benchmarking."
    368     },
    369     {
    370       "title": "A study on Prompt Design, Advantages and Limitations of ChatGPT for Deep Learning Program Repair",
    371       "authors": ["Jialun Cao", "Meiziniu Li", "Ming Wen", "Shing-chi Cheung"],
    372       "year": 2023,
    373       "arxiv_id": "2304.08191",
    374       "relevance": "Studies prompt engineering for ChatGPT in DL program repair, relevant to LLM-based APR methodology."
    375     },
    376     {
    377       "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation",
    378       "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"],
    379       "year": 2023,
    380       "arxiv_id": "2305.04207",
    381       "relevance": "Evaluates ChatGPT for test generation, relevant to LLM capability assessment in SE tasks."
    382     },
    383     {
    384       "title": "Improving ChatGPT Prompt for Code Generation",
    385       "authors": ["Chao Liu", "Xuanlin Bao", "Hongyu Zhang"],
    386       "year": 2023,
    387       "arxiv_id": "2305.08360",
    388       "relevance": "Studies prompt engineering for ChatGPT code generation, relevant to LLM programming methodology."
    389     },
    390     {
    391       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-decoder Models for Code Understanding and Generation",
    392       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven CH Hoi"],
    393       "year": 2021,
    394       "relevance": "Baseline model used in this study, foundational code LLM relevant to the survey."
    395     }
    396   ]
    397 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs