scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25220B)
      1 {
      2   "paper": {
      3     "title": "Towards Practical and Useful Automated Program Repair for Debugging",
      4     "authors": [
      5       "Qi Xin",
      6       "Haojun Wu",
      7       "Steven P. Reiss",
      8       "Jifeng Xuan"
      9     ],
     10     "year": 2024,
     11     "venue": "SE 2030 (International Workshop on Software Engineering in 2030)",
     12     "arxiv_id": "2407.08958",
     13     "doi": "10.48550/arXiv.2407.08958"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [],
     17   "methodology_tags": ["theoretical", "case-study"],
     18   "key_findings": "This vision paper proposes PracAPR, an IDE-integrated repair system that eliminates the need for test suites and program re-execution by using flow-analysis-based fault localization, LLM-based local repair, strategy-driven global repair, and simulated trace comparison for validation. The authors report that their existing ROSE framework repaired 36/40 QuixBugs and 37/60 Defects4J bugs in seconds and improved user debugging success by 44%. Their analysis of Defects4J v1.2 found 118 single-fault multi-location bugs, of which current APR techniques repair at most 8, motivating the need for better global repair strategies.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, code archive, or download link is provided anywhere in the paper. The ROSE framework and PracAPR vision have no publicly released implementation."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper references standard benchmarks (Defects4J, QuixBugs) but does not release its own analysis data, such as the classification of 118 single-fault multi-location bugs or the 75-bug patch relationship analysis."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No reproduction instructions are provided. The paper is a vision paper with no experimental methodology that could be independently replicated from the information given."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper reports point estimates (36/40, 37/60, 44% improvement, 16.5% time reduction) without any confidence intervals, error bars, or uncertainty measures."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims ROSE 'helped 44% more participants succeed' and 'reduce the debugging time by about 16.5%' without any statistical significance tests. No p-values, t-tests, or other tests are reported."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "While relative improvements are stated (44% more participants, 16.5% time reduction), the baseline values are not given, making it impossible to judge the practical significance. For example, 44% more participants than what base rate?"
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for sample sizes. The user study sample size is not even stated in this paper. The choice of 75 bugs for patch relationship analysis (from 118 total) is described as 'about one third' but not justified."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported for any results."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper mentions existing techniques' results (e.g., 'current techniques repaired at most 8' multi-location bugs, Repilot failed on 57.3% of single-location bugs) but does not perform a controlled baseline comparison in its own evaluation."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No formal baseline comparison is conducted. References to existing tools are descriptive, not comparative evaluations."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "PracAPR is a vision/proposal, not a fully implemented and evaluated system. ROSE results are reported from prior work. No ablation is feasible in this paper."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The ROSE evaluation reports multiple metrics: number of bugs repaired (36/40 QuixBugs, 37/60 Defects4J), fault localization accuracy (89%), user study success rate (44% improvement), and debugging time reduction (16.5%)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 3.1 reports a user study evaluating ROSE: 'ROSE helped 44% more participants succeed in a debugging task and helped reduce the debugging time by about 16.5%.'"
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No discussion of held-out test sets or separation of development and evaluation data."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Only aggregate numbers are reported (36/40, 37/60). No per-bug-category, per-project, or per-type breakdowns are provided."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 3.2 provides a detailed failure case analysis of the Defects4J Chart_3 bug, showing how ChatRepair's insufficient prompt leads to incorrect diagnosis and wrong patches. Section 1 also discusses limitations of existing approaches."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper extensively reports negative results: existing non-learning approaches repair less than 23% of single-hunk bugs, Repilot fails on 57.3% of single-location bugs, and current multi-location techniques repair at most 8 of 118 single-fault bugs."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract accurately describes PracAPR as an 'envisioned' system and uses hedged language ('we hope to', 'we envision'). It does not overclaim implementation or evaluation results."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims 'ROSE helped 44% more participants succeed' and 'helped reduce the debugging time by about 16.5%' — these are causal claims. No experimental design details (randomization, controls, confound handling) are provided in this paper to justify the causal inference."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Practical and Useful Automated Program Repair for Debugging' broadly, but all evaluation is on Java benchmarks (Defects4J, QuixBugs) only. ROSE works specifically in the Eclipse-based Code Bubbles IDE. No bounds on generalization to other languages, IDEs, or bug types are stated."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations are discussed for any of the reported results. For example, the user study improvement could be due to the novelty effect or the specific task selection."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures benchmark bug repair counts and a small user study, then frames these as evidence of 'practicality' and 'usefulness' for general debugging — a significant proxy gap. No discussion of what 'practical' and 'useful' actually require beyond these metrics."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper references 'ChatGPT' and 'LLM' throughout without specifying any model version, snapshot date, or API version. Section 3.2 discusses 'ChatGPT-based approaches' without version details."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 3.2 describes what information the prompts contain in natural language ('we plan to use an augmented prompt that includes...') but no actual prompt text is provided."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for the ChatGPT-based approach or any other component."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The PracAPR architecture is described in detail: Figure 1 shows the workflow, and Sections 2-3 describe fault localization via backward slicing, patch generation (local LLM + global strategy), and patch validation via SEEDE-based simulated trace comparison. The ROSE framework's integration with Code Bubbles IDE is also described."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The multi-location bug classification methodology is not described in detail. The paper states 'we proposed an approach to detect such bugs' and 'we did a study' without documenting the detection criteria or filtering steps."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed anywhere in the paper."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what PracAPR cannot do, what the evaluation does not show, or what types of bugs or settings are excluded from its claims."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw data is available. The multi-location bug classifications, patch relationship analysis data, and user study data are not released."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper states 'we did a study and found that' 118 single-fault multi-location bugs exist and analyzed 75 patches, but the methodology for classifying bugs as single-fault vs multi-fault and the patch analysis procedure are not described in detail."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper references a user study ('a repair experiment and a user study') but provides no information about how participants were recruited, how many participated, or their characteristics."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No data pipeline is documented. The path from Defects4J bugs to the 118 single-fault multi-location classification to the 75-bug sample is not described with filtering criteria or intermediate counts."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding information or acknowledgments section is present in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly stated: Qi Xin, Haojun Wu, and Jifeng Xuan at Wuhan University (Xin also at Hubei Luojia Laboratory), Steven P. Reiss at Brown University."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence of the funder cannot be assessed."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The ChatGPT discussion in Section 3.2 analyzes failures of other approaches (ChatRepair) and describes planned future work, not a benchmark evaluation conducted by the authors."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — no pre-trained model benchmark evaluation is conducted in this paper."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Same as above — no pre-trained model benchmark evaluation is conducted in this paper."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The user study referenced was conducted and reported in prior work [33, 34]. This vision paper only cites summary results and does not itself conduct a human subjects study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "The user study was conducted in prior work [33, 34], not in this paper."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "The user study was conducted in prior work [33, 34], not in this paper."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "The user study was conducted in prior work [33, 34], not in this paper."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "The user study was conducted in prior work [33, 34], not in this paper."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "The user study was conducted in prior work [33, 34], not in this paper."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "The user study was conducted in prior work [33, 34], not in this paper."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The paper states ROSE repairs bugs 'in only seconds' but provides no specific timing data, latency measurements, or cost figures. 'In only seconds' is too vague for proper cost/latency reporting."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No computational budget (GPU hours, API costs, hardware specifications) is stated anywhere in the paper."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "Current APR techniques rely on unrealistic assumptions requiring test suites and frequent program re-execution.",
    301       "evidence": "Section 1 cites studies showing developers don't write sufficient tests [4, 19], bug-revealing tests for over 90% of Defects4J bugs were introduced after the bug was found [15], and re-execution makes APR take minutes [14] or hours [25].",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Traditional non-learning-based APR approaches can only repair less than 23% of the 150 single-hunk bugs in Defects4J v1.2.",
    306       "evidence": "Section 1 states this 'based on the previous evaluation of existing tools' but provides no specific source or analysis details.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Repilot (state-of-the-art LLM approach) fails to repair 86 (57.3%) of single-location bugs in Defects4J.",
    311       "evidence": "Section 1 cites Repilot [42] with this specific number.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "The Defects4J v1.2 dataset has 118 single-fault multi-location bugs, and current techniques repair at most 8 of them.",
    316       "evidence": "Section 1 states they 'did a study' and found these numbers. Current techniques cited include [23, 37, 44, 46, 50, 51, 55]. Methodology for determining single-fault vs multi-fault is not described in this paper.",
    317       "supported": "weak"
    318     },
    319     {
    320       "claim": "ROSE's test-free fault localization included the correct repair location for 89% of bugs tested.",
    321       "evidence": "Section 3.1 reports this result but details are in external papers [33, 34].",
    322       "supported": "weak"
    323     },
    324     {
    325       "claim": "A ROSE-based tool can repair 36/40 QuixBugs and 37/60 Defects4J bugs in only seconds.",
    326       "evidence": "Section 3.1 reports these numbers but full evaluation details are in [33, 34].",
    327       "supported": "weak"
    328     },
    329     {
    330       "claim": "ROSE helped 44% more participants succeed in debugging and reduced debugging time by about 16.5%.",
    331       "evidence": "Section 3.1 reports these user study results. No statistical details, sample size, or methodology are given in this paper; details are in [33, 34].",
    332       "supported": "weak"
    333     },
    334     {
    335       "claim": "Multi-location bug patches can be characterized into 8 types of partial patch relationships (DU, OA, RIF, DIF, EOH, SU, ONPF, FU).",
    336       "evidence": "Section 3.3 describes these 8 relationship types derived from analysis of 75 single-fault multi-location bug patches from Defects4J v1.2.",
    337       "supported": "moderate"
    338     }
    339   ],
    340   "red_flags": [
    341     {
    342       "flag": "Claims significantly outrun evidence",
    343       "detail": "The paper makes specific quantitative claims about ROSE (89% fault localization, 36/40 QuixBugs, 37/60 Defects4J, 44% user improvement, 16.5% time reduction) but all evaluation details are deferred to external papers [33, 34]. The reader cannot assess these claims from the information provided."
    344     },
    345     {
    346       "flag": "No uncertainty quantification",
    347       "detail": "All reported numbers are point estimates without confidence intervals, error bars, significance tests, or variance measures. The user study results (44%, 16.5%) lack basic statistical information."
    348     },
    349     {
    350       "flag": "Vision conflated with demonstrated capability",
    351       "detail": "PracAPR is described as an 'envisioned' system, yet the paper uses language that sometimes suggests it exists ('PracAPR uses an LLM-based approach', 'PracAPR also allows conversational repair'). The boundary between what has been built (ROSE) and what is proposed (PracAPR) is blurred."
    352     },
    353     {
    354       "flag": "Undocumented classification methodology",
    355       "detail": "The classification of Defects4J bugs into single-fault vs multi-fault (yielding the 118 count) and the patch relationship taxonomy (8 types from 75 bugs) are central claims, but the detection methodology is described only as 'we proposed an approach' without details."
    356     },
    357     {
    358       "flag": "No limitations discussion",
    359       "detail": "A 6-page vision paper making multiple empirical claims has no limitations section, no threats to validity, and no explicit scope boundaries."
    360     }
    361   ],
    362   "cited_papers": [
    363     {
    364       "title": "Automated program repair in the era of large pre-trained language models",
    365       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    366       "year": 2023,
    367       "relevance": "Major study on LLM-based APR evaluating pre-trained language models for automatic bug fixing at ICSE 2023."
    368     },
    369     {
    370       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    371       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    372       "year": 2023,
    373       "arxiv_id": "2304.00385",
    374       "relevance": "ChatRepair — conversational ChatGPT-based APR approach, key baseline discussed in this paper's Section 3.2."
    375     },
    376     {
    377       "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair",
    378       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"],
    379       "year": 2023,
    380       "arxiv_id": "2309.00608",
    381       "relevance": "Repilot — state-of-the-art LLM+completion engine APR approach, cited as failing on 57.3% of single-location bugs."
    382     },
    383     {
    384       "title": "An empirical study on fine-tuning large language models of code for automated program repair",
    385       "authors": ["Kai Huang", "Xiangxin Meng", "Jian Zhang", "Yang Liu", "Wenjie Wang", "Shuhao Li", "Yuqing Zhang"],
    386       "year": 2023,
    387       "relevance": "Empirical evaluation of fine-tuning LLMs for APR at ASE 2023."
    388     },
    389     {
    390       "title": "Impact of code language models on automated program repair",
    391       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    392       "year": 2023,
    393       "arxiv_id": "2302.05020",
    394       "relevance": "Study on how code language models affect automated program repair effectiveness."
    395     },
    396     {
    397       "title": "A Survey of Learning-based Automated Program Repair",
    398       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yuxiang Ma", "Weisong Sun", "Zhenyu Chen"],
    399       "year": 2023,
    400       "arxiv_id": "2301.03270",
    401       "relevance": "Comprehensive survey of learning-based APR techniques."
    402     },
    403     {
    404       "title": "An analysis of the automatic bug fixing performance of chatgpt",
    405       "authors": ["Dominik Sobania", "Martin Briesch", "Carol Hanna", "Justyna Petke"],
    406       "year": 2023,
    407       "arxiv_id": "2301.08653",
    408       "relevance": "Empirical analysis of ChatGPT's bug-fixing capabilities."
    409     },
    410     {
    411       "title": "CURE: Code-aware neural machine translation for automatic program repair",
    412       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    413       "year": 2021,
    414       "relevance": "Neural machine translation approach to APR at ICSE 2021, cited for repair taking minutes."
    415     },
    416     {
    417       "title": "ITER: Iterative Neural Repair for Multi-Location Patches",
    418       "authors": ["He Ye", "Martin Monperrus"],
    419       "year": 2023,
    420       "arxiv_id": "2304.12015",
    421       "relevance": "Neural approach to multi-location bug repair via iterative self-supervised training."
    422     },
    423     {
    424       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    425       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    426       "year": 2022,
    427       "relevance": "Zero-shot LLM-based APR approach, relevant baseline for multi-location repair evaluation."
    428     },
    429     {
    430       "title": "Practical Program Repair via Preference-based Ensemble Strategy",
    431       "authors": ["Wenkang Zhong", "Chuanyi Li", "Kui Liu", "Tongtong Xu", "Tegawendé F Bissyandé", "Jidong Ge", "Bin Luo", "Vincent Ng"],
    432       "year": 2023,
    433       "arxiv_id": "2309.08211",
    434       "relevance": "Ensemble-based APR approach combining multiple repair strategies, cited as relevant for generator prioritization."
    435     },
    436     {
    437       "title": "Trust enhancement issues in program repair",
    438       "authors": ["Yannic Noller", "Ridwan Shariffdeen", "Xiang Gao", "Abhik Roychoudhury"],
    439       "year": 2022,
    440       "relevance": "Study on developer trust in APR tools, directly relevant to the paper's motivation about practicality."
    441     }
    442   ],
    443   "engagement_factors": {
    444     "practical_relevance": {
    445       "score": 2,
    446       "justification": "Proposes an IDE-integrated debugging tool that eliminates test suite requirements, but PracAPR is not yet built — only the ROSE prototype exists."
    447     },
    448     "surprise_contrarian": {
    449       "score": 1,
    450       "justification": "Challenges the common APR assumption of requiring test suites, but this critique is well-established in the APR literature."
    451     },
    452     "fear_safety": {
    453       "score": 0,
    454       "justification": "No AI safety or security concerns raised."
    455     },
    456     "drama_conflict": {
    457       "score": 0,
    458       "justification": "No controversy or conflict with established claims or organizations."
    459     },
    460     "demo_ability": {
    461       "score": 0,
    462       "justification": "No code, demo, or tool is publicly available — PracAPR is envisioned and ROSE has no public release."
    463     },
    464     "brand_recognition": {
    465       "score": 0,
    466       "justification": "Wuhan University and Brown University are respected institutions but not high-profile AI labs."
    467     }
    468   }
    469 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs