scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22017B)
      1 {
      2   "paper": {
      3     "title": "Towards Extending the Range of Bugs That Automated Program Repair Can Handle",
      4     "authors": [
      5       "Omar I. Al-Bataineh",
      6       "Leon Moonen"
      7     ],
      8     "year": 2022,
      9     "venue": "22nd IEEE International Conference on Software Quality, Reliability and Security (QRS 2022)",
     10     "arxiv_id": "2211.03911",
     11     "doi": "10.1109/QRS57517.2022.00031"
     12   },
     13   "scan_version": 3,
     14   "active_modules": [],
     15   "methodology_tags": ["theoretical"],
     16   "key_findings": "The paper proposes a bug classification system based on three properties — observability, reproducibility, and tractability — to systematically compare APR approaches. It identifies four APR categories (dynamic, static, dynamic-static, formal) and maps them to bug classes. For termination bugs, the paper sketches hybrid repair algorithms: dynamic-static APR using termination provers for sequential programs, and formal APR combining termination provers with software model checkers for concurrent programs. A brief mention claims termination provers (AProVE, 2LS) prove termination of ~85% of programs in SNU/PowerStone benchmarks, but no detailed experimental results are presented.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No source code, prototype implementation, or repository URL is provided. The algorithms in Figures 1 and 2 are pseudocode sketches only."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No datasets or benchmark data are released. The SNU real-time and PowerStone benchmarks are mentioned but not provided or linked."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No environment specifications are provided. The paper is primarily theoretical with no implementation."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No reproduction instructions are provided. The hybrid algorithms are described at a pseudocode level without implementation details sufficient for reproduction."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "Theoretical paper with no experimental results requiring statistical analysis."
     45       },
     46       "significance_tests": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "Theoretical paper; no comparative experimental claims requiring significance tests."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "Theoretical paper with no experiments reporting effect sizes."
     55       },
     56       "sample_size_justified": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "Theoretical paper; no experimental sample to justify."
     60       },
     61       "variance_reported": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "Theoretical paper with no multi-run experimental results."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Section II discusses three existing bug classification systems (cause-impact, severity-priority, bug complexity) as baselines for comparison with the proposed classification."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Prior classification systems referenced include Tan et al. (2006/2014), Cotroneo et al. (2016), and Asadollah et al. (2015), which are the relevant prior work in this space."
     77       },
     78       "ablation_study": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No system with components to ablate; this is a theoretical classification and algorithmic sketch."
     82       },
     83       "multiple_metrics": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No experiments are conducted, so no metrics are reported."
     87       },
     88       "human_evaluation": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Theoretical paper with no system outputs to evaluate."
     92       },
     93       "held_out_test_set": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No experiments requiring train/test splits."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper systematically breaks down analysis by bug type: arithmetic bugs (Section IV-A), non-functional bugs (Section IV-B), and liveness/termination bugs (Sections IV-C, V), analyzing each class separately."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly discusses what each technique cannot handle: dynamic analysis cannot handle non-observable or liveness bugs (Observation 1), static analysis suffers from false positives (Section III-B), and model checking faces the state explosion problem (Section III-C)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper identifies classes of bugs that existing APR approaches cannot handle and explicitly discusses the limitations of dynamic, static, and model checking approaches in Table II and throughout Sections III-IV."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims a novel bug classification system (delivered in Section II), analysis of termination bugs (delivered in Section V), and that integration reduces complexity and improves reliability (argued in Section V, though informally). The 'towards' framing appropriately hedges the scope."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract claims 'integrating dynamic APR with formal analysis techniques...reduces complexity and improves the overall reliability of these repairs.' This is a causal claim supported only by informal theoretical argument, not by formal proof of complexity reduction or empirical demonstration of improved reliability."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The title uses 'Towards' to hedge scope. The paper explicitly states the termination bug analysis is a 'demonstrating example' (Section I), and Section VII identifies the work as preliminary with four specific future research directions needed."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not consider alternative explanations for why the proposed classification is preferable, nor does it discuss potential counterarguments to the hybrid approach. The comparison with prior classification systems (Section II) describes them but does not systematically evaluate trade-offs."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper claims the hybrid approach 'reduces complexity' and 'improves reliability' but does not define or measure either concept. No formal complexity analysis is provided, and reliability improvement is asserted without operationalization."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No computational models (ML or otherwise) are used in experiments. The paper references termination provers (AProVE, 2LS, T2) but does not specify versions."
    146       },
    147       "prompts_provided": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No prompting is involved; this is a theoretical paper about program repair."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "Theoretical paper with no experiments requiring hyperparameter settings."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "Theoretical paper with no data collection or preprocessing."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No dedicated limitations section. Section VII 'Concluding Remarks' lists future research directions, which implicitly acknowledge limitations (e.g., 'we are in the process of empirically validating the ideas'), but this does not constitute a substantive limitations discussion."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to validity are discussed. The paper does not address potential weaknesses of the proposed classification system or the algorithmic sketches."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The paper explicitly bounds scope: 'To demonstrate the benefits of our method, we study termination bugs in sequential and concurrent programs' (Section I). Section VII identifies four specific future directions, explicitly acknowledging what the current work does not cover (empirical validation, fault localization for liveness bugs, CEGIS integration, information flow bugs)."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "Theoretical paper with no data collected or analyzed."
    190       },
    191       "data_collection_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "Theoretical paper; no data collection process."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants and no data collection from human sources."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "Theoretical paper with no data pipeline."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Funding is disclosed: 'This work has been financially supported by the Research Council of Norway through the secureIT project (RCN contract #288787).'"
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Both authors are listed as affiliated with Simula Research Laboratory, Oslo, Norway. The paper does not evaluate any product from their institution."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The Research Council of Norway is a public funding agency with no commercial stake in APR tool outcomes."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is included in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "The paper does not evaluate any pre-trained model's capability on a benchmark. It is a theoretical paper about bug classification and hybrid APR algorithms."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No pre-trained model evaluation on any benchmark."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No pre-trained model evaluation on any benchmark."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this theoretical paper."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Theoretical paper with no implemented method whose cost could be reported."
    288       },
    289       "compute_budget_stated": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Theoretical paper with no computational experiments requiring a budget."
    293       }
    294     }
    295   },
    296   "claims": [
    297     {
    298       "claim": "A bug classification based on observability, reproducibility, and tractability enables methodical analysis of APR detection power and repair capabilities.",
    299       "evidence": "The classification system is defined in Section II with formal definitions (Definitions 1-7) and Table I. Sections III-IV use the properties to analyze dynamic analysis, static analysis, and model checking techniques.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Termination bugs in sequential programs can be effectively addressed using dynamic-static APR by combining test cases with termination provers.",
    304       "evidence": "Section V-A presents an algorithm (Figure 1) and formal validity specification (formula 1). The argument is theoretical; no empirical evaluation of the algorithm is provided.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "Termination bugs in concurrent programs are best addressed with formal APR combining termination provers and software model checkers.",
    309       "evidence": "Section V-B presents an algorithm (Figure 2) and three formal validity specifications (formulas 2-4). The argument is theoretical; no empirical evaluation is provided.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Termination provers AProVE and 2LS can prove termination of around 85% of examined programs from SNU real-time and PowerStone benchmarks.",
    314       "evidence": "Stated in one sentence at the end of Section V: 'the tools are able to successfully prove termination of around 85% of the examined programs using very little computational time (a few seconds).' No methodology, detailed results, or source citation for this specific experiment is provided.",
    315       "supported": "weak"
    316     },
    317     {
    318       "claim": "Integrating formal methods in APR avoids the patch overfitting problem by generating verified repairs.",
    319       "evidence": "Section V-B argues that formula (2) 'entirely avoids the patch overfitting problem' because it checks both termination and semantic preservation. This is a theoretical argument assuming the formal specifications are complete and the tools are sound.",
    320       "supported": "moderate"
    321     }
    322   ],
    323   "red_flags": [
    324     {
    325       "flag": "No empirical validation",
    326       "detail": "The paper proposes classification schemes and repair algorithms but provides no implementation or empirical evaluation. The authors acknowledge this in Section VII: 'we are in the process of empirically validating the ideas described in this work.' The entire contribution is theoretical."
    327     },
    328     {
    329       "flag": "Unsupported quantitative claim",
    330       "detail": "The claim that termination provers prove '~85% of examined programs' (end of Section V) is stated without methodology, detailed results, number of programs, or a citation for this specific experiment. It is unclear whether this was the authors' own experiment or drawn from prior work."
    331     },
    332     {
    333       "flag": "Claims outrun evidence",
    334       "detail": "The abstract claims integration 'reduces complexity and improves the overall reliability of these repairs' but no complexity analysis is provided and reliability improvement is not measured. These are informal assertions, not demonstrated results."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "Automated Program Repair",
    340       "authors": ["C. Le Goues", "M. Pradel", "A. Roychoudhury"],
    341       "year": 2019,
    342       "doi": "10/gkgf29",
    343       "relevance": "Major APR survey in Communications of the ACM providing an overview of the field."
    344     },
    345     {
    346       "title": "Automatic Software Repair: A Bibliography",
    347       "authors": ["M. Monperrus"],
    348       "year": 2018,
    349       "doi": "10/ggssbj",
    350       "relevance": "Comprehensive bibliography of automatic software repair techniques."
    351     },
    352     {
    353       "title": "SapFix: Automated End-to-End Repair at Scale",
    354       "authors": ["A. Marginean", "J. Bader", "S. Chandra", "M. Harman", "Y. Jia", "K. Mao", "A. Mols", "A. Scott"],
    355       "year": 2019,
    356       "doi": "10/gkgf2c",
    357       "relevance": "Facebook's production-scale APR tool demonstrating real-world APR deployment."
    358     },
    359     {
    360       "title": "GenProg: A Generic Method for Automatic Software Repair",
    361       "authors": ["C. Le Goues", "T. Nguyen", "S. Forrest", "W. Weimer"],
    362       "year": 2012,
    363       "doi": "10/cfztf3",
    364       "relevance": "Foundational search-based APR tool using genetic programming."
    365     },
    366     {
    367       "title": "SemFix: Program Repair via Semantic Analysis",
    368       "authors": ["H. D. T. Nguyen", "D. Qi", "A. Roychoudhury", "S. Chandra"],
    369       "year": 2013,
    370       "doi": "10/gg82z6",
    371       "relevance": "Pioneering semantic-based APR approach using symbolic execution."
    372     },
    373     {
    374       "title": "Angelix: Scalable Multiline Program Patch Synthesis via Symbolic Analysis",
    375       "authors": ["S. Mechtaev", "J. Yi", "A. Roychoudhury"],
    376       "year": 2016,
    377       "doi": "10/ggsskp",
    378       "relevance": "Symbolic analysis-based APR tool advancing multiline patch synthesis."
    379     },
    380     {
    381       "title": "ASTOR: A Program Repair Library for Java",
    382       "authors": ["M. Martinez", "M. Monperrus"],
    383       "year": 2016,
    384       "doi": "10/gndn55",
    385       "relevance": "Java APR library used as infrastructure in many repair experiments."
    386     },
    387     {
    388       "title": "Smart Contract Repair",
    389       "authors": ["X. L. Yu", "O. Al-Bataineh", "D. Lo", "A. Roychoudhury"],
    390       "year": 2020,
    391       "doi": "10/gpd4hr",
    392       "relevance": "Extends APR to smart contracts, relevant to automated code repair for blockchain."
    393     },
    394     {
    395       "title": "PROPR: Property-Based Automatic Program Repair",
    396       "authors": ["M. P. Gissurarson", "L. Applis", "A. Panichella", "A. van Deursen", "D. Sands"],
    397       "year": 2022,
    398       "doi": "10/gqhgs7",
    399       "relevance": "Introduces property-based testing for APR patch validation to address overfitting."
    400     },
    401     {
    402       "title": "The ManyBugs and IntroClass Benchmarks for Automated Repair of C Programs",
    403       "authors": ["C. Le Goues", "N. Holtschulte", "E. K. Smith", "Y. Brun", "P. Devanbu", "S. Forrest", "W. Weimer"],
    404       "year": 2015,
    405       "doi": "10/gpd4jv",
    406       "relevance": "Foundational APR benchmarks widely used for evaluating repair tools."
    407     },
    408     {
    409       "title": "Automatic Repair of Infinite Loops",
    410       "authors": ["S. R. L. Marcote", "M. Monperrus"],
    411       "year": 2015,
    412       "arxiv_id": "1504.05078",
    413       "doi": "10/jb2f",
    414       "relevance": "Directly relevant prior work on repairing termination bugs using SMT solvers."
    415     },
    416     {
    417       "title": "Towards More Reliable Automated Program Repair by Integrating Static Analysis Techniques",
    418       "authors": ["O. I. Al-Bataineh", "A. Grishina", "L. Moonen"],
    419       "year": 2021,
    420       "doi": "10/gp6kq6",
    421       "relevance": "Authors' prior work on integrating static analysis with APR to improve patch quality."
    422     },
    423     {
    424       "title": "IntRepair: Informed Repairing of Integer Overflows",
    425       "authors": ["P. Muntean", "M. Monperrus", "H. Sun", "J. Grossklags", "C. Eckert"],
    426       "year": 2021,
    427       "doi": "10/gh97rm",
    428       "relevance": "APR tool targeting integer overflow bugs, a specific bug class discussed in this paper."
    429     }
    430   ],
    431   "engagement_factors": {
    432     "practical_relevance": {
    433       "score": 1,
    434       "justification": "Proposes hybrid APR ideas and algorithms at pseudocode level but provides no implementation, tools, or immediately usable techniques."
    435     },
    436     "surprise_contrarian": {
    437       "score": 1,
    438       "justification": "The observation that current APR cannot handle liveness bugs is somewhat known; the classification framework adds structure but is not surprising."
    439     },
    440     "fear_safety": {
    441       "score": 0,
    442       "justification": "No AI risk, security, or safety concerns are raised."
    443     },
    444     "drama_conflict": {
    445       "score": 0,
    446       "justification": "No controversy or conflict; a constructive theoretical contribution."
    447     },
    448     "demo_ability": {
    449       "score": 0,
    450       "justification": "No code, demo, or tool is available."
    451     },
    452     "brand_recognition": {
    453       "score": 0,
    454       "justification": "Simula Research Laboratory is respected but not widely known outside formal methods circles."
    455     }
    456   }
    457 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs