scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (14893B)
      1 {
      2   "paper": {
      3     "title": "Analysis of Research Status in the Field of Automated Program Repair",
      4     "authors": ["Jishang Han", "Dereck Huang"],
      5     "year": 2025,
      6     "venue": "Dean&Francis"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No code or analysis scripts released. No repository URLs mentioned."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No dataset or corpus of reviewed papers released."
     19       },
     20       "environment_specified": {
     21         "applies": false,
     22         "answer": false,
     23         "justification": "This is a narrative survey with no computational experiments requiring an environment."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No search methodology, inclusion/exclusion criteria, or reproduction instructions provided for the survey process."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": false,
     34         "answer": false,
     35         "justification": "Survey paper with no original experiments or statistical aggregation."
     36       },
     37       "significance_tests": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "Survey paper; no comparative experiments performed by the authors."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No original experiments; numbers cited are from other papers."
     46       },
     47       "sample_size_justified": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No original experiments or meta-analysis conducted."
     51       },
     52       "variance_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No original experiments conducted."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The survey does not compare itself against prior surveys or establish how it differs from existing reviews like [2] (Dikici & Bilgin, 2025)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No baselines included, so contemporaneity is not assessable."
     68       },
     69       "ablation_study": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "Survey paper; no system with components to ablate."
     73       },
     74       "multiple_metrics": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "Survey paper; no original evaluation conducted."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Survey paper; no system outputs to evaluate."
     83       },
     84       "held_out_test_set": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Survey paper; no test set used."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper breaks down APR methods into three categories (template-based, LLM-based, hybrid agent) and discusses each separately in Section 2."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 5.1 discusses limitations and failure modes of each method category (LLM context window limits, hybrid model integration issues, dataset defects)."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses shortcomings of each method type, e.g., template methods failing on complex code, LLMs generating compilable but logically incorrect patches (Section 5.1.1)."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims APR has made progress especially with LLMs and hybrid agents, and identifies challenges (dataset limitations, code understanding). These are discussed in the body, though with limited depth."
    110       },
    111       "causal_claims_justified": {
    112         "applies": false,
    113         "answer": false,
    114         "justification": "The paper makes no causal claims; it summarizes existing work narratively."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes broad claims about APR progress without bounding them to specific benchmarks or languages. Statements like 'code accuracy rate can rise from 30%-40% to 60%-70%' (Section 4) are presented without specifying which benchmarks or conditions."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No discussion of alternative explanations for the performance improvements cited. The narrative accepts reported improvements at face value."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": false,
    130         "answer": false,
    131         "justification": "Survey paper; no models run by the authors."
    132       },
    133       "prompts_provided": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "Survey paper; no prompting conducted."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey paper; no experiments conducted."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey paper; no agentic scaffolding used."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No survey methodology described: no search queries, databases searched, inclusion/exclusion criteria, or filtering pipeline. The paper does not explain how the reviewed papers were selected."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Section 5.1 discusses limitations of APR methods, but there is no discussion of the limitations of this survey itself (e.g., coverage, selection bias)."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No threats to validity of the survey are discussed."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The scope is vaguely implied ('past three years') but no explicit boundaries are stated about what types of APR work are included/excluded or what the survey does not cover."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No list of reviewed papers, search results, or extracted data provided."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No description of how papers were found or selected for review."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants; data source is published literature, but selection method is still undocumented (covered by data_collection_described)."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No documentation of any pipeline from literature search to final review. The paper references only 10 works with no explanation of how these were selected."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding information or acknowledgments section present."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: Liaocheng No.1 Middle School and Zhixin Middle School. These are high school affiliations."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No funding disclosed; likely unfunded high school work."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial disclosure statement present."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "Survey paper; does not evaluate any model on a benchmark."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "Survey paper; no model evaluation conducted."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Survey paper; no model evaluation conducted."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "Survey paper; no method of its own to cost."
    274       },
    275       "compute_budget_stated": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "Survey paper; no computation performed."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "LLM-based APR methods improve code accuracy from 30%-40% to 60%-70% compared to template-based methods",
    285       "evidence": "Section 4 states this range but does not cite a specific source or benchmark for these numbers.",
    286       "supported": "weak"
    287     },
    288     {
    289       "claim": "AAIS achieves 113.32% maximum function-level location accuracy improvement over Agentless-1.5 on SWE-Bench",
    290       "evidence": "Section 2.3 and Section 4 cite these numbers but the source paper for AAIS is not clearly referenced.",
    291       "supported": "weak"
    292     },
    293     {
    294       "claim": "Copilot Autofix fixes vulnerabilities 3-12x faster than manual repair",
    295       "evidence": "Section 4 cites reference [6] (Ahmed, 2025) for these numbers.",
    296       "supported": "moderate"
    297     }
    298   ],
    299   "methodology_tags": ["meta-analysis"],
    300   "key_findings": "This paper provides a narrative overview of Automated Program Repair methods, categorizing them into template-based, LLM-based, and hybrid agent approaches. It identifies four key challenges: dataset limitations, insufficient model understanding of complex code logic, suboptimal multi-model collaboration, and tool dependence. The review covers only 10 references and lacks systematic survey methodology.",
    301   "red_flags": [
    302     {
    303       "flag": "No systematic review methodology",
    304       "detail": "The paper claims to be a survey but has no documented search strategy, inclusion/exclusion criteria, or PRISMA-style pipeline. Only 10 references are cited, making coverage extremely thin for a survey of a large field."
    305     },
    306     {
    307       "flag": "Unsourced quantitative claims",
    308       "detail": "Performance claims like '30%-40% to 60%-70% accuracy improvement' (Section 4) are stated without attribution to specific papers or benchmarks. The AAIS improvement figures lack a clear citation."
    309     },
    310     {
    311       "flag": "Laundering weak evidence",
    312       "detail": "The survey summarizes results from cited papers without assessing their methodological quality, effectively laundering unverified claims into its conclusions."
    313     },
    314     {
    315       "flag": "High school authors, predatory-style venue",
    316       "detail": "Authors are affiliated with middle/high schools. The venue (Dean&Francis, ISSN 2959-6157) appears to be a low-prestige or predatory publisher, raising questions about peer review quality."
    317     }
    318   ],
    319   "cited_papers": [
    320     {
    321       "title": "Less is More: Adaptive Program Repair with Bug Localization and Preference Learning",
    322       "authors": ["Z. Dai", "B. Chen", "Z. Zhao", "X. Tang", "S. Wu", "C. Yao", "J. Chen"],
    323       "year": 2025,
    324       "relevance": "LLM-based APR method (AdaPatcher) combining bug localization with preference learning."
    325     },
    326     {
    327       "title": "Advancements in automated program repair: a comprehensive review",
    328       "authors": ["S. Dikici", "T. T. Bilgin"],
    329       "year": 2025,
    330       "relevance": "Comprehensive review of APR advancements, directly relevant to survey methodology comparison."
    331     },
    332     {
    333       "title": "UTBoost: Rigorous Evaluation of Coding Agents on SWE-Bench",
    334       "authors": ["B. X. Yu", "Y. X. Zhu", "P. J. He", "D. Kang"],
    335       "year": 2025,
    336       "relevance": "Evaluation methodology for coding agents on SWE-Bench benchmark."
    337     },
    338     {
    339       "title": "AI Copilots: Boosting Software Engineers' Productivity or Replacing Them?",
    340       "authors": ["E. Ahmed"],
    341       "year": 2025,
    342       "doi": "10.1145/3743095.3743098",
    343       "relevance": "Empirical data on AI copilot productivity impact including Copilot Autofix vulnerability repair speed."
    344     },
    345     {
    346       "title": "CodeHalu: Investigating Code Hallucinations in LLMs via Execution-based Verification",
    347       "authors": ["Y. Tian", "W. Yan", "Q. Yang", "X. Zhao", "Q. Chen", "W. Wang", "Z. Luo", "L. Ma", "D. Song"],
    348       "year": 2025,
    349       "relevance": "Directly relevant to LLM code generation quality and hallucination detection."
    350     },
    351     {
    352       "title": "CodeNet: A large-scale AI for code dataset for learning a diversity of coding tasks",
    353       "authors": ["R. Puri", "D. S. Kung", "G. Janssen"],
    354       "year": 2021,
    355       "arxiv_id": "2105.12655",
    356       "relevance": "Major APR dataset with 14M+ code samples used for training and evaluation."
    357     }
    358   ]
    359 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs