scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (14898B)
      1 {
      2   "paper": {
      3     "title": "Automated Bug Detection and Correction in Software Development using Machine Learning",
      4     "authors": ["Isabella Hoffman", "Nathaniel Brooks"],
      5     "year": 2023,
      6     "venue": "International Journal on Advanced Computer Theory and Engineering",
      7     "doi": null
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code or repository is mentioned or released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or analysis corpus is released. The paper is a survey but provides no supplementary data."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a narrative literature review with no computational experiments requiring an environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions are provided. The survey methodology (search terms, databases, inclusion criteria) is not documented in a reproducible way."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a narrative literature review with no original statistical analysis."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No original experiments or statistical comparisons are conducted."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No original experiments; no effect sizes to report."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No original experiments or sampling."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No original experiments."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a narrative survey, not a comparative evaluation."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No experimental baselines; narrative survey."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No original evaluation conducted."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No experiments conducted."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides a breakdown of reviewed work by year, contribution type, and dataset used."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses challenges and limitations of ML-based bug detection including false positives, generalization issues, and explainability concerns."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results are reported. The paper presents a uniformly positive narrative about ML for bug detection without reporting any approaches that failed."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims ML models have 'demonstrated high accuracy in identifying and resolving software defects' but the paper provides no original evidence for this claim. The results table with efficiency/accuracy percentages for JIRA/Bugzilla/GitHub Issues appears unsourced."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The conclusion states 'these systems improve bug detection accuracy by 85-95% and reduce debugging time by up to 70%' — causal claims with no cited source or methodology to support them."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes sweeping claims about ML transforming software development without bounding to specific languages, domains, or contexts."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for any of the claims made."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No models are run; this is a literature review."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments conducted."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No search methodology, inclusion/exclusion criteria, or paper selection pipeline is described. The survey does not document how papers were found or selected."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no limitations or threats-to-validity section."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The paper does not clarify what it does not cover."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data or corpus of reviewed papers is available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No description of how the literature was collected — no databases, search terms, or time periods are specified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data source is published literature (not a standard benchmark requiring NA)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No paper selection pipeline is documented. It is unclear how many papers were initially found, screened, or excluded."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information is mentioned anywhere in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Zenith Technical Academy and Orion School of Engineering."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate any pre-trained model on a benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model evaluation on benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No model evaluation on benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey paper with no original method to cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey paper with no computation."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ML-based bug detection systems improve bug detection accuracy by 85-95% and reduce debugging time by up to 70%",
    286       "evidence": "Stated in the Conclusion section without any citation or original analysis to support these specific numbers.",
    287       "supported": "unsupported"
    288     },
    289     {
    290       "claim": "JIRA has 90% accuracy and 85% efficiency; GitHub Issues has 88% efficiency and 87% accuracy; Bugzilla has 78% efficiency and 82% accuracy",
    291       "evidence": "Presented in the Results table with no source, methodology, or experimental setup explaining how these numbers were obtained.",
    292       "supported": "unsupported"
    293     },
    294     {
    295       "claim": "ML-driven techniques lead to a 30-50% reduction in post-release defects",
    296       "evidence": "Stated in the Conclusion section with no citation or supporting evidence.",
    297       "supported": "unsupported"
    298     }
    299   ],
    300   "methodology_tags": ["meta-analysis"],
    301   "key_findings": "This is a narrative literature review summarizing ML techniques for automated bug detection and correction, covering supervised learning, deep learning (CodeBERT, GNNs), reinforcement learning, and self-supervised approaches. The paper presents unsourced quantitative claims about tool efficiency/accuracy without any documented methodology. It provides no systematic review protocol, no inclusion/exclusion criteria, and no original analysis.",
    302   "red_flags": [
    303     {
    304       "flag": "Unsourced quantitative claims",
    305       "detail": "The Results table presents specific efficiency and accuracy percentages for JIRA, Bugzilla, and GitHub Issues with no methodology, citation, or explanation of how these numbers were obtained. The Conclusion claims '85-95% accuracy improvement' and '30-50% reduction in post-release defects' without any source."
    306     },
    307     {
    308       "flag": "No systematic review methodology",
    309       "detail": "For a survey/review paper, there is no documented search strategy, no database selection, no inclusion/exclusion criteria, and no PRISMA-style flow diagram. The selection of reviewed papers appears arbitrary."
    310     },
    311     {
    312       "flag": "Laundering weak evidence",
    313       "detail": "The paper summarizes existing work without assessing the quality of the cited studies, effectively laundering the signal-to-noise ratio of its sources. Claims from individual papers are presented as established facts."
    314     },
    315     {
    316       "flag": "Suspicious venue and affiliations",
    317       "detail": "The journal (International Journal on Advanced Computer Theory and Engineering, MRI INDIA) and author affiliations (Zenith Technical Academy, Orion School of Engineering) are not well-known institutions or venues, and the paper quality is consistent with a predatory journal publication."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "Self-Supervised Bug Detection and Repair",
    323       "authors": ["M. Allamanis", "H. Jackson-Flux", "M. Brockschmidt"],
    324       "year": 2021,
    325       "arxiv_id": "2105.12787",
    326       "relevance": "Self-supervised approach to bug detection relevant to automated program repair research."
    327     },
    328     {
    329       "title": "InferFix: End-to-End Program Repair with LLMs",
    330       "authors": ["M. Jin", "S. Shahriar", "M. Tufano"],
    331       "year": 2023,
    332       "arxiv_id": "2303.07263",
    333       "relevance": "LLM-based automated program repair system."
    334     },
    335     {
    336       "title": "A Survey of Learning-based Automated Program Repair",
    337       "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"],
    338       "year": 2023,
    339       "arxiv_id": "2301.03270",
    340       "relevance": "Survey of ML approaches to automated program repair."
    341     },
    342     {
    343       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    344       "authors": ["Z. Feng", "D. Guo", "D. Tang", "N. Duan", "X. Feng", "M. Gong", "M. Zhou"],
    345       "year": 2020,
    346       "arxiv_id": "2002.08155",
    347       "relevance": "Foundational pre-trained model for code understanding used in bug detection."
    348     },
    349     {
    350       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    351       "authors": ["S. Lu", "Y. Li", "H. Jin", "N. Duan", "Y. Qu", "M. Zhou", "C. Zhai"],
    352       "year": 2021,
    353       "arxiv_id": "2102.04664",
    354       "relevance": "Benchmark dataset for code ML tasks including bug detection."
    355     },
    356     {
    357       "title": "Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks",
    358       "authors": ["Y. Zhou", "S. Liu", "J. Siow", "X. Du", "Y. Liu"],
    359       "year": 2019,
    360       "arxiv_id": "1909.03496",
    361       "relevance": "GNN-based vulnerability detection approach."
    362     }
    363   ]
    364 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs