scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17410B)
      1 {
      2   "paper": {
      3     "title": "AI in Software Engineering: Case Studies and Prospects",
      4     "authors": ["Lei Wang"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2309.15768",
      8     "doi": "10.48550/arXiv.2309.15768"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": false,
     14         "answer": false,
     15         "justification": "This is a narrative literature review / position paper with no original code, experiments, or computational artifacts. There is nothing to release."
     16       },
     17       "data_released": {
     18         "applies": false,
     19         "answer": false,
     20         "justification": "No original data was collected or generated. The paper is a qualitative discussion of two existing AI systems (Watson and AlphaGo) with no dataset."
     21       },
     22       "environment_specified": {
     23         "applies": false,
     24         "answer": false,
     25         "justification": "No computational experiments were run, so no environment specification is applicable."
     26       },
     27       "reproduction_instructions": {
     28         "applies": false,
     29         "answer": false,
     30         "justification": "No experiments or analysis to reproduce. The paper is a narrative discussion of existing systems."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "No quantitative experiments were conducted. The paper presents no original numerical results requiring confidence intervals."
     38       },
     39       "significance_tests": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "No comparative experiments or statistical claims are made. The paper is a qualitative narrative."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No experimental results are reported. The paper presents no effect sizes."
     48       },
     49       "sample_size_justified": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No samples were collected. This is a narrative literature review."
     53       },
     54       "variance_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experimental runs were conducted. No variance to report."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "No evaluation or benchmarking was conducted. The paper discusses existing systems qualitatively without running any experiments."
     65       },
     66       "baselines_contemporary": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No experimental evaluation was performed, so baseline currency is not applicable."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No system or method was proposed that could be ablated. This is a narrative review."
     75       },
     76       "multiple_metrics": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No evaluation was conducted, so metrics are not applicable."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No system outputs were produced that could be evaluated by humans. The paper is a qualitative discussion."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No datasets or test sets were used. No evaluation was performed."
     90       },
     91       "per_category_breakdown": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No quantitative results are reported. No categories to break down."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses Watson's failure with the 'urban dictionary' incident (Section 3.1) and Watson's reduced efficiency with expanding databases, as well as limitations in cross-disciplinary accuracy. These constitute failure case discussion for the reviewed systems."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses limitations and negative aspects of both Watson (inability to understand meaning, reduced efficiency with larger databases, medical field errors) and AlphaGo (dependence on computing power) in Sections 3.1, 3.2, and 4."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims are qualitative and modest: that Watson uses 'decision making support' while AlphaGo uses 'self-decision making,' and that AI techniques contribute to intelligent systems. These claims are discussed in Sections 3 and 4 with supporting narrative."
    112       },
    113       "causal_claims_justified": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "The paper makes no empirical causal claims. Statements like 'using AI techniques contributes to intelligent systems' are presented as general observations from literature, not testable causal claims."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes sweeping generalizations from two case studies (Watson and AlphaGo) to the entire field of AI in software engineering. Section 4 states 'artiﬁcial intelligence can really help us detect the bottleneck of our software process and ﬁnd the global optimum solution' based only on the AlphaGo example. The title 'AI in Software Engineering: Case Studies and Prospects' is broad relative to the narrow evidence base."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "The paper presents no empirical results for which alternative explanations would be relevant. It is a qualitative narrative review."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "No models were run or tested. The paper discusses Watson and AlphaGo conceptually based on published descriptions."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "No prompting was used. This is a narrative review, not an LLM-based study."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No models were trained or run. Hyperparameters are not applicable."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding was used. The paper is a literature review."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No data was collected or preprocessed. The paper is a narrative review without a systematic search methodology."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The paper ends with a brief conclusion (Section 5) that does not discuss methodological limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity are discussed. The paper does not acknowledge the limitations of drawing conclusions from only two case studies."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No explicit scope boundaries are stated. The paper does not clarify what it does NOT claim or what settings its observations do NOT apply to."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": false,
    176         "answer": false,
    177         "justification": "No original data was collected. The paper is a narrative review of published information about Watson and AlphaGo."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper reviews literature but provides no systematic search methodology. There is no description of how sources were identified, what databases were searched, or what selection criteria were used."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No participants were recruited. This is a narrative literature review."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No data pipeline is documented. The paper does not describe how the reviewed sources were selected, filtered, or analyzed."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding disclosure is provided. The paper was written as coursework for the CITS5502 Software Processes unit at UWA (noted in footnote on page 1), but no funding statement is included."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The author's affiliation with the University of Western Australia is stated on page 1. The footnote also clarifies this was a master's coursework submission for CITS5502."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "This appears to be unfunded student coursework, so funder independence is not applicable."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is provided. While financial conflicts are unlikely for student coursework, absence of disclosure is still noted."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a narrative review discussing Watson and AlphaGo conceptually."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No benchmark evaluation was conducted. Contamination is not applicable."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark evaluation was conducted. Contamination is not applicable."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants were involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants were involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants were involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants were involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "This is a narrative review/position paper. No method was proposed or evaluated, so cost reporting is not applicable."
    276       },
    277       "compute_budget_stated": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a narrative review/position paper. No computation was performed, so compute budget is not applicable."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Watson uses 'decision making support' to help humans make decisions, whereas AlphaGo uses 'self-decision making' to choose operations contributing to the best outcome.",
    287       "evidence": "Discussed in Section 4 (Comparison and Discussions) with reference [31]. This is a conceptual distinction, not an empirical finding.",
    288       "supported": "weak"
    289     },
    290     {
    291       "claim": "AlphaGo Zero defeated the previous AlphaGo version 100-0 after training from scratch for about 3 days.",
    292       "evidence": "Section 3.2, citing Silver et al. [5] and DeepMind [26]. This is a factual claim from the original AlphaGo Zero paper, not an original finding.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "AI techniques such as deep learning and machine learning in software systems contribute to intelligent systems.",
    297       "evidence": "Abstract and Section 5 (Conclusion). This is stated as a general conclusion from the two case studies, but no systematic evidence or evaluation framework supports the claim beyond narrative description.",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "AlphaGo's techniques might be very useful in medical research for diagnosis and treatment.",
    302       "evidence": "Section 4, with brief mention of deep learning analyzing complex information. No evidence or examples specific to medical applications are provided beyond speculation.",
    303       "supported": "unsupported"
    304     }
    305   ],
    306   "methodology_tags": ["qualitative", "case-study"],
    307   "key_findings": "This paper provides a narrative comparison of IBM Watson and Google AlphaGo as case studies of AI in software engineering. The main observation is that Watson adopts 'decision making support' while AlphaGo uses 'self-decision making,' and that both demonstrate the potential of deep learning and machine learning in building intelligent systems. The paper was written as master's coursework in 2017 and uploaded to arXiv in 2023. It contains no original experiments, no systematic methodology, and draws broad conclusions from qualitative discussion of two well-known AI systems.",
    308   "red_flags": [
    309     {
    310       "flag": "Student coursework published as research",
    311       "detail": "The footnote on page 1 states this was 'conducted while enrolled as a master's student at UWA, specifically for the CITS5502 Software Processes unit in 2017.' This is a course assignment, not peer-reviewed research. The date on the title page is 'Sunday 15th October, 2017' despite the 2023 arXiv submission."
    312     },
    313     {
    314       "flag": "No systematic methodology",
    315       "detail": "The paper presents no systematic literature search methodology, no inclusion/exclusion criteria, no quality assessment framework for the case studies, and no structured comparison criteria. Sources appear to be selected informally."
    316     },
    317     {
    318       "flag": "Sweeping generalizations from two examples",
    319       "detail": "The paper draws broad conclusions about AI in software engineering from narrative discussion of only two systems (Watson and AlphaGo), neither of which is a software engineering tool. Claims like 'artificial intelligence can really help us detect the bottleneck of our software process' are unsupported extrapolations."
    320     },
    321     {
    322       "flag": "Outdated and low-quality references",
    323       "detail": "Multiple references are Wikipedia pages and blog posts (references 21-26, 31). Several academic references are from low-impact venues. No references are newer than 2017, and many key claims rest on non-scholarly sources."
    324     },
    325     {
    326       "flag": "Marginal relevance to survey scope",
    327       "detail": "Despite the title mentioning 'Software Engineering,' the paper discusses general AI systems (Watson for Q&A and AlphaGo for Go) rather than AI applied to software development processes. The connection to SE is speculative and brief."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Mastering the game of Go without human knowledge",
    333       "authors": ["David Silver", "Julian Schrittwieser", "Karen Simonyan"],
    334       "year": 2017,
    335       "relevance": "Foundational paper on AlphaGo Zero's reinforcement learning approach, relevant to understanding AI system capabilities."
    336     },
    337     {
    338       "title": "Building Watson: an overview of the DeepQA project",
    339       "authors": ["David Ferrucci", "Eric Brown", "Jennifer Chu-Carroll"],
    340       "year": 2010,
    341       "relevance": "Core reference on IBM Watson's architecture, relevant to understanding large-scale NLP system design."
    342     },
    343     {
    344       "title": "Artificial Intelligence Applications for Improved Software Engineering Development: New Prospects",
    345       "authors": ["Farid Meziane", "Sunil Vadera"],
    346       "year": 2009,
    347       "relevance": "Covers AI techniques applied to software development processes including project planning, requirements engineering, and testing."
    348     }
    349   ]
    350 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs