scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21307B)
      1 {
      2   "paper": {
      3     "title": "AI Safety Subproblems for Software Engineering Researchers",
      4     "authors": ["David Gros", "Prem Devanbu", "Zhou Yu"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2304.14597"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides GitHub links for the keyword matching regex (https://github.com/DNGros/aisse/blob/main/ai_terms_regex.txt) and the list of foundational safety papers (https://github.com/DNGros/aisse/blob/main/foundation_papers.csv)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The foundational papers CSV is released via GitHub. The underlying citation data comes from the publicly available Semantic Scholar corpus, which is referenced. The analysis inputs are thus reproducible from public sources."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements files, or dependency information are provided. The paper references using the Semantic Scholar API and corpus but does not specify software versions or dependencies needed to reproduce the analysis."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology at a high level (querying Semantic Scholar API, filtering by venue) but does not provide runnable scripts or detailed steps to reproduce the citation counts in Table 1."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The citation counts in Table 1 are presented as point estimates with no confidence intervals or error bars. The paper acknowledges limitations of the proxy measure but provides no quantification of uncertainty."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper does not make comparative claims that would require significance tests. It presents descriptive counts of citations rather than claiming statistical differences between groups."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "The paper is primarily descriptive and theoretical. The citation counts serve as a motivating observation, not as the basis for effect size claims."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The choice of 44 foundational safety works (FSafe) is described as informed by querying references in survey papers, but no formal justification is given for this number or the completeness of the set. The paper acknowledges limitations but does not justify the sample."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "There are no experimental runs or repeated measurements. The citation analysis is a single deterministic query against the Semantic Scholar corpus."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a position/perspective paper with a supporting literature analysis, not an evaluation of a system or method. There is no system to compare against baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "Not applicable — no system or method is being evaluated against baselines."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system with components to ablate. This is a position paper proposing research directions."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system evaluation is performed. The citation count analysis is descriptive, not an evaluation metric."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate. This is a theoretical/position paper."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No ML model or system is evaluated on test data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides a per-venue breakdown of CSafe paper counts across SE, ML, AI, NLP, CV, PL, and other conference/journal categories, rather than just an overall aggregate."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses an 'anti-problem' (Section 3.5) — research directions that might seem safety-related but are not net-beneficial. It also extensively discusses limitations of its citation analysis in Section 1."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper's main finding IS a negative result: only 4 papers at major SE venues reference foundational AI safety work, and 0 at PL venues. The anti-problems section (3.5) also presents research that is not beneficial to safety."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims a 'quantified literature review suggesting that AI Safety discussions are not common at SE venues.' Table 1 supports this with only 4 CSafe papers at SE conferences and 5 at SE journals. The abstract also claims to 'categorize subproblems' which Section 3 delivers."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes no causal claims. It presents descriptive observations and proposes future research directions. Language is appropriately hedged with terms like 'conjectures' and 'suggest.'"
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly states limitations: 'This analysis has many limitations. Large amounts of research (like in AI robustness or interpretability) is safety-motivated, even if it never references the FSafe discussions of long-term AI.' The analysis is clearly bounded to citation-based proxy measures."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations for the low citation counts: safety-motivated research may not cite the specific foundational works selected, and 'AI Safety culturally relies heavily on self-publishing or web forums' which the analysis would miss. Section 2.1 discusses common skepticisms about AI safety."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No AI/ML models are used in the paper's own methodology. The citation analysis uses the Semantic Scholar API, not a language model."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used. The analysis is based on citation graph queries, not LLM prompting."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No model training or inference is performed. The citation analysis has no hyperparameters in the ML sense."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a citation analysis and position paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper describes the methodology at a high level — querying Semantic Scholar, filtering CSafe to 2012-March 2023, filtering by 'Computer Science' field — but does not document specific preprocessing steps, API query parameters, or how venue classifications were resolved (e.g., footnote 7 notes a misclassification issue). The regex for AI/ML/DL terms is released but the full pipeline steps are not documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations section. Limitations are discussed inline in Section 1 ('This analysis has many limitations...') but there is no substantive standalone section addressing limitations of the overall paper or its proposals."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The paper discusses specific threats: safety-motivated research that doesn't cite FSafe works would be missed, the reliance on Semantic Scholar corpus classifications (footnotes 7-8 note misclassifications), the cultural tendency of AI safety to rely on self-publishing, and that 33% of SE papers mentioning AI is based on keyword matching which may over- or under-count."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states: 'This set of conjectures and sub-problems is not intended to be comprehensive or definitive. Instead it is a set of example starting points. Solving these problems alone will not solve AI alignment.' It also notes it focuses on technical alignment rather than policy."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The foundational papers CSV is released, but the full dataset of 6565 CSafe works, their citation links, and the venue classification data are not released. An independent researcher cannot verify the counts in Table 1 without re-running the analysis."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper describes using the Semantic Scholar API and corpus to query citations to 44 foundational works, filtering to 2012-March 2023, and using venue classifications. The FSafe selection process is described as 'informed by querying all references in a set of survey papers and examining works that are in multiple surveys.'"
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The data comes from automated citation graph queries on a public corpus."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline is only described at a high level. Key details are missing: how FSafe was filtered from survey references to exactly 44 works, how venue classifications were resolved, what API endpoints were used, and how the 33% estimate of AI-related SE papers was computed. The regex is released but not the full pipeline."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The acknowledgements state: 'DG's research work was partially supported by the NSF (CCF-1934568).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: David Gros and Prem Devanbu at University of California, Davis; Zhou Yu at Columbia University. These are academic institutions with no apparent conflict regarding AI safety advocacy."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "NSF is an independent government funding agency with no financial stake in whether AI safety is discussed more at SE venues."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate any pre-trained model's capability on a benchmark. It performs a citation graph analysis using the Semantic Scholar API."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model is evaluated on any benchmark. Contamination is not applicable."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation of a pre-trained model is performed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. This is a citation analysis and position paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a position/theoretical paper with a supporting literature analysis. No system with inference costs is proposed or evaluated."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The paper is a position paper. The citation analysis uses API queries with negligible compute. No compute budget is relevant."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AI Safety discussions are not common at SE venues: only 4 papers at major SE conferences (ICSE, FSE, ASE, ISSTA) out of 7,744 reference foundational AI safety works.",
    286       "evidence": "Table 1 shows 4 CSafe papers at SE conferences out of 7,744 total, and 5 at SE journals out of 6,082. This is based on citation graph analysis using Semantic Scholar corpus (Section 1).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Approximately 33% of papers at large SE venues in 2022 mentioned AI/ML/DL terms in their title or abstract, compared to 4% in 2012.",
    291       "evidence": "Section 1, based on keyword matching in the Semantic Scholar corpus. The regex used is released on GitHub. However, the methodology could over- or under-count.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "AI safety topics are discussed most often at ML-focused conferences (NeurIPS, ICLR, ICML, KDD) with 225 papers, and AI-specific conferences (AAAI, IJCAI) with 87 papers.",
    296       "evidence": "Table 1 provides venue-by-venue counts from the citation graph analysis.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "There are 12 concrete AI safety subproblems that fit into traditional SE research areas.",
    301       "evidence": "Section 3 enumerates 12 problems (P1-P12) across 4 conjectures. These are proposed research directions, not empirically validated claims.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["theoretical", "qualitative"],
    306   "key_findings": "This 4-page position paper argues that AI safety is under-discussed at SE venues, supported by a citation graph analysis showing only 4 papers at major SE conferences reference foundational AI safety works (out of 7,744 total). The authors propose 12 concrete subproblems organized under 4 conjectures about how software engineering will change with rising AI capabilities (machine-written code, increased testing needs, universal software engineering, AI in critical systems). They also identify an 'anti-problem': improving code generation correctness metrics does not necessarily contribute to long-term safety.",
    307   "red_flags": [
    308     {
    309       "flag": "Proxy measure validity unclear",
    310       "detail": "The citation-based proxy for 'considering AI safety' (citing any of 44 foundational works) may significantly undercount safety-relevant research. The paper acknowledges this but the magnitude of undercounting is unknown. Research on robustness, interpretability, or fairness may be safety-motivated without citing the specific FSafe papers."
    311     },
    312     {
    313       "flag": "Selection of foundational works not fully transparent",
    314       "detail": "The 44 FSafe works were selected by 'examining works that are in multiple surveys,' but the threshold (how many surveys?) and exact selection criteria are not documented. Different selections could yield substantially different counts."
    315     },
    316     {
    317       "flag": "Subproblems are speculative conjectures",
    318       "detail": "The 12 proposed subproblems (P1-P12) are framed as conjectures without empirical validation. The paper acknowledges this ('not intended to be comprehensive or definitive') but readers should note these are research direction proposals, not findings."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "Evaluating Large Language Models Trained on Code",
    324       "authors": ["Mark Chen"],
    325       "year": 2021,
    326       "relevance": "Introduces HumanEval benchmark for code generation, which is discussed in the paper as an example of capability metrics that are not safety objectives."
    327     },
    328     {
    329       "title": "Taking Flight with Copilot: Early Insights and Opportunities of AI-Powered Pair-Programming Tools",
    330       "authors": ["Christian Bird"],
    331       "year": 2023,
    332       "relevance": "Early empirical insights on AI-powered pair programming (GitHub Copilot), directly relevant to AI4SE evaluation."
    333     },
    334     {
    335       "title": "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback",
    336       "authors": ["Yuntao Bai"],
    337       "year": 2022,
    338       "arxiv_id": "2204.05862",
    339       "relevance": "Foundational RLHF alignment work relevant to LLM safety methodology."
    340     },
    341     {
    342       "title": "Constitutional AI: Harmlessness from AI Feedback",
    343       "authors": ["Yuntao Bai"],
    344       "year": 2022,
    345       "relevance": "Key alignment technique (Constitutional AI) for language model safety."
    346     },
    347     {
    348       "title": "Unsolved Problems in ML Safety",
    349       "authors": ["Dan Hendrycks"],
    350       "year": 2021,
    351       "arxiv_id": "2109.13916",
    352       "relevance": "Comprehensive categorization of ML safety problems, directly relevant to surveying AI safety methodology."
    353     },
    354     {
    355       "title": "Large Language models and Simple, Stupid, Bugs",
    356       "authors": ["Kevin Jesse"],
    357       "year": 2023,
    358       "relevance": "Studies how LLM code generators produce buggy code, relevant to AI code generation quality evaluation."
    359     },
    360     {
    361       "title": "Language Models (Mostly) Know What They Know",
    362       "authors": ["Saurav Kadavath"],
    363       "year": 2022,
    364       "relevance": "Studies calibration and uncertainty in language models, relevant to evaluating LLM self-knowledge and safety."
    365     },
    366     {
    367       "title": "How to certify machine learning based safety-critical systems? A systematic literature review",
    368       "authors": ["Florian Tambon"],
    369       "year": 2021,
    370       "relevance": "SLR on ML safety certification in SE context, directly relevant to AI safety methodology quality."
    371     },
    372     {
    373       "title": "Software Engineering for AI-Based Systems: A Survey",
    374       "authors": ["Silverio Martínez-Fernández"],
    375       "year": 2021,
    376       "relevance": "Major SE4AI survey published in TOSEM, relevant to understanding SE approaches to AI system quality."
    377     },
    378     {
    379       "title": "Training language models to follow instructions with human feedback",
    380       "authors": ["Long Ouyang"],
    381       "year": 2022,
    382       "arxiv_id": "2203.02155",
    383       "relevance": "InstructGPT paper on RLHF for instruction following, foundational to LLM alignment evaluation."
    384     },
    385     {
    386       "title": "Ethical and social risks of harm from Language Models",
    387       "authors": ["Laura Weidinger"],
    388       "year": 2021,
    389       "arxiv_id": "2112.04359",
    390       "relevance": "Taxonomy of LLM risks and harms, relevant to evaluating AI safety research methodology."
    391     }
    392   ]
    393 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs