scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21769B)
      1 {
      2   "paper": {
      3     "title": "Challenges and Paths Towards AI for Software Engineering",
      4     "authors": [
      5       "Alex Gu",
      6       "Naman Jain",
      7       "Wen-Ding Li",
      8       "Manish Shetty",
      9       "Yijia Shao",
     10       "Ziyang Li",
     11       "Diyi Yang",
     12       "Kevin Ellis",
     13       "Koushik Sen",
     14       "Armando Solar-Lezama"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv",
     18     "arxiv_id": "2503.22625"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository or analysis scripts are released. The paper is a position/survey paper but could have released any supporting materials or curated lists."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No dataset, curated reference list, or structured taxonomy data is released. The taxonomy and challenge categorizations exist only within the paper text."
     31       },
     32       "environment_specified": {
     33         "applies": false,
     34         "answer": false,
     35         "justification": "This is a position/survey paper with no computational experiments, so environment specifications are structurally inapplicable."
     36       },
     37       "reproduction_instructions": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "No experiments to reproduce. This is a position paper presenting a taxonomy, challenges, and research directions."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No experiments or quantitative analyses are conducted. The paper is a position/survey paper with no original empirical results."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No comparative claims based on original data. All empirical claims reference existing literature."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No original experiments, so no effect sizes to report."
     58       },
     59       "sample_size_justified": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No original experiments or data collection, so sample size justification is inapplicable."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No original experiments with runs or trials to report variance on."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "This is a position/survey paper, not an evaluation paper. There are no baselines to compare against."
     75       },
     76       "baselines_contemporary": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No baselines; position/survey paper."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No system or method to ablate; this is a position/survey paper."
     85       },
     86       "multiple_metrics": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No experiments or evaluations are conducted."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No system outputs to evaluate; this is a position/survey paper."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "No experiments conducted."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper provides a structured taxonomy breaking tasks into categories (code generation, code transformation, software testing, etc.) and further subdivides challenges and research directions with cross-references in Figure 1."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper extensively discusses failure cases of LLMs throughout Section 3, including concrete code examples of syntax errors in Triton (Sec. 3.7), hallucinated theorems in Lean (Sec. 3.7), version confusion in Next.js (Sec. 3.8), and Lean 3 vs Lean 4 syntax mixing (Sec. 3.8)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports multiple negative results from cited work, including LLMs failing at low-resource languages (27% accuracy in D vs 83% in Python, Sec. 3.7), and discusses where AI approaches have been outperformed by humans (AlphaDev sorting kernels, Sec. 3.9)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims the paper provides (1) a structured taxonomy of tasks, (2) key bottlenecks, and (3) promising research directions. All three are delivered in Sections 2, 3, and 4 respectively."
    122       },
    123       "causal_claims_justified": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "The paper makes no causal claims. It is a position paper that identifies challenges and proposes research directions, using language like 'we believe' and 'we hypothesize' rather than causal assertions."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The Limitations section (Sec. 5) explicitly bounds the paper's scope: the ideas are 'opinionated directions,' the paper has 'limited scope of future work,' focuses on 'code-specific challenges,' and acknowledges the authors are 'primarily in the academic community' who may not know 'details of cutting-edge methods employed in frontier industry labs.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "This is a position/survey paper with no empirical results to explain. There are no observations requiring alternative explanations."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No models are run or evaluated in this paper. Examples reference existing models (o1, o3, Gemma-3 27B, Claude 3.7) but these are illustrative citations, not experiments."
    144       },
    145       "prompts_provided": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The paper does not conduct any experiments using prompting. Illustrative code examples are drawn from existing literature."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No experiments are conducted, so no hyperparameters to report."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic system is built or evaluated in this paper."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "As a survey/position paper, the methodology for selecting which papers and topics to cover is not described. There is no systematic review protocol, search strategy, or inclusion/exclusion criteria documented."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 5 is a dedicated 'Limitations' section listing four specific limitations."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The Limitations section identifies specific threats: (1) future work ideas are 'speculative' and 'lack strong and concrete evidence,' (2) the scope excludes 'novel moonshot ideas' and domain-specific knowledge, (3) authors are 'primarily in the academic community' and may miss industry perspectives, (4) the field is changing so rapidly that challenges may be resolved within months."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 5 explicitly states what the paper does NOT cover: 'novel moonshot ideas,' 'domain-specific knowledge and insights,' 'novel architectures,' and general LLM reasoning techniques. It also acknowledges a bias toward academic perspectives."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No raw data is available. The paper does not release its curated list of references, taxonomy data, or any structured dataset that would allow independent verification of the survey's coverage."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper does not describe how the surveyed papers, examples, or challenges were selected. There is no systematic search methodology or selection process documented."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants are involved."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No pipeline from literature search to final paper selection is documented. The paper presents itself as an 'opinionated view' but does not describe how the opinions or topic selections were arrived at systematically."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 7 (Acknowledgements) discloses NSF grants (No. 2141064, CCF:1900968, CCF:1908870, CCF:2217064), SKY Lab industrial sponsors, Intel Corporation, and ONR YIP Award N000142412532."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "All author affiliations are listed on the first page: MIT CSAIL, UC Berkeley, Cornell University, Stanford University, and University of Pennsylvania."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Funding comes from NSF, Intel, and ONR. As a position paper identifying challenges and research directions for AI in SE, none of the funders have a direct financial stake in the specific opinions or conclusions presented."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is present in the paper. Several authors may have connections to companies in this space (e.g., N. Jain mentions SKY Lab industrial sponsors) but no explicit declaration is provided."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a position/survey paper."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No model evaluations are conducted, so train/test overlap is not applicable."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No benchmark evaluations are conducted. Notably, the paper does discuss contamination as a challenge (Sec. 3.1), but this is about the field, not about the paper's own methodology."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a survey/position paper with no method or system to report costs for."
    286       },
    287       "compute_budget_stated": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "This is a survey/position paper with no computational experiments."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "Today's code LLM evaluations focus on a narrow set of tasks, suffer from potential contamination, and do not reliably measure real-world software engineering abilities.",
    297       "evidence": "Section 3.1 discusses how most coding evaluations are function-level, how contamination has been demonstrated on HumanEval and SWE-Bench (Aleithan et al., 2024; Xu et al., 2024a; Matton et al., 2024), and how construct validity is a problem (user experiences don't match benchmark gains).",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "LLMs struggle with low-resource programming languages, often hallucinating constructs from higher-resource languages.",
    302       "evidence": "Section 3.7 cites Blinn et al. (2024) on Hazel, provides a concrete example of Gemma-3 27B failing on Triton syntax, and cites Qwen 2.5 Coder's 83% Python vs 27% D accuracy on HumanEval.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "LLMs struggle to adapt to library and API version updates, often mixing constructs from different versions.",
    307       "evidence": "Section 3.8 cites CodeUpdateArena and GitChameleon benchmarks, provides concrete examples of Python typing hints version confusion and Lean 3 vs Lean 4 syntax mixing by o3-mini.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Reinforcement learning with verifiable rewards is a promising paradigm for improving code LLMs.",
    312       "evidence": "Section 4.2.1 cites DeepSeek-R1 and SWE-RL as evidence of success in algorithmic programming and SWE-Bench respectively, but acknowledges challenges including reward hacking and coverage issues.",
    313       "supported": "weak"
    314     },
    315     {
    316       "claim": "The field needs more diverse task evaluations beyond code generation, including code refactoring, optimization, testing, and formal verification.",
    317       "evidence": "Section 2 provides a comprehensive taxonomy of SE tasks and Section 3.1 argues current evaluations are too narrow. The argument is well-structured but purely observational, based on cataloguing existing benchmarks rather than new empirical evidence.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "theoretical",
    323     "qualitative"
    324   ],
    325   "key_findings": "This position paper provides a structured taxonomy of AI for software engineering tasks along three dimensions (scope, logical complexity, human intervention level), identifies nine cross-cutting challenges facing current LLM-based coding systems, and proposes research directions in data collection, training, and inference-time approaches. The paper emphasizes that most current evaluations focus narrowly on code generation while neglecting tasks like refactoring, testing, optimization, and formal verification. It highlights LLM failures in low-resource languages, version adaptation, long-horizon planning, and semantic understanding of large codebases.",
    326   "red_flags": [
    327     {
    328       "flag": "No systematic review methodology",
    329       "detail": "Despite surveying a large body of work, the paper does not describe any systematic process for selecting papers or topics. The selection appears to be based on the authors' expertise and opinions rather than a reproducible methodology, which risks coverage bias toward the authors' own research communities."
    330     },
    331     {
    332       "flag": "Self-citation density",
    333       "detail": "Several of the illustrative examples and cited works involve the authors' own projects (LiveCodeBench, R2E, CRUXEval, SWE-agent community, etc.), which is natural for a position paper by domain experts but could bias the selection of challenges and directions toward problems the authors are best positioned to work on."
    334     },
    335     {
    336       "flag": "Speculative future directions presented without evidence",
    337       "detail": "Many of the proposed research directions in Section 4 (e.g., test-time training for codebases, prompt/prefix tuning for library versions, semantic-aware embeddings) are acknowledged by the authors as speculative but are presented with confidence. The Limitations section partially addresses this."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    343       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    344       "year": 2024,
    345       "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks, central to evaluating agentic coding systems."
    346     },
    347     {
    348       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    349       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik R Narasimhan", "Ofir Press"],
    350       "year": 2024,
    351       "arxiv_id": "2405.15793",
    352       "relevance": "Defines agent-computer interfaces for automated SE and demonstrates tool-augmented agents for code tasks."
    353     },
    354     {
    355       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    356       "authors": ["DeepSeek-AI"],
    357       "year": 2025,
    358       "arxiv_id": "2501.12948",
    359       "relevance": "Demonstrates RLVR approach for improving reasoning capabilities in code, relevant to training methodology for coding agents."
    360     },
    361     {
    362       "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
    363       "authors": ["Yuxiang Wei", "Olivier Duchenne", "Jade Copet"],
    364       "year": 2025,
    365       "arxiv_id": "2502.18449",
    366       "relevance": "Applies RL with rule-based rewards to improve SWE-Bench performance, directly relevant to training AI coding agents."
    367     },
    368     {
    369       "title": "Copilot Arena: A Platform for Code LLM Evaluation in the Wild",
    370       "authors": ["Wayne Chi", "Valerie Chen", "Anastasios Nikolas Angelopoulos"],
    371       "year": 2025,
    372       "arxiv_id": "2502.09328",
    373       "relevance": "Gamified arena for evaluating code LLMs with human preferences, addresses evaluation challenges discussed in the paper."
    374     },
    375     {
    376       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    377       "authors": ["Naman Jain", "King Han", "Alex Gu", "Wen-Ding Li"],
    378       "year": 2024,
    379       "arxiv_id": "2403.07974",
    380       "relevance": "Addresses contamination in code benchmarks by using temporal splits, relevant to evaluation methodology."
    381     },
    382     {
    383       "title": "Evaluating Large Language Models Trained on Code",
    384       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    385       "year": 2021,
    386       "arxiv_id": "2107.03374",
    387       "relevance": "Introduced HumanEval benchmark and Codex, foundational work in LLM code generation evaluation."
    388     },
    389     {
    390       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    391       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    392       "year": 2024,
    393       "relevance": "Alternative to agent-based approaches for automated SE, relevant to understanding agentic vs non-agentic paradigms."
    394     },
    395     {
    396       "title": "Large Language Models for Software Engineering: Survey and Open Problems",
    397       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    398       "year": 2023,
    399       "relevance": "Prior survey of LLMs for SE that overlaps with this paper's scope, useful for comparative coverage analysis."
    400     },
    401     {
    402       "title": "Monitoring Reasoning Models for Misbehavior and the Risks of Promoting Obfuscation",
    403       "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"],
    404       "year": 2025,
    405       "arxiv_id": "2503.11926",
    406       "relevance": "Identifies reward hacking in RL-trained coding models, relevant to AI safety in agentic software engineering."
    407     },
    408     {
    409       "title": "Sycophancy to Subterfuge: Investigating Reward-Tampering in Large Language Models",
    410       "authors": ["Carson Denison", "Monte MacDiarmid", "Fazl Barez"],
    411       "year": 2024,
    412       "arxiv_id": "2406.10162",
    413       "relevance": "Documents reward tampering behavior in LLMs during RL training, relevant to safety of AI coding agents."
    414     },
    415     {
    416       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    417       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    418       "year": 2024,
    419       "arxiv_id": "2407.16741",
    420       "relevance": "Open platform for building AI software engineering agents, directly relevant to agentic coding infrastructure."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs