scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19274B)
      1 {
      2   "paper": {
      3     "title": "A Survey on Large Language Models for Software Engineering",
      4     "authors": [
      5       "Quanjun Zhang",
      6       "Chunrong Fang",
      7       "Yang Xie",
      8       "Yaxin Zhang",
      9       "Yun Yang",
     10       "Weisong Sun",
     11       "Shengcheng Yu",
     12       "Zhenyu Chen"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv",
     16     "arxiv_id": "2312.15223"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["survey_methodology"],
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper provides a living repository at https://github.com/iSEngLab/AwesomeLLM4SE for tracking papers and resources."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The GitHub repository serves as the curated dataset of surveyed papers and LLMs. The paper's collected corpus of 62 LLMs and 947 studies is available through this repository."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment or dependency specifications are provided. The survey does not include any scripts or tools that would require environment setup."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step instructions for reproducing the survey methodology (search queries, database access, filtering pipeline) are provided."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "This is a survey paper that does not run experiments or report statistical results requiring uncertainty quantification."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "Survey paper with no comparative experiments requiring significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "Survey paper with no experiments. Effect sizes are not applicable."
     58       },
     59       "sample_size_justified": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "Survey paper — no experimental sample size to justify."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "Survey paper with no experimental runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The survey does not compare itself against prior surveys or reviews in terms of coverage, methodology, or findings."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No baseline comparison with other surveys is provided."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "Survey paper — no system components to ablate."
     85       },
     86       "multiple_metrics": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Survey paper — no system evaluation requiring metrics."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "Survey paper — no system outputs to evaluate."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "Survey paper — no train/test split applicable."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The survey provides extensive per-category breakdowns across SE workflow phases (requirements, design, implementation, testing, maintenance), task types (112 tasks), and model categories (62 LLMs). Tables 3-15 break down studies by task and model."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 'Challenges and Opportunities' discusses limitations of current LLM approaches including hallucination, security vulnerabilities, and areas where LLMs struggle."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The survey reports areas where LLMs underperform, including security vulnerabilities in generated code (Section 5.3), reliability issues (Section 5.4), and challenges with complex reasoning tasks."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims about covering 62 LLMs, 947 studies, and 112 tasks are supported by the detailed tables and sections in the paper. The claim of being a comprehensive survey is substantiated by the breadth of coverage."
    122       },
    123       "causal_claims_justified": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "The paper is a survey/taxonomy and does not make causal claims. It describes and categorizes rather than claiming X causes Y."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims to cover 'Large Language Models for Software Engineering' broadly, but the survey's temporal scope and search methodology are not precisely bounded. The paper does not clearly state what time period is covered or acknowledge potential gaps in coverage."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "As a pure survey/taxonomy paper with no empirical results of its own, alternative explanations are not applicable."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey paper with no measurements of its own — no proxy/outcome gap to address."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "Survey paper — does not use any LLM for its own methodology."
    149       },
    150       "prompts_provided": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "Survey paper — does not use prompting."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "Survey paper — no model inference or training performed."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "Survey paper — no agentic scaffolding used."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 2 describes a three-stage paper selection pipeline: keyword search on Google Scholar, snowballing from references, and filtering. Search terms and databases are specified. However, exact filtering criteria at each stage could be more detailed."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Challenges and Opportunities' serves as a limitations/future directions section, discussing open problems and gaps in current LLM-for-SE research."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "There is no explicit threats-to-validity section discussing threats specific to this survey's methodology (e.g., potential missed papers, search bias, temporal cutoff effects)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state what is excluded from scope. The boundary between 'software engineering' and adjacent fields (e.g., general code generation, NLP) is not clearly delineated."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The GitHub repository (AwesomeLLM4SE) contains the list of surveyed papers, serving as the raw data for the survey."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 2 describes the data collection: keyword search on Google Scholar using terms like 'large language model' AND 'software engineering', followed by snowballing from references."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants — this is a literature survey. Data source is academic publications, which is a standard corpus."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "While Section 2 describes the general pipeline (search → snowball → filter), it does not provide counts at each filtering stage or document how many papers were excluded and why at each step."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The acknowledgments section lists funding from the National Natural Science Foundation of China and other grants."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations (Nanjing University, Swinburne University of Technology) are clearly listed. No conflict with evaluated products."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Funding is from government research foundations (NSFC, etc.) with no financial stake in the survey's conclusions."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "Survey paper — does not evaluate a pre-trained model on any benchmark."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Survey paper — does not evaluate a pre-trained model on any benchmark."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "Survey paper — does not evaluate a pre-trained model on any benchmark."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "Survey paper with no human participants."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "Survey paper with no human participants."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Survey paper with no human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "Survey paper with no human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "Survey paper with no human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "Survey paper with no human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper with no human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "Survey paper — no method with inference costs."
    291       },
    292       "compute_budget_stated": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "Survey paper — no computation performed."
    296       }
    297     },
    298     "survey_methodology": {
    299       "prisma_or_structured_protocol": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The paper describes a keyword search + snowballing approach but does not follow PRISMA or another structured review protocol. No PRISMA flow diagram, no protocol registration, and search queries are not fully reproducible."
    303       },
    304       "quality_assessment_of_sources": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The survey does not assess the methodological quality of its 947 source papers. All papers are treated equally regardless of rigor, sample size, or study design. This is a significant limitation — weak results are presented alongside strong ones without distinction."
    308       },
    309       "publication_bias_discussed": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No discussion of publication bias. The survey does not consider whether its sources are biased toward positive results or whether negative findings about LLMs in SE are underrepresented."
    313       }
    314     }
    315   },
    316   "claims": [
    317     {
    318       "claim": "The survey covers 62 LLMs of code, 947 SE studies, and 112 code-related tasks across 5 SE workflow phases.",
    319       "evidence": "Tables 1-15 throughout the paper enumerate the LLMs, studies, and tasks. Section 2 describes the collection methodology.",
    320       "supported": "strong"
    321     },
    322     {
    323       "claim": "LLMs have been applied across all five phases of the SE workflow: requirements engineering, software design, software implementation, software testing, and software maintenance.",
    324       "evidence": "Sections 3-4 provide detailed coverage of each phase with specific studies and models cited for each.",
    325       "supported": "strong"
    326     },
    327     {
    328       "claim": "Pre-training objectives like next-token prediction and masked language modeling are foundational to LLMs for code, with different objectives suited to different downstream tasks.",
    329       "evidence": "Section 3.2 discusses pre-training objectives with examples of how different LLMs use different objectives.",
    330       "supported": "moderate"
    331     },
    332     {
    333       "claim": "Security and reliability remain significant challenges for LLM-generated code.",
    334       "evidence": "Section 5.3 covers security vulnerabilities and Section 5.4 covers reliability issues, citing multiple studies demonstrating these problems.",
    335       "supported": "moderate"
    336     }
    337   ],
    338   "methodology_tags": ["meta-analysis"],
    339   "key_findings": "This survey provides a comprehensive taxonomy of 62 LLMs applied to 112 software engineering tasks across 947 studies. It organizes SE applications into five workflow phases (requirements, design, implementation, testing, maintenance) and catalogs pre-training objectives, fine-tuning strategies, and evaluation approaches. The survey identifies security, reliability, and hallucination as key open challenges, and highlights the rapid growth of LLM-for-SE research with a concentration on code generation and testing tasks.",
    340   "red_flags": [
    341     {
    342       "flag": "No quality assessment of source papers",
    343       "detail": "The survey collects and summarizes 947 papers without assessing their methodological quality. Strong and weak studies are presented side by side without distinction, effectively laundering the signal-to-noise ratio of its sources."
    344     },
    345     {
    346       "flag": "No structured review protocol",
    347       "detail": "The paper describes an ad-hoc search methodology (Google Scholar keyword search + snowballing) without following PRISMA or any structured review protocol. The search queries, databases, and temporal boundaries are not fully specified, making the survey non-reproducible."
    348     },
    349     {
    350       "flag": "No publication bias discussion",
    351       "detail": "The survey does not consider whether its 947 sources are biased toward positive findings about LLMs in SE, which is likely given the hype cycle around LLMs."
    352     },
    353     {
    354       "flag": "Breadth over depth",
    355       "detail": "Covering 947 studies and 112 tasks in a single paper necessarily means most studies receive only surface-level treatment. Individual findings are reported without critical evaluation of the underlying evidence quality."
    356     }
    357   ],
    358   "cited_papers": [
    359     {
    360       "title": "Evaluating Large Language Models Trained on Code",
    361       "authors": ["Mark Chen"],
    362       "year": 2021,
    363       "arxiv_id": "2107.03374",
    364       "relevance": "Introduces Codex and the HumanEval benchmark — foundational work for LLM code generation evaluation."
    365     },
    366     {
    367       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    368       "authors": ["Zhangyin Feng"],
    369       "year": 2020,
    370       "relevance": "Early pre-trained model for code understanding, influential in the code LLM lineage."
    371     },
    372     {
    373       "title": "StarCoder: May the Source Be with You!",
    374       "authors": ["Raymond Li"],
    375       "year": 2023,
    376       "arxiv_id": "2305.06161",
    377       "relevance": "Open-source code LLM with transparent training data and evaluation methodology."
    378     },
    379     {
    380       "title": "Code Llama: Open Foundation Models for Code",
    381       "authors": ["Baptiste Rozière"],
    382       "year": 2023,
    383       "arxiv_id": "2308.12950",
    384       "relevance": "Major open-source code LLM family with multiple size variants and specializations."
    385     },
    386     {
    387       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    388       "authors": ["Daya Guo"],
    389       "year": 2024,
    390       "arxiv_id": "2401.14196",
    391       "relevance": "High-performing open-source code LLM with detailed training methodology."
    392     },
    393     {
    394       "title": "ChatDev: Communicative Agents for Software Development",
    395       "authors": ["Chen Qian"],
    396       "year": 2023,
    397       "arxiv_id": "2307.07924",
    398       "relevance": "Multi-agent framework for software development — relevant to agentic coding workflows."
    399     },
    400     {
    401       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    402       "authors": ["Jiawei Liu"],
    403       "year": 2023,
    404       "relevance": "Critical evaluation of LLM code generation quality beyond pass@k metrics."
    405     },
    406     {
    407       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    408       "authors": ["Xinyi Hou"],
    409       "year": 2023,
    410       "relevance": "Competing survey on the same topic — useful for comparison of coverage and methodology."
    411     },
    412     {
    413       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    414       "authors": ["Chunqiu Steven Xia"],
    415       "year": 2023,
    416       "relevance": "Key paper on LLM-based automated program repair, a major SE application area."
    417     },
    418     {
    419       "title": "LLM-based Test Generation: A Comprehensive Survey",
    420       "authors": ["Various"],
    421       "year": 2023,
    422       "relevance": "Focused survey on LLM test generation, a major subcategory of LLMs for SE."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs