scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17844B)
      1 {
      2   "paper": {
      3     "title": "The Current Challenges of Software Engineering in the Era of Large Language Models",
      4     "authors": ["Cuiyun Gao", "Xing Hu", "Shan Gao", "Xin Xia", "Zhi Jin"],
      5     "year": 2024,
      6     "arxiv_id": "2412.14554"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No code or analysis scripts are released. A Google Sheets link to seminar topics is provided but no analysis code."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The transcriptions, opinion cards, and coded data from the qualitative analysis are not released. Only a Google Sheets link to seminar topics is provided."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "NVivo is mentioned as the coding tool but no version or environment details are provided."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No instructions for reproducing the qualitative coding or card sorting process beyond the high-level methodology description."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": false,
     34         "answer": false,
     35         "justification": "Qualitative study with no quantitative results requiring confidence intervals."
     36       },
     37       "significance_tests": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "No quantitative comparisons are made; this is a qualitative study."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No quantitative results; qualitative challenge identification only."
     46       },
     47       "sample_size_justified": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Qualitative study; sample size justification (e.g., saturation) is not standard for expert panel discussions, though the number of participants (24) is stated."
     51       },
     52       "variance_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No quantitative experimental runs; qualitative study."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "This is a qualitative challenge-identification paper, not a system or method evaluation. No baselines are applicable."
     63       },
     64       "baselines_contemporary": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No system evaluation; no baselines applicable."
     68       },
     69       "ablation_study": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No system with components to ablate."
     73       },
     74       "multiple_metrics": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No quantitative evaluation performed."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "The paper does not evaluate a system's outputs; it reports challenges derived from expert discussion."
     83       },
     84       "held_out_test_set": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No dataset or evaluation requiring train/test splits."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Challenges are organized into seven categories (requirement & design, coding assistance, testing, code review, maintenance, vulnerability management, data/training/evaluation) with detailed per-category discussion."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No system evaluation; the paper itself identifies challenges/limitations of the LLM4SE field."
     98       },
     99       "negative_results_reported": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "No experiments were run; the paper is a qualitative challenge identification."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims 26 challenges from seven aspects based on discussion among 20+ participants. The paper delivers exactly this in Section 4."
    110       },
    111       "causal_claims_justified": {
    112         "applies": false,
    113         "answer": false,
    114         "justification": "The paper makes no causal claims; it identifies and categorizes challenges."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper presents challenges as broadly applicable to LLM4SE without bounding them to the specific perspectives of the 24 Chinese-institution-affiliated participants. The threats to validity section acknowledges incompleteness but does not bound the scope of the claims."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": false,
    123         "answer": false,
    124         "justification": "No empirical results are presented that would require alternative explanations. This is a qualitative taxonomy paper."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": false,
    130         "answer": false,
    131         "justification": "No LLM experiments are conducted; this is a qualitative study about challenges."
    132       },
    133       "prompts_provided": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "No prompting is used; this is a qualitative study."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No model experiments conducted."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding used."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 3 describes the methodology: transcribing and coding with NVivo, open card sorting by two authors, disagreement resolution, and verification by two additional authors. The process is documented at a reasonable level for qualitative research."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 6 'Threats to Validity' discusses threats to the study."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The threats section mentions completeness of challenges and representativeness of participants, but these are generic. It claims 'we believe we have made this threat have minimal impact' without specific evidence. No specific threats like geographic bias, language bias, or disciplinary blind spots are discussed."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of what types of challenges might be missed due to the specific composition of the panel (all from Chinese institutions, specific SE/AI backgrounds)."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The transcriptions, opinion cards, and coding sheets from the qualitative analysis are not available. Only a link to seminar topics is provided."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 3 describes that 24 participants held face-to-face meetings during Jan 19-21, 2024, with six thematic sessions each lasting ~4 hours, followed by panel discussions."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper states participants were from the '9th CCF Beautiful Lake Seminars' but does not describe how the 24 participants were selected or recruited. No information on selection criteria or potential selection bias."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 documents the pipeline: transcription → open coding with NVivo → opinion cards → card sorting by two authors → disagreement resolution → review by two additional authors → 26 challenges in 7 categories."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: Harbin Institute of Technology, Zhejiang University, Huawei Technologies, Peking University. The Huawei affiliation is disclosed."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed. Two authors are from Huawei, which develops and sells LLM-based SE tools, creating a potential conflict that is not addressed."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests statement is present. Huawei authors may have financial interests in outcomes related to LLM4SE challenges."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "No model evaluation on benchmarks is performed."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "No model evaluation on benchmarks is performed."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "No model evaluation on benchmarks is performed."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The study involves human participants (24 experts in discussion sessions) but there is no mention of pre-registration."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No ethics board approval is mentioned despite involving human participants in structured discussion sessions."
    242       },
    243       "demographics_reported": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Only the split of 17 academic researchers and 7 industry practitioners is given. No information on experience levels, geographic distribution, seniority, or other demographics."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No criteria for participant selection are stated beyond 'specializing in fields such as software engineering and artificial intelligence.'"
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "Not an experimental study with treatment/control conditions; it is a qualitative expert panel discussion."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "Blinding is not feasible for an expert panel discussion."
    262       },
    263       "attrition_reported": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "The abstract mentions '20 participants' while the methodology says 24. No explanation for this discrepancy or any attrition."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "Qualitative study; no method with inference costs."
    274       },
    275       "compute_budget_stated": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "Qualitative study; no computational experiments."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "26 key challenges in LLM4SE are identified across seven aspects of the software development lifecycle.",
    285       "evidence": "Section 4 presents all 26 challenges organized into requirement & design (6), coding assistance (5), testing (2), code review (4), maintenance (3), vulnerability management (3), and data/training/evaluation (3).",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "The challenges were derived from rigorous discussion among 24 participants from academia and industry.",
    290       "evidence": "Section 3 describes the methodology: 3-day seminar with six thematic sessions, qualitative coding with NVivo, open card sorting with inter-coder agreement. However, inter-rater reliability metrics are not reported.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "LLMs can potentially enable end-to-end software development, generating and testing code directly from requirements.",
    295       "evidence": "Stated in the introduction (Section 1) as a general claim with references to tools like GitHub Copilot X and GitLab Duo, but no empirical evidence is provided for end-to-end development.",
    296       "supported": "weak"
    297     }
    298   ],
    299   "methodology_tags": ["qualitative"],
    300   "key_findings": "This paper identifies 26 key challenges for LLM-based software engineering (LLM4SE) across seven SDLC aspects, derived from a structured qualitative study involving 24 experts. The challenges span requirements/design (prompt sensitivity, domain knowledge gaps), coding assistance (hallucination, vulnerability introduction, evaluation gaps), testing (syntactic and semantic quality of generated tests), code review (data quality, industry-OSS gap), maintenance (microservice complexity, data scarcity), vulnerability management (insufficient training data, context complexity), and data/training/evaluation (dataset quality, training costs, benchmark limitations).",
    301   "red_flags": [
    302     {
    303       "flag": "No inter-rater reliability metrics",
    304       "detail": "The qualitative coding involved open card sorting by two authors with disagreement resolution, but no inter-rater reliability (e.g., Cohen's kappa) is reported for the coding process."
    305     },
    306     {
    307       "flag": "Participant selection bias unaddressed",
    308       "detail": "All 24 participants appear to be affiliated with Chinese institutions. The paper does not discuss whether this geographic/cultural concentration affects the generalizability of identified challenges."
    309     },
    310     {
    311       "flag": "Participant count discrepancy",
    312       "detail": "The abstract says 'more than 20 participants' while the methodology section says 24. This inconsistency is not explained."
    313     },
    314     {
    315       "flag": "Undisclosed potential conflicts",
    316       "detail": "Two authors are from Huawei Technologies, which develops LLM-based SE tools. No conflict of interest statement or funding disclosure is present."
    317     }
    318   ],
    319   "cited_papers": [
    320     {
    321       "title": "Large Language Models for Software Engineering: Survey and Open Problems",
    322       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    323       "year": 2023,
    324       "relevance": "Major survey on LLMs for SE covering the same domain as this paper."
    325     },
    326     {
    327       "title": "Evaluating large language models trained on code",
    328       "authors": ["Mark Chen", "Jerry Tworek"],
    329       "year": 2021,
    330       "relevance": "Introduces Codex and HumanEval benchmark, foundational to LLM code generation evaluation."
    331     },
    332     {
    333       "title": "Self-collaboration code generation via ChatGPT",
    334       "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"],
    335       "year": 2023,
    336       "relevance": "Multi-agent code generation approach relevant to agentic SE workflows."
    337     },
    338     {
    339       "title": "ChatUniTest: a Framework for LLM-Based Test Generation",
    340       "authors": ["Xie et al."],
    341       "year": 2023,
    342       "relevance": "LLM-based test generation framework with generate-verify-repair pipeline."
    343     },
    344     {
    345       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    346       "authors": ["Daya Guo", "Qihao Zhu"],
    347       "year": 2024,
    348       "relevance": "Major open-source code LLM achieving state-of-the-art on code generation tasks."
    349     },
    350     {
    351       "title": "StarCoder2 and The Stack V2",
    352       "authors": ["Lozhkov et al."],
    353       "year": 2024,
    354       "relevance": "Open-source code LLM trained on large-scale dataset, relevant to code generation evaluation."
    355     },
    356     {
    357       "title": "CodeReviewer: Pre-Training for Automating Code Review Activities",
    358       "authors": ["Li et al."],
    359       "year": 2022,
    360       "relevance": "Pre-trained model for code review automation, central to LLM-based code review research."
    361     },
    362     {
    363       "title": "Fuzz4All: Universal Fuzzing with Large Language Models",
    364       "authors": ["Xia et al."],
    365       "year": 2024,
    366       "relevance": "LLM-based fuzzing tool for test input generation across multiple languages."
    367     },
    368     {
    369       "title": "CodePlan: Repository-level Coding using LLMs and Planning",
    370       "authors": ["Ramakrishna Bairi"],
    371       "year": 2024,
    372       "relevance": "Repository-level code generation with dependency analysis, relevant to agentic coding."
    373     },
    374     {
    375       "title": "SemCoder: Training Code Language Models with Comprehensive Semantics",
    376       "authors": ["Yangruibo Ding"],
    377       "year": 2024,
    378       "relevance": "Code LLM training approach incorporating semantic understanding."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs