scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25833B)
      1 {
      2   "paper": {
      3     "title": "Can ChatGPT Support Developers? An Empirical Evaluation of Large Language Models for Code Generation",
      4     "authors": ["Kailun Jin", "Chung-Yu Wang", "Hung Viet Pham", "Hadi Hemmati"],
      5     "year": 2024,
      6     "venue": "MSR '24 (21st International Conference on Mining Software Repositories)",
      7     "arxiv_id": "2402.11702",
      8     "doi": "10.1145/3643991.3645074"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, GitHub link, or archive reference is provided in the paper. The authors mention a 'publicly available dataset' in their contributions but do not provide a link to their labeled data or analysis code."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The authors use the publicly available DevGPT dataset (Xiao et al. 2024) and state as a contribution 'A publicly available dataset of developers and ChatGPT interactions, labelled with the prompt types and the final use cases of the generated code.' The underlying DevGPT dataset is public, though no explicit link is given for the labeled overlay."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, dependency files, or software version details are provided. The study is primarily manual analysis, but there is automated data cleaning that would benefit from reproducibility details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided. While the study design is described at a high level in Section 2, there are no scripts, commands, or detailed procedures that would allow exact replication of the automated filtering or manual labeling process."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No confidence intervals or error bars are reported. Results are presented as raw counts and percentages (e.g., '65.3%', '32.8%') without any uncertainty quantification."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No statistical significance tests are used. Claims such as 'conversations in the Commit context lasted on average only 2.4 rounds while conversations in the Code file context lasted on average significantly longer at 10.4 rounds' use the word 'significantly' without any statistical test backing it."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No formal effect sizes are reported. The paper presents raw percentages and averages but no measures like Cohen's d or relative effect sizes with baseline context."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The sample sizes are reported (2,299 CG conversations, 138 merged PRs) but never justified. No power analysis or discussion of whether 138 merged PRs is sufficient for generalizable conclusions about LLM code usage."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "While box plots are mentioned for RQ1.2 (Figure reference is broken as '??'), the paper does not report standard deviations, IQR values, or other explicit variance measures in the text or tables."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "This is an empirical observational/mining study characterizing developer-ChatGPT interactions, not a system evaluation. There is no system to compare against baselines."
     65       },
     66       "baselines_contemporary": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "Not applicable — this is not a system evaluation with baselines. It is a descriptive empirical study."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "Not applicable — this is a mining/observational study, not a system with components to ablate."
     75       },
     76       "multiple_metrics": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "Not applicable — this is a descriptive study analyzing conversation patterns and code usage, not a system evaluation with performance metrics."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "Not applicable — the paper does not evaluate a system's outputs. The manual labeling is part of the data collection methodology, not an evaluation of a system."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "Not applicable — no model is being trained or evaluated on a test set."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by GitHub category (Pull Request, Issue, Discussion, Commit, Code File) in Table 1, by prompt category in Table 2, and by code usage category in Figure 1 (Exact Match, Modified Code, Document, Supplementary Info)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses cases where generated code was not used (32.8% 'Supplementary Info' — code considered unhelpful) and gives a specific example: 'a commit conversation indicates that the generated code is considered unhelpful, causing a notable slowdown instead of enhancing code execution speed.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that 32.8% of generated code was not directly used and that 'the current practice of using LLM-generated code is typically limited to either demonstrating high-level concepts or providing examples in documentation, rather than to be used as production-ready code.' This is a negative finding about LLM code utility."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims that 'the current practice of using LLM-generated code is typically limited to either demonstrating high-level concepts or providing examples in documentation, rather than to be used as production-ready code.' This is supported by Figure 1 showing only 16.8% Exact Match and 26% Modified Code, with the majority being Document (24.4%) or Supplementary Info (32.8%)."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes causal-sounding claims without adequate justification. For example: 'when developers provide specific code snippets directly to ChatGPT for improvement (in the Commit context), the desirable outcomes are likely achieved quicker with fewer clarifications.' This is a causal inference from observational data without controlling for confounds. The speculation about transformer 'memory' causing developers to avoid 'Request another generation' (Section 3.1) is presented as explanatory without evidence."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper's title asks 'Can ChatGPT Support Developers?' broadly, and the conclusion states 'Much improvement is needed before LLMs become an integral part of modern software development.' However, the data comes only from publicly shared ChatGPT links on GitHub (a self-selected, survivorship-biased sample), covering only GPT-3.5 conversations from mid-2023. These significant scope limitations are not reflected in the title or conclusion."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 3.3 (Discussion) explicitly discusses survivorship bias and filtering bias: 'developers are less likely to include ChatGPT URLs in commit messages if the generated code's quality is too low. This may skew the dataset towards resolved discussions.' These are specific alternative explanations for the observed patterns."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to 'ChatGPT' and 'GPT-3.5' without specifying exact model versions or snapshot dates. No API version or model identifier (e.g., gpt-3.5-turbo-0613) is stated. Since the data comes from DevGPT (shared conversations), the exact model versions used by developers are unknown and this is not acknowledged."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "Not applicable — the authors did not prompt an LLM themselves. They analyzed existing developer conversations with ChatGPT from the DevGPT dataset. The prompts they study are the developers' prompts, which are part of the dataset."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "Not applicable — the authors did not run any LLM inference. They analyzed an existing dataset of conversations."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "Not applicable — no agentic scaffolding is used. This is a mining study analyzing existing conversations."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 2.2 describes the automated data cleaning steps: filtering out Hacker News conversations, extracting code-generation conversations by checking for 'ListofCode' attributes, and manually removing 344 irrelevant conversation rounds from 2,299 conversations. Section 2.3 describes filtering 189 PR conversations down to 138 merged PRs by excluding 51 non-merged ones."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.3 (Discussion) serves as a limitations section, discussing survivorship bias, filtering bias, and dataset limitations. While not titled 'Limitations' explicitly, it provides substantive discussion of threats to validity."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.3 discusses specific threats: survivorship bias in the DevGPT dataset ('constructed from retained conversations, potentially overlooking a broader spectrum of developer interactions') and filtering bias ('developers are less likely to include ChatGPT URLs in commit messages if the generated code's quality is too low'). These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges 'dataset limitations based on empirical experience' but does not enumerate specific out-of-scope claims. The generalization from shared GitHub ChatGPT links to 'developers' broadly is never explicitly bounded."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The underlying DevGPT dataset is publicly available (Xiao et al. 2024, reference [24]), which forms the raw data for this study. The conversations can be independently accessed and verified."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 2.1 describes how DevGPT was collected: 'searching with the keyword \"https://chat.openai.com/share/\" in the GitHub GraphQL API to identify mentions of shared links sourced from software development artifacts.' The dataset covers six snapshots from July 24th to August 31st, 2023, containing 17,913 prompts and 11,751 code snippets."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "For the crowd-sourcing labeling, the paper says only 'we distributed the 2,299 valid data entries to volunteers whom we recruited from our local software engineering community.' No details are given on how many volunteers participated, what their qualifications were, how they were recruited, or whether this introduces selection bias."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The data pipeline is documented: DevGPT dataset → filter out Hacker News → parse JSON for code generation conversations → remove 344 irrelevant rounds → 2,299 CG conversations. For RQ2: 189 PR conversations → exclude 51 non-merged → 138 merged PRs. Each stage with counts is described in Sections 2.2 and 2.3."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 6 (Acknowledgement) states: 'This work was partly supported by NSERC and Alberta Innovates.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All four authors are listed as affiliated with York University, Toronto, Canada. No conflict with the evaluated product (ChatGPT/OpenAI) — the authors are independent academic researchers."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "NSERC (Natural Sciences and Engineering Research Council of Canada) and Alberta Innovates are government funding agencies with no financial interest in ChatGPT's code generation performance."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper. While the authors appear to be independent academics, the absence of an explicit declaration is noted."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "Not applicable — this is a mining study analyzing developer conversations. It does not evaluate a pre-trained model's capability on a benchmark."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "Not applicable — no benchmark evaluation of a model is performed. The study analyzes existing conversations."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Not applicable — no benchmark evaluation is performed."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Not applicable — this is a repository mining study. The crowd-sourced labelers are annotators, not study participants. No human subjects experiment is conducted."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Not applicable — this is a mining study of publicly shared conversations. No human subjects research is conducted."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Not applicable — no human participants study. The developers whose conversations are analyzed are not study participants."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "Not applicable — no human participants study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "Not applicable — no experimental study with human participants."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Not applicable — no experimental study with human participants."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "Not applicable — no human participants study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "Not applicable — the study does not propose or run a method that incurs inference cost. It is a mining study analyzing existing conversations."
    276       },
    277       "compute_budget_stated": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "Not applicable — this is a mining/manual analysis study, not a compute-intensive method."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "65.3% of ChatGPT conversations on GitHub are related to code generation.",
    287       "evidence": "Table 1 shows 2,299 CG-related conversations out of 3,523 total (Section 3.1).",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Only about half (52.3%) of code file conversations are directly associated with code generation, the lowest among all GitHub categories.",
    292       "evidence": "Table 1 shows 1,052 CG out of 2,010 Code File conversations (Section 3.1, RQ1.1).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Commit-related conversations predominantly involve code generation (98.5%) and require fewer prompt-response rounds (average 2.4 rounds).",
    297       "evidence": "Table 1 shows 660/670 Commit conversations are CG. Section 3.1 RQ1.2 states average 2.4 rounds for Commits.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "32.8% of generated code is not used (Supplementary Info), indicating the current practice is limited to high-level concepts or documentation rather than production-ready code.",
    302       "evidence": "Figure 1 shows usage distribution: 32.8% Supplementary Info, 24.4% Document, 26% Modified, 16.8% Exact Match (Section 3.2).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Much improvement is needed before LLMs become an integral part of modern software development.",
    307       "evidence": "Based on the finding that only 16.8% of generated code is used as-is (Exact Match) and 32.8% is not used at all (Section 3.2, Conclusion Section 5).",
    308       "supported": "weak"
    309     }
    310   ],
    311   "methodology_tags": ["observational", "qualitative"],
    312   "key_findings": "This MSR 2024 paper analyzes 3,523 developer-ChatGPT conversations from the DevGPT dataset. It finds that 65.3% of conversations involve code generation, with Commit-related interactions focused on code improvement (98.5% CG-related, average 2.4 rounds) while Code File conversations are longer (10.4 rounds) and often involve clarification. Analysis of 138 merged pull requests shows that only 16.8% of generated code is used as-is, 26% is heavily modified, 24.4% appears in documentation, and 32.8% is not used at all, suggesting LLM-generated code is primarily used for conceptual demonstrations rather than production code.",
    313   "red_flags": [
    314     {
    315       "flag": "Survivorship bias in dataset",
    316       "detail": "The DevGPT dataset only captures conversations that developers chose to share publicly via GitHub. Conversations where ChatGPT was unhelpful are less likely to be shared, systematically biasing the sample toward more successful interactions. The authors acknowledge this but do not quantify or mitigate it."
    317     },
    318     {
    319       "flag": "No statistical tests despite comparative claims",
    320       "detail": "The paper uses the word 'significantly' when comparing conversation lengths across categories (2.4 vs 10.4 rounds) without any statistical test. Multiple comparative claims are made based on raw percentage differences without significance testing."
    321     },
    322     {
    323       "flag": "Small sample for RQ2",
    324       "detail": "RQ2 conclusions about how generated code is used are based on only 138 merged pull requests. This is a small sample from which to draw general conclusions about LLM code utility in software development, and no justification is provided for this sample size."
    325     },
    326     {
    327       "flag": "Overclaimed scope",
    328       "detail": "The title asks 'Can ChatGPT Support Developers?' and the conclusion generalizes to 'LLMs' and 'modern software development,' but the data covers only GPT-3.5 conversations from a 5-week window in mid-2023, captured through a specific sharing mechanism on GitHub. The generalization far exceeds the evidence."
    329     },
    330     {
    331       "flag": "Broken figure references",
    332       "detail": "The paper text contains 'Figure ??' references (Section 3.1, RQ1.2), suggesting a LaTeX compilation issue. Key results about conversation length distributions reference figures that cannot be examined."
    333     },
    334     {
    335       "flag": "Crowd-sourced labeling with minimal inter-rater details",
    336       "detail": "The crowd-sourcing for RQ1.3 uses majority voting among 3 annotators from a 'local software engineering community,' but no inter-rater reliability metrics (e.g., Fleiss' kappa) are reported. The number of labelers, their qualifications, and disagreement rates are not provided."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Evaluating large language models trained on code",
    342       "authors": ["Mark Chen", "Jerry Tworek"],
    343       "year": 2021,
    344       "arxiv_id": "2107.03374",
    345       "relevance": "Foundational Codex paper on evaluating LLM code generation capabilities, directly relevant to code generation evaluation methodology."
    346     },
    347     {
    348       "title": "Grounded copilot: How programmers interact with code-generating models",
    349       "authors": ["Shraddha Barke", "Michael B James", "Nadia Polikarpova"],
    350       "year": 2023,
    351       "relevance": "Studies developer interactions with code-generating models, relevant to understanding AI-assisted programming workflows."
    352     },
    353     {
    354       "title": "Taking Flight with Copilot: Early Insights and Opportunities of AI-Powered Pair-Programming Tools",
    355       "authors": ["Christian Bird", "Denae Ford", "Thomas Zimmermann"],
    356       "year": 2023,
    357       "doi": "10.1145/3582083",
    358       "relevance": "Early empirical study of Copilot adoption and developer experience, relevant to AI programming tool evaluation."
    359     },
    360     {
    361       "title": "Investigating Code Generation Performance of Chat-GPT with Crowdsourcing Social Data",
    362       "authors": ["Yunhe Feng"],
    363       "year": 2023,
    364       "relevance": "Evaluates ChatGPT code generation using crowdsourcing, directly relevant to LLM code generation assessment methodology."
    365     },
    366     {
    367       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    368       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    369       "year": 2023,
    370       "arxiv_id": "2305.01210",
    371       "relevance": "Rigorous evaluation of LLM-generated code correctness, central to assessing code generation quality."
    372     },
    373     {
    374       "title": "Understanding the Usability of AI Programming Assistants",
    375       "authors": ["Jenny T Liang", "Chenyang Yang", "Brad A Myers"],
    376       "year": 2023,
    377       "arxiv_id": "2303.17125",
    378       "relevance": "Studies usability of AI programming tools, relevant to developer productivity and tool design."
    379     },
    380     {
    381       "title": "Prompt Engineering or Fine Tuning: An Empirical Assessment of Large Language Models in Automated Software Engineering Tasks",
    382       "authors": ["Jiho Shin", "Clark Tang", "Tahmineh Mohati"],
    383       "year": 2023,
    384       "arxiv_id": "2310.10508",
    385       "relevance": "Empirical comparison of prompt engineering vs fine-tuning for SE tasks; source of the prompt categorization taxonomy used in this paper."
    386     },
    387     {
    388       "title": "Security implications of large language model code assistants: A user study",
    389       "authors": ["Gustavo Sandoval", "Hammond Pearce"],
    390       "year": 2022,
    391       "arxiv_id": "2208.09727",
    392       "relevance": "User study on security implications of LLM code assistants, relevant to AI code safety and quality."
    393     },
    394     {
    395       "title": "DevGPT: Studying Developer-ChatGPT Conversations",
    396       "authors": ["Tao Xiao", "Christoph Treude", "Hideaki Hata", "Kenichi Matsumoto"],
    397       "year": 2024,
    398       "relevance": "Source dataset for this study; characterizes developer-ChatGPT conversations on GitHub."
    399     },
    400     {
    401       "title": "A systematic evaluation of large language models of code",
    402       "authors": ["Frank F Xu", "Uri Alon", "Graham Neubig", "Vincent Josua Hellendoorn"],
    403       "year": 2022,
    404       "relevance": "Systematic evaluation of code LLMs, directly relevant to code generation evaluation methodology."
    405     },
    406     {
    407       "title": "A study on robustness and reliability of large language model code generation",
    408       "authors": ["Li Zhong", "Zilong Wang"],
    409       "year": 2023,
    410       "arxiv_id": "2308.10335",
    411       "relevance": "Studies robustness and reliability of LLM code generation, relevant to code quality assessment."
    412     },
    413     {
    414       "title": "Software testing with large language model: Survey, landscape, and vision",
    415       "authors": ["Junjie Wang", "Yuchao Huang"],
    416       "year": 2023,
    417       "arxiv_id": "2307.07221",
    418       "relevance": "Survey of LLMs for software testing, relevant to the broader landscape of AI-assisted software engineering."
    419     }
    420   ]
    421 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs