scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19783B)
      1 {
      2   "paper": {
      3     "title": "Evaluating the Code Quality of AI-Assisted Code Generation Tools: An Empirical Study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
      4     "authors": ["Burak Yetiştiren", "Işık Özsoy", "Miray Ayerdem", "Eray Tüzün"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2304.10778"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "Repository URL provided: https://github.com/mirayayerdem/Github-Copilot-Amazon-Whisperer-ChatGPT. Also references a reproduction package with experiment results spreadsheet."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "They use the publicly available HumanEval dataset and provide their experiment results in the GitHub repository. The reproduction package spreadsheet is linked in Section 3.2."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "They mention Python 3.10.10 interpreter and SonarQube but do not provide requirements.txt, Dockerfile, or detailed environment setup with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The experimental workflow is described at a high level (Section 3.2) but no README with commands or scripts to replicate experiments is mentioned."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Only point estimates are reported (e.g., 65.2% correctness for ChatGPT). No confidence intervals or error bars on any results."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares three tools and claims ChatGPT is 'the most successful' but no statistical significance tests are used to support comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Raw percentages and differences are reported but no formal effect sizes (Cohen's d, odds ratios, etc.) are provided."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample is the 164 HumanEval problems. No justification is given for why this sample size is adequate for the claims made, nor is any power analysis discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Single-run results only. The paper acknowledges nondeterministic behavior of the tools (Section 6.2) but does not report variance across multiple runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Three tools are compared against each other (GitHub Copilot, Amazon CodeWhisperer, ChatGPT), and older versions serve as baselines for RQ4."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The three tools evaluated were all current at the time of the study (January 2023 versions)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ2 removes docstrings and RQ3 replaces function names with 'foo', effectively ablating input components to measure their contribution to code generation quality."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Five metrics are used: Code Validity, Code Correctness, Code Security, Code Reliability, and Code Maintainability (Section 3.3)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of code quality. All evaluation is automated via unit tests and SonarQube. Human judgment of readability or usefulness of generated code is absent."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — the paper evaluates code generation tools on a fixed benchmark, not training a model with train/test splits."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per tool, per experiment type (original, only function names, dummy names), and partially correct generations are further categorized into correctness score intervals (Figures 7, 9, 11)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Multiple code listings show specific failure cases (Listings 2-6 for invalid code, Listing 9 for complex prompts where Copilot outputs 'pass'). Section 5.1 discusses causes of invalid code."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "RQ4 reports that 17 of 47 correct answers from the old Copilot version became incorrect in the new version. Also reports regressions in CodeWhisperer's new version (16 of 40 correct answers became partial/incorrect)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (ChatGPT 65.2%, Copilot 46.3%, CodeWhisperer 31.1% correctness; improvement rates; technical debt figures) are all supported by results in Sections 4 and 5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The ablation experiments (RQ2 removing docstrings, RQ3 using dummy names) use controlled single-variable manipulation to support claims about the effect of input quality on code generation. The design is adequate for these causal claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Evaluating the Code Quality of AI-Assisted Code Generation Tools' broadly, but results are limited to Python on HumanEval's 164 problems. While Section 6.4 notes this limitation, the title and conclusions do not sufficiently bound the claims to this narrow setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The threats to validity section (Section 6) discusses methodological issues but does not consider alternative explanations for observed results (e.g., whether HumanEval problems are in training data could explain ChatGPT's high performance)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "GitHub Copilot versions are given (v1.7.4421 and v1.70.8099) and ChatGPT is identified as '9 Jan 23 Version', but the underlying model versions (which GPT model, which Codex version) are not specified. For CodeWhisperer, only the date is given as AWS does not provide version numbers. No API model identifiers are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The prompts are the HumanEval function signatures and docstrings, which are shown in Figure 1. For ChatGPT, they state the instruction: 'Generate code using the prompts I will provide' (Section 6.2). The full HumanEval dataset is publicly available."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the three tools. The tools were used with default settings but this is not explicitly stated."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper evaluates third-party tools (Copilot, CodeWhisperer, ChatGPT) as black boxes. The authors cannot be expected to describe internal scaffolding they have no access to."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2 and Figure 3 describe the pipeline: extracting problems from HumanEval, saving prompts and tests as separate files, generating solutions, then running assessments. The process is documented step by step."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 'Threats to Validity' is a dedicated section with four subsections (conclusion, internal, construct, external validity)."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Multiple specific threats are discussed: nondeterministic code generation (Section 6.2), different code generation methods (auto vs key combination), unequal time intervals between version comparisons (13 months vs 2 months), and IDE dependency."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.4 explicitly states limitations: dependency on HumanEval dataset, limitation to Python, IDE dependency, and problem coverage limited to 164 problems."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Raw experiment results are available in the reproduction package spreadsheet linked in Section 3.2 and the GitHub repository."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.2 describes code generation was done manually using VS Code for Copilot/CodeWhisperer and browser for ChatGPT. The process for each tool is detailed."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The data source is the standard HumanEval benchmark."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Figure 3 provides a step-by-step workflow from problem extraction through code generation to assessment. Each stage is documented in Section 3."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information or acknowledgments section mentioning grants or sponsors is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with Bilkent University, clearly listed. They are not affiliated with any of the companies whose tools are evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding disclosure is not the same as absence of funding."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The paper includes an explicit statement: 'Conflict of Interests: The authors declare no conflicts of interest in relation to this article.'"
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates are stated for any of the three models. ChatGPT's training is vaguely described as 'completed its training process early in 2022' but no cutoff date for Copilot's or CodeWhisperer's models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "HumanEval was published in 2021. All three tools were likely trained on data including HumanEval problems, but no discussion of potential train/test overlap."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval has been publicly available since July 2021 and the models were trained after that date. The paper does not address benchmark contamination risk at all."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, tokens consumed, or wall-clock time for the code generation process is reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of computational resources, time spent, or total budget for running the experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ChatGPT generates correct code 65.2% of the time, GitHub Copilot 46.3%, and Amazon CodeWhisperer 31.1% on HumanEval.",
    286       "evidence": "Section 4.2, Figures 5, 6, 8, 10. Results based on single-run evaluation of 164 HumanEval problems.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Removing docstrings significantly reduces code correctness for all three tools (drops of 26.2%, 16.5%, and 43.2% respectively).",
    291       "evidence": "Section 4.4, Table 4. Controlled experiment removing docstrings from HumanEval prompts.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Using dummy function names ('foo') causes smaller correctness drops (4.2%, 3.7%, 3.6%) than removing docstrings.",
    296       "evidence": "Section 4.5, Table 4. Controlled experiment replacing function names with 'foo'.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "GitHub Copilot's new version showed 62% improvement in passed unit tests over the old version; CodeWhisperer showed 28% improvement.",
    301       "evidence": "Section 4.6, Figures 23-26. Comparison of Copilot v1.7.4421 vs v1.70.8099, and CodeWhisperer Nov '22 vs Jan '23.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "All code generation tools generate valid code approximately 90% of the time.",
    306       "evidence": "Section 4.1, Figure 4. Copilot 91.5%, CodeWhisperer 90.2%, ChatGPT 93.3%.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "ChatGPT outperforms GitHub Copilot and Amazon CodeWhisperer on HumanEval with 65.2% code correctness vs 46.3% and 31.1% respectively. Providing docstrings in prompts is more important than meaningful function names for code generation quality. Both GitHub Copilot and CodeWhisperer showed improvements across versions. All three tools generate valid (syntactically correct) code approximately 90% of the time, but code maintainability issues (code smells) are common across all generators.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical tests",
    315       "detail": "Comparative claims between three tools are made based solely on comparing raw percentages without any significance tests, despite acknowledged nondeterminism in the tools."
    316     },
    317     {
    318       "flag": "Single-run evaluation with nondeterministic tools",
    319       "detail": "Each tool was run once per problem despite the paper acknowledging nondeterministic behavior (Section 6.2). No variance or confidence intervals are reported, making it impossible to know if observed differences are meaningful."
    320     },
    321     {
    322       "flag": "Benchmark contamination unaddressed",
    323       "detail": "HumanEval was published in July 2021 and all three models were trained on data from after that date. ChatGPT's superior performance could partly be explained by memorization of HumanEval solutions, but this is not discussed."
    324     },
    325     {
    326       "flag": "Unequal comparison intervals for RQ4",
    327       "detail": "GitHub Copilot versions are 13 months apart while CodeWhisperer versions are only 2 months apart, making improvement rate comparisons misleading."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating large language models trained on code",
    333       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    334       "year": 2021,
    335       "doi": "10.48550/ARXIV.2107.03374",
    336       "relevance": "Introduces HumanEval benchmark and Codex model, foundational to this study's evaluation methodology."
    337     },
    338     {
    339       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    340       "authors": ["Hammond Pearce"],
    341       "year": 2021,
    342       "relevance": "Evaluates security vulnerabilities in Copilot-generated code, finding 40% of programs were vulnerable."
    343     },
    344     {
    345       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    346       "authors": ["Priyan Vaithilingam"],
    347       "year": 2022,
    348       "relevance": "Within-subjects user study of GitHub Copilot finding no significant improvement in speed/success rate."
    349     },
    350     {
    351       "title": "An empirical evaluation of using large language models for automated unit test generation",
    352       "authors": ["Nguyen", "Nadi"],
    353       "year": 2022,
    354       "relevance": "Evaluates Copilot on LeetCode problems across four programming languages for code correctness and understandability."
    355     },
    356     {
    357       "title": "Robustness of AI-assisted code generation: a study on GitHub Copilot",
    358       "authors": ["Mastropaolo"],
    359       "year": 2023,
    360       "relevance": "Studies effect of semantic-preserving input changes on Copilot's code generation, finding 46% result in different recommendations."
    361     },
    362     {
    363       "title": "An analysis of the automatic bug fixing performance of ChatGPT",
    364       "authors": ["Sobania"],
    365       "year": 2023,
    366       "relevance": "Evaluates ChatGPT's bug fixing capability on QuixBugs benchmark, comparing to Codex and standard APR approaches."
    367     },
    368     {
    369       "title": "ChatGPT and software testing education: Promises & perils",
    370       "authors": ["Jalil", "Rafi", "LaToza", "Moran", "Lam"],
    371       "year": 2023,
    372       "relevance": "Evaluates ChatGPT on software testing curriculum questions, finding it would fail a software testing course."
    373     },
    374     {
    375       "title": "An empirical evaluation of GitHub Copilot's code suggestions",
    376       "authors": ["Sobania"],
    377       "year": 2022,
    378       "relevance": "Evaluates Copilot code correctness and compares to genetic programming-based code generators."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs