scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30946B)
      1 {
      2   "paper": {
      3     "title": "No Need to Lift a Finger Anymore? Assessing the Quality of Code Generation by ChatGPT",
      4     "authors": [
      5       "Zhijie Liu",
      6       "Yutian Tang",
      7       "Xiapu Luo",
      8       "Yuming Zhou",
      9       "Liang Feng Zhang"
     10     ],
     11     "year": 2023,
     12     "venue": "IEEE Transactions on Software Engineering",
     13     "arxiv_id": "2308.04838",
     14     "doi": "10.1109/TSE.2024.3392499"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "ChatGPT (GPT-3.5-turbo-0301) generates functionally correct code at 68.41% acceptance rate for pre-2021 LeetCode problems but only 20.27% for post-2021 problems, a 48.14% gap attributed to training data exposure. Multi-round fixing is largely ineffective for functional correctness (<32% fix rate) but highly effective for security vulnerabilities (89-100% fix rate). Code complexity tends to increase through multi-round fixing. The paper identifies specific defect categories (WD, MCC, MP) and shows ChatGPT's non-determinism significantly affects all quality dimensions at default temperature.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states 'The experimental scripts, results, and raw data are available at: [28]' pointing to a Zenodo archive (https://zenodo.org/records/10556350)."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Raw data is released via the same Zenodo archive [28]. Additionally, the datasets used (LeetCode problems and CWE scenarios from [23]) are publicly available."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 3.3 specifies: Intel i9-10900X CPU @ 3.70GHz, 128GB RAM, Ubuntu 20.04, Python 3.10.9, CodeQL 2.12.2, and the specific ChatGPT model version gpt-3.5-turbo-0301 with temperature 0.7."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Experimental scripts are provided at the Zenodo artifact [28]. The methodology sections (Sec 3-4) describe the workflow, prompt design, and evaluation process in sufficient detail for reproduction."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Main results (Tables 1-2, 5, 8, 10, 11, 18-21) report only point estimates (percentages) with no confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Wilcoxon rank-sum test is used throughout with p-values reported (e.g., p=0.008 for before/after 2021 comparison). Holm-Bonferroni correction is applied for multiple comparisons (Sec 4.1)."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Cliff's Delta effect size measure is used alongside Wilcoxon tests. Effect size values are reported (e.g., effect size value of 1 for period comparison, 0.3125 for C.E. rate comparison between periods). Percentage differences with context are also provided (e.g., '48.14% advantage')."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "728 algorithm problems are used (354 after 2021, 374 sampled before 2021) but no power analysis or explicit justification for why these specific numbers were chosen. The sampling follows the difficulty distribution ratio (1:2:1) but no statistical justification for total sample size."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Main experiments query each prompt once. The paper acknowledges ChatGPT's non-determinism (RQ5) and runs 10 trials for a subset of 18 problems, but the main results in Tables 1-2 are single-run with no variance measures."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Human-written code from LeetCode is used as a baseline for complexity comparison (Tables 14-16). Human acceptance rates are compared with ChatGPT acceptance rates. Before/after 2021 comparison serves as a contamination baseline."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No contemporary AI code generation models (Codex, CodeGen, Copilot, etc.) are compared. The only baselines are human-written solutions and human acceptance rates. For a 2023 paper evaluating code generation, omitting comparison with other LLM-based tools is a gap."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "The paper evaluates a single pre-existing system (ChatGPT) with no components to ablate. It is not proposing a new multi-component method."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are used: Accepted rate, Wrong Answer rate, Compile Error rate, Runtime Error rate, Time Limit Exceeded rate, test case pass rate, cyclomatic complexity, cognitive complexity, and vulnerability counts across multiple CWE categories."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Two graduate students independently classify W.A. code defects (157 pairs, 0.7325 consistency ratio, with senior analyst for disagreements). Manual analysis is also performed for T.L.E. causes, C.E. classification, vulnerability verification, and Type-2 clone detection."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Post-2021 problems (354) serve as a held-out test set that could not have appeared in ChatGPT's training data (trained on data before 2021). Pre-2021 problems are explicitly acknowledged as potentially in the training set."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Extensive breakdowns by language (C, C++, Java, Python3, JavaScript), difficulty level (easy, medium, hard), time period (before/after 2021), error type (W.A., C.E., R.E., T.L.E.), defect class (Table 4), compile error class (Table 7), runtime error class (Table 9), and CWE vulnerability groups (Table 20)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Extensive failure analysis: three error categories (WD, MCC, MP) with code examples (Figs. 4, 6, 8-19), analysis of unfixed multi-round cases (Table 6), EIL and EBL error categories for compile errors, and specific vulnerability examples."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Multiple negative results: ChatGPT's weak multi-round fixing ability (<32% fix rate for W.A.), poor performance on hard problems (0.66% A. rate for hard Aft. problems), persistent vulnerability patterns, complexity increase through fixing, and unfixable CWE-20 vulnerabilities."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "All abstract claims are supported: 48.14% advantage (Tables 1-2), weak multi-round fixing (Tables 5, 8, 10, 11), complexity variation (Tables 12-13, Fig. 25), >89% vulnerabilities fixed (Tables 18, 21), and non-determinism effects (Tables 22-32)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper is primarily descriptive/evaluative. Where comparative claims are made (before vs after 2021), Wilcoxon rank-sum tests with effect sizes are used. Claims about multi-round fixing effects are based on before/after measurement within the same conversation. No strong unsupported causal claims are made."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper bounds its scope: specifies the model (gpt-3.5-turbo-0301), the benchmark (LeetCode algorithm problems), languages tested (5 specified), and CWE scenarios. Threats to validity (Sec 5.3) explicitly acknowledges 'LeetCode problems... may not fully represent the complexity and diversity of real-world coding tasks.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Several alternatives are addressed: (1) post-2021 problems might be reformulations of pre-2021 ones (checked via 'similar questions' feature and manual analysis of 142 pairs, ruled out), (2) non-determinism as confound (RQ5, temperature 0 analysis), (3) token limitation impact (Sec 5.1, simulated incomplete generation)."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures functional correctness (LeetCode acceptance), complexity (cyclomatic/cognitive), and security (CodeQL detection), and frames results at the same granularity — 'code generation quality' across these three specific dimensions. No broader proxy gap exists."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Sec 3.2 states: 'we utilize the model version gpt-3.5-turbo-0301 of ChatGPT' — exact API version specified."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Full prompt examples are provided: Fig. 3 shows a complete prompt with all four components (<Content>, <Examples>, <Template>, <Command>) using a real problem. Multi-round fixing prompt examples are shown throughout Sec 4.2 with actual error messages."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Temperature 0.7 (default, Sec 3.2), token limitation 4,096 (Sec 3.2), and the token-limitation strategy for multi-round conversations are reported. Temperature 0 is also tested in RQ5."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The multi-round workflow is described in detail with a diagram (Fig. 2): prompt construction → ChatGPT response → testing (LeetCode/CodeQL) → error feedback → re-prompting, including the token-limitation strategy for managing conversation context."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Data collection and preprocessing are well-documented: problem sampling strategy (1:2:1 difficulty ratio), before/after 2021 split criteria (2022-01-01 divider), CWE scenario sources, prompt construction process, code extraction from responses (between triple backticks), and filtering of constant function outputs."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Dedicated 'Limitations' section (Sec 5.2) discussing closed-source model constraints and evolving model concerns, plus a 'Threats to Validity' section (Sec 5.3) with five specific threats."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats discussed: LeetCode problems may not represent real-world coding, LeetCode terminates testing at first failure (affecting pass rates), CodeQL may report false positives (mitigated by manual inspection), limited language coverage for vulnerability detection, and ChatGPT's randomness (Sec 5.3)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Explicit scope boundaries: 'LeetCode problems are designed specifically for coding practice and interview preparation... they may not fully represent the complexity and diversity of real-world coding tasks' (Sec 5.3). The model is bounded to GPT-3.5-turbo-0301 and its limitations are noted (Sec 5.2)."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The paper states 'The experimental scripts, results, and raw data are available at: [28]' (Zenodo archive https://zenodo.org/records/10556350)."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Detailed collection description: 728 algorithm problems from LeetCode (354 after 2021, 374 sampled before 2021 following 1:2:1 difficulty ratio), 18 CWEs with 54 scenarios from MITRE Top 25 CWEs and [23]. Sampling criteria, time divider, and problem selection process documented in Sec 3.1 and 4.1."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants in the study. Data sources are standard benchmarks (LeetCode platform, CWE scenarios from MITRE and prior work). Graduate student annotators are internal researchers, not recruited participants."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Full pipeline documented: problem collection → prompt construction → ChatGPT querying → code extraction → LeetCode judgment/CodeQL analysis → error classification → multi-round fixing. Each step is described with clear criteria and counts (e.g., 1,870 + 1,770 prompts generated)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding or acknowledgments section is present in the paper. Funding sources are not disclosed."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: ShanghaiTech University, University of Glasgow, Hong Kong Polytechnic University, and Nanjing University. Authors are academic researchers evaluating a third-party product (ChatGPT/OpenAI)."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence cannot be assessed. The authors are from academic institutions with no apparent affiliation with OpenAI, but without explicit disclosure, this criterion is not met."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Sec 4.1 states 'ChatGPT is trained on text data before 2021' and uses this as the basis for the before/after 2021 experimental split."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Extensively discussed: the paper acknowledges 'Bef. problems and corresponding solutions may have a high probability to appear in its training set' (Sec 4.1). They manually check 142 similar question pairs between periods. They find 6/250 solutions are Type-2 clones to ground truth, indicating ~5% training data memorization."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "The entire experimental design splits before/after 2021 to address contamination. The 48.14% performance gap between periods is attributed partly to training data exposure. Type-2 clone analysis provides direct evidence of memorization."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. Graduate students serve as code analysts/annotators but are not study participants."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study evaluates an AI system on public benchmarks."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. Graduate student annotators are described only as 'two graduate students with experience in algorithm analysis.'"
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No API costs, token consumption, or per-query latency are reported despite making thousands of API calls (3,640+ prompts in the main experiment plus multi-round fixing and non-determinism studies)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Hardware is specified (Sec 3.3) but total computational budget (wall-clock time, total API spend, number of tokens consumed) is not quantified."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Main experiments (RQ1-RQ4) query each prompt once. RQ5 explores non-determinism with 10 trials on 18 problems, showing significant variation at temperature 0.7, but this is a separate analysis on a small subset rather than seed sensitivity for the main results."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Sec 4.1: 'we input every prompt once into it to ask for generating code.' RQ5 explicitly states 10 trials. The number of runs is clear throughout."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Default temperature 0.7 is used with no hyperparameter search. Temperature 0 is tested only in RQ5 for non-determinism analysis. No systematic search over prompt designs or model parameters is reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Sec 3.2 explicitly states: 'The goal of our prompt design is not to find the optimal prompt that maximizes ChatGPT's performance. Instead, our goal is to provide a reasonable prompt that simulates real-world usage scenarios.' Default temperature 0.7 is used to 'simulate real-world usage scenarios.'"
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": true,
    322         "justification": "Holm-Bonferroni correction is applied for multiple comparisons: 'In cases of multiple comparisons, we apply Holm-Bonferroni correction, a commonly used technique, to adjust p-values' (Sec 4.1)."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors do not discuss potential biases in their evaluation design (e.g., whether their prompt template advantages or disadvantages ChatGPT, or whether their defect classification might be biased)."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Multi-round fixing uses progressively more API calls (up to 5 or 10 rounds) but performance is not reported as a function of compute cost. No cost comparison between one-round and multi-round approaches."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Sec 5.3 discusses construct validity: 'LeetCode problems are designed specifically for coding practice and interview preparation... they may not fully represent the complexity and diversity of real-world coding tasks. Real-world coding scenarios often involve various external factors, domain-specific requirements.' CWE scenarios are also acknowledged as potentially incomplete."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "The paper evaluates a single model (ChatGPT) with a consistent prompting approach. No multi-model comparison across different scaffolds is performed, so the scaffold confound does not apply."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "The before/after 2021 split is explicitly designed to address temporal leakage. The paper demonstrates a 48.14% performance gap consistent with training data exposure, and manually checks for Type-2 clones to detect memorization."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information. The prompts include method signatures and examples from LeetCode which could provide hints. Error messages in multi-round fixing provide specific feedback, but this is the intended design rather than accidental leakage."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": true,
    359         "justification": "The paper checks for similarity between before/after 2021 problems using LeetCode's 'similar questions' feature, finding 142 pairs. Manual analysis by two graduate students confirms these are 'either having similar scenarios but completely different solution goals, or different scenarios and conditions but can be solved using similar algorithms' (Sec 4.1)."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Type-2 clone detection is applied: 50 sampled problems (250 solutions) are compared against 5 ground truth solutions each. 6 solutions in easy/medium difficulties were found to be Type-2 clones, providing direct evidence of training data memorization (Sec 4.1)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "ChatGPT generates functionally correct code for pre-2021 problems at 48.14% higher Accepted rate than post-2021 problems (68.41% vs 20.27%)",
    371       "evidence": "Tables 1-2 show per-language, per-difficulty status rates across 1,870 and 1,770 prompts for before and after 2021 problems. Wilcoxon rank-sum test p=0.008, Cliff's Delta effect size=1 (Sec 4.1).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Multi-round fixing can only fix a small fraction (<32%) of erroneous code to Accepted status",
    376       "evidence": "Table 5: 25/157 (15.9%) W.A. pairs fixed. Table 8: 40/155 (26%) C.E. pairs fixed to A. Table 10: 52/194 (27%) R.E. pairs fixed to A. Table 11: 44/140 (31.4%) T.L.E. pairs fixed (Sec 4.2).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The multi-round fixing process generally preserves or increases code complexity levels",
    381       "evidence": "Figure 25 shows heatmaps where diagonal cells (same complexity) exceed 50% in all language-complexity combinations, and cells above the diagonal (increased complexity) consistently have higher percentages than below-diagonal cells (Sec 4.3).",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "ChatGPT-generated code has security vulnerabilities (33.32% of valid CWE scenario code) but multi-round fixing addresses 89.4% of them",
    386       "evidence": "Table 19: 994/2,983 valid CWE code snippets are vulnerable. Table 21: 143/160 sampled vulnerable snippets fixed. Table 18: all sampled algorithm problem vulnerabilities fixed (Sec 4.4).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "ChatGPT's non-determinism affects code generation quality, but temperature 0 mitigates this in one-round process",
    391       "evidence": "Tables 22-25: at temperature 0.7, same problem-language pairs produce different statuses, complexity levels, and vulnerability rates across 10 trials. Tables 24-26: at temperature 0, one-round outputs are identical across all trials except one case (Sec 4.5).",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Code generated in C has higher complexity and lower acceptance rates than other languages (C++, Java, Python3, JavaScript)",
    396       "evidence": "Table 12: C has 54.9% low+moderate cyclomatic complexity vs 66%+ for other languages. Tables 1-2: C acceptance rate of 31.28% overall vs 44-50% for other languages (Sec 4.1, 4.3).",
    397       "supported": "strong"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Single model evaluation without competing models",
    403       "detail": "The paper evaluates only ChatGPT (GPT-3.5-turbo-0301) without comparing against other code generation models (Codex, CodeGen, Copilot, GPT-4). This limits the ability to contextualize ChatGPT's performance relative to the state of the art."
    404     },
    405     {
    406       "flag": "Single-run main experiment despite documented non-determinism",
    407       "detail": "RQ5 demonstrates significant non-determinism at temperature 0.7 (the default used), yet the main experiments (RQ1-RQ4) query each prompt only once. Tables 22-23 show acceptance rates varying from 0% to 100% across trials for the same problem, suggesting main results could be unreliable."
    408     },
    409     {
    410       "flag": "No cost reporting for extensive API usage",
    411       "detail": "The study makes thousands of API calls (3,640+ prompts in the main experiment, plus multi-round fixing and non-determinism studies) without reporting any cost, token consumption, or time metrics."
    412     },
    413     {
    414       "flag": "Outdated model version",
    415       "detail": "Uses gpt-3.5-turbo-0301 (March 2023 snapshot). GPT-3.5 and GPT-4 have been substantially updated since. Results may not generalize to current model versions, as acknowledged in Sec 5.2."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Evaluating large language models trained on code",
    421       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    422       "year": 2021,
    423       "arxiv_id": "2107.03374",
    424       "relevance": "Introduces Codex and HumanEval benchmark for evaluating LLM code generation — foundational benchmark-eval paper."
    425     },
    426     {
    427       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    428       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    429       "year": 2022,
    430       "relevance": "Evaluates Copilot's security code generation using CWE scenarios — directly comparable methodology for assessing LLM code security."
    431     },
    432     {
    433       "title": "An empirical evaluation of github copilot's code suggestions",
    434       "authors": ["N. Nguyen", "S. Nadi"],
    435       "year": 2022,
    436       "relevance": "Evaluates Copilot's code suggestion quality across 33 LeetCode problems — earlier benchmark-eval study of LLM code generation."
    437     },
    438     {
    439       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    440       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    441       "year": 2023,
    442       "arxiv_id": "2305.01210",
    443       "relevance": "Proposes EvalPlus framework for more rigorous evaluation of LLM code generation, addressing benchmark adequacy."
    444     },
    445     {
    446       "title": "Automated repair of programs from large language models",
    447       "authors": ["Z. Fan", "X. Gao", "A. Roychoudhury", "S. H. Tan"],
    448       "year": 2022,
    449       "arxiv_id": "2205.10583",
    450       "relevance": "Studies whether APR techniques including Codex can fix incorrect LLM-generated code — related to multi-round fixing evaluation."
    451     },
    452     {
    453       "title": "Self-collaboration code generation via chatgpt",
    454       "authors": ["Y. Dong", "X. Jiang", "Z. Jin", "G. Li"],
    455       "year": 2023,
    456       "arxiv_id": "2304.07590",
    457       "relevance": "Proposes multi-role ChatGPT framework for code generation, relevant to agentic LLM workflows."
    458     },
    459     {
    460       "title": "Examining zero-shot vulnerability repair with large language models",
    461       "authors": ["H. Pearce", "B. Tan", "B. Ahmad", "R. Karri", "B. Dolan-Gavitt"],
    462       "year": 2023,
    463       "relevance": "Evaluates LLMs for zero-shot vulnerability repair — directly related to security code fixing evaluation."
    464     },
    465     {
    466       "title": "Codegen: An open large language model for code with multi-turn program synthesis",
    467       "authors": ["E. Nijkamp", "B. Pang", "H. Hayashi"],
    468       "year": 2022,
    469       "arxiv_id": "2203.13474",
    470       "relevance": "Open LLM for multi-turn code generation — relevant baseline for code generation capability evaluation."
    471     },
    472     {
    473       "title": "Measuring coding challenge competence with apps",
    474       "authors": ["D. Hendrycks", "S. Basart", "S. Kadavath"],
    475       "year": 2021,
    476       "arxiv_id": "2105.09938",
    477       "relevance": "APPS benchmark for evaluating code generation using programming challenges — related benchmark approach."
    478     },
    479     {
    480       "title": "Refining chatgpt-generated code: Characterizing and mitigating code quality issues",
    481       "authors": ["Y. Liu", "T. Le-Cong", "R. Widyasari"],
    482       "year": 2023,
    483       "arxiv_id": "2307.12596",
    484       "relevance": "Characterizes code quality issues in ChatGPT-generated code including correctness and maintainability — closely related study."
    485     },
    486     {
    487       "title": "Choose your programming copilot: A comparison of the program synthesis performance of github copilot and genetic programming",
    488       "authors": ["D. Sobania", "M. Briesch", "F. Rothlauf"],
    489       "year": 2022,
    490       "relevance": "Compares Copilot against genetic programming for code synthesis — benchmark-eval of AI code generation."
    491     },
    492     {
    493       "title": "Security weaknesses of copilot generated code in github",
    494       "authors": ["Y. Fu", "P. Liang", "A. Tahir"],
    495       "year": 2023,
    496       "arxiv_id": "2310.02059",
    497       "relevance": "Studies security weaknesses in Copilot-generated code found in GitHub repositories — real-world security evaluation."
    498     }
    499   ]
    500 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs