scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24697B)
      1 {
      2   "paper": {
      3     "title": "Can LLM Replace Stack Overflow? A Study on Robustness and Reliability of Large Language Model Code Generation",
      4     "authors": ["Li Zhong", "Zilong Wang"],
      5     "year": 2023,
      6     "venue": "AAAI 2024",
      7     "arxiv_id": "2308.10335"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a GitHub link: https://github.com/FloridSleeves/RobustAPI. The dataset and evaluator are stated as open-sourced."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The dataset of 1208 Stack Overflow questions is released as part of the GitHub repository. The paper states 'We open-source our dataset and evaluator on GitHub.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency versions are mentioned in the paper. There is no 'Environment Setup' section with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper does not include step-by-step reproduction instructions. While the code is released, no README with commands to run or a 'Reproducing Results' section is described in the paper itself."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., '62.97% misuse rate') with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., 'GPT-4 actually has a higher API misuse rate than GPT-3.5') but provides no statistical significance tests. All comparisons are made by directly comparing percentages."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Raw percentages are given but without standardized effect size measures or baseline context beyond the raw numbers."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The dataset size of 1208 questions is stated but not justified with a power analysis or rationale for why this number is sufficient. The selection of 18 APIs is also not statistically justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results appear to be from single runs (Pass@1 unless specified). No standard deviations, variance across runs, or spread measures are reported. The temperature experiment (Table 5) shows different temperature settings but these are different conditions, not repeated runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares 4 main LLMs (GPT-3.5, GPT-4, Llama-2, Vicuna-1.5) plus DeepSeek-Coder variants against each other. While there is no non-LLM baseline, the multi-model comparison across settings serves as a baseline comparison."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The models evaluated (GPT-3.5, GPT-4, Llama-2, Vicuna-1.5, DeepSeek-Coder) were all contemporary and state-of-the-art at the time of the study (2023)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper includes ablation-style experiments: zero-shot vs. one-shot-irrelevant vs. one-shot-relevant settings, temperature variations (Table 5), and API rules vs. one-shot examples (Table 6). These systematically vary components to measure their effect."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Three metrics are defined and reported: API Misuse Rate, Compilation Rate, and Overall API Misuse Percentage. Pass@k results are also reported (Table 4)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The evaluation is entirely automated via AST-based static analysis. No human evaluation of the generated code quality or API misuse detection accuracy is reported. Given that the paper makes claims about code reliability for developers, human evaluation of the outputs would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The one-shot demonstration examples are stated to not be present in the testing dataset: 'The question in the demo example is not present in the testing dataset.' The entire 1208 questions serve as the test set with separate demonstration examples."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Figure 4 provides a per-API breakdown of misuse rates for each model across all experiment settings. Table 1 shows the domain-level breakdown of the dataset."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The 'Error Analysis' section and 'Case Study: API Misuse in GPT-3.5' section discuss specific failure cases, showing concrete code examples where GPT-3.5 generates API misuse under different settings."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are reported: irrelevant shots increase misuse rate rather than decreasing it (Finding 3), API usage rules in prompts do not help reduce misuse (Table 6, Finding 5), and relevant shots do not help Llama."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'even GPT-4 has 62% of the generated code that contains API misuses' which is supported by Table 2 (GPT-4 zero-shot Overall Misuse = 62.09%). Other abstract claims about the dataset and evaluation framework are supported in the methodology and results sections."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims like 'the irrelevant shot provided to the large language models actually encourages the models to give a lengthy code solution, which increases the chance of API misuse' without adequate causal evidence. This is speculation about mechanisms without controlled testing of this specific hypothesis."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title asks 'Can LLM Replace Stack Overflow?' which implies general conclusions, but results are limited to 18 Java APIs and 4-6 LLMs. The Discussion section acknowledges the Java focus ('Extend to Other Language') but the title and framing significantly overreach the tested scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not substantively discuss alternative explanations. For example, the finding that GPT-4 has higher misuse than GPT-3.5 is noted as 'counter-intuition' but not deeply explored. No threats-to-validity section exists, and possible confounds (e.g., training data composition, prompt sensitivity) are not systematically addressed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper refers to models as 'GPT-3.5', 'GPT-4', 'Llama-2', 'Vicuna-1.5' without specifying exact API versions or snapshot dates (e.g., no 'gpt-4-0613'). DeepSeek-Coder variants are specified by size (6.7b-base, 6.7b-instruct) but not by exact version."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes the prompt structure in natural language ('we start with the task introduction and the required response format') and mentions it is adapted from Patil et al. 2023, but the actual full prompt text is not provided in the paper or appendix. Only a template description is given."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'We use the default hyper-parameter settings of each model without further extensive hyper-parameter tuning' but does not specify what those defaults are (temperature, top-p, max_tokens). Table 5 shows temperature experiments but only for GPT-3.5 one-shot-relevant."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The evaluation is a straightforward prompt-response setup with no tool use, retry logic, or multi-turn interaction."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data collection pipeline is described: starting from ExampleCheck dataset, selecting 18 APIs, crawling Stack Overflow questions, filtering for those with online answers containing API misuse, resulting in 1208 questions. The filtering criteria and conversion to JSON format are documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The Discussion section briefly mentions extending to other languages but does not constitute a substantive limitations discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. Potential issues such as the accuracy of the AST-based checker, the representativeness of the 18 selected APIs, or the impact of prompt design on results are not addressed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. The Discussion mentions extending to other languages but does not bound the claims to Java or to the specific API set tested. No explicit statements about what claims the authors are NOT making."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The dataset is open-sourced on GitHub (https://github.com/FloridSleeves/RobustAPI), allowing independent verification of the questions, API usage rules, and evaluation results."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection procedure is described in the 'Data Collection' section: based on ExampleCheck dataset, 18 popular Java APIs selected, questions crawled from Stack Overflow, filtered to keep only questions with answers containing API misuse, yielding 1208 questions."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data source is a standard benchmark constructed from Stack Overflow questions."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from ExampleCheck to final dataset is documented: API selection from ExampleCheck -> Stack Overflow crawling -> filtering for questions with answers containing API misuse -> JSON conversion -> prompt generation. The evaluator pipeline (AST extraction -> call sequence comparison -> misuse detection) is also documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding sources are disclosed. The Acknowledgments section only thanks AAAI reviewers and chairs without mentioning any funding."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are disclosed: both authors are from University of California, San Diego. They are not affiliated with any of the LLM companies being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure statement means this criterion is not satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates are stated for any of the evaluated models. This is relevant because the Stack Overflow questions in the benchmark could have been in the training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper does not discuss potential overlap between the Stack Overflow questions in ROBUSTAPI and the training data of the evaluated LLMs. Stack Overflow content is commonly used in LLM training data, making this a significant omission."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The benchmark is constructed from Stack Overflow questions that were available online well before the training cutoffs of GPT-3.5/4, Llama-2, etc. No discussion of contamination risk is provided despite the high likelihood that these models were trained on Stack Overflow data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this benchmark evaluation study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this benchmark evaluation study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this benchmark evaluation study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this benchmark evaluation study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this benchmark evaluation study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this benchmark evaluation study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this benchmark evaluation study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or per-example costs are reported. Only execution time for the static analysis checker is provided (Table 3), not the LLM inference cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours, or API spend is stated. Table 3 only shows the static analysis execution time (~6-7 minutes), not the LLM query costs."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Even GPT-4 has 62% of generated code containing API misuses in zero-shot setting.",
    286       "evidence": "Table 2 shows GPT-4 zero-shot Overall API Misuse Percentage of 62.09% and API Misuse Rate of 68.81% among compilable code.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "GPT-4 has a higher API misuse rate than GPT-3.5, despite being promoted as more advanced.",
    291       "evidence": "Table 2 shows GPT-4 zero-shot misuse rate of 68.81% vs. GPT-3.5 at 62.97%. However, no significance test is performed on this comparison.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Irrelevant one-shot examples do not reduce API misuse rate but trigger more valid answers.",
    296       "evidence": "Figure 3 and Table 2 show that one-shot-irrelevant setting increases compilation rate but generally maintains or increases misuse rate across models (Finding 2, Finding 3).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Relevant one-shot examples with correct API usage significantly reduce misuse rate for GPT-3.5, GPT-4, and Vicuna.",
    301       "evidence": "Table 2 shows GPT-3.5 misuse rate drops from 62.97% (zero-shot) to 38.56% (one-shot-relevant), GPT-4 from 68.81% to 54.40%, Vicuna from 45.66% to 42.53%.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Code-specialized LLMs (DeepSeek-Coder) generate more compilable samples but do not significantly improve API misuse rate.",
    306       "evidence": "Table 2 shows DeepSeek-Coder-6.7b-instruct achieves 96.61% compilation rate (one-shot-irrelevant) but 59.04% misuse rate, comparable to other models.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Changing temperature or replacing one-shot examples with API rules does not significantly affect API misuse rate.",
    311       "evidence": "Table 5 shows similar misuse rates across temperatures (38.56% at T=0, 39.77% at T=0.5, 39.06% at T=1.0). Table 6 shows API rules actually increase misuse rate to 65.01% vs 38.56% for one-shot-relevant.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "The paper introduces ROBUSTAPI, a benchmark of 1208 Stack Overflow questions across 18 Java APIs to evaluate API misuse in LLM-generated code. All evaluated models (GPT-3.5, GPT-4, Llama-2, Vicuna-1.5, DeepSeek-Coder) exhibit high rates of API misuse (38-70% among compilable code), even when generating syntactically correct and functionally adequate code. Providing one-shot examples with correct API usage reduces misuse rates for some models (GPT-3.5 drops from 63% to 39%), while irrelevant examples or explicit API rules are ineffective. Counter-intuitively, GPT-4 shows higher API misuse rates than GPT-3.5 despite being considered more capable.",
    317   "red_flags": [
    318     {
    319       "flag": "No contamination analysis",
    320       "detail": "The benchmark is constructed from Stack Overflow questions that were publicly available before the training data cutoffs of all evaluated models. LLMs are commonly trained on Stack Overflow data. The paper does not address whether the models may have seen these questions during training, which could mean the results reflect memorization rather than generalization."
    321     },
    322     {
    323       "flag": "No statistical significance testing",
    324       "detail": "All comparative claims between models are based on raw percentage comparisons with no significance tests. With 1208 questions, differences of a few percentage points may not be statistically significant, yet the paper draws conclusions from them (e.g., GPT-4 vs GPT-3.5 comparison)."
    325     },
    326     {
    327       "flag": "No uncertainty quantification",
    328       "detail": "Results appear to be from single runs with no confidence intervals, error bars, or variance across runs reported. Stochastic LLM outputs could produce different results on repeated runs."
    329     },
    330     {
    331       "flag": "Overclaimed title and framing",
    332       "detail": "The title 'Can LLM Replace Stack Overflow?' implies a broad conclusion, but the study only examines 18 Java APIs for API misuse patterns. It does not address other dimensions of Stack Overflow utility (debugging, conceptual explanations, architecture decisions, etc.)."
    333     },
    334     {
    335       "flag": "Missing limitations section",
    336       "detail": "The paper has no dedicated limitations or threats-to-validity section. The accuracy of the AST-based checker, representativeness of selected APIs, and prompt design sensitivity are not discussed as potential threats."
    337     },
    338     {
    339       "flag": "Model versions unspecified",
    340       "detail": "Exact model versions (API snapshots, dates) for GPT-3.5 and GPT-4 are not specified. Given that model behavior changes across versions, results may not be reproducible."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Evaluating large language models trained on code",
    346       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    347       "year": 2021,
    348       "arxiv_id": "2107.03374",
    349       "relevance": "Introduces HumanEval benchmark for code generation evaluation, a foundational benchmark in the field."
    350     },
    351     {
    352       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    353       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    354       "year": 2023,
    355       "arxiv_id": "2310.06770",
    356       "relevance": "Proposes real-world software engineering benchmark using GitHub issues, directly relevant to evaluating LLM coding capabilities."
    357     },
    358     {
    359       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    360       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan"],
    361       "year": 2022,
    362       "relevance": "Evaluates security of LLM-generated code (Copilot), finding ~40% of code is vulnerable, relevant to code quality assessment."
    363     },
    364     {
    365       "title": "Do users write more insecure code with AI assistants?",
    366       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    367       "year": 2022,
    368       "arxiv_id": "2211.03622",
    369       "relevance": "Large-scale user study on security implications of AI code assistants, directly relevant to AI coding productivity and safety."
    370     },
    371     {
    372       "title": "Lost at c: A user study on the security implications of large language model code assistants",
    373       "authors": ["Gustavo Sandoval", "Hammond Pearce", "Ton Nys"],
    374       "year": 2023,
    375       "arxiv_id": "2208.09727",
    376       "relevance": "User study assessing security of low-level code generated by AI assistants, complementary study to AI code quality research."
    377     },
    378     {
    379       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    380       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    381       "year": 2023,
    382       "arxiv_id": "2305.01210",
    383       "relevance": "Rigorous evaluation of ChatGPT code generation correctness, enlarging HumanEval with higher-coverage test cases."
    384     },
    385     {
    386       "title": "Large Language Models and Simple, Stupid Bugs",
    387       "authors": ["Kevin Jesse", "Toufique Ahmed", "Premkumar T. Devanbu"],
    388       "year": 2023,
    389       "arxiv_id": "2303.11455",
    390       "relevance": "Studies simple bugs in LLM-generated code, showing AI assistants can introduce hard-to-detect bugs."
    391     },
    392     {
    393       "title": "Assessing the quality of GitHub copilot's code generation",
    394       "authors": ["Burak Yetistiren", "Isik Ozsoy", "Eray Tuzun"],
    395       "year": 2022,
    396       "relevance": "Assesses Copilot code quality across compilation correctness, functional correctness, and code efficiency."
    397     },
    398     {
    399       "title": "Gorilla: Large language model connected with massive apis",
    400       "authors": ["Shishir G. Patil", "Tianjun Zhang", "Xin Wang", "Joseph E. Gonzalez"],
    401       "year": 2023,
    402       "arxiv_id": "2305.15334",
    403       "relevance": "Addresses LLMs connected to APIs, relevant to understanding API usage and hallucination in code generation."
    404     },
    405     {
    406       "title": "A systematic evaluation of large language models of code",
    407       "authors": ["Frank F. Xu", "Uri Alon", "Graham Neubig", "Vincent J. Hellendoorn"],
    408       "year": 2022,
    409       "relevance": "Systematic evaluation of code LLMs, relevant to the broader evaluation methodology for AI code generation."
    410     }
    411   ]
    412 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs