scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28092B)
      1 {
      2   "paper": {
      3     "title": "Bugs in Large Language Models Generated Code: An Empirical Study",
      4     "authors": [
      5       "Florian Tambon",
      6       "Arghavan Moradi Dakhel",
      7       "Amin Nikanjam",
      8       "Foutse Khomh",
      9       "Michel C. Desmarais",
     10       "Giuliano Antoniol"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2403.08937"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The replication package is referenced as [18] and available at https://github.com/FlowSs/BugsInLLMs. The paper states: 'We make the dataset used in this study publicly available online [18] for other researchers and practitioners to replicate or build upon our work.'"
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The dataset and all collected artifacts are shared via the replication package at [18]. The paper states: 'All collected artifacts and data generated during our study are accessible in our replication package [18].' Survey responses and labeling data are included."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No environment specification (requirements.txt, Dockerfile, or dependency list with versions) is mentioned in the paper. The study involves manual labeling and a survey; the paper references specific versions of the CoderEval dataset but does not specify a computational environment for reproducing the analysis."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While the replication package is provided, the paper does not include step-by-step reproduction instructions. The methodology is described in detail for the manual labeling and survey process, but there are no specific scripts or commands to replicate the analysis pipeline."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper reports percentages and weighted averages for bug patterns and survey results but does not include confidence intervals or error bars on these figures. The sampling is stated at '95% confidence interval and an error rate of 5%' for sample size justification, but no CIs are reported on the actual results."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper uses Spearman's Rho correlation to test the relationship between survey-reported bug frequency and the distribution observed in the sample set (Section 3.2.3, Section 4.2.2). The correlation values (rho = 0.47 for Codex, 0.28 for PanGu, -0.18 for CodeGen) are reported."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Spearman's Rho values are reported with context (e.g., 'ρ = 0.47, medium correlation' for Codex), which provides effect size information. The percentage distributions of bug patterns across models also provide baseline context for interpreting magnitudes."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The sample size is explicitly justified in Section 3.1.1: 'At a 95% confidence interval and an error rate of 5%, this necessitated the analysis of 323 samples. We sampled 333 code fragments.' The survey sample size (34 responses) is contextualized against the 382 invitations sent."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No variance, standard deviation, or spread measures are reported for the weighted average Likert-scale scores or the bug pattern distributions. Only point estimates (percentages and weighted averages) are provided."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The study compares bug patterns across three LLMs (CodeGen, PanGu-Coder, and Codex), and relates findings to prior bug taxonomies from human-written code and prior studies (Fan et al., Liu et al., Jesse et al.). The survey validation also serves as a baseline comparison."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The three LLMs studied (CodeGen, PanGu-Coder, and Codex) are from 2022 and earlier. At the time of the 2024 publication, more capable models like GPT-4, Claude, and Code Llama were available and widely used. The paper acknowledges this as a limitation tied to the CoderEval dataset version used."
     76       },
     77       "ablation_study": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "This is a taxonomy construction and qualitative analysis study, not a system with components to ablate. The study characterizes bug patterns rather than proposing a method with separable components."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The study uses multiple metrics: bug pattern distribution percentages across models, inter-rater agreement (78.2%), Spearman correlation between survey and sample set frequencies, and four Likert-scale dimensions (frequency, diagnosing, complexity, fixing) for survey validation."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The taxonomy was validated through an online survey with 34 practitioners and researchers (Section 3.2, Section 4.2). Participants assessed each bug pattern on frequency, diagnosing difficulty, complexity, and fixing difficulty using Likert scales."
     91       },
     92       "held_out_test_set": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "This is not a machine learning evaluation study; there is no train/test split. The study constructs a taxonomy through manual analysis, so a held-out test set is not applicable."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Detailed per-category breakdowns are provided in Table 2 (bug pattern distribution per LLM), Table 3 (bug pattern distribution per runnable level), Figure 3 (heatmap per task), and Table 4/Figure 4 (survey results per bug pattern)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper extensively discusses failure cases through illustrative code examples (Listings 1-10) for each bug pattern. Specific failure modes like prompt ambiguity leading to Misinterpretation and version mismatches are discussed."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports that the correlation between survey responses and sample distributions was low for some models (rho = -0.18 for CodeGen) and discusses discrepancies between survey results and sample-based findings (e.g., Prompt-biased Code being more prevalent in survey responses than in CoderEval)."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims 10 bug patterns from 333 bugs across 3 LLMs validated by 34 practitioners are all supported in the results. The claim that participants 'generally asserted the significance and prevalence' is supported by the survey results in Section 4.2."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper is mostly descriptive and avoids strong causal claims. Where causal-like language appears (e.g., 'limited information in the prompt can drastically lead the model astray'), it is presented as observations with illustrative examples rather than tested causal relationships. The taxonomy is presented as descriptive rather than explanatory."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper bounds its claims to the three studied LLMs and Python code from CoderEval. Section 7 (Threats to Validity) explicitly notes limitations to Python, the three specific LLMs, and the CoderEval dataset. The external validity section notes: 'future works should consider expanding our study to cover a more diverse set of LLMs and functions.'"
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper discusses alternative explanations for observed patterns. For instance, it considers whether prompt ambiguity rather than LLM weakness drives certain bugs, whether runnable levels bias results, and why survey frequencies differ from sample frequencies (e.g., CoderEval's limited docstrings vs. practitioners' richer prompts). Section 4.2.2 discusses how prompt type explains discrepancies."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper names the three LLMs (CodeGen, PanGu-Coder, Codex) and provides parameter count ranges but does not specify exact model versions, snapshot dates, or checkpoints used. For CodeGen, 350M to 16.1B parameter variants are mentioned but the specific variant used is not stated. For Codex, the version is not specified beyond 'Codex.'"
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The prompts are the docstrings from CoderEval functions, which are shown in the code listings (Listings 1-10) throughout the paper. The paper states code was generated 'using the docstring' as prompt, and the exact CoderEval dataset version is linked (commit hash ec1177750cf10b5faa414a0e76d1430e75141a44)."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No hyperparameters (temperature, top-p, max tokens, sampling strategy) are reported for the LLM code generation. The paper uses code generated within the CoderEval dataset but does not mention the generation settings used."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The LLMs generate code directly from prompts (docstrings) in a single-turn generation setting."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The data filtering pipeline is well documented in Section 3.1.1: starting from 6,900 Python samples, filtering by runnable level to 1,997, then filtering by buggy flag, then sampling 333. Filtering criteria at each stage and the rationale are clearly stated."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 7 'Threats to Validity' provides a dedicated and substantive discussion organized into construct, internal, external, and conclusion validity threats, spanning approximately 2 pages."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The threats are specific to this study: the limitation to Python, the three specific LLMs, the CoderEval dataset's prompt style, the 8.9% response rate for the survey, the focus on runnable levels up to plib_runnable, and the risk of missing bug patterns due to sampling. These are concrete and study-specific."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper explicitly states scope boundaries: limited to Python only, three specific LLMs, CoderEval tasks, runnable levels up to plib_runnable. Section 7 notes: 'future works should consider expanding our study to include other programming languages' and 'to cover a more diverse set of LLMs.' The paper also notes that type-based bugs and memory management issues cannot be detected in Python."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The replication package at [18] (https://github.com/FlowSs/BugsInLLMs) includes the labeled buggy code samples, the labeling data, and anonymized survey responses, enabling independent verification."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3.1.1 describes the data collection in detail: the CoderEval dataset source, the specific GitHub commit used, the filtering criteria (runnable levels, buggy flag), the sampling method (95% CI, 5% error rate), and balanced distribution across LLMs."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 3.2.1 describes participant recruitment: collecting emails from GitHub collaborators on LLM code repositories (200 emails via PyDriller), researcher emails from Google Scholar papers (182 emails from 56 papers), and Reddit posts on LocalLLaMA and MachineLearning channels. Total: 382 unique email addresses."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full pipeline is documented from CoderEval (6,900 Python samples) → filtering by runnable level (1,997) → filtering by buggy flag → sampling (333). The labeling process is documented in Table 1 with initial agreements and conflicts per round. Survey analysis methodology is described in Section 3.2.3."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Funding is disclosed in the paper's first page footnote: 'This work was supported by: Fonds de Recherche du Québec (FRQ), the Canadian Institute for Advanced Research (CIFAR) as well as the DEEL project CRDPJ 537462-18 funded by NSERC and CRIAQ, together with its industrial partners Thales Canada inc, Bell Textron Canada Limited, CAE inc and Bombardier inc.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All authors are affiliated with Polytechnique Montréal, which is clearly stated. The authors are academics and do not work for any of the LLM companies whose products are evaluated (OpenAI for Codex, Huawei for PanGu-Coder, Salesforce for CodeGen)."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The funders (FRQ, CIFAR, NSERC, CRIAQ, Thales, Bell Textron, CAE, Bombardier) are not producers of the evaluated LLMs and have no direct financial interest in the bug taxonomy findings."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is included in the paper. There is no declaration of conflicts of interest."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper evaluates LLMs (CodeGen, PanGu-Coder, Codex) on the CoderEval benchmark but does not state the training data cutoff dates for any of the three models. This is relevant because CoderEval tasks are from GitHub and could potentially overlap with training data."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No discussion of potential train/test overlap between the CoderEval benchmark tasks and the LLMs' training data. The CoderEval tasks were extracted from GitHub repositories, which could have been in the training data of the models."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not discuss whether the CoderEval functions could have appeared in the training data of the three LLMs. Since all three models were trained on GitHub code and CoderEval is sourced from GitHub, this is a relevant contamination risk that is not addressed."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No pre-registration is mentioned for the survey study. There is no link to OSF, AsPredicted, or any pre-registration platform."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No IRB or ethics board approval is mentioned for the survey involving 34 human participants."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Section 4.2.1 reports participant demographics: job titles (12 PhD students, 4 researchers, 8 undergrad/grad students, 1 lecturer, 6 developers, 2 data scientists, 1 CTO), LLMs used, and programming languages used."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 3.2.1 describes inclusion criteria: participants were either GitHub users who collaborated on repositories containing LLM-generated code, or researchers who published on LLM code generation. The selection methodology is described in detail."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "This is not an experimental study with treatment/control groups. The survey is cross-sectional, collecting perceptions from all participants about the same bug patterns. Randomization is not applicable."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "This is a survey study collecting participants' perceptions of bug patterns. Blinding is not applicable to this design."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "The paper reports: 382 surveys sent, 34 responses received (8.9% response rate). Reddit distribution is also mentioned. The response rate is contextualized against similar SE surveys (8.4% to 13.3%)."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This study analyzes existing LLM-generated code from the CoderEval dataset rather than proposing a new method with inference costs. The paper is a taxonomy construction study, not a system proposal."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "The paper reports the manual labeling effort: 'The entire process required approximately 108 person-hours' (Section 3.1.2). While this is human effort rather than compute cost, it quantifies the resource budget for the main analysis."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "10 distinctive bug patterns were identified in LLM-generated code from a sample of 333 bugs across three LLMs (CodeGen, PanGu-Coder, Codex).",
    293       "evidence": "Section 4.1.1 presents the full taxonomy with percentages per category. Table 2 shows distribution across models. Figure 2 visualizes the taxonomy.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "Misinterpretation is the most common bug pattern overall (20.77%), while Missing Corner Case is the most common for the stronger model Codex (23.53%).",
    298       "evidence": "Table 2 in Section 4.1.2 provides exact percentages per model and overall. The difference between stronger (Codex) and weaker models (CodeGen, PanGu-Coder) is discussed.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "Several LLM bug patterns (Hallucinated Object, Wrong Attribute, Silly Mistake) are different from bugs typically found in human-written code.",
    303       "evidence": "Section 4.1.2 discusses how these patterns would rarely occur in human code, supported by the observation that IDEs and linters would catch many of these. Survey respondents rated these patterns as low complexity (unlikely for human developers).",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "The frequency of bug patterns reported by survey respondents correlates with the distribution observed for Codex (Spearman rho = 0.47).",
    308       "evidence": "Section 4.2.2 reports the Spearman correlation: rho = 0.47 (medium) for Codex, 0.28 for PanGu-Coder, -0.18 for CodeGen.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "Missing Corner Case bugs are the hardest to diagnose according to survey respondents (score > 3 on 5-point Likert scale).",
    313       "evidence": "Figure 4 and Section 4.2.3 show Missing Corner Cases has a diagnosing score of 3.35, the highest among all categories. Participant quotes support this finding.",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "Bug patterns are evenly distributed over different tasks and runnable levels in the CoderEval dataset.",
    318       "evidence": "Figure 3 (heatmap) and Table 3 (runnable levels) show generally even distribution with few exceptions. Some trends are noted (Missing Corner Case increases with runnable level).",
    319       "supported": "moderate"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "qualitative",
    324     "observational"
    325   ],
    326   "key_findings": "The study identifies 10 distinctive bug patterns in LLM-generated code through manual analysis of 333 buggy code samples from CodeGen, PanGu-Coder, and Codex on the CoderEval benchmark. Misinterpretation is the most common pattern overall (20.77%), while stronger models like Codex show more Missing Corner Case bugs (23.53%). Several bug patterns (Hallucinated Object, Wrong Attribute, Silly Mistake) are qualitatively different from typical human-made bugs. A validation survey with 34 practitioners confirms the relevance of the taxonomy, with respondents reporting similar bug frequencies especially for Codex-like models (Spearman rho = 0.47).",
    327   "red_flags": [
    328     {
    329       "flag": "Outdated models evaluated",
    330       "detail": "The three LLMs studied (CodeGen, PanGu-Coder, Codex) are from 2022 and earlier. By the 2024 publication date, much more capable models (GPT-4, Claude, Code Llama) were widely available. The paper acknowledges this but the taxonomy may not fully represent current LLM bug patterns."
    331     },
    332     {
    333       "flag": "Benchmark contamination not addressed",
    334       "detail": "The CoderEval tasks are sourced from GitHub repositories, and all three models were trained on GitHub code. The paper does not discuss whether the models may have seen these exact functions during training, which could affect which bugs they generate."
    335     },
    336     {
    337       "flag": "Small survey sample with low response rate",
    338       "detail": "The validation survey has only 34 respondents from 382 invitations (8.9% response rate). While contextualized against similar SE surveys, this limits the statistical power of the survey-based validation."
    339     },
    340     {
    341       "flag": "No IRB/ethics approval mentioned",
    342       "detail": "The survey involved 34 human participants but no IRB or ethics board approval is mentioned."
    343     },
    344     {
    345       "flag": "Missing hyperparameters for LLM generation",
    346       "detail": "The paper uses code generated by the CoderEval dataset but does not report the generation hyperparameters (temperature, top-p, sampling strategy), which can significantly affect the types of bugs generated."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "Evaluating large language models trained on code",
    352       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    353       "year": 2021,
    354       "arxiv_id": "2107.03374",
    355       "relevance": "Introduces the Codex model and HumanEval benchmark, foundational to evaluating LLM code generation capabilities."
    356     },
    357     {
    358       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    359       "authors": ["H. Yu", "B. Shen", "D. Ran"],
    360       "year": 2023,
    361       "arxiv_id": "2302.00288",
    362       "relevance": "The benchmark dataset used in this study, designed for evaluating LLMs on real-world programming tasks rather than competitive programming."
    363     },
    364     {
    365       "title": "Github copilot ai pair programmer: Asset or liability?",
    366       "authors": ["A. Moradi Dakhel", "V. Majdinasab", "A. Nikanjam"],
    367       "year": 2023,
    368       "doi": "10.1016/j.jss.2023.111734",
    369       "relevance": "Assesses code quality of Copilot-generated code and compares repair effort with human-written code."
    370     },
    371     {
    372       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    373       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    374       "year": 2023,
    375       "arxiv_id": "2305.01210",
    376       "relevance": "Evaluates correctness of ChatGPT-generated code and shows existing test cases may miss LLM-specific bugs."
    377     },
    378     {
    379       "title": "Automated repair of programs from large language models",
    380       "authors": ["Z. Fan", "X. Gao", "M. Mirchev", "A. Roychoudhury", "S. H. Tan"],
    381       "year": 2023,
    382       "relevance": "Studies automated program repair for LLM-generated buggy code, finding Codex outperforms APR tools at fixing its own bugs."
    383     },
    384     {
    385       "title": "Lost in translation: A study of bugs introduced by large language models while translating code",
    386       "authors": ["R. Pan", "A. R. Ibrahimzada", "R. Krishna"],
    387       "year": 2023,
    388       "arxiv_id": "2308.03109",
    389       "relevance": "Develops a bug taxonomy for LLM-generated code in code translation tasks, complementary to this study's code generation focus."
    390     },
    391     {
    392       "title": "Refining chatgpt-generated code: Characterizing and mitigating code quality issues",
    393       "authors": ["Y. Liu", "T. Le-Cong", "R. Widyasari"],
    394       "year": 2023,
    395       "relevance": "Analyzes code quality issues in ChatGPT-generated code using static analysis tools on LeetCode problems."
    396     },
    397     {
    398       "title": "Large language models and simple, stupid bugs",
    399       "authors": ["K. Jesse", "T. Ahmed", "P. T. Devanbu", "E. Morgan"],
    400       "year": 2023,
    401       "relevance": "Examines LLM propensity to generate simple bugs (SStuBs) and the difficulty of distinguishing them from correct code."
    402     },
    403     {
    404       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    405       "authors": ["P. Vaithilingam", "T. Zhang", "E. L. Glassman"],
    406       "year": 2022,
    407       "relevance": "User study showing Copilot users had lower task success rates, relevant to understanding LLM bug impact on developer productivity."
    408     },
    409     {
    410       "title": "Taxonomy of real faults in deep learning systems",
    411       "authors": ["N. Humbatova", "G. Jahangirova", "G. Bavota"],
    412       "year": 2020,
    413       "relevance": "Establishes methodology for bug taxonomy construction in AI systems using open coding, which this paper adapts."
    414     },
    415     {
    416       "title": "Adaptive test generation using a large language model",
    417       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    418       "year": 2023,
    419       "arxiv_id": "2302.06527",
    420       "relevance": "Explores LLM-based test generation, relevant to automated approaches for detecting bugs in LLM-generated code."
    421     },
    422     {
    423       "title": "CodaMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    424       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    425       "year": 2023,
    426       "relevance": "Combines LLMs with search-based testing to improve code coverage, relevant to testing LLM-generated code."
    427     }
    428   ]
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs