ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28077B)


      1 {
      2   "paper": {
      3     "title": "Beyond Synthetic Benchmarks: Evaluating LLM Performance on Real-World Class-Level Code Generation",
      4     "authors": ["Musfiqur Rahman", "SayedHassan Khatoonabadi", "Emad Shihab"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.26130",
      8     "doi": "10.1145/nnnnnnn.nnnnnnn"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper provides a GitHub replication package at https://github.com/mrsumitbd/RealClassEval-Replication (reference [3]). Section 11 states 'We make our data and scripts available to facilitate replication and future research.'"
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The RealClassEval dataset is released as part of the replication package (reference [3]). The paper explicitly states data and scripts are available."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No mention of requirements.txt, Dockerfile, conda environment, or specific library versions. The paper mentions using Python's ast library and PYNGUIN but does not provide environment setup details sufficient to recreate the experimental environment."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper does not include step-by-step reproduction instructions. While a replication package is referenced, the paper itself does not describe how to run the experiments, what commands to execute, or provide a README-level walkthrough."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Bootstrap 95% confidence intervals are reported throughout for Cliff's Delta, mean differences, and proportion differences. Tables 3 and 4 include 95% CIs for all comparisons (e.g., 'Mean Diff (95% CI)' column)."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper uses appropriate statistical tests throughout: Kruskal-Wallis, Mann-Whitney U, Friedman, Wilcoxon signed-rank, chi-square, and sign tests. FDR correction via Benjamini-Hochberg is consistently applied for multiple comparisons."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Cliff's Delta with established thresholds (negligible, small, medium, large) is reported for per-class analyses, and Cramér's V for chi-square analyses. Both are reported with bootstrap 95% CIs. Mean percentage point differences are also provided throughout."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 9.1.1 explicitly discusses sample size limitations and justification. Power analysis is reported for all RQs (e.g., 'Power analysis across all seven models indicated high statistical power (0.974–0.983)' in Section 4.2.1). The paper also acknowledges budget constraints limiting to 200 classes per version."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports distributions via violin plots (Figure 3), bootstrap confidence intervals across analyses, and discusses skewness across comparisons (e.g., 'High skewness (|skewness| > 1 in 11 of 28 comparisons)' in Section 5.1.3). Results are reported with spread measures throughout."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares real-world performance against the synthetic ClassEval benchmark as a baseline. Seven LLMs are compared against each other, and ablation conditions (with/without docstrings, with/without RAG) serve as controlled baselines."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The seven evaluated models include GPT-5, GPT-4.1, GPT-OSS (August 2025), Llama 4 Maverick (April 2025), DeepSeek-V3 (August 2025), and other recent models. ClassEval (2023) is the most recent comparable class-level benchmark."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "RQ2 is an ablation study of docstring completeness (full, partial, no docstrings). RQ3 ablates the presence/absence of RAG across docstring conditions. Both systematically vary individual factors while controlling others."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The primary metric is pass rate (proportion of passed tests). While the paper also analyzes error type distributions, it does not use multiple quantitative metrics for code generation quality (e.g., no CodeBLEU, edit distance, or other complementary metrics)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "There is no human evaluation of the generated code quality. All evaluation is automated through test suite execution. The paper makes claims about code quality and practical capability that could benefit from human review of generated outputs."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The Post-Cutoff dataset (200 classes from repositories created after March 31, 2025) serves as a held-out set unseen by any evaluated model. This temporal split guarantees no training data contamination."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by dataset (ClassEval, Pre-Cutoff, Post-Cutoff), by model (7 models), by docstring condition (full, partial, no), and by error type (13 error categories). Tables 3, 4, 6, 9, and 12 provide detailed per-condition breakdowns."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "RQ4 (Section 7) is entirely dedicated to error analysis. The paper provides detailed failure mode analysis including error type distributions, qualitative analysis of test suite differences, and the RAG error substitution mechanism."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Several negative results are reported: docstrings provide only negligible improvements (RQ2), RAG provides no benefit with full docstrings (RQ3), GPT-5 shows negative effects with RAG, and RAG introduces new dependency errors (ImportError, KeyError increases)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims of 84-89% synthetic vs 25-34% real-world correctness are supported by Tables 3-4 and Figure 2. The 1-3% docstring improvement and 4-7% RAG gains are supported by Tables 5-6 and 8-9. Error analysis claims match Table 10."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims through ablation studies (RQ2: docstring impact, RQ3: RAG impact) with within-subjects designs where each class is tested under all conditions, controlling for snippet-specific characteristics. The language 'RAG improves' and 'docstrings improve' are supported by controlled single-variable manipulations."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 10.2 explicitly bounds generalization: 'All evaluated code is Python-based, and findings may not generalize to statically typed languages.' Section 9.1.3 states findings 'may not generalize to other granularities.' The title specifies 'Class-Level Code Generation' rather than making broad claims."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper discusses multiple alternative explanations: the performance gap could be due to code complexity (investigated via mixed-effects model in Section 4.3), test suite quality differences (qualitatively analyzed), and model-specific architectural differences explaining GPT-5's anomalous RAG behavior (Section 6.3.3)."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Table 2 lists model names and release dates but not exact API versions or snapshot dates. 'GPT-4.1', 'GPT-5', 'GPT-OSS', 'DeepSeek-V3' are listed without specific API version strings (e.g., no 'gpt-4.1-2025-04-14' or equivalent). Codestral's knowledge cutoff is listed as 'Unknown'."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 3.2.2 provides the exact prompt template used: 'You are an expert Python programmer who can correctly implement complete Python classes based on the provided class skeletons. Implement the following class. Do not explain the code. The given class skeleton is as follows: [CLASS SKELETON].' The class skeleton content is fully specified by the dataset."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper does not report temperature, top-p, max tokens, or other generation hyperparameters used for the LLM API calls. Section 9.1.5 acknowledges 'consistent generation settings' but does not specify what those settings were."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The paper uses direct prompt-to-completion generation for each class skeleton. RAG retrieval is a simple embedding-based similarity search, not an agentic workflow."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.1 describes the full data pipeline: data collection from CodeSearchNet and GitHub API (3.1.1), filtering criteria including license, project size/activity, and code quality (3.1.2), static analysis and class extraction (3.1.3), and post-processing with random sampling (3.1.4). Figure 1 provides a workflow diagram."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 9 is titled 'Limitations and Future Work' with Section 9.1 dedicated to limitations. Section 10 is titled 'Threats to Validity' with detailed subsections on internal, external, and construct validity."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The paper discusses specific threats: sample size of 200 classes per version with detailed budget analysis (~$1000 API cost, 500+ CPU hours), Python-only evaluation limiting generalization to other languages, Pynguin test suite limitations, standardized prompts vs model-specific optimization, and open-source vs proprietary codebase differences."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 9.1.3 explicitly states: 'Findings may not generalize to other granularities: single-function generation may show higher success rates... while generating entire projects likely shows even larger challenges.' Section 10.2 states 'findings may not generalize to statically typed languages' and 'potentially limiting generalizability to proprietary enterprise codebases.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The replication package at https://github.com/mrsumitbd/RealClassEval-Replication includes the dataset and scripts. The paper states 'We make our data and scripts available to facilitate replication and future research' (Section 11)."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3.1 provides detailed data collection procedures: Pre-cutoff data from CodeSearchNet, post-cutoff from GitHub API for repos created after March 31, 2025, with specific criteria (10+ commits, Python primary, not forks). Figure 1 illustrates the workflow."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants in this study. Data sources are standard benchmarks and public GitHub repositories."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The full pipeline is documented in Section 3.1: collection from two sources, filtering for engineered projects (license, stars 10+, commits 50+, code quality), static analysis with Understand tool, class extraction via ast library, post-processing removal of problematic classes, and random sampling of 200 per version."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding or acknowledgments section is present in the paper. The authors are affiliated with Concordia University but no grants or funding sources are mentioned."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All three authors are listed with their affiliation at Concordia University, Canada. No evaluated product is affiliated with their institution."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding source is disclosed, so independence cannot be assessed. The paper evaluates commercial products (GPT-4.1, GPT-5, GPT-OSS from OpenAI; Codestral from Mistral; etc.) without disclosing whether any API credits or funding came from these companies."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or financial disclosure is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Table 2 lists knowledge cutoff dates for each model: Qwen 2.5 Coder (June 2024), GPT-4.1 (June 2024), GPT-5 (October 2024), GPT-OSS (June 2024), Codestral (Unknown), DeepSeek-V3 (July 2024), Llama 4 Maverick (August 2024). The post-cutoff data boundary (March 31, 2025) is explicitly stated."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "The entire study design addresses contamination through seen/unseen partitions. Section 3.1.1 explains: pre-cutoff data from CodeSearchNet is 'likely encountered during training' while post-cutoff data from repos created after March 31, 2025 guarantees they 'were not in training data.' The finding that seen vs unseen performance is equivalent is a key result."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The paper explicitly addresses this through the seen/unseen design. Section 3.1.1 states pre-cutoff data uses CodeSearchNet which 'has been used as a pre-training corpus for numerous prominent models.' Post-cutoff data guarantees no contamination via temporal separation. The result that contamination does not explain performance gaps is a central finding."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study. This is a benchmark evaluation study using automated test suites."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper does not report per-class or per-model inference costs, API token usage, or wall-clock time for generation. Section 6.1.1 mentions budget constraints ('it was beyond our budget to use k > 2') but does not quantify per-inference costs."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Section 9.1.1 states: 'the total generation cost, including trial and error, was approximately $1000 for API calls alone, plus over 500 CPU hours for test execution.' This provides the total computational budget."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "LLMs achieve 84-89% correctness on synthetic benchmarks (ClassEval) but only 25-34% on real-world class-level code generation tasks.",
    287       "evidence": "Figure 2 and Tables 3-4 show ClassEval pass rates of 83.60-88.27% vs Pre-Cutoff 24.59-33.60% and Post-Cutoff 27.40-31.00%, with medium to large effect sizes (Cliff's Delta 0.46-0.64). All comparisons statistically significant after FDR correction.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "There is no meaningful performance difference between seen (pre-cutoff) and unseen (post-cutoff) real-world code, indicating memorization does not explain real-world failures.",
    292       "evidence": "Table 3 shows 6 of 7 models have no significant Pre-Cutoff vs Post-Cutoff differences after FDR correction. All effect sizes are negligible. GPT-OSS shows statistical significance (p=0.019) but with negligible effect size (Cliff's Delta=-0.125). Table 4 pooled analysis confirms: all FDR-adjusted p > 0.07.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Complete docstrings provide only marginal improvements (1-3%) in code generation correctness, with only 2 of 7 models showing statistically significant benefits.",
    297       "evidence": "Table 5 shows only 2/28 comparisons are significant after FDR correction, both for full vs partial (Codestral 2.72%, DeepSeek-V3 1.51%). All Cliff's Delta values are negligible (0.003-0.077). Pooled chi-square shows no significant differences (all pFDR > 0.05).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "RAG provides significant benefits (4-7%) specifically when documentation is incomplete (partial docstrings), but minimal value with complete or absent documentation.",
    302       "evidence": "Table 8 shows 5/7 models significant with partial docstrings, 0/7 with full, 0/7 with no. Table 9 details: Codestral 5.27% (p=0.030), DeepSeek-V3 6.94% (p=0.023), GPT-4.1 4.63% (p=0.023), Llama 4 Maverick 4.25% (p=0.024), Qwen 2.5 5.18% (p=0.024). Power analysis confirms adequate detection (0.634-0.838).",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "AttributeError, TypeError, and AssertionError account for 84% of all failures in LLM-generated class-level code.",
    307       "evidence": "Table 10 shows AttributeError 43.84%, TypeError 21.65%, AssertionError 18.51%, totaling 84%. Chi-square goodness-of-fit confirms non-uniform distribution (chi-squared = 37847.3, p < 10^-300).",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "RAG operates through an error substitution mechanism, reducing logic and object access errors while occasionally introducing dependency errors.",
    312       "evidence": "Table 13 and Figure 5 show: RAG reduces AttributeError (-60), Other (-53), AssertionError (-43), but increases ImportError (+5, +250%) and KeyError (+5, +45%). Chi-square tests significant for 3 models with partial docstrings (all pFDR < 0.05).",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Synthetic benchmarks fundamentally mischaracterize real-world failure modes, with ClassEval dominated by AssertionError (71.80%) versus real-world code dominated by AttributeError (45-49%) and TypeError (22-24%).",
    317       "evidence": "Table 12 shows ClassEval: 71.80% AssertionError, 2.14% AttributeError vs Pre-Cutoff: 5.34% AssertionError, 49.41% AttributeError. Table 11 shows large effect sizes (Cramér's V 0.59-0.85) for all synthetic vs real-world comparisons.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": ["benchmark-eval", "observational"],
    322   "key_findings": "LLMs achieve 84-89% correctness on synthetic class-level benchmarks but only 25-34% on real-world classes from open-source projects, with no meaningful difference between seen and unseen codebases. This performance gap stems from fundamentally different error patterns: synthetic benchmarks test logical correctness (AssertionError-dominated) while real-world code requires object attribute mastery and type consistency (AttributeError and TypeError-dominated). Docstrings provide only marginal improvements (1-3%, mostly non-significant), while RAG yields 4-7% gains specifically when documentation is incomplete, operating through an error substitution mechanism that trades logic failures for dependency errors.",
    323   "red_flags": [
    324     {
    325       "flag": "Missing hyperparameters",
    326       "detail": "Temperature, top-p, max tokens, and other generation settings are not reported despite the paper acknowledging 'consistent generation settings.' These parameters significantly affect LLM output quality and reproducibility."
    327     },
    328     {
    329       "flag": "Missing model version specifics",
    330       "detail": "Models are identified by marketing names (GPT-4.1, GPT-5, etc.) without API version strings or snapshot dates. Codestral's knowledge cutoff is listed as 'Unknown,' making contamination assessment incomplete for that model."
    331     },
    332     {
    333       "flag": "Single-metric primary evaluation",
    334       "detail": "Pass rate is the sole quantitative metric for code generation quality. No complementary metrics (CodeBLEU, semantic similarity, etc.) are used, which could provide different perspectives on code quality beyond test-passing."
    335     },
    336     {
    337       "flag": "No funding disclosure",
    338       "detail": "The paper evaluates commercial API products costing approximately $1000 but does not disclose whether API credits were provided by any of the companies whose products were evaluated."
    339     },
    340     {
    341       "flag": "Automated test suite limitations acknowledged but not mitigated",
    342       "detail": "Section 9.1.4 acknowledges Pynguin-generated test suites 'primarily assess that code executes without exceptions, but may not validate whether methods compute correct results.' No manual validation of test quality was performed for any subset."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation",
    348       "authors": ["X. Du", "M. Liu", "K. Wang", "H. Wang", "J. Liu", "Y. Chen", "J. Feng", "C. Sha", "X. Peng", "Y. Lou"],
    349       "year": 2023,
    350       "arxiv_id": "2308.01861",
    351       "relevance": "The primary baseline benchmark for class-level code generation that this paper extends with real-world evaluation."
    352     },
    353     {
    354       "title": "Evaluating Large Language Models Trained on Code",
    355       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    356       "year": 2021,
    357       "arxiv_id": "2107.03374",
    358       "relevance": "Introduced HumanEval benchmark and Pass@k metric widely used in code generation evaluation."
    359     },
    360     {
    361       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    362       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    363       "year": 2023,
    364       "relevance": "Introduced EvalPlus showing HumanEval overestimates LLM capabilities due to weak test suites, directly motivating this paper's real-world benchmark."
    365     },
    366     {
    367       "title": "CodeRAG: Supportive Code Retrieval on Bigraph for Real-World Code Generation",
    368       "authors": ["J. Li", "X. Shi", "K. Zhang"],
    369       "year": 2025,
    370       "arxiv_id": "2504.10046",
    371       "relevance": "Repository-level RAG framework for code generation, directly related to this paper's RAG evaluation."
    372     },
    373     {
    374       "title": "Retrieval-Augmented Code Generation: A Survey with Focus on Repository-Level Approaches",
    375       "authors": ["Y. Tao", "Y. Qin", "Y. Liu"],
    376       "year": 2025,
    377       "arxiv_id": "2510.04905",
    378       "relevance": "Survey of RAG for code generation providing context for this paper's RAG evaluation methodology."
    379     },
    380     {
    381       "title": "Bugs in Large Language Models Generated Code: An Empirical Study",
    382       "authors": ["F. Tambon", "A. Moradi-Dakhel", "A. Nikanjam", "F. Khomh", "M. C. Desmarais", "G. Antoniol"],
    383       "year": 2025,
    384       "relevance": "Prior work on error patterns in LLM-generated function-level code that this paper extends to class-level generation."
    385     },
    386     {
    387       "title": "Towards Understanding the Characteristics of Code Generation Errors Made by Large Language Models",
    388       "authors": ["Z. Wang", "Z. Zhou", "D. Song"],
    389       "year": 2025,
    390       "relevance": "Analysis of 557 incorrect code snippets from six LLMs on HumanEval, providing error taxonomy context for this paper's class-level error analysis."
    391     },
    392     {
    393       "title": "HumanEval Pro and MBPP Pro: Evaluating Large Language Models on Self-Invoking Code Generation",
    394       "authors": ["Z. Yu", "Y. Zhao", "A. Cohan", "X.-P. Zhang"],
    395       "year": 2024,
    396       "arxiv_id": "2412.21199",
    397       "relevance": "Extended HumanEval with self-invoking tasks showing performance degradation, related to this paper's finding that complex code generation is harder."
    398     },
    399     {
    400       "title": "CodeRAG-Bench: Can Retrieval Augment Code Generation?",
    401       "authors": ["Z. Z. Wang", "A. Asai", "X. V. Yu"],
    402       "year": 2024,
    403       "arxiv_id": "2406.14497",
    404       "relevance": "Comprehensive RAG benchmark for code generation evaluating retrievers and LLMs, directly relevant to this paper's RAG analysis."
    405     },
    406     {
    407       "title": "A Survey on Large Language Models for Code Generation",
    408       "authors": ["J. Jiang", "F. Wang", "J. Shen", "S. Kim", "S. Kim"],
    409       "year": 2024,
    410       "arxiv_id": "2406.00515",
    411       "relevance": "Survey covering LLM code generation capabilities and benchmarks, providing context for this study's contributions."
    412     },
    413     {
    414       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    415       "authors": ["H. Yu", "B. Shen", "D. Ran"],
    416       "year": 2024,
    417       "relevance": "Benchmark for non-standalone function-level code generation from real-world projects, addressing similar real-world evaluation concerns."
    418     },
    419     {
    420       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    421       "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"],
    422       "year": 2024,
    423       "arxiv_id": "2403.17134",
    424       "relevance": "Agentic approach to LLM-based program repair, relevant to understanding LLM code capabilities in software engineering."
    425     }
    426   ]
    427 }

Impressum · Datenschutz