scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28395B)
      1 {
      2   "paper": {
      3     "title": "ProjectEval: A Benchmark for Programming Agents Automated Evaluation on Project-Level Code Generation",
      4     "authors": [
      5       "Kaiyuan Liu",
      6       "Youcheng Pan",
      7       "Yang Xiang",
      8       "Daojing He",
      9       "Jing Li",
     10       "Yexing Du",
     11       "Tianrun Gao"
     12     ],
     13     "year": 2025,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2503.07010",
     16     "doi": "10.48550/arXiv.2503.07010"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Paper states 'Dataset, code and constructed evaluation machine are available at https://github.com/RyanLoil/ProjectEval/' in a footnote on page 1."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmark dataset (20 tasks with 284 testcases) is released at the same GitHub URL mentioned in the paper."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No mention of requirements.txt, Dockerfile, conda environment, or dependency specifications in the paper. Only mentions that Selenium and subprocess are used for testing."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper describes the evaluation process conceptually (Section 3.3) but does not provide step-by-step reproduction instructions, commands to run, or a reproducing-results guide."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 4, 5, and 7 are reported as point estimates with no confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes numerous comparative claims (e.g., 'close-source LLM agents are doing better than open-source ones', 'cascade generation is better than direct generation') based solely on comparing raw numbers without any statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Results are reported as raw Pass@K percentages and objective indicator scores. No formal effect sizes (Cohen's d, odds ratios, etc.) are provided. The one mention of '2.06% higher' for cascade vs direct does not include baseline context adequate for interpreting magnitude."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The benchmark contains only 20 tasks. No justification is given for why 20 tasks is sufficient, and no power analysis is discussed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No standard deviation, variance, or spread measures are reported across experimental runs despite the Pass@K metric implying multiple runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper evaluates 14+ models across open-source AGI LLMs, closed-source AGI LLMs, code generation LLMs, and a production coding agent (OpenHands), providing extensive baseline comparisons."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include contemporary models: GPT-4o, Gemini 2.0, Phi-4, Llama 3.2, Gemma 2, and OpenHands. These represent the state of the art at the time of writing."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The cascade vs direct generation comparison (RQ2) and three-level input comparison function as ablation studies, isolating the effect of step-by-step vs direct generation and input granularity."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper uses Pass@K plus four objective indicators: Sentence Transformer similarity for checklists, CodeBLEU for skeletons and code, and Levenshtein Distance for parameter values (Table 3)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated via test suites (Selenium for web, subprocess for console). No human evaluation of the LLM-generated outputs is performed. Human involvement is limited to benchmark construction."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The benchmark tasks serve as the test set. No models are tuned or fine-tuned on the benchmark data; all are evaluated zero-shot with temperature 0."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 5 provides per-part breakdowns (Checklist, Skeleton, Code, Parameter Values) across all models and input levels. Results are also broken down by cascade vs direct and by input level."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5.1 discusses three typical failure cases: Invalid Output Format, Missing Essential Files, and Omitted Content, with explanations of likely causes."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that code LLMs 'have almost no effective results,' that many open-source models score near zero, that OpenHands got lower scores than raw GPT-4o, and that cascade generation can hurt GPT-4o performance."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims about automated evaluation via user interaction simulation, three input levels, and findings about systematic engineering and overall project understanding are supported by the experimental results in Sections 5.1-5.4."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper makes causal-sounding claims: 'cascade generation... mimics the CoT process' causing better performance, and 'asking the LLM agents to generate according to the thought steps we set induces the LLM to tend to activate parameters about natural language.' These are mechanistic claims without adequate causal justification."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims 'Programming Agents Automated Evaluation on Project-Level Code Generation' broadly, and the conclusion says results apply to 'real-world production environments.' However, the benchmark is 20 Python/Django tasks. The limitation section acknowledges Python-only and Django-focused constraints, but the title and abstract do not bound the generalization."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper briefly speculates that GPT-4o's better direct generation might be due to 'activating parameters about natural language' (Section 5.2) but provides no systematic discussion of alternative explanations for the main findings."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper measures pass rates on 20 Django-heavy tasks with Selenium/subprocess tests and frames this as measuring 'project-level code generation' capability broadly, without discussing whether these 20 tasks are a valid proxy for general project-level programming ability."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Closed-source models are specified only by marketing name: 'GPT-4o', 'GPT-3.5-turbo', 'Gemini 1.5 pro', 'Gemini 2.0-flash' without snapshot dates or API versions. Open-source models include parameter counts (e.g., 'Llama-3.1-7B') but no exact checkpoint identifiers."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix D provides the actual reasoning prompt template used, and the fill values ({description} and {technical_stack}) are deterministically derived from the benchmark inputs which are fully specified."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Appendix B states 'All the models are running under temperature zero with all settings default in their releases.'"
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "The main experiments use direct LLM prompting without agentic scaffolding. OpenHands is evaluated as a third-party black-box tool whose internal scaffolding the authors cannot be expected to describe."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 3.2 and Figure 2 document the full construction pipeline: NL Prompt sources → LLM generation → human review → canonical solutions → Masker → skeleton. Each step and its purpose are described."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 7 'Limitation' provides a dedicated limitations section with four specific bullet points."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The limitations are specific to this study: Django as primary tech stack may not align with all project patterns, canonical answer supports only Python, JSON format is not universally compatible with all LLMs, and CPV may not be the only valid parameter values."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "While the limitation section mentions Python-only and Django-focused constraints, the paper does not explicitly state what the results do NOT show or what populations/settings are excluded. The boundaries are implicit rather than explicitly stated."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The GitHub repository includes the benchmark tasks, test suites, canonical solutions, and canonical parameter values."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 3.2 describes the full construction process: 7 tasks sourced from SoftwareDev and ProjectDev, rest created originally. LLM-generated components reviewed by humans. Construction cost of $2.95 (GPT-4o) and $420 (human review) reported."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "Human annotators were hired from 'a third-party company with contract' with no further detail on annotator qualifications, selection criteria, or potential biases. The 8-page Chinese instruction guideline is mentioned but not included."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Figure 2 and Section 3.2 document the full pipeline from NL Prompt sources through LLM generation, human review, test suite creation, canonical solution generation, and skeleton masking. Appendix A provides the full version."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Acknowledgments section states: 'This work is supported by the National Science and Technology Innovation 2030 Major program (Grant No. 2024ZD01NL00101).'"
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Harbin Institute of Technology and Pengcheng Laboratory. No authors are affiliated with the LLM companies whose models are evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The funder is a Chinese government research program with no financial interest in which LLM performs best on the benchmark."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial disclosure is included in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the models evaluated, despite this being critical for assessing whether models could have seen the benchmark tasks during training."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "7 of 20 tasks are sourced from SoftwareDev and ProjectDev, both published benchmarks. No discussion of whether the evaluated models may have been trained on these published tasks."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "SoftwareDev (2024) and ProjectDev (2024) tasks were published before the evaluation. No discussion of whether models trained after publication could have seen these tasks. No contamination analysis performed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. Human annotators were involved in benchmark construction only, not as research subjects."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 4.2 reports 'The total evaluation cost of Pass@5 with GPT-4o is $28.02, average $5.60 for each round.' Section 3.2 reports construction cost of $2.95 (GPT-4o) and $420 (human review)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Only monetary API costs are reported. No GPU hours, hardware specifications, or total wall-clock time for running all model evaluations is stated."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No seed sensitivity analysis. The paper states temperature 0 (deterministic) but Pass@5 scores differ from Pass@1, suggesting non-determinism that is not investigated."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The paper reports Pass@1 and Pass@5 metrics but does not explicitly state how many total samples were generated per task to compute these metrics."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": false,
    311         "answer": false,
    312         "justification": "No hyperparameter search was performed. All models use temperature 0 with default settings."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": false,
    316         "answer": false,
    317         "justification": "No configuration selection was performed. All models use default settings."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "The authors evaluate third-party LLMs on their benchmark, not their own model against baselines. The Lucic et al. concern about re-implementing baselines does not apply."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The paper compares 3B-14B parameter open-source models against large closed-source models (GPT-4o, Gemini 1.5 pro) without controlling for or discussing compute budget differences. The finding that closed-source models outperform open-source ones is unsurprising given the compute disparity."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper argues user interaction simulation is better than test units but provides no empirical validation that their 20 Django-heavy tasks with Selenium tests actually measure 'project-level code generation capability' as claimed."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "OpenHands (GPT-4o + scaffold) is compared against raw GPT-4o without systematically controlling for the scaffold effect. The paper notes OpenHands scored lower but attributes it to task origin without analyzing the scaffold confound."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "7 of 20 tasks are sourced from SoftwareDev (2024) and ProjectDev (2024), published before the models' training cutoffs. No temporal leakage analysis is performed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information. The cascade mode provides increasingly detailed inputs (NL → Checklist → Skeleton) but the paper does not analyze whether this constitutes feature leakage from the canonical solution structure."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the 20 tasks are independent or share structural patterns (e.g., Django-heavy tasks may share common patterns that inflate scores for models trained on Django code)."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination procedures."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "ProjectEval is hard for current LLM agents, with only GPT-4o reaching Pass@5 of approximately 15%.",
    371       "evidence": "Table 4 shows GPT-4o achieves 12.49% overall average Pass@5, with the next best (Gemini 1.5 pro) at 6.65%. All open-source models are below 1.13%.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Closed-source LLM agents perform better than open-source ones on project-level code generation.",
    376       "evidence": "Tables 4 and 5 consistently show GPT-4o and Gemini models outperforming all open-source models across all metrics and input levels.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Cascade generation (step-by-step) is better than direct generation on average.",
    381       "evidence": "Table 5 objective indicators show cascade is 2.06% higher at Level 1 input on average. However, Table 4 shows GPT-4o achieves higher Pass@K scores in direct mode, contradicting the general claim.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "LLM agents are best at generating Checklists compared to other parts of ProjectEval.",
    386       "evidence": "Table 5 shows checklist scores (29.28% average) are substantially higher than skeleton (7.59%), code (11.34%), and parameter values (11.10%) in cascade generation.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Code generation LLMs (StarCoder-2, CodeGemma, CodeLlama) cannot produce effective project-level code.",
    391       "evidence": "Table 4 shows all code generation LLMs have near-zero Pass@K scores (0.00-1.20% Pass@5 on Level 3 skeleton only). Table 5 confirms negligible objective indicator scores.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Systematic engineering, overall project understanding, and comprehensive analysis are key capabilities for LLM agents.",
    396       "evidence": "This is stated as a finding in the abstract and Section 5.3, derived from observing that closed-source models do better across all parts. The claim is an interpretation rather than a directly measured outcome.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "methodology_tags": ["benchmark-eval"],
    401   "key_findings": "ProjectEval introduces a 20-task benchmark for project-level code generation with automated evaluation via user interaction simulation (Selenium/subprocess). The best model (GPT-4o) achieves only ~12.5% Pass@5 overall, demonstrating that project-level code generation remains extremely challenging. Closed-source models substantially outperform open-source ones, and code-specialized LLMs perform worst. Cascade (step-by-step) generation shows mixed results: better on objective similarity metrics but worse for GPT-4o on actual pass rates.",
    402   "red_flags": [
    403     {
    404       "flag": "Very small benchmark size",
    405       "detail": "Only 20 tasks with 284 total testcases. This is too small to draw robust conclusions about model capabilities, especially for per-model or per-level comparisons. No statistical power analysis or justification for the sample size."
    406     },
    407     {
    408       "flag": "GPT-4o used for construction and evaluation",
    409       "detail": "GPT-4o was used to generate the benchmark components (canonical solutions, checklists, test suites, parameter descriptions) and is also the best-performing model on the benchmark. This creates a potential bias: the tasks and evaluation criteria may implicitly favor GPT-4o's coding style and patterns."
    410     },
    411     {
    412       "flag": "Non-standard or unclear Pass@K metric",
    413       "detail": "Pass@5 scores are consistently lower than Pass@1 scores across most models (e.g., GPT-4o Cascade Level 1: Pass@1=10.21, Pass@5=8.52), which contradicts the standard definition where Pass@K >= Pass@1 for K >= 1. The paper claims to follow HumanEval convention but the metric appears to work differently without adequate explanation."
    414     },
    415     {
    416       "flag": "No contamination analysis despite reused tasks",
    417       "detail": "7 of 20 tasks (35%) are sourced from SoftwareDev and ProjectDev, both publicly published benchmarks. Models trained after these publications may have seen the tasks. No contamination analysis is performed."
    418     },
    419     {
    420       "flag": "No error bars or statistical tests",
    421       "detail": "All comparative claims are made by directly comparing point estimates without any statistical tests, confidence intervals, or measures of uncertainty. With only 20 tasks, stochastic variation could easily account for observed differences."
    422     },
    423     {
    424       "flag": "Django-heavy benchmark presented as general",
    425       "detail": "The benchmark is heavily biased toward Django web development (acknowledged in limitations as 'primary technical stack'). Results are presented as measuring general 'project-level code generation' capability, which Django-specific tasks do not represent."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Evaluating large language models trained on code",
    431       "authors": ["Mark Chen", "Jared Tworek", "Heewoo Jun"],
    432       "year": 2021,
    433       "arxiv_id": "2107.03374",
    434       "relevance": "Introduced HumanEval, the foundational benchmark for LLM code generation evaluation."
    435     },
    436     {
    437       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    438       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    439       "year": 2024,
    440       "relevance": "Multi-agent framework for software development; used as a source for ProjectEval tasks and represents SoftwareDev benchmark."
    441     },
    442     {
    443       "title": "AgileCoder: Dynamic collaborative agents for software development based on agile methodology",
    444       "authors": ["Minh Huynh Nguyen", "Thang Phan Chau", "Phong X. Nguyen", "Nghi D. Q. Bui"],
    445       "year": 2024,
    446       "arxiv_id": "2406.11912",
    447       "relevance": "Multi-agent software development system using agile methodology; represents ProjectDev benchmark used as a source for ProjectEval tasks."
    448     },
    449     {
    450       "title": "ChatDev: Communicative agents for software development",
    451       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    452       "year": 2024,
    453       "relevance": "LLM-based multi-agent system for collaborative software development via natural language communication."
    454     },
    455     {
    456       "title": "SWE-bench: Can language models resolve real-world github issues?",
    457       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    458       "year": 2024,
    459       "relevance": "Major benchmark for evaluating LLM code generation on real-world GitHub issue resolution."
    460     },
    461     {
    462       "title": "Prompting large language models to tackle the full software development lifecycle: A case study",
    463       "authors": ["Bowen Li", "Wenhan Wu", "Ziwei Tang"],
    464       "year": 2024,
    465       "arxiv_id": "2403.08604",
    466       "relevance": "DevBench project-level benchmark with automated test-unit evaluation, the primary comparison point for ProjectEval."
    467     },
    468     {
    469       "title": "OpenHands: An open platform for ai software developers as generalist agents",
    470       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    471       "year": 2025,
    472       "arxiv_id": "2407.16741",
    473       "relevance": "Production coding agent platform evaluated on ProjectEval; represents state-of-the-art agentic coding systems."
    474     },
    475     {
    476       "title": "Commit0: Library generation from scratch",
    477       "authors": ["Wenting Zhao", "Nan Jiang", "Celine Lee"],
    478       "year": 2024,
    479       "arxiv_id": "2412.01769",
    480       "relevance": "Benchmark for generating entire libraries from scratch, closely related to project-level code generation evaluation."
    481     },
    482     {
    483       "title": "Program synthesis with large language models",
    484       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    485       "year": 2021,
    486       "arxiv_id": "2108.07732",
    487       "relevance": "Introduced MBPP benchmark for LLM program synthesis, foundational work in code generation evaluation."
    488     },
    489     {
    490       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    491       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    492       "year": 2023,
    493       "arxiv_id": "2308.01861",
    494       "relevance": "Class-level code generation benchmark bridging function-level and project-level evaluation granularity."
    495     },
    496     {
    497       "title": "Exploring and evaluating hallucinations in LLM-powered code generation",
    498       "authors": ["Fang Liu", "Yang Liu", "Lin Shi"],
    499       "year": 2024,
    500       "arxiv_id": "2404.00971",
    501       "relevance": "Studies hallucination in LLM code generation, directly relevant to understanding code quality issues."
    502     }
    503   ]
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs