scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30046B)
      1 {
      2   "paper": {
      3     "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
      4     "authors": [
      5       "Jia Li",
      6       "Ge Li",
      7       "Yunfei Zhao",
      8       "Yongmin Li",
      9       "Huanyu Liu",
     10       "Hao Zhu",
     11       "Lecheng Wang",
     12       "Kaibo Liu",
     13       "Zheng Fang",
     14       "Lanshen Wang",
     15       "Jiazheng Ding",
     16       "Xuanming Zhang",
     17       "Yuqi Zhu",
     18       "Yihong Dong",
     19       "Zhi Jin",
     20       "Binhua Li",
     21       "Fei Huang",
     22       "Yongbin Li"
     23     ],
     24     "year": 2024,
     25     "venue": "Annual Meeting of the Association for Computational Linguistics",
     26     "arxiv_id": "2405.19856",
     27     "doi": "10.48550/arXiv.2405.19856"
     28   },
     29   "scan_version": 3,
     30   "active_modules": ["experimental_rigor", "data_leakage"],
     31   "methodology_tags": ["benchmark-eval"],
     32   "key_findings": "DevEval is a new code generation benchmark of 1,874 samples from 117 real-world Python repositories, annotated by 13 developers, with distributions aligned to 500 real-world repos. Evaluation of 8 LLMs shows dramatic performance drops compared to HumanEval — gpt-4-turbo achieves only 53.04% Pass@1 (vs 80% on HumanEval). Context from local files improves gpt-4's Pass@1 by up to 205%, but LLMs still struggle with long heterogeneous contexts, often hallucinating non-existent functions instead of using available dependencies.",
     33   "checklist": {
     34     "artifacts": {
     35       "code_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "A GitHub repository is provided: https://github.com/seketeam/DevEval. The abstract states 'DevEval, prompts, and LLMs' predictions have been released.'"
     39       },
     40       "data_released": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The benchmark data is released at the same GitHub repository. The paper states the benchmark, prompts, and predictions have been released."
     44       },
     45       "environment_specified": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using setuptools and Pytest but does not provide reproducible environment details for the LLM evaluation."
     49       },
     50       "reproduction_instructions": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No step-by-step reproduction instructions are provided in the paper. While the GitHub repo contains data and prompts, the paper lacks a 'Reproducing Results' section or specific instructions for rerunning the experiments."
     54       }
     55     },
     56     "statistical_methodology": {
     57       "confidence_intervals_or_error_bars": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Table 5 reports only point estimates (e.g., 53.04% Pass@1) with no confidence intervals, error bars, or uncertainty quantification."
     61       },
     62       "significance_tests": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No statistical significance tests are used. Comparisons between LLMs (e.g., 'gpt-4 achieves higher Pass@1') are based solely on comparing point estimates without any tests."
     66       },
     67       "effect_sizes_reported": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper reports relative improvements with baseline context, e.g., 'the Pass@1 of gpt-4 is improved by 205% and 173% in two settings' (Section 4.4), and provides both baseline and improved values throughout."
     71       },
     72       "sample_size_justified": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The final benchmark contains 1,874 samples but no justification for why this number is sufficient for the claims made. The construction pipeline is described but no power analysis or sample size reasoning is given."
     76       },
     77       "variance_reported": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No variance or standard deviation is reported across experimental runs. For k=1 greedy search is used (single deterministic output), and for k>1 only 20 samples are drawn per example with no repeated trials or spread measures reported."
     81       }
     82     },
     83     "evaluation_design": {
     84       "baselines_included": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Eight LLMs are compared (Table 5), and performance is compared against prior benchmarks (e.g., gpt-4's 80% Pass@1 on HumanEval vs 53.04% on DevEval). Three experimental settings serve as internal baselines."
     88       },
     89       "baselines_contemporary": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The evaluated models (gpt-4-turbo-1106, DeepSeek Coder, StarCoder 2, CodeLLaMa) were among the most capable code LLMs available at the time of writing (late 2023 / early 2024)."
     93       },
     94       "ablation_study": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Three experimental settings ablate the role of context: without context, local file completion (code above), and local file infilling (code above and below). Results in Table 5 and Figures 5-6 also break down by program type and dependency type."
     98       },
     99       "multiple_metrics": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Two complementary metrics are used: Pass@k (functional correctness via test execution) and Recall@k (recall of reference dependencies). Both are reported at k=1,3,5,10."
    103       },
    104       "human_evaluation": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "Evaluation of LLM outputs is entirely automated via test case execution and static dependency extraction. The 13 developers annotated the benchmark data but did not evaluate LLM-generated code."
    108       },
    109       "held_out_test_set": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "DevEval serves as the test set. The 8 LLMs are evaluated off-the-shelf without any fine-tuning on DevEval data, so there is no dev/test contamination within the experimental protocol."
    113       },
    114       "per_category_breakdown": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Results are broken down by program type (standalone vs non-standalone, Figure 5), dependency type (intra-class, intra-file, cross-file, Figure 6), and experimental setting (Table 5)."
    118       },
    119       "failure_cases_discussed": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Section 4.4 includes 'Error Case Analyses' with Figure 4 showing a specific failure where gpt-3.5 invokes a non-existent function 'create_connection' instead of the available 'connect'. Two root causes are identified."
    123       },
    124       "negative_results_reported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper reports that all LLMs perform poorly on DevEval compared to HumanEval, that LLMs struggle with long heterogeneous contexts, and that cross-file dependency generation remains particularly weak."
    128       }
    129     },
    130     "claims_and_evidence": {
    131       "abstract_claims_supported": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The abstract claims gpt-4-turbo's highest Pass@1 is 53.04%, which matches Table 5 (Local File Infilling). Claims about benchmark alignment are supported by Tables 2-3."
    135       },
    136       "causal_claims_justified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The main causal claim is that code context improves LLM performance. The three experimental settings (no context vs completion vs infilling) form a controlled comparison that adequately supports this. Other causal language is modest."
    140       },
    141       "generalization_bounded": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The title claims alignment with 'Real-World Code Repositories' broadly, but the benchmark is Python-only and English-only. While the Limitations section (Section 9) acknowledges this is monolingual, the title and main framing overgeneralize beyond what was tested."
    145       },
    146       "alternative_explanations_discussed": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 4.4 discusses two reasons LLMs can generate dependencies without context (reasoning from requirements, guessing from naming conventions). It also explains why gpt-family vs open-source models behave differently (instruction tuning). Error analysis considers context length and heterogeneity as alternative failure explanations."
    150       },
    151       "proxy_outcome_distinction": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Pass@k explicitly measures functional correctness via test execution, and Recall@k measures dependency recall. The paper is precise about what these metrics capture and introduces Recall@k specifically to address a gap not captured by Pass@k alone. Claims match measurement granularity."
    155       }
    156     },
    157     "setup_transparency": {
    158       "model_versions_specified": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Table 4 specifies model versions: gpt-4-turbo-1106, gpt-3.5-turbo-1106 (with snapshot dates), and open-source model sizes (StarCoder 2 15B/7B, DeepSeek Coder 33B/6.7B, CodeLLaMa 13B/7B)."
    162       },
    163       "prompts_provided": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The paper states 'DevEval, prompts, and LLMs' predictions have been released' at https://github.com/seketeam/DevEval. The GitHub repository is stated to contain the prompts used in experiments."
    167       },
    168       "hyperparameters_reported": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.3 reports: greedy search for k=1, nucleus sampling with temperature 0.4 and top-p 0.95 for k>1, max generation length 500, n=20 programs sampled per requirement."
    172       },
    173       "scaffolding_described": {
    174         "applies": false,
    175         "answer": false,
    176         "justification": "No agentic scaffolding is used. LLMs are prompted directly with requirement, signature, and context to generate code."
    177       },
    178       "data_preprocessing_documented": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 documents the full pipeline across five stages with filtering at each step: 500 repos → 590,365 functions → 3,764 with tests → 2,846 after annotation → 1,874 final samples. Criteria at each stage are described."
    182       }
    183     },
    184     "limitations_and_scope": {
    185       "limitations_section_present": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 9 'Limitations' provides substantive discussion of three specific limitations: monolingual scope, Recall@k bias from static analysis, and limited context settings."
    189       },
    190       "threats_to_validity_specific": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Specific threats are discussed: monolingual limitation (Python/English only), Recall@k parser bias quantified at 0.16 from a 50-program sample comparison with human annotators, and the limitation of only using local file context."
    194       },
    195       "scope_boundaries_stated": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The paper explicitly states it is Python-only, English-only, uses only local file contexts (not imported or sibling files), and that Recall@k has slight bias. Future work lists specific extensions planned (multilingual, more projects, more test cases)."
    199       }
    200     },
    201     "data_integrity": {
    202       "raw_data_available": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The benchmark data (samples, repositories, test cases, prompts, and LLM predictions) is released at https://github.com/seketeam/DevEval, enabling independent verification."
    206       },
    207       "data_collection_described": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Section 3 describes data collection in detail across five stages: repository selection from PyPI (top 10 topics, top 50 repos each, three selection criteria), function parsing, test construction, human annotation (674 person-hours, dual annotation), and final benchmark construction."
    211       },
    212       "recruitment_methods_described": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "The 13 developers who annotated requirements and dependencies are mentioned but their recruitment is not described — no information on how they were selected, their qualifications, programming experience levels, or affiliations beyond 'countries of residence' for payment purposes."
    216       },
    217       "data_pipeline_documented": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The full pipeline is documented in Section 3 with counts at each stage: 500 repos (10 topics × 50) → 590,365 candidate functions → 3,764 with tests → 2,846 after annotation → 1,874 final samples (73% non-standalone, 27% standalone)."
    221       }
    222     },
    223     "conflicts_of_interest": {
    224       "funding_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Section 8 states: 'This research was supported by the National Natural Science Foundation of China (Nos. 62192731, 62152730).'"
    228       },
    229       "affiliations_disclosed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Author affiliations are clearly stated: Peking University (School of Computer Science, Key Laboratory) and Alibaba Group."
    233       },
    234       "funder_independent_of_outcome": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The National Natural Science Foundation of China is a government funding agency with no commercial stake in benchmark outcomes. While Alibaba authors are involved, Alibaba's own models are not being evaluated."
    238       },
    239       "financial_interests_declared": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No competing interests or financial interest statement is included in the paper."
    243       }
    244     },
    245     "contamination": {
    246       "training_cutoff_stated": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The paper does not state the training data cutoff dates for any of the 8 evaluated models. Section 5 uses 'release dates' as a proxy for training data cutoffs but does not report the actual cutoff dates."
    250       },
    251       "train_test_overlap_discussed": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Section 5 'Data leakage' discusses train/test overlap: repos are divided into 'unseen' (released after LLM) and 'potentially seen' (released before LLM). Average Pass@1 difference of 0.36 between groups is reported as slight."
    255       },
    256       "benchmark_contamination_addressed": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 5 provides three arguments: (1) requirements are newly written and never in training data, (2) no overfitting tendencies between seen/unseen repos (0.36 average difference), (3) repo links released to help future LLM trainers omit them."
    260       }
    261     },
    262     "human_studies": {
    263       "pre_registered": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects study was conducted. The 13 developers were annotators for the benchmark, not participants in a study."
    267       },
    268       "irb_or_ethics_approval": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects study was conducted. The developers were compensated annotators, not research participants."
    272       },
    273       "demographics_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study was conducted."
    277       },
    278       "inclusion_exclusion_criteria": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human subjects study was conducted."
    282       },
    283       "randomization_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human subjects study was conducted."
    287       },
    288       "blinding_described": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human subjects study was conducted."
    292       },
    293       "attrition_reported": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "No human subjects study was conducted."
    297       }
    298     },
    299     "cost_and_practicality": {
    300       "inference_cost_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No inference costs, API costs, or latency figures are reported for any of the 8 LLMs evaluated, despite using commercial APIs (gpt-4, gpt-3.5) and generating 20 programs per requirement across 1,874 samples."
    304       },
    305       "compute_budget_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No computational budget is stated. The paper mentions 674 person-hours for annotation but does not report GPU hours, API spend, or hardware used for the LLM evaluation experiments."
    309       }
    310     },
    311     "experimental_rigor": {
    312       "seed_sensitivity_reported": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No seed sensitivity analysis is reported. The nucleus sampling introduces randomness but results are not reported across multiple random seeds."
    316       },
    317       "number_of_runs_stated": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Section 4.3 states: 'we generate n≥k programs per requirement' and specifies n=20 for sampling. For k=1, greedy search generates a single deterministic program."
    321       },
    322       "hyperparameter_search_budget": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No hyperparameter search budget is reported. The generation parameters (temperature=0.4, top-p=0.95, max length=500) are stated but no justification or search process for selecting these values is provided."
    326       },
    327       "best_config_selection_justified": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "All three experimental settings (without context, completion, infilling) are reported in Table 5 with full results. No cherry-picking of configurations — all conditions are transparently shown."
    331       },
    332       "multiple_comparison_correction": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No statistical tests are performed, so correction for multiple comparisons is not applicable."
    336       },
    337       "self_comparison_bias_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper proposes DevEval and evaluates it against prior benchmarks (Tables 1-2) but does not acknowledge potential bias from authors designing a benchmark that supports their narrative that existing benchmarks are insufficient."
    341       },
    342       "compute_budget_vs_performance": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "Models of vastly different sizes (7B to gpt-4) are compared without any discussion of compute budget differences. No performance-per-compute analysis is provided."
    346       },
    347       "benchmark_construct_validity": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "Section 2.4 and Tables 2-3 extensively analyze alignment with 500 real-world repos across code distributions, dependency types, and repository scale. The paper explicitly discusses what makes a benchmark valid for evaluating real-world coding ability."
    351       },
    352       "scaffold_confound_addressed": {
    353         "applies": false,
    354         "answer": false,
    355         "justification": "No scaffolding is used. LLMs are prompted directly for code generation."
    356       }
    357     },
    358     "data_leakage": {
    359       "temporal_leakage_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "Section 5 addresses temporal leakage by dividing repos into 'unseen repositories released later than LLMs and potentially seen repositories released earlier than LLMs,' finding an average Pass@1 difference of 0.36."
    363       },
    364       "feature_leakage_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No discussion of whether the evaluation setup (providing local file context) might leak answer information or provide hints not available in realistic development scenarios."
    368       },
    369       "non_independence_addressed": {
    370         "applies": true,
    371         "answer": false,
    372         "justification": "No discussion of whether training data of the LLMs might contain code from the same repositories or structurally similar code, beyond the temporal analysis."
    373       },
    374       "leakage_detection_method": {
    375         "applies": true,
    376         "answer": true,
    377         "justification": "The paper uses a concrete temporal split method: comparing Pass@1 on repos released before vs after each LLM's release date, finding an average difference of 0.36 (vs 5.15 average variation between LLMs)."
    378       }
    379     }
    380   },
    381   "claims": [
    382     {
    383       "claim": "Existing code generation benchmarks are poorly aligned with real-world code repositories in code distribution, dependencies, and annotations.",
    384       "evidence": "Tables 1-3 compare DevEval with prior benchmarks across multiple dimensions. Most benchmarks are 100% standalone functions with 0 dependencies, while 500 real-world repos show 73% non-standalone code with 3.22 dependencies per sample.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "gpt-4-turbo achieves a highest Pass@1 of only 53.04% on DevEval, compared to 80% on HumanEval.",
    389       "evidence": "Table 5 shows Pass@1 of 53.04% for gpt-4 in the Local File (Infilling) setting. The 80% HumanEval figure is cited from prior work.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Code context from local files improves gpt-4's Pass@1 by up to 205%.",
    394       "evidence": "Table 5: gpt-4 without context achieves 17.40 Pass@1, vs 53.04 in Local File (Infilling), a 205% improvement. Reported in Section 4.4.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "DevEval's distributions are aligned with 500 real-world repositories.",
    399       "evidence": "Table 2 shows DevEval has 27% standalone / 73% non-standalone code matching 500 repos exactly. Table 3 shows dependency type distribution (38%/32%/30%) close to 500 repos (42%/29%/30%). Average dependencies per sample: 3.41 vs 3.22.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "LLMs struggle with understanding long and heterogeneous contexts, leading to hallucinations.",
    404       "evidence": "Section 4.4 error case analysis shows gpt-3.5 generating non-existent function 'create_connection' despite 'connect' being in context. Two root causes identified: context length and heterogeneity. Supported by one qualitative example (Figure 4).",
    405       "supported": "moderate"
    406     },
    407     {
    408       "claim": "Data leakage has only a slight impact on DevEval results.",
    409       "evidence": "Section 5 reports average Pass@1 difference of 0.36 between seen and unseen repos, compared to 5.15 average variation between LLMs. Three arguments provided, but analysis uses release dates as proxy for training cutoffs.",
    410       "supported": "moderate"
    411     }
    412   ],
    413   "red_flags": [
    414     {
    415       "flag": "No error bars or uncertainty quantification",
    416       "detail": "All results in Table 5 are point estimates with no confidence intervals, standard deviations, or significance tests. Claims of superiority between models (e.g., gpt-4 vs gpt-3.5) rest entirely on comparing single numbers."
    417     },
    418     {
    419       "flag": "No significance testing for model comparisons",
    420       "detail": "Multiple claims about which models are better are made without any statistical tests. Small differences (e.g., DeepSeek Coder 33B vs CodeLLaMa 13B at 46.32 vs 41.94) may not be significant."
    421     },
    422     {
    423       "flag": "Annotator recruitment and qualifications undisclosed",
    424       "detail": "The 13 developers who annotated requirements and dependencies — a critical component of benchmark quality — are not characterized beyond 'developers.' Their experience levels, selection criteria, and inter-annotator agreement are not reported."
    425     },
    426     {
    427       "flag": "Training cutoff dates not established",
    428       "detail": "The data leakage analysis uses model release dates as a proxy for training data cutoffs, which is imprecise. Actual training cutoff dates for the 8 models are not stated."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Evaluating large language models trained on code",
    434       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    435       "year": 2021,
    436       "arxiv_id": "2107.03374",
    437       "relevance": "Introduces HumanEval, the most widely used code generation benchmark that DevEval aims to improve upon."
    438     },
    439     {
    440       "title": "Program synthesis with large language models",
    441       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"],
    442       "year": 2021,
    443       "arxiv_id": "2108.07732",
    444       "relevance": "Introduces MBPP benchmark and the Pass@k metric used throughout DevEval's evaluation."
    445     },
    446     {
    447       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    448       "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"],
    449       "year": 2023,
    450       "arxiv_id": "2302.00288",
    451       "relevance": "Prior real-world code generation benchmark that DevEval extends with better alignment to repository distributions."
    452     },
    453     {
    454       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    455       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    456       "year": 2023,
    457       "arxiv_id": "2308.01861",
    458       "relevance": "Class-level code generation benchmark; DevEval addresses its limitations in repository context and dependency coverage."
    459     },
    460     {
    461       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    462       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    463       "year": 2023,
    464       "arxiv_id": "2310.06770",
    465       "relevance": "Repository-level benchmark for issue resolution; contrasted with DevEval's focus on code generation from requirements."
    466     },
    467     {
    468       "title": "DeepSeek-Coder: When the large language model meets programming",
    469       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    470       "year": 2024,
    471       "arxiv_id": "2401.14196",
    472       "relevance": "One of the evaluated code LLMs, demonstrating competitive performance on DevEval."
    473     },
    474     {
    475       "title": "StarCoder 2 and the Stack v2: The next generation",
    476       "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"],
    477       "year": 2024,
    478       "arxiv_id": "2402.19173",
    479       "relevance": "Open-source code LLM evaluated on DevEval, representing the state of open-source code generation."
    480     },
    481     {
    482       "title": "Code Llama: Open foundation models for code",
    483       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    484       "year": 2023,
    485       "arxiv_id": "2308.12950",
    486       "relevance": "Open-source code LLM evaluated on DevEval; widely used baseline for code generation."
    487     },
    488     {
    489       "title": "RepoBench: Benchmarking repository-level code auto-completion systems",
    490       "authors": ["Tianyang Liu", "Canwen Xu", "Julian J. McAuley"],
    491       "year": 2023,
    492       "arxiv_id": "2306.03091",
    493       "relevance": "Repository-level code completion benchmark; lacks natural language requirements that DevEval provides."
    494     },
    495     {
    496       "title": "CrossCodeEval: A diverse and multilingual benchmark for cross-file code completion",
    497       "authors": ["Yangruibo Ding", "Zijian Wang", "Wasi Uddin Ahmad"],
    498       "year": 2023,
    499       "arxiv_id": "2310.11248",
    500       "relevance": "Cross-file code completion benchmark; complements DevEval's focus on code generation with requirements."
    501     },
    502     {
    503       "title": "Lost in the middle: How language models use long contexts",
    504       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"],
    505       "year": 2023,
    506       "arxiv_id": "2307.03172",
    507       "relevance": "Study on LLMs ignoring information in long contexts, directly relevant to DevEval's error analysis findings."
    508     },
    509     {
    510       "title": "Repository-level prompt generation for large language models of code",
    511       "authors": ["Disha Shrivastava", "Hugo Larochelle", "Daniel Tarlow"],
    512       "year": 2023,
    513       "relevance": "Proposes methods for extracting relevant context from repositories, directly motivating DevEval's experimental settings."
    514     },
    515     {
    516       "title": "Measuring coding challenge competence with APPS",
    517       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    518       "year": 2021,
    519       "relevance": "Competition-style code generation benchmark; DevEval argues such benchmarks are poorly aligned with real-world development."
    520     }
    521   ],
    522   "engagement_factors": {
    523     "practical_relevance": {
    524       "score": 2,
    525       "justification": "Benchmark is publicly released and directly usable for evaluating code generation LLMs on realistic tasks."
    526     },
    527     "surprise_contrarian": {
    528       "score": 1,
    529       "justification": "Shows LLMs perform much worse on real-world code than HumanEval, which is somewhat expected but well-quantified."
    530     },
    531     "fear_safety": {
    532       "score": 0,
    533       "justification": "No safety, security, or AI risk concerns are raised."
    534     },
    535     "drama_conflict": {
    536       "score": 1,
    537       "justification": "Implicitly challenges HumanEval and similar benchmarks as insufficient, but framed constructively rather than controversially."
    538     },
    539     "demo_ability": {
    540       "score": 2,
    541       "justification": "Benchmark, prompts, and model predictions released on GitHub for immediate use and reproduction."
    542     },
    543     "brand_recognition": {
    544       "score": 1,
    545       "justification": "Peking University and Alibaba Group are moderately well-known in the AI research community."
    546     }
    547   }
    548 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs