scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27702B)
      1 {
      2   "paper": {
      3     "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
      4     "authors": [
      5       "Terry Yue Zhuo",
      6       "Minh Chien Vu",
      7       "Jenny Chim",
      8       "Han Hu",
      9       "Wenhao Yu",
     10       "Ratnadira Widyasari",
     11       "Imam Nur Bani Yusuf",
     12       "Haolan Zhan",
     13       "Junda He",
     14       "Indraneil Paul",
     15       "Simon Brunner",
     16       "Chen Gong",
     17       "Thong Hoang",
     18       "Armel Zebaze",
     19       "Xiaoheng Hong",
     20       "Wen-Ding Li",
     21       "Jean Kaddour",
     22       "Ming Xu",
     23       "Zhihan Zhang",
     24       "Prateek Yadav",
     25       "Naman Jain",
     26       "Alex Gu",
     27       "Zhoujun Cheng",
     28       "Jiawei Liu",
     29       "Qian Liu",
     30       "Zijian Wang",
     31       "Binyuan Hui",
     32       "Niklas Muennighoff",
     33       "David Lo",
     34       "Daniel Fried",
     35       "Xiaoning Du",
     36       "Harm de Vries",
     37       "Leandro von Werra"
     38     ],
     39     "year": 2024,
     40     "venue": "ICLR 2025",
     41     "arxiv_id": "2406.15877",
     42     "doi": ""
     43   },
     44   "checklist": {
     45     "artifacts": {
     46       "code_released": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper provides multiple GitHub links for the evaluation framework (https://github.com/bigcode-project/bigcodebench), annotation framework (https://github.com/bigcode-project/bigcodebench-annotation), and Hugging Face dataset (https://huggingface.co/datasets/bigcode/bigcodebench). These are listed in Appendix H (Table 4)."
     50       },
     51       "data_released": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The benchmark dataset is publicly available on Hugging Face (https://huggingface.co/datasets/bigcode/bigcodebench) and a Croissant endpoint is provided. Listed explicitly in Appendix H, Table 4."
     55       },
     56       "environment_specified": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Appendix I.3 provides a detailed requirements.txt with pinned library versions (e.g., pandas==2.0.3, scikit-learn==1.3.1, numpy==1.21.2, etc.). The evaluation framework is also available as a PyPI package (bigcodebench)."
     60       },
     61       "reproduction_instructions": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper provides a PyPI-installable evaluation framework, prompt templates (Appendix K.3, Figures 14-17), execution hardware details (Appendix K.1-K.2), and the full dataset. The GitHub repositories contain documentation for reproducing the benchmark evaluation."
     65       }
     66     },
     67     "statistical_methodology": {
     68       "confidence_intervals_or_error_bars": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper reports Pass@1 and Pass@5 scores as point estimates. No confidence intervals or error bars are reported for the main results in Tables 6 and 7 or Figure 6."
     72       },
     73       "significance_tests": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper makes comparative claims (e.g., 'LLMs perform much worse on BigCodeBench-Instruct than BigCodeBench-Complete with an average decrease of 8.5%') but no statistical significance tests are applied to any of these comparisons."
     77       },
     78       "effect_sizes_reported": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper reports performance differences with baseline context, e.g., 'the best performing LLM, GPT-4o, solves merely 60% of tasks on BigCodeBench-Complete and less than 50% on BigCodeBench-Instruct' compared to '97%' human performance. The 8.5% average decrease between Complete and Instruct is provided with both absolute scores visible in tables."
     82       },
     83       "sample_size_justified": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The benchmark contains 1,140 tasks, but there is no justification for why this number is sufficient. The human evaluation sample for quality validation uses only 33 tasks (Section 2.3) with no justification for this sample size."
     87       },
     88       "variance_reported": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The paper reports greedy decoding results (single-run, deterministic) for the main experiments. For random sampling experiments (Appendix L), Pass@1 and Pass@5 are reported as single values with no standard deviation or variance across runs."
     92       }
     93     },
     94     "evaluation_design": {
     95       "baselines_included": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper extensively compares BigCodeBench against prior benchmarks (HumanEval, MBPP, DS-1000, ODEX, APPS) in Table 1 and evaluates 60 LLMs, including comparisons between instruction-tuned and base models, closed and open models."
     99       },
    100       "baselines_contemporary": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The evaluated models include contemporary state-of-the-art at time of writing: GPT-4o, Claude-3-Opus, Llama-3-70B, Qwen2-72B, DeepSeek-V2, and others from 2024. Benchmark comparisons include recent works like DS-1000 (2023) and SWE-bench (2023)."
    104       },
    105       "ablation_study": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper includes ablation-like analyses: comparing Complete vs. Instruct variants (Section 4.1), base vs. instruction-tuned models (Section 4.1), examining tool-level performance across domains (Section 4.2), and constructing BigCodeBench-Hard with specific filtering criteria (Appendix F)."
    109       },
    110       "multiple_metrics": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports Pass@1 (greedy), calibrated Pass@1, Pass@1 (random sampling), and Pass@5 (random sampling) in Table 6. Domain-specific pass rates are also reported in Figure 7."
    114       },
    115       "human_evaluation": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Human annotators validated benchmark quality: 33 randomly sampled tasks were assigned to 11 annotators to write solutions, achieving 97% pass rate (Section 2.3). The benchmark construction involved 20 human annotators across three stages."
    119       },
    120       "held_out_test_set": {
    121         "applies": false,
    122         "answer": false,
    123         "justification": "This is a benchmark paper, not a model training paper. The benchmark itself serves as a test set for models, and there is no train/validation/test split within BigCodeBench itself."
    124       },
    125       "per_category_breakdown": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Figure 7 provides per-domain breakdowns across 7 domains (General, Computation, System, Visualization, Time, Network, Cryptography). Section 4.2 and Table 2 provide tool-level analysis."
    129       },
    130       "failure_cases_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 4.1 discusses 'model laziness' where LLMs omit import statements. Section 4.2 and Table 2 analyze how models use different function calls and why this leads to failures. Appendix M provides qualitative failure examples."
    134       },
    135       "negative_results_reported": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper reports several negative findings: models fail on up to 40% of tasks, instruction-tuned LLMs omit essential details (Section 4.1), models perform worse on NL-oriented instructions (8.5% average decrease), and GPT-4 shows 'model laziness' with long prompts."
    139       }
    140     },
    141     "claims_and_evidence": {
    142       "abstract_claims_supported": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The abstract claims 'scores up to 60%' which matches GPT-4o's 0.602 Pass@1 in Table 6. The '97%' human performance claim is supported by Section 2.3 (32/33 tasks). The claim about 60 LLMs is supported by Table 6."
    146       },
    147       "causal_claims_justified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper's causal claims are generally well-supported through controlled comparisons: instruction tuning's effect is shown by comparing base vs. instruction-tuned versions of the same model (Section 4.1). The 'model laziness' claim is supported by the calibration mechanism that adds back missing imports."
    151       },
    152       "generalization_bounded": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The title claims to benchmark 'Code Generation' generally, but the benchmark is Python-only and covers only 139 libraries. While the Limitations section (Appendix G.1) acknowledges the Python-only limitation, the abstract and main text make broad claims about 'LLMs' capabilities without consistently scoping to Python."
    156       },
    157       "alternative_explanations_discussed": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.1 discusses alternative explanations for performance differences: lower BigCodeBench-Instruct scores could be due to ambiguity rather than capability gaps ('While it is possible that the lower verbosity may introduce more ambiguity'). The Saturation discussion in Appendix G.1 addresses why SWE-bench's low performance may be due to under-specified instructions rather than task difficulty."
    161       }
    162     },
    163     "setup_transparency": {
    164       "model_versions_specified": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Appendix H (Table 4) provides exact model versions for all 60 evaluated models: GPT-4o (gpt-4-turbo-2024-04-09), GPT-4 (gpt-4-0613), Claude-3-Opus (claude-3-opus-20240229), etc. HuggingFace model IDs are given for all open models."
    168       },
    169       "prompts_provided": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Appendix K.3 provides the exact prompt templates used for each API provider (Figures 14-17). The data synthesis prompts are given in Appendix J.1. The benchmark tasks themselves serve as the prompts."
    173       },
    174       "hyperparameters_reported": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4 states: 'greedy decoding for the main experiments in the zero-shot setting' and 'N=5 samples with a temperature of 0.8 and top-p of 0.95' for random sampling experiments."
    178       },
    179       "scaffolding_described": {
    180         "applies": false,
    181         "answer": false,
    182         "justification": "The paper evaluates LLMs in a direct code generation setting (zero-shot prompting) without agentic scaffolding. No retry logic, feedback mechanisms, or iterative refinement is used in the evaluation."
    183       },
    184       "data_preprocessing_documented": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 2 describes the three-stage construction pipeline in detail: data synthesis from ODEX seeds using GPT-4 (Section 2.1, starting with 4,718 samples), semi-automatic refactoring reducing to 1,223 tasks (Section 2.2), and human curation producing 1,140 final tasks (Section 2.3). Obfuscation and perturbation steps are also described."
    188       }
    189     },
    190     "limitations_and_scope": {
    191       "limitations_section_present": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Appendix G.1 contains a dedicated 'Limitations' subsection covering multilingualism, saturation, reliability, rigorousness, generalization, evolution, interaction, and diversity."
    195       },
    196       "threats_to_validity_specific": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The limitations are specific to this benchmark: 'BigCodeBench is Python-only' (multilingualism), 'some test cases are flaky' with 'uncontrollable changes of Pass@1 under 0.6%' (reliability), 'high test coverage for the ground-truth solutions... does not guarantee that any code generated by LLMs will be correctly assessed' (rigorousness). These are concrete, actionable threats."
    200       },
    201       "scope_boundaries_stated": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Appendix G.1 explicitly states what BigCodeBench does NOT cover: other programming languages, private/emerging libraries, agentic interaction, out-of-distribution tools. The paper also explicitly positions against SWE-bench's repository-level scope."
    205       }
    206     },
    207     "data_integrity": {
    208       "raw_data_available": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The full benchmark dataset with all 1,140 tasks, test cases, and ground-truth solutions is available on Hugging Face (https://huggingface.co/datasets/bigcode/bigcodebench). The annotation framework is also publicly available."
    212       },
    213       "data_collection_described": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Section 2 describes the three-stage data collection in detail: seed examples from ODEX/StackOverflow, GPT-4-based synthesis with 2-shot in-context learning, human annotation by 20 authors over one year. The data synthesis prompt is provided in Appendix J.1."
    217       },
    218       "recruitment_methods_described": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Appendix B provides demographics of the 20 annotators (age distribution, Python experience, academic background). Section 2.2 states '13 authors as human annotators' and describes task assignment. Appendix A details each contributor's role."
    222       },
    223       "data_pipeline_documented": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "The pipeline is fully documented: 4,718 synthesized samples (Section 2.1) → program analysis filtering → 1,223 refactored tasks (Section 2.2) → human curation and cross-checking → 1,140 final tasks (Section 2.3). Each stage's filtering criteria and counts are provided."
    227       }
    228     },
    229     "conflicts_of_interest": {
    230       "funding_disclosed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The Acknowledgements section discloses: 'CSIRO's Data61 PhD Scholarships, the National Research Foundation... Investigatorship Grant (NRF-NRFI08-2022-0002), and Xiaoning Du's Google Research Scholar Program Award.' Computational resources from Sea AI Lab and MASSIVE are also acknowledged."
    234       },
    235       "affiliations_disclosed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "All author affiliations are listed on the first page, including companies like Intel, Uber, Sea AI Lab, AWS AI Labs, Contextual AI, ServiceNow Research, and Hugging Face. One author has a specific disclaimer: '∗The work does not relate to the author's position at Amazon.'"
    239       },
    240       "funder_independent_of_outcome": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "The disclosed funders (CSIRO Data61, National Research Foundation Singapore, Google Research Scholar) are research funding bodies without a direct stake in the benchmark results. The note that AWS AI Labs work 'does not relate to the author's position at Amazon' further demonstrates independence awareness."
    244       },
    245       "financial_interests_declared": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No competing interests or financial interests statement is present. Several authors are affiliated with companies whose models are evaluated (e.g., Hugging Face authors evaluating StarCoder2), but no formal competing interests declaration is made."
    249       }
    250     },
    251     "contamination": {
    252       "training_cutoff_stated": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "The paper does not state the training data cutoff dates for the 60 evaluated models. While some model references point to papers with this information, the paper itself does not consolidate or discuss cutoff dates."
    256       },
    257       "train_test_overlap_discussed": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "Appendix D provides a dedicated 'Data Contamination' section with N-gram overlap experiments (10-gram and 13-gram) against ODEX, Stack Overflow, and StarCoderData. Table 3 shows overlap percentages are below 2.5%. The paper also discusses hosting on Hugging Face rather than GitHub to mitigate contamination."
    261       },
    262       "benchmark_contamination_addressed": {
    263         "applies": true,
    264         "answer": true,
    265         "justification": "Section 2.1 describes obfuscation (replacing function names with dummy names) and perturbation (back-translation of docstrings) to mitigate contamination. Appendix D provides quantitative contamination analysis. The paper discusses future contamination risks from API providers collecting inference data."
    266       }
    267     },
    268     "human_studies": {
    269       "pre_registered": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "The human involvement is for benchmark construction (annotation), not a human subjects study. Pre-registration is not applicable."
    273       },
    274       "irb_or_ethics_approval": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "The human involvement is expert annotation for benchmark construction, not a human subjects study requiring IRB approval."
    278       },
    279       "demographics_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants study is conducted. The annotator demographics in Appendix B (age, experience, education) describe benchmark creators, not study participants."
    283       },
    284       "inclusion_exclusion_criteria": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants study is conducted. Annotators are co-authors, not study participants."
    288       },
    289       "randomization_described": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants study is conducted."
    293       },
    294       "blinding_described": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "No human participants study is conducted."
    298       },
    299       "attrition_reported": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "No human participants study is conducted."
    303       }
    304     },
    305     "cost_and_practicality": {
    306       "inference_cost_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The paper does not report the cost of running inference on 60 models across 1,140 tasks. No API costs, token counts, or wall-clock times for the evaluation are provided."
    310       },
    311       "compute_budget_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "While Appendix K states 'A100 GPUs' for inference and 'Intel(R) Xeon(R) Gold 6150 CPU' for execution, no total GPU hours, API spend, or overall computational budget is reported. The paper acknowledges 'limited computational resources' (Section 4) but does not quantify them."
    315       }
    316     }
    317   },
    318   "claims": [
    319     {
    320       "claim": "GPT-4o achieves the highest Pass@1 of 60.2% on BigCodeBench-Complete (calibrated: 61.1%), significantly below human performance of 97%.",
    321       "evidence": "Table 6 shows GPT-4o at 0.602 original and 0.611 calibrated Pass@1. Section 2.3 reports 97% human performance (32/33 tasks passed).",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "LLMs perform substantially worse on BigCodeBench-Instruct than BigCodeBench-Complete, with an average decrease of 8.5% on Pass@1.",
    326       "evidence": "Table 7 shows the delta between Instruct and Complete variants. Section 4.1 reports the 8.5% average decrease, with individual model deltas visible in the table.",
    327       "supported": "strong"
    328     },
    329     {
    330       "claim": "Instruction-tuned LLMs omit essential import statements due to 'model laziness' with long prompts.",
    331       "evidence": "Section 4.1 describes this phenomenon. Figure 6 shows calibrated vs. original Pass@1 differences, with GPT-4 showing the largest gap (0.484 original vs. 0.572 calibrated on Complete). The difference is less than 0.3% on Instruct (shorter prompts).",
    332       "supported": "strong"
    333     },
    334     {
    335       "claim": "Instruction tuning improves the capability of following complex programming constraints, with instruction-tuned LLMs averaging 40.7% vs. 35.7% for base LLMs.",
    336       "evidence": "Section 4.1 provides the comparison between instruction-tuned and base model means. Table 6 shows paired comparisons (e.g., CodeLlama-instruct-70B at 0.496 vs. CodeLlama-base-70B at 0.443).",
    337       "supported": "strong"
    338     },
    339     {
    340       "claim": "BigCodeBench is the first benchmark combining diverse function calls across 139 libraries and 7 domains with complex instructions for 1,140 tasks.",
    341       "evidence": "Table 1 compares BigCodeBench statistics with prior benchmarks. The benchmark covers 723 function calls from 139 libraries, more than any compared benchmark.",
    342       "supported": "strong"
    343     },
    344     {
    345       "claim": "BigCodeBench has minimal data contamination, with less than 2.5% overlap in N-gram analysis.",
    346       "evidence": "Appendix D, Table 3 shows 10-gram overlap of 0.09% with ODEX, 1.49% with Stack Overflow, and 2.49% with StarCoderData. 13-gram overlap is 0.00% with ODEX and 0.18% with Stack Overflow.",
    347       "supported": "moderate"
    348     },
    349     {
    350       "claim": "Model rankings show strong positive correlation between BigCodeBench-Complete and BigCodeBench-Instruct (Pearson's r = 0.982).",
    351       "evidence": "Section 4.1 reports the Pearson's r correlation. The model rankings in Figure 6 are visually consistent.",
    352       "supported": "strong"
    353     }
    354   ],
    355   "methodology_tags": [
    356     "benchmark-eval"
    357   ],
    358   "key_findings": "BigCodeBench is a benchmark of 1,140 Python programming tasks requiring diverse function calls across 139 libraries and 7 domains, constructed through human-LLM collaboration. Evaluation of 60 LLMs shows the best model (GPT-4o) achieves only 60% Pass@1, far below 97% human performance. LLMs perform worse on natural-language-oriented instructions (8.5% average decrease) and exhibit 'model laziness' by omitting essential code when given long prompts. The benchmark reveals that while models generally follow scaling laws, domain-specific performance varies significantly.",
    359   "red_flags": [
    360     {
    361       "flag": "No error bars or variance across evaluation runs",
    362       "detail": "Main results use greedy decoding (deterministic, single-run). The random sampling results (Pass@1 and Pass@5 with temperature=0.8) also report single point estimates without variance across seeds or runs, making it impossible to assess result stability."
    363     },
    364     {
    365       "flag": "Human performance estimate from small, non-independent sample",
    366       "detail": "The 97% human performance claim is based on only 33 tasks (out of 1,140) evaluated by the benchmark's own annotators (11 of the authors). This sample is small and the evaluators are not independent of the benchmark creators, which could inflate the human baseline."
    367     },
    368     {
    369       "flag": "Potential conflict of interest with evaluated models",
    370       "detail": "Several authors are affiliated with organizations whose models are evaluated: Hugging Face (StarCoder2), ServiceNow Research, and others. While affiliations are disclosed, no formal competing interests statement addresses this."
    371     },
    372     {
    373       "flag": "Benchmark constructed partially with GPT-4, which is also evaluated",
    374       "detail": "GPT-4 (gpt-4-0613) was used for data synthesis and program refactoring, and is also one of the evaluated models. While the paper applies obfuscation and perturbation to mitigate self-preference bias, the model may still have subtle advantages on tasks it helped construct."
    375     }
    376   ],
    377   "cited_papers": [
    378     {
    379       "title": "Evaluating large language models trained on code",
    380       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    381       "year": 2021,
    382       "arxiv_id": "2107.03374",
    383       "relevance": "Introduced HumanEval, one of the most widely used code generation benchmarks that BigCodeBench aims to supersede."
    384     },
    385     {
    386       "title": "SWE-bench: Can language models resolve real-world github issues?",
    387       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    388       "year": 2023,
    389       "relevance": "Repository-level code generation benchmark that evaluates agent-level capabilities, complementary to BigCodeBench's function-level evaluation."
    390     },
    391     {
    392       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    393       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    394       "year": 2024,
    395       "arxiv_id": "2403.07974",
    396       "relevance": "Addresses data contamination in code benchmarks through temporal splits, directly relevant to benchmark methodology quality."
    397     },
    398     {
    399       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    400       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    401       "year": 2024,
    402       "relevance": "EvalPlus framework for rigorous code evaluation, used as inspiration for BigCodeBench's evaluation approach."
    403     },
    404     {
    405       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    406       "authors": ["Yuhang Lai", "Chengxi Li", "Yiming Wang"],
    407       "year": 2023,
    408       "relevance": "Domain-specific code generation benchmark for data science, one of BigCodeBench's key comparison baselines."
    409     },
    410     {
    411       "title": "Code llama: Open foundation models for code",
    412       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    413       "year": 2023,
    414       "arxiv_id": "2308.12950",
    415       "relevance": "Major open code LLM family evaluated in BigCodeBench, relevant to understanding code model capabilities."
    416     },
    417     {
    418       "title": "Starcoder 2 and the stack v2: The next generation",
    419       "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"],
    420       "year": 2024,
    421       "arxiv_id": "2402.19173",
    422       "relevance": "Open code LLM and training data used for contamination analysis in BigCodeBench."
    423     },
    424     {
    425       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    426       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig"],
    427       "year": 2024,
    428       "relevance": "Agent-based approach to software engineering tasks, representing the agentic paradigm that BigCodeBench's function-level evaluation complements."
    429     },
    430     {
    431       "title": "Magicoder: Empowering code generation with oss-instruct",
    432       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    433       "year": 2024,
    434       "relevance": "Instruction-tuned code model using synthetic data, evaluated on BigCodeBench and relevant to understanding training data effects on code generation."
    435     },
    436     {
    437       "title": "SciCode: A research coding benchmark curated by scientists",
    438       "authors": ["Minyang Tian", "Luyu Gao", "Dylan Zhang"],
    439       "year": 2024,
    440       "relevance": "Scientific code generation benchmark representing domain-specific evaluation that extends beyond BigCodeBench's scope."
    441     },
    442     {
    443       "title": "Tool learning with foundation models",
    444       "authors": ["Yujia Qin", "Shengding Hu", "Yankai Lin"],
    445       "year": 2023,
    446       "arxiv_id": "2304.08354",
    447       "relevance": "Survey on tool-use capabilities of LLMs, directly relevant to BigCodeBench's focus on function-call-as-tool evaluation."
    448     },
    449     {
    450       "title": "OctoPack: Instruction tuning code large language models",
    451       "authors": ["Niklas Muennighoff", "Qian Liu", "Armel Randy Zebaze"],
    452       "year": 2023,
    453       "relevance": "Instruction tuning approach for code models that created HumanEvalPack, a predecessor to BigCodeBench-Instruct's NL-oriented evaluation."
    454     }
    455   ]
    456 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs