scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27064B)
      1 {
      2   "paper": {
      3     "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
      4     "authors": ["Naman Jain", "King Han", "Alex Gu", "Wen-Ding Li", "Fanjia Yan", "Tianjun Zhang", "Sida I. Wang", "Armando Solar-Lezama", "Koushik Sen", "Ion Stoica"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2403.07974"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "LiveCodeBench introduces a continuously updated, contamination-free benchmark for evaluating LLMs on code across four scenarios (generation, self-repair, execution, test output prediction). The benchmark reveals likely contamination in DeepSeek and GPT-4-O models on older LeetCode problems, and shows that many fine-tuned open models that perform well on HumanEval do not generalize to LiveCodeBench, suggesting overfitting. GPT-4-Turbo and Claude-3-Opus lead across all scenarios, with closed models maintaining a large gap over open models.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a website (https://livecodebench.github.io/) and states 'We will release all prompts and model completions for further community analysis, along with a general toolkit for adding new scenarios and models' (Abstract). The website serves as the release platform."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The benchmark problems (511 problems) are released through the LiveCodeBench platform. The paper states problems are collected from public contest platforms and the benchmark is publicly available."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. They mention using vLLM for inference but do not provide a full environment setup."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While prompts are provided in the appendix and the toolkit is mentioned, no step-by-step reproduction instructions are included in the paper itself."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates (Pass@1) with no confidence intervals or error bars. The paper acknowledges '1-1.5% performance variations' from bootstrapping (Section 7) but does not report CIs in the main results."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are used despite numerous comparative claims (e.g., 'GPT-4-Turbo outperforms all other models'). Differences are assessed by comparing raw numbers."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports percentage differences with baseline context, e.g., 'DS-Ins-33B is merely 4.3 point behind GPT-4-Turbo on HumanEval+ but 16.2 points (69%) on LCB' (Section 5.2), and 'GPT-4-Turbo leads DS-Ins-33B by 96% and 134%' on other scenarios."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper does not justify why 511 problems were collected or perform a power analysis. The Limitations section acknowledges the sample size issue but provides no formal justification."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Pass@1 is computed from 10 samples per problem but no variance across runs or seeds is reported. Only point estimates appear in all results tables."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares against HumanEval/HumanEval+ as existing benchmarks and evaluates 52 models as mutual baselines across four scenarios."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Models evaluated include GPT-4-Turbo, Claude-3-Opus, Gemini-1.5-Pro, LLama-3-70B, and other contemporary models as of 2024."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "This is a benchmark paper, not a system with components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses Pass@1 across four distinct scenarios (code generation, self-repair, code execution, test output prediction), providing multiple complementary evaluation dimensions. Per-difficulty breakdowns (Easy/Medium/Hard) are also provided."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation of model outputs is performed. Evaluation is entirely automated via test case execution. The paper mentions 'manual inspection' of code execution filters (Appendix A.3) but this is for data curation, not output evaluation."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The benchmark uses hidden test cases (avg 17 per problem) not shown to models. The temporal split mechanism ensures models are evaluated on problems released after their training cutoff."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by difficulty (Easy/Medium/Hard), platform (LeetCode/AtCoder/CodeForces), time window, and scenario. Tables 3-6 provide detailed per-category breakdowns."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Appendix E provides qualitative examples of code execution failures by GPT-4 with CoT. Section 5.2 discusses where models struggle (Hard problems, open vs closed gap)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that CoT prompting sometimes hurts open models on code execution ('open models perform even worse in comparison to the direct code execution baseline', Section 7). Contamination is reported as a negative finding for specific models."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about contamination detection, holistic evaluation, overfitting, and model comparisons are all supported by specific results in Sections 5.1 and 5.2 with figures and tables."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims about contamination ('models are likely trained on the older LeetCode problems') based on temporal performance drops, but this is observational evidence — the drop could have other explanations (e.g., problem difficulty variation over time). Similarly, 'overfitting to HumanEval' is inferred from correlation patterns without controlled experiments."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Holistic and Contamination Free Evaluation of Large Language Models for Code' but the benchmark only covers Python and competitive programming problems. The Limitations section acknowledges this but the title and framing are broad."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 7 discusses alternative explanations: performance drops could be due to problem difficulty variation, prompt sensitivity, or domain specificity. The paper acknowledges 'different domains might have individual requirements' and that competition problems may not represent general coding."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper is clear that it measures Pass@1 on competitive programming tasks and does not frame this as measuring general 'coding ability'. It explicitly states LiveCodeBench 'might not be representative of the most general notion of LLM programming capabilities' (Section 7)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Table 2 (Appendix C.1) lists exact model IDs for all 52 models, including specific versions like 'gpt-4-0613', 'gpt-4-1106-preview', 'claude-3-opus-20240229', and HuggingFace model paths."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompts are provided in Appendix C.2-C.5 for all four scenarios: code generation, self-repair, code execution (with and without CoT), and test output prediction. Figures 6-7 provide input generator prompts."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4 states: 'We use nucleus sampling with temperature 0.2 and top p 0.95' and 'generate 10 candidate answers for each problem.'"
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. Models are evaluated via direct prompting."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3 describes data collection in detail: HTML scraping, formula parsing, image exclusion, difficulty filtering thresholds, test generation procedures. Appendix A.3 details filtering criteria for code execution (character length 100-500, runtime filters, step count limits)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 7 'Limitations' is a dedicated section covering benchmark size, Python-only focus, prompt robustness, and problem domain limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 identifies specific threats: '1-1.5% performance variations' from bootstrapping, prompt sensitivity particularly for open models on CoT, Python-only limitation, and competition-only domain. These are specific to this study."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 explicitly states: 'LiveCodeBench currently only focuses on Python', 'focuses on competition problems sourced from three platforms', and 'might not be representative of the most general notion of LLM programming capabilities.' Recommends using it as 'a starting point.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states they will release 'all prompts and model completions for further community analysis' (Abstract), and the benchmark data is available through the website."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes data collection in detail: HTML scrapers for LeetCode, AtCoder, CodeForces; contest dates from May 2023 to May 2024; what metadata is collected (problem statements, test cases, ground truth solutions)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data is sourced from public competitive programming platforms (standard benchmark construction)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: scraping → parsing/filtering → difficulty classification → test generation (GPT-4-Turbo generators, Appendix A.2) → validation on correct programs → scenario construction. Filtering criteria and statistics are provided in Table 1 and Appendix A.3."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgements section lists NSF grants (CCF:1900968, CCF:1908870), SKY Lab industrial sponsors (Google, IBM, Intel, Microsoft, etc.), and NSF Graduate Research Fellowship."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: UC Berkeley, MIT, Cornell. No authors are affiliated with the model providers being evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Industrial sponsors include Google, IBM, Intel, Microsoft — companies whose models are evaluated. While the sponsors are for the SKY Lab generally (not this specific paper), the funders have a stake in LLM benchmark outcomes."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Table 2 (Appendix C.1) lists 'Approximate Cutoff Date' for all 52 models. Section 5.1 discusses specific cutoff dates for DeepSeek (Aug 2023) and GPT-4-O (Nov 2023)."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 5.1 extensively analyzes potential contamination through temporal performance analysis, showing performance drops after cutoff dates for DeepSeek and GPT-4-O models."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Contamination avoidance is a core contribution. The temporal split mechanism (evaluating only on problems released after model cutoff) directly addresses this. Section 5.1 provides empirical evidence of contamination in existing benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, tokens consumed, or wall-clock time for evaluations are reported despite evaluating 52 models × 10 samples × 511 problems across four scenarios."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget (GPU hours, API spend) is stated. The paper mentions using vLLM for open models but does not quantify compute."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Results are from sampling (temperature 0.2, 10 samples) but no seed sensitivity analysis is performed. No variance across seeds or runs is reported."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 4 states: 'we generate 10 candidate answers for each problem' and Pass@1 is computed from these 10 samples."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The paper uses temperature 0.2 and top-p 0.95 without justifying these choices or reporting alternatives tried."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Prompt and sampling parameters appear chosen without systematic justification. Section 7 acknowledges 'we either do not tune prompts across models or make minor adjustments' but does not explain the selection process."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, let alone multiple comparison corrections, despite comparing 52 models across multiple scenarios."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors construct the benchmark and evaluate models on it without acknowledging potential biases in their benchmark construction choices (e.g., platform selection, difficulty thresholds)."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "The paper evaluates models via inference only; it does not propose a new method with compute tradeoffs against baselines."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 7 discusses construct validity: 'Programming is a vast domain' and LiveCodeBench 'currently focuses on competition problems' which 'might not be representative of the most general notion of LLM programming capabilities.' The comparison with HumanEval (Figure 5) also examines whether benchmarks measure the same construct."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved; models are evaluated via direct prompting."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Temporal leakage is a core focus. The paper tags problems with release dates and evaluates models only on problems released after their training cutoff dates (Section 1, Section 5.1)."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information through prompt formatting, example inputs, or other contextual cues."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether problems across platforms share structural similarities, or whether similar problem types appear in both training corpora and the benchmark."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "The paper uses temporal performance analysis as a concrete detection method: measuring Pass@1 across monthly time windows and identifying performance drops after cutoff dates (Figures 1, 10-12). This is an empirical leakage detection approach."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "DeepSeek and GPT-4-O show likely contamination on older LeetCode problems, with stark performance drops after their training cutoff dates.",
    364       "evidence": "Figure 1 shows DS-Ins-33B performance drops after Aug 2023 (its release date) and GPT-4-O drops after Nov 2023 (its cutoff date). DS-Base-33B drops from Pass@1 ~60 to ~0 on Sep vs May LeetCode problems (Section 5.1).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Many fine-tuned open models overfit to HumanEval and their performance does not generalize to LiveCodeBench.",
    369       "evidence": "Figure 5 shows two clusters: base/closed models where HumanEval+ and LCB-Easy performance align, and fine-tuned open models that perform well on HumanEval+ but poorly on LCB-Easy. DS-Ins-1.3B achieves 59.8% on HumanEval+ but only 26.3% on LCB-Easy (Section 5.2).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Closed API models consistently outperform open models across all four scenarios, with the gap amplifying on harder tasks.",
    374       "evidence": "Tables 3-6 and Figure 2 show GPT-4-Turbo and Claude-3-Opus leading across scenarios. Only L3-Ins-70B, Mixtral, and DS-Ins-33B among open models approach closed model performance (Section 5.2).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Model performances are correlated across the four scenarios but relative differences vary, highlighting the need for holistic evaluation.",
    379       "evidence": "Figure 13 shows correlations >0.88 across all scenario pairs, with 0.98 for generation-repair and 0.96 for test output-execution. Claude-3-Opus outperforms GPT-4-Turbo on test output prediction despite trailing on code generation (Section 5.2).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LiveCodeBench better highlights the gap between SoTA and open models compared to HumanEval.",
    384       "evidence": "DS-Ins-33B is 4.3 points behind GPT-4-Turbo on HumanEval+ but 16.2 points (69%) behind on LCB code generation. This gap amplifies on other scenarios: 96% and 134% on test output prediction and code execution (Section 5.2).",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No statistical tests for comparative claims",
    391       "detail": "The paper makes extensive comparative claims ('outperforms', 'significantly worse') across 52 models based solely on raw Pass@1 numbers without any significance tests, confidence intervals, or uncertainty quantification. The acknowledged 1-1.5% bootstrap variation suggests some close comparisons may not be meaningful."
    392     },
    393     {
    394       "flag": "Industrial sponsors with stake in outcomes",
    395       "detail": "SKY Lab sponsors include Google, Microsoft, Intel, and Samsung, whose models and products are evaluated. While the authors are academic, the funding relationship creates a potential conflict that is not explicitly acknowledged."
    396     },
    397     {
    398       "flag": "Contamination claims based on correlational evidence",
    399       "detail": "The contamination claims for DeepSeek and GPT-4-O are based on temporal performance drops, but the paper does not rule out that later problems could simply be harder. The relatively stable AtCoder performance (Figure 11) provides partial control, but LeetCode-specific difficulty trends are not analyzed."
    400     }
    401   ],
    402   "cited_papers": [
    403     {
    404       "title": "Evaluating large language models trained on code",
    405       "authors": ["Mark Chen", "Jerry Tworek", "et al."],
    406       "year": 2021,
    407       "arxiv_id": "2107.03374",
    408       "relevance": "Introduced HumanEval benchmark and Codex model — the primary baseline benchmark LiveCodeBench critiques for contamination."
    409     },
    410     {
    411       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    412       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    413       "year": 2023,
    414       "arxiv_id": "2305.01210",
    415       "relevance": "Released HumanEval+ with more tests revealing up to 8% performance drops, demonstrating test insufficiency in existing benchmarks."
    416     },
    417     {
    418       "title": "SWE-bench: Can language models resolve real-world github issues?",
    419       "authors": ["Carlos E Jimenez", "John Yang", "et al."],
    420       "year": 2023,
    421       "arxiv_id": "2310.06770",
    422       "relevance": "Real-world software engineering benchmark complementary to LiveCodeBench's competitive programming focus."
    423     },
    424     {
    425       "title": "CRUXEval: A benchmark for code reasoning, understanding and execution",
    426       "authors": ["Alex Gu", "Baptiste Rozière", "et al."],
    427       "year": 2024,
    428       "arxiv_id": "2401.03065",
    429       "relevance": "Inspired LiveCodeBench's code execution scenario; evaluates code comprehension through output prediction."
    430     },
    431     {
    432       "title": "Demystifying gpt self-repair for code generation",
    433       "authors": ["Theo X Olausson", "et al."],
    434       "year": 2023,
    435       "arxiv_id": "2306.09896",
    436       "relevance": "Foundational work on LLM self-repair capabilities that inspired LiveCodeBench's self-repair scenario."
    437     },
    438     {
    439       "title": "Code generation with alphacodium: From prompt engineering to flow engineering",
    440       "authors": ["Tal Ridnik", "Dedy Kredo", "Itamar Friedman"],
    441       "year": 2024,
    442       "arxiv_id": "2401.08500",
    443       "relevance": "Demonstrates that multi-step LLM pipelines (reasoning, test generation, self-repair) outperform naive generation, motivating holistic evaluation."
    444     },
    445     {
    446       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    447       "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"],
    448       "year": 2024,
    449       "relevance": "Uses edit distance and AST-based similarity to detect code benchmark contamination — directly relevant to contamination methodology."
    450     },
    451     {
    452       "title": "Competition-level code generation with alphacode",
    453       "authors": ["Yujia Li", "et al."],
    454       "year": 2022,
    455       "relevance": "Competitive programming benchmark and evaluation methodology for code generation at scale."
    456     },
    457     {
    458       "title": "Program synthesis with large language models",
    459       "authors": ["Jacob Austin", "et al."],
    460       "year": 2021,
    461       "arxiv_id": "2108.07732",
    462       "relevance": "Introduced MBPP benchmark — one of the key existing benchmarks LiveCodeBench aims to improve upon."
    463     },
    464     {
    465       "title": "Reflexion: Language agents with verbal reinforcement learning",
    466       "authors": ["Noah Shinn", "et al."],
    467       "year": 2023,
    468       "relevance": "Self-repair and iterative refinement approach relevant to agentic coding workflows and LiveCodeBench's self-repair scenario."
    469     }
    470   ]
    471 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs