scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27288B)
      1 {
      2   "paper": {
      3     "title": "CodeCriticBench: A Holistic Code Critique Benchmark for Large Language Models",
      4     "authors": [
      5       "Alexander Zhang",
      6       "Marcus Dong",
      7       "Jiaheng Liu",
      8       "Wei Zhang",
      9       "Yejie Wang",
     10       "Jian Yang",
     11       "Ge Zhang",
     12       "Tianyu Liu",
     13       "Zhongyuan Peng",
     14       "Yingshui Tan",
     15       "Yuanxing Zhang",
     16       "Zhexu Wang",
     17       "Weixun Wang",
     18       "Yancheng He",
     19       "Ken Deng",
     20       "Wangchunshu Zhou",
     21       "Wenhao Huang",
     22       "Zhaoxiang Zhang"
     23     ],
     24     "year": 2025,
     25     "arxiv_id": "2502.16614"
     26   },
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "A GitHub link is provided in the paper header: https://github.com/multimodal-art-projection/CodeCriticBench. An anonymous data link is also given in Appendix A.1 (https://anonymous.4open.science/r/CodeCriticBench-D657/)."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The dataset is released via the anonymous link in Appendix A.1 and the GitHub repository. The paper states 'We have released our data on an anonymous website.'"
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section listing library versions is provided in the paper. Model links are given in Appendix C (Tables 7, 8) but no execution environment specifications."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions, README with commands, or 'Reproducing Results' section is included in the paper. The GitHub repository is linked but the paper itself contains no specific instructions for replicating experiments."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "All results in Tables 3, 4, 5, and throughout the appendix are reported as single point estimates (e.g., ACC or MSE values) with no confidence intervals, error bars, or ± notation."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper makes numerous comparative claims (e.g., 'performance generally improves with more parameters', 'o1-like models achieving milestone results') but no statistical significance tests (p-values, t-tests, etc.) are reported."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper reports effect sizes with baseline context, e.g., 'Qwen2.5-Coder-3B-Instruct shows gains of 7.5% in ACC and 1.31 in MSE' (Section 4.3, CoT evaluation), and improvements from critiques (Figure 11, scores from 5.72 to 6.91). Results are reported with enough context to understand the magnitude of differences."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The benchmark contains 4,300 samples and subsets like CodeCritic_400 (400 randomly sampled instances), but no justification or power analysis is provided for why these sizes are sufficient for the claims made."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No standard deviations, variance, or spread measures across experimental runs are reported. All results appear to be single-run numbers with no indication of result stability across multiple trials."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper evaluates 38 models including both open-source and closed-source models of varying sizes, and compares against prior critique benchmarks in Table 2 (CriticBench, CriticEval, JudgeBench, RealCritic)."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The 38 evaluated models include state-of-the-art systems at the time of writing: DeepSeek-R1, OpenAI o1-Preview, Claude3.5-Sonnet, GPT-4o, Gemini2.0-Flash-Thinking, and Qwen2.5 series models."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper includes ablation-style analyses: CoT vs. no-CoT evaluation (Table 5), effect of critique quality by model size (Figure 11), basic vs. advanced evaluation comparison (Figure 9), and performance across difficulty levels (Figure 5)."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Multiple metrics are used: ACC (Accuracy) for basic evaluation, MSE (Mean Squared Error) for advanced evaluation, and Pass@1 Accuracy for bug identification (Section 3.4)."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Human evaluation is included. Section 4.3 describes ranking 8 models based on human evaluations (Figure 9), comparing human rankings against basic and advanced critique rankings. Additionally, 20 volunteers assessed correctness for the Code QA subset (Section 3.3)."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper states code generation data is collected 'restricted to test sets' from CodeForces, MBPP, and LiveCodeBench (Section 3.2). The entire benchmark serves as a held-out evaluation set for the 38 models being evaluated."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Extensive per-category breakdowns are provided: by difficulty level (Easy/Medium/Hard in Figure 5), by task type (Code Gen vs. Code QA in Table 3), by application scenario (11 scenarios in Figure 4), by error type (23 categories in Table 6), and by model size group."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Error analysis is provided: specific error types where models struggle are discussed (e.g., 'most models, except Doubao-Coder-Preview and Claude3.5-Sonnet, struggle with identifying Performance Issue'), and hard difficulty cases show accuracy drops to ~30% (Section 4.3)."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Negative results are reported: 'DeepSeek-R1 and OpenAI o1-Preview underperform on Easy to Medium data, likely due to overthinking' (Section 4.3). Some models perform near random guessing on hard problems (accuracy ~30%). Some error categories show near-zero detection rates (Table 6)."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract claims the benchmark includes two code tasks with different difficulties, basic and advanced evaluations with fine-grained checklists, and 'extensive experimental results' — all supported by the paper's Tables 1-3, Sections 3.1-3.4, and Section 4."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper makes causal claims: 'This improvement is due to the model having access to more detailed context' (CoT evaluation), 'We suggest this is due to insufficient code-related optimization' (error studies), 'likely due to overthinking' (o1 models). These causal attributions are not supported by controlled experiments isolating the claimed mechanisms."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title claims 'Holistic Code Critique Benchmark' but the Limitations section acknowledges 'our evaluation is confined to single-file scenarios' and 'presently focused solely on code.' The benchmark covers only algorithmic problems and StackOverflow-style QA, not all code critique scenarios. The broad title oversells the scope."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Alternative explanations for the results are not substantively discussed. For instance, the scaling law finding and CoT improvements are not examined for confounds. The paper presents one interpretation without considering alternatives (e.g., could CoT improvements be due to formatting rather than reasoning?)."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Models are referred to by marketing names only: 'Claude3.5-Sonnet', 'GPT 4o', 'DeepSeek-v3', 'Gemini2.0-Flash-Thinking'. No API version strings, snapshot dates, or specific model identifiers (e.g., 'gpt-4o-2024-05-13') are provided. Tables 7-8 link to model pages but do not specify exact versions used."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt texts are provided in Appendix B: Figure 18 (basic evaluation prompt), Figure 19 (advanced evaluation prompt), Figure 20 (error type lists), Figure 21 (bug insertion prompt), Figures 22-28 (additional prompts including CoT variants). The prompts use template variables ($Question, $Answer, $Checklists) but the template structure is fully specified."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No hyperparameters (temperature, top-p, max tokens, sampling settings) are reported for any of the 38 models evaluated. These settings can significantly affect LLM output quality."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The evaluation is a direct prompt-response setup where models receive prompts and generate critiques."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.2 describes the data collection pipeline: for Code Gen, data from CodeForces/MBPP/LiveCodeBench restricted to test sets, DeepSeek-v3 rewriting, bug insertion with sandbox verification and manual review. For Code QA, rule-based filtering of StackOverflow content, Qwen2.5-72B question generation, manual and LLM-assisted review. Section 3.3 documents the difficulty labeling and score calibration process."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "A dedicated 'Limitations' section is present after the Conclusion (page 13), discussing single-file scope and code-only domain limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The Limitations section contains only two generic points: evaluation limited to single-file scenarios, and benchmark focused solely on code. No specific threats to validity are discussed, such as potential biases from using LLMs to construct the dataset, inter-annotator agreement among the 20 volunteers, or the representativeness of StackOverflow-sourced data."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The Limitations section explicitly states: 'our evaluation is confined to single-file scenarios' and 'CodeCriticBench is presently focused solely on code.' These are specific scope boundaries about what was not tested."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The raw dataset is available via both the GitHub repository (https://github.com/multimodal-art-projection/CodeCriticBench) and an anonymous data link (Appendix A.1), enabling independent verification of the benchmark data."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.2 describes data collection in detail: sources (CodeForces, MBPP, LiveCodeBench for Code Gen; StackOverflow for Code QA), how problems were rewritten by DeepSeek-v3, how bugs were inserted and verified, and how QA pairs were generated and filtered."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper mentions '20 volunteers with coding experience' for correctness assessment (Section 3.3) and 'human experts' for iterative discussions, but does not describe how these volunteers or experts were recruited, their backgrounds beyond 'coding experience', or whether the recruitment could introduce bias."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline is documented: Figure 2 illustrates the data collection process, Section 3.2 describes extraction and filtering steps, Section 3.3 documents difficulty labeling (12 models, 80%/60% thresholds), and score annotation (20% manual annotation → 3 LLMs → linear regression calibration). Final counts are provided (4,300 samples: 1,517 Easy, 1,084 Medium, 1,699 Hard)."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding sources, grants, or acknowledgments section is present in the paper. Authors are affiliated with multiple institutions and companies (Alibaba, Kuaishou, OPPO, M-A-P) but no funding disclosure is made."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are listed in the header: NJU, M-A-P, Alibaba, BUAA, CASIA, BUPT, Kuaishou, OPPO. Authors affiliated with companies whose models are evaluated (e.g., Alibaba → Qwen models) are identifiable."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence cannot be assessed. However, several authors are from Alibaba (which produces Qwen models, heavily featured in the evaluation), making funder independence questionable even if funding were disclosed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement, patent disclosures, or financial interest declarations are present in the paper. Several authors are employed by companies (Alibaba, Kuaishou, OPPO) that produce or use the evaluated models."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper evaluates 38 pre-trained models on the benchmark but does not state any training data cutoff dates for the models. This is relevant because CodeForces and MBPP problems may be in training data."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No analysis of whether benchmark problems appeared in model training data is provided. The paper uses MBPP (published 2021) and CodeForces problems, which could be in the training data of many tested models. LiveCodeBench is referenced for contamination-free evaluation but this property is not verified for the full CodeCriticBench."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "MBPP and CodeForces problems have been publicly available for years and are likely in training data of many evaluated models. The paper acknowledges LiveCodeBench is 'contamination free' in the reference but does not address contamination risk for its own benchmark, which also includes MBPP and CodeForces data. DeepSeek-v3 rewriting may partially mitigate this but the extent is not analyzed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "The human annotation (20 volunteers for Code QA correctness and 400 instances for ranking comparison) is part of benchmark construction and a small validation study, not a formal human subjects study requiring pre-registration."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "The 20 volunteers assessed code correctness and 8 models were ranked — this is benchmark construction work rather than a human subjects study requiring IRB approval."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "The volunteer annotation and human evaluation ranking are part of benchmark construction, not a human subjects study. Demographics are not structurally required."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No formal human subjects study was conducted. The volunteers participated in annotation tasks as part of benchmark construction."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No experimental study with human participants requiring randomization. The human involvement is limited to annotation tasks."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No experimental study with human participants requiring blinding. Annotation tasks do not structurally require blinding."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No formal human subjects study was conducted, so attrition reporting is not applicable."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API expenses, tokens consumed, or wall-clock time are reported for running 38 models on the 4,300-sample benchmark. The evaluation likely incurred substantial API costs for closed-source models."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget, GPU hours, total API spend, or hardware specifications are mentioned for running the benchmark evaluation or dataset construction."
    298       }
    299     }
    300   },
    301   "claims": [
    302     {
    303       "claim": "CodeCriticBench is the first holistic code critique benchmark that covers both code generation and code QA tasks with basic and advanced evaluations.",
    304       "evidence": "Table 2 compares CodeCriticBench against CriticBench, CriticEval, JudgeBench, and RealCritic, showing it is the only benchmark with both Code Gen (3,200 samples), Code QA (1,100 samples), basic and advanced evaluation settings.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "Performance on CodeCriticBench generally improves with model scale, supporting scaling laws.",
    309       "evidence": "Figure 3 shows ACC trending upward with parameter count across model families. Table 3 shows o1-like models (largest effective compute) achieving the highest ACC scores (72-75%).",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "Advanced critique evaluation rankings align more closely with human evaluation than basic critique evaluation.",
    314       "evidence": "Figure 9 shows that model rankings from the advanced setting closely match human evaluation rankings across 8 models on CodeCritic_400, while basic evaluation rankings diverge.",
    315       "supported": "moderate"
    316     },
    317     {
    318       "claim": "Chain-of-thought evaluation improves both ACC and MSE compared to direct evaluation.",
    319       "evidence": "Table 5 shows CoT improves ACC by 7.5-15.5% and MSE by 1.31-3.13 across four Qwen2.5-Coder models on CodeCritic_400.",
    320       "supported": "moderate"
    321     },
    322     {
    323       "claim": "o1-like models (DeepSeek-R1, OpenAI o1-Preview) underperform on Easy/Medium data but excel on Hard data due to their architecture.",
    324       "evidence": "Figure 5 shows DeepSeek-R1 at 92.21% on Easy (below others at 93-98%) but 51.97% on Hard (vs. ~27-34% for most models). The paper hypothesizes this is 'partly due to o1-like architecture' but provides no mechanistic evidence.",
    325       "supported": "weak"
    326     },
    327     {
    328       "claim": "Model-generated critiques improve answer quality, and critique quality increases with model size.",
    329       "evidence": "Figure 11 shows fine-grained scores improving from 5.72 (original) to 6.76-6.91 after applying critiques from Qwen2.5-Coder models of increasing size (1.5B to 14B), evaluated by GPT-4o on CodeCritic_400.",
    330       "supported": "moderate"
    331     }
    332   ],
    333   "methodology_tags": [
    334     "benchmark-eval"
    335   ],
    336   "key_findings": "CodeCriticBench introduces a 4,300-sample benchmark spanning code generation and code QA with basic (correctness) and advanced (fine-grained scoring) evaluation modes. Evaluation of 38 LLMs shows performance scales with model size, with o1-like reasoning models (DeepSeek-R1, OpenAI o1-Preview) achieving the highest accuracy (72-75% ACC) especially on hard problems. The advanced fine-grained evaluation correlates better with human judgments than binary correctness evaluation, and chain-of-thought prompting improves evaluation accuracy by 7.5-15.5% over direct prompting.",
    337   "red_flags": [
    338     {
    339       "flag": "No uncertainty quantification",
    340       "detail": "All results across 38 models are single-run point estimates with no error bars, confidence intervals, standard deviations, or repeated trials. For LLM evaluations where outputs can vary with sampling, this is a significant omission that makes it impossible to assess whether observed differences are meaningful."
    341     },
    342     {
    343       "flag": "No hyperparameters reported",
    344       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 38 models. These settings substantially affect LLM output quality and can change the results entirely."
    345     },
    346     {
    347       "flag": "Potential conflicts of interest",
    348       "detail": "Authors from Alibaba (which produces Qwen/Qwen2.5-Coder models that are heavily featured and among the best-performing open-source models in the evaluation) and other companies do not disclose any conflicts of interest. Qwen models are used in both dataset construction (Qwen2.5-72B for QA generation, Qwen2.5-Coder for responses) and evaluation."
    349     },
    350     {
    351       "flag": "Benchmark contamination risk unaddressed",
    352       "detail": "MBPP (published 2021) and CodeForces problems are included in the benchmark. Many of the evaluated models were trained well after these datasets became public, creating a serious contamination risk that is not discussed or analyzed."
    353     },
    354     {
    355       "flag": "Circular dataset construction",
    356       "detail": "LLMs (DeepSeek-v3, Qwen2.5-Coder, GPT-4o, Claude3.5-Sonnet) were used to construct the dataset (rewrite problems, generate QA pairs, generate evaluation checklists, assign scores), and then the same or similar models are evaluated on the dataset. This creates potential circularity where models may perform better on data they helped create."
    357     },
    358     {
    359       "flag": "Model versions unspecified",
    360       "detail": "No specific API versions or snapshot dates are provided for any of the closed-source models (Claude3.5-Sonnet, GPT-4o, DeepSeek-v3, etc.), making results unreproducible as model behavior changes across versions."
    361     }
    362   ],
    363   "cited_papers": [
    364     {
    365       "title": "Evaluating Large Language Models Trained on Code",
    366       "authors": ["Mark Chen", "Jerry Tworek"],
    367       "year": 2021,
    368       "arxiv_id": "2107.03374",
    369       "relevance": "Introduces HumanEval, a foundational code generation benchmark used in the CodeCriticBench comparison."
    370     },
    371     {
    372       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    373       "authors": ["Naman Jain"],
    374       "year": 2024,
    375       "relevance": "Source of contamination-free code problems used in CodeCriticBench's code generation subset."
    376     },
    377     {
    378       "title": "CriticBench: Benchmarking LLMs for Critique-Correct Reasoning",
    379       "authors": ["Zicheng Lin"],
    380       "year": 2024,
    381       "relevance": "Prior critique benchmark that CodeCriticBench directly compares against and aims to improve upon."
    382     },
    383     {
    384       "title": "CriticEval: Evaluating Large-Scale Language Model as Critic",
    385       "authors": ["Tian Lan"],
    386       "year": 2024,
    387       "relevance": "Large-scale critique evaluation benchmark compared against in Table 2."
    388     },
    389     {
    390       "title": "JudgeBench: A Benchmark for Evaluating LLM-based Judges",
    391       "authors": ["Sijun Tan"],
    392       "year": 2024,
    393       "relevance": "Benchmark for LLM-as-judge evaluation, directly compared in Table 2."
    394     },
    395     {
    396       "title": "RealCritic: Towards Effectiveness-Driven Evaluation of Language Model Critiques",
    397       "authors": ["Zhengyang Tang"],
    398       "year": 2025,
    399       "arxiv_id": "2501.14492",
    400       "relevance": "Recent critique benchmark that CodeCriticBench positions itself against."
    401     },
    402     {
    403       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    404       "authors": ["Aman Madaan"],
    405       "year": 2023,
    406       "relevance": "Foundational work on LLM self-critique and iterative refinement, motivating critique benchmark development."
    407     },
    408     {
    409       "title": "LLM Critics Help Catch LLM Bugs",
    410       "authors": ["Nat McAleese"],
    411       "year": 2024,
    412       "arxiv_id": "2407.00215",
    413       "relevance": "Demonstrates LLM critique capacity for finding bugs, directly related to CodeCriticBench's bug detection evaluation."
    414     },
    415     {
    416       "title": "Qwen2.5-Coder Technical Report",
    417       "authors": ["Binyuan Hui"],
    418       "year": 2024,
    419       "arxiv_id": "2409.12186",
    420       "relevance": "Technical report for the Qwen2.5-Coder model family heavily featured in CodeCriticBench evaluations and dataset construction."
    421     },
    422     {
    423       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    424       "authors": ["Daya Guo"],
    425       "year": 2024,
    426       "arxiv_id": "2401.14196",
    427       "relevance": "Technical report for DeepSeek-Coder models evaluated in the benchmark."
    428     },
    429     {
    430       "title": "Program Synthesis with Large Language Models",
    431       "authors": ["Jacob Austin"],
    432       "year": 2021,
    433       "arxiv_id": "2108.07732",
    434       "relevance": "Introduces MBPP, a code generation benchmark used as a data source for CodeCriticBench."
    435     },
    436     {
    437       "title": "FullStack Bench: Evaluating LLMs as Full Stack Coder",
    438       "authors": ["Siyao Liu"],
    439       "year": 2024,
    440       "arxiv_id": "2412.00535",
    441       "relevance": "Full-stack code evaluation benchmark with category definitions adopted for CodeCriticBench's Code QA classification."
    442     }
    443   ]
    444 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs