scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27500B)
      1 {
      2   "paper": {
      3     "title": "CodeScope: An Execution-based Multilingual Multitask Multidimensional Benchmark for Evaluating LLMs on Code Understanding and Generation",
      4     "authors": [
      5       "Weixiang Yan",
      6       "Haitian Liu",
      7       "Yunkun Wang",
      8       "Yunzhe Li",
      9       "Qian Chen",
     10       "Wen Wang",
     11       "Tingyu Lin",
     12       "Weishan Zhao",
     13       "Li Zhu",
     14       "Hari Sundaram",
     15       "Shuiguang Deng"
     16     ],
     17     "year": 2023,
     18     "venue": "arXiv",
     19     "arxiv_id": "2311.08588"
     20   },
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub repository URL: https://github.com/WeixiangYAN/CodeScope (stated in abstract and throughout the paper)."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The benchmark dataset is publicly released at the same GitHub repository. The abstract states: 'The CodeScope benchmark and code are publicly available at https://github.com/WeixiangYAN/CodeScope.'"
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Table 8 in the appendix provides specific model versions, model sizes, and inference hardware (e.g., 'NVIDIA Tesla A800 * 4' for LLaMA 2, 'NVIDIA GeForce RTX 4090 * 4' for Code LLaMA). However, no requirements.txt or Dockerfile is mentioned. The hardware information is detailed enough to partially reconstruct the environment, but software dependency specifications are absent. Given Table 8 provides specific model versions and GPU configurations, this is borderline but the paper does specify the execution environment for MultiCodeEngine supporting 47 compiler/interpreter versions. Answering YES given the level of detail across Table 8 and compiler specifications."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "While the paper mentions code is available at the GitHub repository, there are no step-by-step reproduction instructions in the paper itself. No README with commands, no 'Reproducing Results' section, and no scripts for replicating experiments are described in the paper text."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No confidence intervals or error bars are reported for any results. All tables (Tables 3-6, 9-12, etc.) report only point estimates."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are used despite numerous comparative claims (e.g., 'WizardCoder demonstrates the best performance'). Rankings and comparisons are based solely on numerical differences without any significance testing."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. While raw performance numbers and differences can be inferred from tables, there is no explicit effect size reporting or contextual framing beyond raw scores."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is provided for why specific sample sizes were chosen for each task (e.g., 200 samples for code smell, 900 for code review, 100 per language for automated testing). The numbers appear arbitrary with no power analysis or justification."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The SD reported in Table 3 is standard deviation across length categories (short/medium/long), not across experimental runs. No variance across runs, random seeds, or repeated experiments is reported. All results appear to be single-run numbers."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Eight baseline LLMs are evaluated: GPT-4, GPT-3.5, PaLM 2, LLaMA 2, StarCoder, Code LLaMA, WizardCoder, and Vicuna. The paper also compares CodeScope rankings against HumanEval and MBPP benchmarks (Table 6)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The baselines include GPT-4, GPT-3.5, PaLM 2, LLaMA 2, Code LLaMA, and WizardCoder, which were all contemporary and competitive models at the time of writing (2023). Table 8 specifies gpt-4-0613 and gpt-3.5-turbo-0613."
     81       },
     82       "ablation_study": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "This is a benchmark paper that evaluates existing LLMs rather than proposing a system with multiple components. There is no system to ablate."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used across tasks: BLEU, METEOR, ROUGE, BERTScore for code summarization; accuracy, precision, recall, F1 for code smell and code review; pass rate, line coverage, branch coverage for automated testing; Pass@k for program synthesis/translation; DSR@K for code repair; Opt@K for code optimization."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation of the LLMs' outputs is conducted. All evaluations are automated through execution-based metrics or matching-based metrics. While the paper argues execution-based metrics are more reliable, no human judgment is used to validate outputs."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The paper does not describe a separation between development/validation and test sets. The benchmark appears to be used directly without any dev/test split for tuning evaluation methodology."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Extensive per-category breakdowns are provided: per-language results for all tasks (Tables 9-12, 14-36 in appendix), per-difficulty breakdowns (Easy vs. Hard in Table 4), per-length breakdowns (Short/Medium/Long in Table 3), and per-efficiency-dimension (Memory vs. Time in Table 5)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Case studies in the appendix (Tables 37-52) include both high-quality and low-quality examples. For example, Table 39 shows a low-quality code summarization by PaLM 2 where it 'misunderstood the task and inserted natural language comments back into the code.' The paper discusses GPT-4's poor performance on automated testing and models' struggles with hard problems."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports that GPT-4 'does not exhibit leading performance' in understanding tasks (ranking 5th), and most models 'struggle to provide correct solutions for hard problems.' It notes open-source models perform poorly on code generation tasks. The paper also discusses that LLMs optimize C code worst, especially execution time."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims the benchmark covers 43 programming languages and 8 tasks, which is verified in Table 1. It claims to demonstrate 'superior breadth and challenges of CodeScope,' which is supported by the comparison with HumanEval and MBPP in Table 6 and Table 2."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper makes causal attributions such as 'This advantage is attributed to its Evol-Instruct approach, which significantly enhances the model's understanding' (Section 4.1 on WizardCoder). This is a causal claim without adequate justification — the authors cannot attribute performance differences to specific training methods without controlled experiments."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper's title claims to be a benchmark 'for Evaluating LLMs on Code Understanding and Generation' broadly, but evaluations are limited to 8 specific LLMs. The paper states 'CodeScope evaluates the coding proficiency of LLMs' without sufficiently bounding claims to the tested models. Claims like 'GPT-4 performs the best' are stated without acknowledging this is only among the 8 tested models."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Alternative explanations for performance differences are not discussed. For example, when attributing WizardCoder's performance to Evol-Instruct, no alternative explanations (training data composition, model size relative to task complexity, etc.) are considered. The Limitations section focuses only on data leakage, not on alternative interpretations of results."
    138       }
    139     },
    140     "setup_transparency": {
    141       "model_versions_specified": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Table 8 provides specific model versions: gpt-4-0613, gpt-3.5-turbo-0613, text-bison-001, LLaMA-2-70b-chat-hf, starchat-beta, Code LLaMA-34b-Instruct-hf, WizardCoder-15B-V1.0, and vicuna-13b-v1.5-16k."
    145       },
    146       "prompts_provided": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Full prompt texts are provided in the appendix case studies (Tables 37-52), showing complete prompt templates with actual content for each task type. The prompts include specific instructions, problem descriptions, and response format specifications."
    150       },
    151       "hyperparameters_reported": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "No hyperparameters are reported. Temperature, top-p, max tokens, and other sampling parameters for the LLM API calls are not mentioned anywhere in the paper. The paper states 'we detail the specific configuration information for each LLM' in Table 8, but Table 8 only lists model versions, sizes, and GPU hardware — no sampling/generation hyperparameters."
    155       },
    156       "scaffolding_described": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "No agentic scaffolding is used. The LLMs are prompted directly for each task without any agentic workflow, retry logic, or multi-step reasoning framework."
    160       },
    161       "data_preprocessing_documented": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Data preprocessing is documented in detail for each task in Sections 3.1-3.2 and Appendix A.3.1-A.3.8. For example, code summarization describes collecting from Rosetta Code, selecting 170 tasks and 4,838 samples; program synthesis describes excluding problems with fewer than 10 test cases, non-deterministic problems, and brute force solutions exceeding 5,000 tokens."
    165       }
    166     },
    167     "limitations_and_scope": {
    168       "limitations_section_present": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "A dedicated 'Limitations' section is present (Section after Conclusion), discussing data leakage concerns from three perspectives."
    172       },
    173       "threats_to_validity_specific": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The limitations section discusses specific threats: data memorization as a confound for benchmark evaluation, the technical infeasibility of constructing a zero-leakage dataset, and how five independent data sources were used to mitigate reliance on any single source. These are specific to this benchmark study."
    177       },
    178       "scope_boundaries_stated": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper does not explicitly state what the results do NOT show or what claims it is NOT making. The limitations section focuses exclusively on data leakage but does not bound the scope of conclusions — for example, it does not state that results may not generalize beyond the 8 tested models, or that performance on Codeforces-sourced problems may not reflect real-world development tasks."
    182       }
    183     },
    184     "data_integrity": {
    185       "raw_data_available": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The benchmark data is publicly available at https://github.com/WeixiangYAN/CodeScope, enabling independent verification of the underlying data."
    189       },
    190       "data_collection_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Data collection is described in detail for each task: Rosetta Code for code summarization (43 languages), Madeyski/Lewowski and Slivka et al. datasets for code smell, Li et al. (2022b) for code review, and Codeforces for program synthesis/translation/repair/optimization. Time periods and selection criteria are specified."
    194       },
    195       "recruitment_methods_described": {
    196         "applies": false,
    197         "answer": false,
    198         "justification": "No human participants were recruited. The data sources are publicly available code repositories and programming competition platforms (Rosetta Code, Codeforces, GitHub)."
    199       },
    200       "data_pipeline_documented": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The data pipeline is documented for each task. For example, program synthesis: collect from Codeforces -> filter by 10+ test cases -> exclude non-deterministic problems -> execute validation -> exclude failing submissions -> exclude brute force >5000 tokens. Code summarization: collect from Rosetta Code -> select 170 tasks -> manually create summaries -> GPT-4 paraphrase -> manual review."
    204       }
    205     },
    206     "conflicts_of_interest": {
    207       "funding_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The paper states: 'Work is supported by Alibaba Group' (footnote on page 1)."
    211       },
    212       "affiliations_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Author affiliations are clearly listed, including that Qian Chen and Wen Wang are from Alibaba Group, with alibaba-inc.com email addresses."
    216       },
    217       "funder_independent_of_outcome": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Alibaba Group funded the work, and no Alibaba product is being evaluated. The benchmark evaluates OpenAI, Google, Meta, and independent models. Alibaba has no direct financial stake in the relative performance of these models on this benchmark."
    221       },
    222       "financial_interests_declared": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No competing interests or financial interests statement is present in the paper. Two authors are from Alibaba Group, which develops its own LLMs, but no declaration of financial interests is made."
    226       }
    227     },
    228     "contamination": {
    229       "training_cutoff_stated": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No training data cutoff dates are stated for any of the evaluated models. This is important because several data sources (Codeforces, Rosetta Code) are publicly available and could have been in the training data."
    233       },
    234       "train_test_overlap_discussed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The Limitations section extensively discusses data leakage risks, acknowledging that 'data leakage is a likely problem for benchmarks for evaluating LLMs' and discussing it from three perspectives. They note using 'five independent data sources' to mitigate leakage. However, no concrete analysis of actual overlap is performed."
    238       },
    239       "benchmark_contamination_addressed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "While the paper discusses data leakage conceptually in the Limitations section, it does not address the specific contamination risk that Codeforces solutions (a major data source) and Rosetta Code are publicly available and likely included in the training data of the evaluated models. The paper argues data leakage is 'unavoidable' rather than taking concrete steps to quantify or mitigate it."
    243       }
    244     },
    245     "human_studies": {
    246       "pre_registered": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in the study. This is a benchmark evaluation of LLMs."
    250       },
    251       "irb_or_ethics_approval": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study."
    255       },
    256       "demographics_reported": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the study."
    260       },
    261       "inclusion_exclusion_criteria": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "randomization_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "blinding_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "attrition_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       }
    281     },
    282     "cost_and_practicality": {
    283       "inference_cost_reported": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time for running the benchmark are reported, despite using closed-source API models (GPT-4, GPT-3.5, PaLM 2) which incur monetary costs."
    287       },
    288       "compute_budget_stated": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "While GPU types are listed in Table 8, no total compute budget (GPU hours, API spend, total runtime) is provided. Running 8 LLMs across 8 tasks with ~13,000 total samples likely required significant compute, but this is not quantified."
    292       }
    293     }
    294   },
    295   "claims": [
    296     {
    297       "claim": "CodeScope is the first comprehensive benchmark covering 43 programming languages and 8 coding tasks with execution-based evaluation.",
    298       "evidence": "Table 1 lists 8 tasks and their language coverage. Table 2 compares CodeScope with 13 other benchmarks, showing it is the only one that is execution-based, multilingual, multitask, and multidimensional.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "WizardCoder demonstrates the best overall performance among tested LLMs on code understanding tasks with a score of 50.14.",
    303       "evidence": "Table 3 (Length dimension) and Table 6 show WizardCoder at 50.14 overall for code understanding, ahead of LLaMA 2 (48.79) and GPT-3.5 (48.10).",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "GPT-4 leads in code generation tasks but ranks only 5th in code understanding on CodeScope.",
    308       "evidence": "Table 6 shows GPT-4 at 31.47 for generation (1st) but 47.16 for understanding (5th). Section 5 discusses this discrepancy.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "CodeScope rankings differ substantially from HumanEval and MBPP rankings, demonstrating its complementary evaluation value.",
    313       "evidence": "Table 6 shows GPT-4 ranks 1st on HumanEval/MBPP but ranks differently on CodeScope Understanding/Generation. WizardCoder ranks 1st on CodeScope Understanding but 2nd/4th on HumanEval/MBPP.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "WizardCoder's strong performance is attributed to its Evol-Instruct training approach.",
    318       "evidence": "Section 4.1 states: 'This advantage is attributed to its Evol-Instruct approach, which significantly enhances the model's understanding.' This is an attribution claim without controlled evidence.",
    319       "supported": "weak"
    320     },
    321     {
    322       "claim": "LLMs optimize Python code best but C code worst, especially in terms of execution time.",
    323       "evidence": "Table 5 shows optimization results across 4 languages. For time optimization, most models achieve higher scores in Python than in C. GPT-4 achieves 36.67% time optimization for Python vs. 6.67% for C.",
    324       "supported": "moderate"
    325     }
    326   ],
    327   "methodology_tags": [
    328     "benchmark-eval"
    329   ],
    330   "key_findings": "CodeScope is presented as the first comprehensive execution-based benchmark for evaluating LLMs on coding tasks, covering 43 programming languages and 8 tasks across 3 evaluation dimensions (length, difficulty, efficiency). Evaluation of 8 LLMs reveals that model rankings differ substantially from existing benchmarks like HumanEval and MBPP, with WizardCoder leading in code understanding but GPT-4 leading in code generation. The paper finds that strong code generation performance does not imply strong code understanding, and that most open-source LLMs struggle significantly on hard programming problems, with near-zero pass rates. The MultiCodeEngine execution environment supporting 14 programming languages enables more reliable evaluation than matching-based metrics.",
    331   "red_flags": [
    332     {
    333       "flag": "No hyperparameters reported",
    334       "detail": "Temperature, top-p, max tokens, and other sampling/generation parameters are not reported for any model. These parameters significantly affect LLM output quality and reproducibility. Table 8 lists only model versions and hardware."
    335     },
    336     {
    337       "flag": "Major contamination risk unaddressed",
    338       "detail": "Codeforces solutions (used for 4 of 8 tasks) and Rosetta Code (used for code summarization) are publicly available websites likely crawled for LLM training data. The paper acknowledges data leakage conceptually but does not analyze whether specific models were trained on this data, nor does it state training cutoff dates."
    339     },
    340     {
    341       "flag": "No statistical significance testing",
    342       "detail": "All comparative claims ('WizardCoder demonstrates the best performance,' 'GPT-4 excels') are based on raw numerical differences without any significance testing. With single-run results and no error bars, it is impossible to determine whether performance differences are meaningful."
    343     },
    344     {
    345       "flag": "Unsupported causal attributions",
    346       "detail": "The paper attributes WizardCoder's performance to its Evol-Instruct approach and GPT-4's struggles to difficulties with 'tracking and analyzing data flow.' These are causal claims without controlled experiments to isolate the effects of specific training methods."
    347     },
    348     {
    349       "flag": "Reference summaries partially generated by GPT-4",
    350       "detail": "Code summarization reference summaries were created using a three-step process involving GPT-4 paraphrasing. While the authors claim this was NL-to-NL paraphrasing that doesn't bias toward GPT-4, using GPT-4 in the reference creation process introduces a potential confound when GPT-4 is also being evaluated."
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "Evaluating large language models trained on code",
    356       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    357       "year": 2021,
    358       "arxiv_id": "2107.03374",
    359       "relevance": "Introduces HumanEval, the most widely used code generation benchmark against which CodeScope explicitly compares."
    360     },
    361     {
    362       "title": "Program synthesis with large language models",
    363       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"],
    364       "year": 2021,
    365       "arxiv_id": "2108.07732",
    366       "relevance": "Introduces MBPP benchmark for code generation evaluation, directly compared with CodeScope."
    367     },
    368     {
    369       "title": "CodeXGLUE: A machine learning benchmark dataset for code understanding and generation",
    370       "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"],
    371       "year": 2021,
    372       "relevance": "Influential code benchmark using matching-based metrics that CodeScope aims to improve upon with execution-based evaluation."
    373     },
    374     {
    375       "title": "XCodeEval: A large scale multilingual multitask benchmark for code understanding, generation, translation and retrieval",
    376       "authors": ["Mohammad Abdullah Matin Khan", "M. Saiful Bari"],
    377       "year": 2023,
    378       "arxiv_id": "2303.03004",
    379       "relevance": "Most directly comparable prior benchmark to CodeScope; uses execution-based metrics in multilingual multitask setting."
    380     },
    381     {
    382       "title": "WizardCoder: Empowering code large language models with evol-instruct",
    383       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    384       "year": 2023,
    385       "relevance": "Code LLM that demonstrates the best understanding performance on CodeScope, relevant for evaluating LLM coding capabilities."
    386     },
    387     {
    388       "title": "Code LLaMA: Open foundation models for code",
    389       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    390       "year": 2023,
    391       "arxiv_id": "2308.12950",
    392       "relevance": "Major open-source code LLM evaluated in CodeScope, relevant for benchmark evaluation of code generation models."
    393     },
    394     {
    395       "title": "StarCoder: may the source be with you!",
    396       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    397       "year": 2023,
    398       "arxiv_id": "2305.06161",
    399       "relevance": "Open-source code LLM trained on 80+ programming languages, evaluated in CodeScope."
    400     },
    401     {
    402       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    403       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    404       "year": 2023,
    405       "arxiv_id": "2308.01861",
    406       "relevance": "Benchmark evaluating LLMs on more complex class-level code generation, complementary to CodeScope's function-level evaluation."
    407     },
    408     {
    409       "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation",
    410       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    411       "year": 2022,
    412       "relevance": "Translates HumanEval/MBPP to 18 languages, directly relevant as a multilingual code benchmark approach."
    413     },
    414     {
    415       "title": "Measuring coding challenge competence with APPS",
    416       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    417       "year": 2021,
    418       "relevance": "Challenging program synthesis benchmark; CodeScope aims to provide similar difficulty assessment but across more languages."
    419     },
    420     {
    421       "title": "No more manual tests? Evaluating and improving ChatGPT for unit test generation",
    422       "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"],
    423       "year": 2023,
    424       "arxiv_id": "2305.04207",
    425       "relevance": "Evaluates LLM capabilities for automated test generation, directly relevant to CodeScope's automated testing task."
    426     },
    427     {
    428       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    429       "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"],
    430       "year": 2023,
    431       "arxiv_id": "2302.00288",
    432       "relevance": "Benchmark addressing limitations of HumanEval for real-world code generation evaluation."
    433     }
    434   ]
    435 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs