scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28278B)
      1 {
      2   "paper": {
      3     "title": "PythonSaga: Redefining the Benchmark to Evaluate Code Generating LLMs",
      4     "authors": [
      5       "Ankit Yadav",
      6       "Himanshu Beniwal",
      7       "Mayank Singh"
      8     ],
      9     "year": 2024,
     10     "venue": "Conference on Empirical Methods in Natural Language Processing (EMNLP 2024 Findings)",
     11     "arxiv_id": "2401.03855",
     12     "doi": "10.18653/v1/2024.findings-emnlp.996"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval",
     21     "qualitative"
     22   ],
     23   "key_findings": "HumanEval and MBPP are heavily biased toward basic programming concepts (72-77% of problems in 5 concepts) and easy difficulty (85-90% Easy), with 14 of 38 programming concepts entirely absent. PythonSaga, a new 185-problem benchmark with balanced concept and difficulty representation, reveals dramatically lower Code-LLM performance: open-source models achieve <4.5% pass@1 and GPT-4 achieves only 12.4% pass@1, far below scores on existing benchmarks. Six programming concepts were unsolved by all tested models.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract states: 'The code and data set are openly available to the NLP community at https://anonymous.4open.science/r/PythonSaga.' A URL is provided, though it is an anonymous review link."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The dataset is released at the same URL mentioned in the abstract. PythonSaga's 185 problems are described as openly available."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions 'a single Tesla V100' as hardware but provides no software environment details — no requirements.txt, Dockerfile, library versions, or Python version."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. There is no description of how to run the evaluation pipeline or replicate the benchmarking experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Table 2 reports only point estimates for pass@1 and pass@10. No confidence intervals or error bars are provided for any results."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims 'closed-source models performed considerably better than open-source models' and discusses performance differences across concept categories, but no statistical significance tests are used anywhere."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only raw pass@k scores are reported. No formal effect sizes (Cohen's d, odds ratios, etc.) are computed for any comparisons."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The choice of 5 problems per concept (185 total) is not justified with any statistical reasoning or power analysis. The 5-annotator sample for human evaluation is also unjustified."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Despite generating n=20 samples per problem, no variance, standard deviation, or spread measures are reported across the samples or across problems."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares 14 open-source and 2 closed-source models, and contrasts PythonSaga results against HumanEval and MBPP performance (Figure 6)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Models tested include Llama 3, StarCoder2, DeepSeek Coder, GPT-4 — all contemporary at time of writing (2024)."
     84       },
     85       "ablation_study": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "PythonSaga is a benchmark, not a multi-component system. There is nothing to ablate."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Both pass@1 and pass@10 are reported for all models in Table 2."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The human annotation study evaluates the benchmark problems themselves (concept labels and difficulty), not model outputs. No human evaluation of generated code quality is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "PythonSaga is a newly created benchmark. Models are evaluated on it without any tuning on PythonSaga data, making it a held-out test set."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 3 provides a detailed heatmap of per-concept performance for each model. Section 5.3 also reports breakdowns by basic/intermediate/advanced categories."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5.4 identifies and categorizes 9 error types including invalid syntax, incomplete code, hallucination, and non-compliance with problem statements."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that no model could solve any problem in 6 specific concepts (Basic Data Structures, Recursion, Hashing, Context Managers, Concurrency and Parallelism, Max Flow)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims about concept bias (supported by Section 4.3, Figure 2), prevalence of easy tasks (supported by Section 4.3 difficulty analysis), and poor Code-LLM performance (supported by Table 2) are all backed by results in the paper."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper asserts that 'current benchmarks overstate the generalization abilities of existing code-LLMs' and that easy/biased benchmarks 'inflate model performance estimations.' However, the performance difference between PythonSaga and HumanEval/MBPP could be due to confounds other than concept diversity (e.g., problem rephrasing style, source platform differences, test case quality). No controlled analysis isolates the effect of concept diversity."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly states 'we focus on Python Programming language' and the Limitations section acknowledges that 'the generalizability of these findings to other languages requires further investigation.'"
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for poor model performance on PythonSaga, such as the rephrasing style making problems harder to parse, platform-specific problem formulations, low test case coverage, or other confounds beyond concept diversity."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures pass@k and discusses results in terms of pass@k without inflating it to broader claims about 'code generation ability' in general. Claims match the measurement granularity."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Models are listed by general name and size (e.g., 'Code Llama 7B', 'GPT-3.5', 'GPT-4') without specific version identifiers or snapshot dates. No API version strings are provided."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The benchmark problems serve as the prompts in HumanEval-style evaluation. Representative examples are provided in Appendices A.2-A.4, and the full dataset is released."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper states n=20 samples per problem but does not report temperature, top-p, max tokens, or any other generation hyperparameters for any model."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. Models generate code directly from prompts."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 5.1 describes the curation methodology: problems selected from GeeksForGeeks and LeetCode, 5 per concept, manually rephrased without AI assistance. Section 4.2 describes the annotation protocol including guidelines and constraints."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section discusses three specific limitations of the study."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section identifies specific threats: (1) random sampling of 164/974 MBPP problems may introduce selection bias, (2) annotators are all postgrad CS students limiting annotator diversity, (3) findings limited to Python."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The Limitations section explicitly states that findings may not generalize to the full MBPP benchmark, other annotator populations, or other programming languages."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The benchmark dataset is released, but raw model outputs (the 20 generated code samples per problem per model) and raw annotation data from the 5 annotators are not made available."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 5.1 describes curation from GeeksForGeeks and LeetCode with manual rephrasing. Section 4.2 describes the annotation procedure including annotator qualifications, guidelines, and constraints."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "Annotators are described as 'postgraduate students in Computer Science with at least three years of experience' who 'willingly volunteered,' but how they were recruited (likely convenience sampling from the same lab) is not described, and selection bias is not discussed."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The pipeline from problem selection to final benchmark is described at a high level, but key details are missing: how many candidate problems were initially considered, how many were filtered at each stage, and what criteria were used to select the final 5 per concept."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "The Acknowledgements section thanks annotators and the LINGO research group but does not mention any funding sources, grants, or sponsors."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations with IIT Gandhinagar are clearly disclosed in the paper header. No commercial affiliation with any evaluated model is present."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "No funder is identified. This appears to be unfunded academic work from a university research group."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates are stated for any of the 16 evaluated models. This is critical since PythonSaga problems are sourced from public coding platforms."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether PythonSaga problems (or their originals from GeeksForGeeks/LeetCode) appeared in any model's training data. The manual rephrasing is framed as making problems harder, not as decontamination."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "HumanEval and MBPP were published in 2021; many tested models were trained after that. PythonSaga draws from public platforms that are common training data sources. Contamination is not discussed for any benchmark."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No pre-registration is mentioned for the human annotation study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "The Ethics Statement mentions informed consent but does not mention IRB or ethics board approval."
    263       },
    264       "demographics_reported": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Annotators are described as 'postgraduate students in Computer Science with at least three years of experience in Python programming and competitive programming' (Section 4.2)."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "Section 4.2 specifies annotators must be postgraduate CS students with at least 3 years of Python and competitive programming experience."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This is an annotation study, not an experimental study with treatment/control conditions. Randomization of conditions is not applicable."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is an annotation study where blinding is not applicable — annotators are classifying problems by concept and difficulty, not evaluating experimental conditions."
    283       },
    284       "attrition_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper states annotators 'willingly volunteered throughout the entire duration' but does not explicitly report whether any annotators started but did not complete the task, or how many total were initially recruited."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference costs, API costs, or per-example latency figures are reported despite evaluating 16 models on 185 problems with 20 samples each."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper mentions 'a single Tesla V100' as a hardware constraint but does not quantify total GPU hours, training time, or API spend."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No seed sensitivity analysis is reported. Results appear to be from a single experimental run with n=20 samples per problem."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper explicitly states 'we consistently generated n = 20 samples from both open and closed source models' (Section 5.3)."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search is described. The paper does not report what generation settings were used or whether any tuning was performed."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No description of how generation configurations were chosen for each model. Temperature, top-p, and other settings are not reported or justified."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed in the paper, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors created PythonSaga and evaluate all models on it. They do not acknowledge or address the bias of evaluating using their own benchmark."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Models range from 2.7B to 13B parameters (plus unknown-size closed-source), but performance is not analyzed as a function of compute budget. Closed-source models are compared with open-source without accounting for compute differences."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper's central contribution is a construct validity analysis: it argues that HumanEval and MBPP fail to measure general code generation ability due to biased concept coverage and difficulty distribution, and designs PythonSaga to address these validity gaps."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is used. Models generate code directly from prompts."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "PythonSaga problems are sourced from GeeksForGeeks and LeetCode, which predate most tested models' training data. This temporal overlap is not discussed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information. The problem format (function signature + docstring) is standard but feature leakage is not addressed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Problems from GeeksForGeeks and LeetCode may have structural overlap with model training data from these same platforms. This non-independence is not discussed."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection method is applied. The manual rephrasing is presented as making problems harder, not as a decontamination technique, and its effectiveness at preventing memorization is not tested."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "HumanEval and MBPP are biased toward a limited set of programming concepts, with 72.1% and 77.3% of problems in just 5 concepts (Mathematics, Control Flow, Basic Data Structures, Variables & Data Types, In-Built Functions)",
    375       "evidence": "Section 4.3 and Figure 2 show concept distributions based on 5-annotator majority voting across 164 HumanEval and 164 MBPP problems.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "14 out of 38 programming concepts (37.8%) are completely absent from both HumanEval and MBPP",
    380       "evidence": "Figure 2 shows zero problems in concepts including OOPs, Linked Lists, Tree, Graph, Backtracking, and 9 others.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "More than 80% of problems in both benchmarks are perceived as Easy (84.8% HumanEval, 89.6% MBPP)",
    385       "evidence": "Section 4.3 reports difficulty distributions from 5-annotator majority voting. Inter-annotator agreement: 39% complete agreement on HumanEval, 40.2% on MBPP.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Existing Code-LLMs perform poorly on PythonSaga: open-source models achieve <4.5% pass@1, closed-source <13% pass@1",
    390       "evidence": "Table 2 shows pass@1 scores: best open-source (Code Llama Python 13B) at 4.05%, GPT-4 at 12.43%, with n=20 samples per problem.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "No model could solve any problem in six specific concepts: Basic Data Structures, Recursion, Hashing, Context Managers, Concurrency and Parallelism, and Max Flow",
    395       "evidence": "Figure 3 heatmap shows zero solved problems for these concepts across all 16 models.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Current benchmarks overestimate LLM generalization abilities for code generation",
    400       "evidence": "Comparison between high pass@1 on HumanEval/MBPP (Figure 6) and low pass@1 on PythonSaga (Table 2), combined with concept diversity analysis showing benchmark bias.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Very small annotator pool",
    407       "detail": "Only 5 annotators from the same institution (IIT Gandhinagar, same research group) classify all problems. With such a small, homogeneous pool, the concept and difficulty classifications may not generalize. No inter-annotator reliability metric beyond agreement percentage is reported."
    408     },
    409     {
    410       "flag": "No statistical tests for any comparisons",
    411       "detail": "All comparisons (model vs model, concept vs concept, benchmark vs benchmark) are made by comparing raw numbers without any statistical significance testing."
    412     },
    413     {
    414       "flag": "Benchmark contamination unaddressed",
    415       "detail": "PythonSaga problems are sourced from GeeksForGeeks and LeetCode — among the most common code training data sources. Manual rephrasing may not prevent memorization of solution patterns. No contamination analysis is performed."
    416     },
    417     {
    418       "flag": "Very few test cases per problem",
    419       "detail": "PythonSaga averages 3.7 test cases per problem (max 4). This is notably low and may not adequately verify functional correctness, potentially over- or underestimating model performance."
    420     },
    421     {
    422       "flag": "Missing generation hyperparameters",
    423       "detail": "Temperature, top-p, and other generation settings are not reported for any model. These settings significantly affect pass@k scores and make results non-reproducible."
    424     },
    425     {
    426       "flag": "Anonymous review URL as dataset link",
    427       "detail": "The dataset URL (anonymous.4open.science) is an anonymous review platform link, which may not be a permanent or reliable distribution method for the published version."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Evaluating large language models trained on code",
    433       "authors": ["Mark Chen", "Jerry Tworek"],
    434       "year": 2021,
    435       "arxiv_id": "2107.03374",
    436       "relevance": "Introduces HumanEval benchmark and Codex model, the primary baseline benchmark analyzed in this paper."
    437     },
    438     {
    439       "title": "Program synthesis with large language models",
    440       "authors": ["Jacob Austin", "Augustus Odena"],
    441       "year": 2021,
    442       "arxiv_id": "2108.07732",
    443       "relevance": "Introduces MBPP benchmark, the second primary baseline benchmark analyzed in this paper."
    444     },
    445     {
    446       "title": "Starcoder: may the source be with you!",
    447       "authors": ["Raymond Li", "Loubna Ben Allal"],
    448       "year": 2023,
    449       "arxiv_id": "2305.06161",
    450       "relevance": "Major open-source Code-LLM evaluated in the study; established pass@k evaluation conventions."
    451     },
    452     {
    453       "title": "Code llama: Open foundation models for code",
    454       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    455       "year": 2023,
    456       "arxiv_id": "2308.12950",
    457       "relevance": "Leading open-source Code-LLM family evaluated across multiple variants (base, instruct, Python)."
    458     },
    459     {
    460       "title": "Deepseek-coder: When the large language model meets programming",
    461       "authors": ["Daya Guo", "Qihao Zhu"],
    462       "year": 2024,
    463       "arxiv_id": "2401.14196",
    464       "relevance": "Open-source Code-LLM evaluated in the study; representative of recent code-focused models."
    465     },
    466     {
    467       "title": "Measuring coding challenge competence with APPS",
    468       "authors": ["Dan Hendrycks", "Steven Basart"],
    469       "year": 2021,
    470       "arxiv_id": "2105.09938",
    471       "relevance": "Code generation benchmark with difficulty levels from competitive programming platforms, directly relevant to benchmark design."
    472     },
    473     {
    474       "title": "DS-1000: A natural and reliable benchmark for data science code generation",
    475       "authors": ["Yuhang Lai", "Chengxi Li"],
    476       "year": 2023,
    477       "relevance": "Python-specific code generation benchmark focused on data science, relevant to benchmark diversity discussions."
    478     },
    479     {
    480       "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation",
    481       "authors": ["Federico Cassano", "John Gouwar"],
    482       "year": 2022,
    483       "arxiv_id": "2208.08227",
    484       "relevance": "Multi-language extension of HumanEval, directly relevant to code generation benchmark design."
    485     },
    486     {
    487       "title": "Dynabench: Rethinking benchmarking in NLP",
    488       "authors": ["Douwe Kiela", "Max Bartolo"],
    489       "year": 2021,
    490       "arxiv_id": "2104.14337",
    491       "relevance": "Addresses benchmark saturation and dynamic evaluation in NLP, a key motivation for PythonSaga."
    492     },
    493     {
    494       "title": "OpenCodeInterpreter: Integrating code generation with execution and refinement",
    495       "authors": ["Tianyu Zheng", "Ge Zhang"],
    496       "year": 2024,
    497       "arxiv_id": "2402.14658",
    498       "relevance": "Code generation model with execution feedback, evaluated in the study."
    499     },
    500     {
    501       "title": "StarCoder 2 and The Stack v2: The next generation",
    502       "authors": ["Anton Lozhkov", "Raymond Li"],
    503       "year": 2024,
    504       "arxiv_id": "2402.19173",
    505       "relevance": "Recent open-source Code-LLM evaluated in the study; represents latest generation of code models."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 1,
    511       "justification": "The benchmark could be used by researchers evaluating Code-LLMs, but is primarily an academic contribution rather than a practitioner tool."
    512     },
    513     "surprise_contrarian": {
    514       "score": 2,
    515       "justification": "The finding that models achieving 60-80% on HumanEval score below 13% on a more balanced benchmark challenges the narrative that Code-LLMs are approaching human-level capability."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No AI safety or security concerns are raised by the paper."
    520     },
    521     "drama_conflict": {
    522       "score": 1,
    523       "justification": "Implicitly criticizes the HumanEval/MBPP ecosystem as misleadingly easy, but does so in measured academic language."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "The dataset is released and could be downloaded and used, but there is no live demo or easy-to-use evaluation tool."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "From IIT Gandhinagar (not a top-tier AI lab), but evaluates recognizable models like GPT-4 and Code Llama."
    532     }
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs