scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26437B)
      1 {
      2   "paper": {
      3     "title": "CoDet-M4: Detecting Machine-Generated Code in Multi-Lingual, Multi-Generator and Multi-Domain Settings",
      4     "authors": ["Daniil Orel", "Dilshod Azizov", "Preslav Nakov"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2503.13733"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a HuggingFace link for data and pre-trained models: https://huggingface.co/datasets/DaniilOr/CoDET-M4 (Appendix A) and mentions 'We release our data and code' with a link in Section 1."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The dataset is released on HuggingFace: https://huggingface.co/datasets/DaniilOr/oDET-M4 (footnote 2) and https://huggingface.co/datasets/DaniilOr/CoDET-M4 (Appendix A). The paper explicitly states 'We release our data and code.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided in the paper. The paper mentions using vLLM for serving but does not specify library versions or a reproducible environment setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README with commands, or 'Reproducing Results' section is provided in the paper. While the experimental setup is described, there are no concrete instructions to replicate the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported for any results. All tables (Tables 2-14) report only point estimates for precision, recall, F1, and accuracy with no uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No statistical significance tests are used despite multiple comparative claims (e.g., 'UniXcoder is superior', 'CodeT5 achieves better performance'). All comparisons are based solely on comparing raw numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. While raw performance numbers are provided, there is no systematic reporting of effect magnitudes beyond raw metric values."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for sample sizes used in the various experiments. For example, the unseen models experiment uses only 126 samples, the unseen domains experiment uses 5,451 samples, and the unseen languages experiment uses 6,388 samples, but none of these choices are justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The paper does not mention running experiments multiple times with different seeds or reporting any spread measures."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper includes a zero-shot baseline (Fast-DetectGPT), traditional ML baselines (SVM, CatBoost), and multiple DNN-based models (CodeBERT, CodeT5, UniXcoder) as described in Section 4.1."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines include contemporary models: Fast-DetectGPT (2024), UniXcoder (2022), CodeT5+ (2023), CodeBERT (2020). The paper also tests GPT-4o as a detector. These represent a reasonable range of contemporary approaches for this task."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed to identify which components of the proposed framework contribute most to performance. The paper compares different models but does not ablate specific design choices within any model (e.g., feature selection in SVM/CatBoost, or fine-tuning strategies in DNNs)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports four evaluation metrics: Macro F1 score (F), precision (P), recall (R), and accuracy (A), as stated in Section 4.1 ('Evaluation Measures')."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not relevant here since the task is binary/multi-class classification with ground-truth labels. The evaluation is appropriately automated using standard classification metrics against known labels."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The dataset is split into train/validation/test in an 8:1:1 ratio (Section 3.4), and results are reported on the test set. The out-of-domain experiments also use separate held-out datasets (unseen models, domains, languages)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper provides extensive per-category breakdowns: per-language results (Table 3), per-source/domain results (Table 4), per-generator results (Tables 6, 8, Figure 8), per-language results for unseen languages (Table 10), and per-domain results for unseen domains (Table 11)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses failure cases including: CodeWhisperer being hard to detect (Section 4.5.1), performance drops in unseen domains (Section 4.5.2), JavaScript being most challenging (Section 4.5.3), and hybrid authorship detection failing (Section 4.5.4, Table 13). An error analysis of the baseline is provided in Appendix I."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are reported: GPT-4o performs worse than traditional ML models for detection (Table 5, Section 4.3), significant performance drops in unseen domains (Table 9), hybrid authorship detection fails (Table 13), and performance degrades with increasing human code proportion (Figure 3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the framework 'effectively distinguishes human-written from LLM-generated program code, setting a new benchmark for the task.' The results in Tables 2-4 support the in-domain detection claim (98.65% F1 for UniXcoder). The out-of-domain results (Tables 8-13) add nuance showing where the framework struggles, which is honestly reported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims about why certain models perform better (e.g., 'UniXcoder depends on these structural elements to effectively capture relationships' in Section 4.5.2; 'CodeT5 appears to rely on more general patterns, making it more adaptable'). These are speculative explanations without controlled experiments to establish causation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is careful to bound its generalizations. The Limitations section explicitly acknowledges that results are constrained to three main programming languages, function- and class-level code, and a narrow range of prompts. The conclusion also specifically notes performance drops in unseen domains and hybrid scenarios."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not systematically discuss alternative explanations for its results. For example, the high detection accuracy could partly be due to formatting artifacts from the generation process (comments/docstrings removed, length filtering), but this is not explored. The SHAP analysis in Appendix G touches on what features matter but does not discuss alternative explanations for the observed performance patterns."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "For the code generators, model sizes are specified (CodeLlama 7B, Llama3.1 8B, CodeQwen 1.5 7B, Nxcode-orpo 7B) but not specific version hashes or snapshot dates. GPT-4o is used without a version/snapshot identifier. For detector models (CodeBERT, UniXcoder, CodeT5), no specific model checkpoint versions are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt texts are provided in Appendices C, E.1, E.2, E.3, E.4, and E.5 for all experimental settings: GPT-4o detection prompts, LeetCode generation prompts, CodeForces prompts, GitHub prompts, MBPP prompts, and hybrid generation prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Training hyperparameters are reported in Section 4.1: 5 epochs, learning rate 3e-4, weight decay 1e-3, batch size 256, linear LR scheduler for DNNs. CatBoost: 2,000 trees, learning rate 0.1. Generation: temperature 0.4-1.0 randomly. SVM: RBF kernel, primal formulation."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The approach uses standard model fine-tuning and inference pipelines."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.3 (Quality Assurance) documents preprocessing: filtering CodeForces/LeetCode solutions that passed all test cases, removing HTML tags, filtering irrelevant LLM responses, removing comments/docstrings using regex, length filtering (excluding below 5th and above 95th percentile in token count), and deduplication."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section is present after Section 5, covering four specific limitations: Generalizability (language constraints), Corpus Update (dataset obsolescence), Prompt Diversity, and Applied Models."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations are specific to this study: 'Our research predominantly focuses on three programming languages' (generalizability), 'our study utilizes a narrow range of prompts' (prompt diversity), 'any dataset created to detect LLM-generated code can quickly become outdated' (corpus update), and 'We primarily relied on pre-existing models' (applied models)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope boundaries: focuses on three main programming languages (Java, C++, Python), function- and class-level code, specific set of LLMs, and acknowledges the limitations of narrow prompt diversity. The Limitations section explicitly states what the dataset does NOT cover."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The full dataset is available on HuggingFace (https://huggingface.co/datasets/DaniilOr/CoDET-M4), allowing independent verification of the data used in experiments."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.1 (Data Collection) describes in detail: sources used (LeetCode, CodeForces via Kaggle, GitHub via CodeSearchNet and API), languages covered, how code was collected from each source, and the resulting sample sizes. Appendix A provides additional detail on the 3-month collection period (September-November 2024)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data consists of publicly available code samples and LLM-generated code. This is a standard benchmark study without human subjects."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data pipeline is documented: collection from multiple sources (Section 3.1), code generation process (Section 3.2), quality assurance with filtering steps (Section 3.3), and dataset splitting (Section 3.4). Table 1 provides detailed counts for each split by language and source."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information or acknowledgments section is present in the paper. The authors are from Mohamed bin Zayed University of Artificial Intelligence (MBZUAI), but no grants or funding sources are disclosed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly disclosed: all three authors are from Mohamed bin Zayed University of Artificial Intelligence (MBZUAI), UAE. No products of the authors' institution are being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of funding disclosure means this criterion cannot be satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state training data cutoff dates for any of the models used (CodeBERT, UniXcoder, CodeT5, or the code generators). This is relevant because the detector models are pre-trained on code corpora that may overlap with the test data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper mentions deduplication within the dataset (Section 3.3) and ensures no overlap with The Vault repositories (Section 4.5.2), but does not discuss whether the pre-trained detector models (CodeBERT, UniXcoder, CodeT5) may have seen the evaluation code during their pre-training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The detector models (CodeBERT, UniXcoder, CodeT5) were pre-trained on code data that could include LeetCode solutions, CodeForces solutions, or GitHub code used in the evaluation. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens consumed are reported. The paper uses multiple LLMs for code generation (including GPT-4o API calls) and fine-tunes multiple models but does not report any cost information."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget is stated. The paper mentions using vLLM for serving but does not report GPU hours, hardware used, training time, or total API spend."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "UniXcoder achieves the best binary classification performance with 98.65% F1 score for detecting LLM-generated code.",
    286       "evidence": "Table 2 shows UniXcoder achieving P=98.65, R=98.66, F=98.65, A=98.65, outperforming all other models (Section 4.2).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "GPT-4o is ineffective at identifying LLM-generated code, even with few-shot learning, performing worse than traditional ML models.",
    291       "evidence": "Table 5 shows GPT-4o achieving at most 42.13% accuracy (3-shot), compared to SVM at 72.19% and CatBoost at 88.79% (Section 4.3, Table 2).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Models generalize well to unseen code generators, with UniXcoder achieving 93.22% F1 and 87.30% accuracy.",
    296       "evidence": "Table 8 shows UniXcoder achieving R=87.30, F=93.22, A=87.30 on code from 7 LLMs not in the training set (Section 4.5.1).",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Detection models experience significant performance drops in unseen domains, with the best model (CodeT5) achieving only 58.22% F1.",
    301       "evidence": "Table 9 shows CodeT5 at P=78.43, R=59.18, F=58.22, A=74.11 for unseen domains (Section 4.5.2).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "UniXcoder generalizes relatively well to unseen programming languages, achieving 88.96% F1.",
    306       "evidence": "Table 12 shows UniXcoder at P=89.13, R=89.20, F=88.96, A=88.96 across four unseen languages (Section 4.5.3).",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Detection models fail on hybrid authorship scenarios where LLMs complement or rewrite human-written code.",
    311       "evidence": "Table 13 shows UniXcoder achieving only R=33.22, F=39.36, A=64.71 on hybrid generated codes (Section 4.5.4).",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "LLMs have unique writing styles that can be identified, enabling authorship attribution across generators.",
    316       "evidence": "Table 7 shows UniXcoder achieving 79.35% accuracy for 6-class authorship identification. Figure 2 shows confusion between related models (Nxcode/CodeQwen) as expected (Section 4.4).",
    317       "supported": "moderate"
    318     }
    319   ],
    320   "methodology_tags": ["benchmark-eval"],
    321   "key_findings": "CoDet-M4 introduces a 500K-sample multi-lingual, multi-generator, multi-domain dataset for detecting LLM-generated code. Fine-tuned pre-trained language models (UniXcoder, CodeT5) achieve near-perfect detection on in-domain data (98.65% F1), significantly outperforming traditional ML and zero-shot approaches including GPT-4o. However, detection performance drops substantially in out-of-domain settings: unseen code domains (55-58% F1), hybrid authorship scenarios (39% F1), and partially for unseen programming languages. The work highlights that current detection methods are brittle outside their training distribution, particularly for hybrid human-LLM code that reflects real-world usage patterns.",
    322   "red_flags": [
    323     {
    324       "flag": "No uncertainty quantification",
    325       "detail": "All results across 14+ tables are reported as single-run point estimates with no confidence intervals, error bars, or standard deviations. The paper does not mention running experiments multiple times with different seeds, making it impossible to assess result stability."
    326     },
    327     {
    328       "flag": "No statistical significance tests",
    329       "detail": "Multiple comparative claims are made (e.g., UniXcoder vs. CodeT5, DNN vs. classical models) without any significance testing. Raw metric differences are treated as meaningful without assessing whether they could be due to chance."
    330     },
    331     {
    332       "flag": "Very small OOD test sets",
    333       "detail": "The unseen models experiment uses only 126 samples (Table 8). The unseen domains experiment uses 5,451 samples and unseen languages 6,388 samples, but no power analysis justifies these sizes for the claims being made."
    334     },
    335     {
    336       "flag": "Potential data artifacts from preprocessing",
    337       "detail": "Comments and docstrings were removed from all code samples, and length filtering excluded below 5th and above 95th percentile. These preprocessing choices could introduce systematic differences between human and LLM code that the models exploit, rather than detecting genuine stylistic differences. This confound is not discussed."
    338     },
    339     {
    340       "flag": "Pre-trained model contamination risk unaddressed",
    341       "detail": "CodeBERT, UniXcoder, and CodeT5 were pre-trained on large code corpora that likely include LeetCode and GitHub code used in the evaluation. The high in-domain performance could partly reflect data memorization rather than genuine detection capability."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Distinguishing LLM-Generated from Human-Written Code by Contrastive Learning",
    347       "authors": ["Xiaodan Xu", "Chao Ni", "Xinrong Guo", "Shaoxuan Liu", "Xiaoya Wang", "Kui Liu", "Xiaohu Yang"],
    348       "year": 2025,
    349       "relevance": "Directly relevant prior work on detecting LLM-generated code using contrastive learning with UniXcoder-based semantic encoder."
    350     },
    351     {
    352       "title": "GPTSniffer: A CodeBERT-based classifier to detect source code written by chatgpt",
    353       "authors": ["Phuong T. Nguyen", "Juri Di Rocco", "Claudio Di Sipio", "Riccardo Rubei", "Davide Di Ruscio", "Massimiliano Di Penta"],
    354       "year": 2024,
    355       "relevance": "Prior work on binary classification to detect ChatGPT-generated code using CodeBERT, relevant baseline approach for machine-generated code detection."
    356     },
    357     {
    358       "title": "Automatic Detection of LLM-generated Code: A Case Study of Claude 3 Haiku",
    359       "authors": ["Musfiqur Rahman", "Sayed Hossein Khatoonabadi", "Ahmad Abdellatif", "Emad Shihab"],
    360       "year": 2024,
    361       "arxiv_id": "2409.01382",
    362       "relevance": "Case study on detecting Claude 3-generated code using ML models on CodeSearchNet dataset."
    363     },
    364     {
    365       "title": "Assessing AI Detectors in Identifying AI-Generated Code: Implications for Education",
    366       "authors": ["Wei Hung Pan", "Ming Jie Chok", "Jonathan Leong Shan Wong"],
    367       "year": 2024,
    368       "relevance": "Evaluation of AI detectors for code, revealing limitations in detection of LLM-generated code, used as foundation dataset in this work."
    369     },
    370     {
    371       "title": "Whodunit: Classifying code as human authored or GPT-4 generated – A case study on CodeChef problems",
    372       "authors": ["Oseremen Joy Idialu", "Noble Saji Mathews", "Rungroj Maipradit", "Joanne M. Atlee", "Mei Nagappan"],
    373       "year": 2024,
    374       "relevance": "Used stylometric features to identify GPT-4-generated code at class level, methodology adopted for SVM/CatBoost features in this work."
    375     },
    376     {
    377       "title": "M4: Multi-generator, Multi-domain, and Multi-lingual Black-Box Machine-Generated Text Detection",
    378       "authors": ["Yuxia Wang", "Jonibek Mansurov", "Petar Ivanov"],
    379       "year": 2024,
    380       "relevance": "Large-scale dataset for machine-generated text detection across domains, languages, and generators; inspired the multi-dimensional evaluation approach."
    381     },
    382     {
    383       "title": "Fast-detectgpt: Efficient zero-shot detection of machine-generated text via conditional probability curvature",
    384       "authors": ["Guangsheng Bao", "Yanbin Zhao", "Zhiyang Teng", "Linyi Yang", "Yue Zhang"],
    385       "year": 2024,
    386       "relevance": "Zero-shot AI-generated content detector used as the baseline in this paper's experiments."
    387     },
    388     {
    389       "title": "LLM-DetectAIve: a tool for fine-grained machine-generated text detection",
    390       "authors": ["Mervat Abassy", "Kareem Elozeiri", "Alexander Aziz"],
    391       "year": 2024,
    392       "relevance": "Tool for fine-grained detection of machine-generated text, relevant to the broader AI-generated content detection landscape."
    393     },
    394     {
    395       "title": "Evaluating large language models trained on code",
    396       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    397       "year": 2021,
    398       "arxiv_id": "2107.03374",
    399       "relevance": "Introduced HumanEval benchmark for code generation evaluation; foundational work for code LLM assessment."
    400     },
    401     {
    402       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    403       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    404       "year": 2020,
    405       "relevance": "Pre-trained model for code understanding used as one of the main detection models in this work."
    406     },
    407     {
    408       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    409       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    410       "year": 2024,
    411       "arxiv_id": "2403.07974",
    412       "relevance": "Contamination-free evaluation benchmark for code LLMs, relevant to benchmark contamination concerns in code generation evaluation."
    413     }
    414   ]
    415 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs