scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30186B)
      1 {
      2   "paper": {
      3     "title": "Importance Sampling is All You Need: Predict LLM's performance on new benchmark by reusing existing benchmark",
      4     "authors": [
      5       "Junjie Shi",
      6       "Wei Ma",
      7       "Shi Ying",
      8       "Lingxiao Jiang",
      9       "Yang Liu",
     10       "Bo Du"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2508.01203",
     15     "doi": "10.48550/arXiv.2508.01203"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "theoretical"],
     20   "key_findings": "BIS proposes a prompt-centric framework using importance sampling and IWAE to predict LLM performance on code generation benchmarks without ground-truth execution, achieving ~1.1% average absolute error on CodeBLEU predictions across CodeLlama models (7B–70B). The method outperforms alternative distribution modeling approaches (GMM, RBM, MaxEnt, VAE) and regression baselines (MLP, RNN, LR, etc.) in cross-benchmark prediction between BigCodeBench and EvoEval. Semantic-level metrics (security score, cyclomatic complexity) are more accurately predicted than code-level metrics (Halstead complexity), with errors of 4.3% vs 10.7% at worst.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available benchmarks: BigCodeBench, HumanEval, and EvoEval (7 sub-benchmarks). All are standard public benchmarks that are freely accessible."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions renting '8 server with L20 GPUs' (Section 4.1) but provides no requirements.txt, Dockerfile, library versions, or environment setup details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results tables (Tables 3–13) report only point estimates of error. No confidence intervals, error bars, or ± notation appear anywhere."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims 'our approach achieves the lowest prediction error among all compared schemes' (Section 4.4) based solely on comparing raw numbers across Tables 7–8. No p-values, t-tests, or any statistical significance tests are used."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Error rates are reported with full baseline context. For example, BIS achieves avg absolute error 0.011 vs GMM(80) at 0.290 and RNN at 0.298 (Tables 7–8), providing clear magnitude comparisons."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper uses ~8,528 data points across 4 models and 9 benchmarks but provides no justification for why this is sufficient, no power analysis, and no discussion of statistical adequacy."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No standard deviation, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Extensive baselines are compared: distribution modeling methods (GMM, RBM, MaxEnt, VAE) in Table 7, and regression methods (RSR, LR, DTR, RR, MLP, RNN) in Table 8."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include VAE (a direct contemporary alternative) and standard ML/DL approaches. For the specific task of prompt distribution modeling, these represent reasonable and appropriate comparisons."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section 4.5 (RQ4) presents a thorough ablation studying four factors: feature dimensionality (Tables 9–10), IWAE sample count (Table 11), weight truncation percentile (Table 12), and prompt set size (Table 13)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics evaluated: CodeBLEU, pass@1, security score, cyclomatic complexity, and four Halstead metrics (length, volume, effort, time) in Table 6."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation of any kind. All evaluation is automated comparison of predicted vs actual metric values."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The framework uses cross-prediction between entirely separate benchmark suites: BigCode predicts Evo and vice versa (Section 4.1). Source and target are disjoint datasets."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by model size (7B, 13B, 34B, 70B), by source/target direction, and by metric type (semantic vs code-level) across all tables."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The paper shows general degradation trends (e.g., small prompt sets, extreme truncation) but does not analyze specific failure cases or show qualitative examples where the framework produced particularly poor predictions."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative results reported: code-level metrics perform much worse than semantic metrics (Table 6, up to 10.7% error), linear layer dimensionality reduction severely degrades performance (Table 10, errors exceeding 0.4), and small prompt sets cause large errors (Table 13)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of '1.1% average absolute prediction error' are supported by Tables 3 (0.8% and 1.4% average absolute errors), '0.3% and 1.9%' best/worst are visible in Table 3, and '2.15% for pass@1' matches Table 4 (2.2% and 2.1%)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The core causal claim—that prompt distributions determine LLM performance—is supported by a mathematical proof (Theorem 3.1, Section 3.3). Ablation studies in Section 4.5 use controlled single-variable manipulation to support component-level causal claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Importance Sampling is All You Need' and abstract claims of 'broad applicability' are unbounded. The paper tests only CodeLlama models (one family) on code generation tasks in Python. Section 5 acknowledges cross-scenario and cross-language as future work but the title and abstract do not reflect these scope limitations."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not consider alternative explanations for its results. For example, the low prediction errors could be due to high distributional similarity between BigCodeBench and EvoEval (both Python code generation) rather than the method's generalizability. No robustness checks against confounds are performed."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "CodeBLEU measures syntactic/structural similarity to reference solutions, not actual code correctness. The paper uses CodeBLEU as its primary 'correctness metric' (Section 4.2: 'To represent correctness of code generated by LLM, we employ the CodeBLEU score') without discussing the gap between reference similarity and actual functional correctness. Pass@1 (which measures execution correctness) is only tested on a single model."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model versions are stated: 'CodeLlama-7B, CodeLlama-13B, CodeLlama-34B, and CodeLlama-70B' (Section 4.1), with citation to the Code Llama paper [27]. These are unique, identifiable model releases."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The evaluation prompts come from publicly available benchmarks (BigCodeBench, HumanEval, EvoEval) which provide their prompts openly. The paper's method operates on these benchmark prompts."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "IWAE hyperparameters are studied in the ablation (K=10 samples, 0.9 truncation percentile, 768-dim embeddings). However, LLM generation hyperparameters (temperature, top-p, max tokens) for the CodeLlama runs are never reported."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The framework consists of BERT embeddings, IWAE models, and importance weight computation—no agent loops or tool use."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4.1 documents preprocessing: merging HumanEval with EvoEval into 'Evo' dataset (992 samples), BigCodeBench as standalone (1,140 samples), min-max normalization to [0,1], BERT CLS token extraction for embeddings."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 5 'Discussion' provides substantive discussion of limitations including cross-scenario/cross-language applicability, Assumption 1 violations, and weight instability issues."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 5 and Section 3.3.1 discuss specific threats: Assumption 1 violation when source and target distributions don't overlap, extreme weight distributions dominating predictions, and that cross-domain settings increase weight variance."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what its results do NOT show. Section 5 frames scope limitations as 'future work' directions rather than explicit boundaries. No statement like 'our results do not generalize to non-code tasks' or 'results are limited to the CodeLlama family.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data (model outputs, computed scores, embeddings, importance weights) is released for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4.1 describes: 8,528 data points collected across 4 CodeLlama models and 9 benchmarks, using 8 servers with L20 GPUs. Table 2 provides per-benchmark data point counts."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data comes from running CodeLlama models on standard public benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The pipeline from model outputs to final predictions is described at a high level (Figure 1), but critical details are missing: how many generations per prompt, inference settings for CodeLlama, exact IWAE training procedure, and how CodeBLEU reference solutions were obtained."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Section 7: 'This research is supported by the Ministry of Education, Singapore under its Academic Research Fund Tier 3 (Award ID: MOET32020-0004).'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations clearly listed: Nanyang Technological University, Singapore Management University, and Wuhan University. No affiliation with Meta (CodeLlama developer) or benchmark creators."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The Ministry of Education, Singapore has no financial interest in the outcome of CodeLlama benchmarking research."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement appears anywhere in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No explicit training data cutoff date for CodeLlama is stated. The paper says 'we employed open-source models released before the publication of these benchmarks' (Section 4.1) but does not specify when training data collection ended."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper claims models were 'released before the publication of these benchmarks' but this is inaccurate for HumanEval (published 2021, CodeLlama released 2023). The paper does not analyze whether CodeLlama's training data included HumanEval problems."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "HumanEval (2021) was publicly available well before CodeLlama's training. The paper's claim that models predate benchmarks is incorrect for HumanEval. No contamination analysis is performed despite this being a core motivation of the paper."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Section 4.1: 'we rented 8 server with L20 GPUs with the bill 280$.' This states the total compute cost for the experiments."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Section 4.1 states: '8 server with L20 GPUs with the bill 280$,' providing the total hardware and cost budget."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No seed sensitivity analysis for IWAE training or model inference. For pass@1, CodeLlama-7B was run 10 times but this is inherent to the metric definition, not a sensitivity analysis."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not stated for the main experiments (CodeBLEU predictions). Only pass@1 states 'ten times' (Section 4.2), but this is metric-specific, not a replication count."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The ablation study (Section 4.5) explores different hyperparameter settings but does not report the total search budget or how many configurations were evaluated before selecting the final settings."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Section 4.5 justifies configuration selection: K=10 IWAE samples selected based on Table 11 (avoiding both too-few and too-many extremes), truncation at 0.9 percentile based on Table 12, full 768-dim embeddings based on Tables 9–10."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparisons across 8+ baselines, 4 models, and 2 dataset directions without any correction for multiple comparisons."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement all baseline methods themselves but do not acknowledge the potential bias of author-implemented baselines systematically underperforming."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Different methods (IWAE vs GMM vs RNN vs MLP) have different compute requirements, but performance is never reported as a function of compute. The comparison is accuracy-only with no compute normalization."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "CodeBLEU is used as the primary 'correctness' metric but it measures reference similarity, not functional correctness. The paper does not discuss whether CodeBLEU validly measures code correctness, nor whether predicting CodeBLEU scores has practical value for benchmark evaluation."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved in the evaluation. Models generate code directly from prompts."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "HumanEval (published 2021) predates CodeLlama's training (2023). The paper claims models predate benchmarks but this is incorrect for HumanEval. Temporal leakage is not analyzed per-benchmark."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. For example, whether BERT embeddings of prompts could encode benchmark-specific signals that inflate prediction accuracy."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "EvoEval is explicitly derived from HumanEval (evolved versions of HumanEval problems). The paper merges them into one 'Evo' dataset and uses it for cross-prediction with BigCodeBench, but does not discuss whether this shared origin could inflate distributional similarity or affect results."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is applied despite the paper's stated motivation of addressing data contamination."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "BIS achieves an average absolute prediction error of 1.1% for CodeBLEU code correctness scores across 4 CodeLlama models.",
    372       "evidence": "Table 3 (Section 4.2) shows average absolute errors of 0.8% (BigCode→Evo) and 1.4% (Evo→BigCode), averaging approximately 1.1%.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "BIS generalizes to pass@1 with average absolute errors of 2.15%.",
    377       "evidence": "Table 4 (Section 4.2) shows errors of 2.2% and 2.1% for the two prediction directions, but only on CodeLlama-7B.",
    378       "supported": "weak"
    379     },
    380     {
    381       "claim": "BIS outperforms all baseline distribution modeling and regression methods.",
    382       "evidence": "Tables 7–8 (Sections 4.3–4.4) show BIS achieving 0.011 avg absolute error vs next best VAE (0.015), MaxEnt (0.017), and RBM (0.018). Non-IS baselines: LR/RR (0.017), RSR (0.031), MLP (0.105), RNN (0.298).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Under fixed model and evaluation metrics, prompt distribution uniquely determines expected model performance.",
    387       "evidence": "Theorem 3.1 (Section 3.3) provides a mathematical proof using importance sampling theory, with Assumption 1 (absolute continuity) as the key requirement.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Semantic-level metrics are more accurately predicted than code-level metrics.",
    392       "evidence": "Table 6 (Section 4.3) shows max absolute errors of 4.3% and 4.6% for security score and cyclomatic complexity, versus 10.7%, 8%, 5%, and 4.5% for Halstead metrics.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "BIS can reduce the cost and effort of benchmarking LLMs by eliminating need for ground-truth test suites.",
    397       "evidence": "The framework predicts performance from prompt distributions alone (Section 3.2). However, this is only validated on two benchmark groups using a single model family, and relies on having a fully annotated source benchmark.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Extremely narrow model diversity",
    404       "detail": "All 4 models are from the CodeLlama family, sharing a common lineage (acknowledged in Section 4.1: 'models within this family share a common lineage'). Results cannot demonstrate generalization across architecturally diverse models. No instruction-tuned, chat, or closed-source models are tested."
    405     },
    406     {
    407       "flag": "No code or data released",
    408       "detail": "Despite proposing a novel framework, no source code, trained models, or raw experimental data is released. The results cannot be independently verified or reproduced."
    409     },
    410     {
    411       "flag": "No statistical tests for comparative claims",
    412       "detail": "Claims of outperforming baselines are based entirely on comparing point estimates without any significance tests, confidence intervals, or variance measures. The margins between BIS (0.011) and VAE (0.015) or MaxEnt (0.017) are small and could be within noise."
    413     },
    414     {
    415       "flag": "Contamination contradiction",
    416       "detail": "The paper motivates itself partly by addressing data contamination risk, yet does not analyze contamination in its own experiments. HumanEval (2021) was public before CodeLlama's training, creating a contamination risk that goes unaddressed."
    417     },
    418     {
    419       "flag": "CodeBLEU as 'correctness' proxy",
    420       "detail": "The primary metric CodeBLEU measures syntactic/structural similarity to reference solutions, not functional correctness. The paper calls it a 'correctness metric' (Section 4.2) without discussing this validity gap. Pass@1 (actual execution correctness) is only tested on a single model."
    421     },
    422     {
    423       "flag": "EvoEval derived from HumanEval non-independence",
    424       "detail": "EvoEval benchmarks are evolved from HumanEval problems (ref [36]). Merging them into one 'Evo' dataset and cross-predicting with BigCodeBench may benefit from shared distributional characteristics that wouldn't exist with truly independent benchmarks."
    425     },
    426     {
    427       "flag": "Overclaiming title and abstract",
    428       "detail": "The title 'Importance Sampling is All You Need' and abstract claims of 'broad applicability' significantly overstate the scope of experiments limited to one model family on Python code generation tasks."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Evaluating large language models trained on code",
    434       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    435       "year": 2021,
    436       "arxiv_id": "2107.03374",
    437       "relevance": "Introduces HumanEval, one of the most widely used LLM code generation benchmarks."
    438     },
    439     {
    440       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    441       "authors": ["Terry Yue Zhuo", "Minh Chien Vu"],
    442       "year": 2024,
    443       "arxiv_id": "2406.15877",
    444       "relevance": "Major code generation benchmark requiring complex function calls, used as primary evaluation dataset in this paper."
    445     },
    446     {
    447       "title": "Code Llama: Open foundation models for code",
    448       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    449       "year": 2023,
    450       "arxiv_id": "2308.12950",
    451       "relevance": "Open-source code LLM family used as the sole evaluation models in this paper."
    452     },
    453     {
    454       "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM",
    455       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Lingming Zhang"],
    456       "year": 2024,
    457       "arxiv_id": "2403.19114",
    458       "relevance": "Creates evolved benchmarks from HumanEval to test robustness of code generation evaluation."
    459     },
    460     {
    461       "title": "SWE-Dev: Evaluating and Training Autonomous Feature-Driven Software Development",
    462       "authors": ["Yaxin Du", "Yuzhu Cai"],
    463       "year": 2025,
    464       "arxiv_id": "2505.16975",
    465       "relevance": "Code generation benchmark for realistic software engineering settings."
    466     },
    467     {
    468       "title": "Swe-bench: Can language models resolve real-world github issues?",
    469       "authors": ["Carlos E Jimenez", "John Yang"],
    470       "year": 2023,
    471       "arxiv_id": "2310.06770",
    472       "relevance": "Influential benchmark for evaluating LLMs on real-world software engineering tasks."
    473     },
    474     {
    475       "title": "A survey on evaluating large language models in code generation tasks",
    476       "authors": ["Liguo Chen", "Qi Guo"],
    477       "year": 2024,
    478       "arxiv_id": "2408.16498",
    479       "relevance": "Comprehensive survey of LLM code generation evaluation methodologies."
    480     },
    481     {
    482       "title": "TestGenEval: A real world unit test generation and test completion benchmark",
    483       "authors": ["Kush Jain", "Gabriel Synnaeve", "Baptiste Rozière"],
    484       "year": 2024,
    485       "arxiv_id": "2410.00752",
    486       "relevance": "Benchmark for automated test generation showing only 39.2% accuracy, motivating the need for cheaper evaluation methods."
    487     },
    488     {
    489       "title": "LessLeakBench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks",
    490       "authors": ["Xin Zhou", "Martin Weyssow"],
    491       "year": 2025,
    492       "arxiv_id": "2502.06215",
    493       "relevance": "Investigates data leakage across SE benchmarks, directly relevant to the contamination concerns this paper claims to address."
    494     },
    495     {
    496       "title": "Does Data Contamination Detection Work (Well) for LLMs? A Survey and Evaluation on Detection Assumptions",
    497       "authors": ["Yujuan Fu", "Ozlem Uzuner"],
    498       "year": 2024,
    499       "arxiv_id": "2410.18966",
    500       "relevance": "Surveys data contamination detection methods for LLMs, relevant to the contamination mitigation claims of BIS."
    501     },
    502     {
    503       "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis",
    504       "authors": ["Shuo Ren", "Daya Guo"],
    505       "year": 2020,
    506       "arxiv_id": "2009.10297",
    507       "relevance": "Primary evaluation metric used in this paper for measuring code generation quality."
    508     },
    509     {
    510       "title": "Task contamination: Language models may not be few-shot anymore",
    511       "authors": ["Changmao Li", "Jeffrey Flanigan"],
    512       "year": 2024,
    513       "relevance": "Demonstrates that contaminated benchmark samples can inflate LLM scores by up to 5x, supporting the motivation for contamination-resistant evaluation."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 1,
    519       "justification": "The framework could help benchmark developers screen candidate tasks, but requires training IWAE models and having a fully annotated source benchmark, limiting immediate practical use."
    520     },
    521     "surprise_contrarian": {
    522       "score": 1,
    523       "justification": "Predicting performance without execution is a novel framing, but the underlying idea that similar prompts yield similar performance is fairly intuitive."
    524     },
    525     "fear_safety": {
    526       "score": 0,
    527       "justification": "No safety or security concerns raised by this work."
    528     },
    529     "drama_conflict": {
    530       "score": 1,
    531       "justification": "Touches on the data contamination debate in benchmarks, which is a contentious topic, but does not make dramatic claims about specific benchmarks or models being compromised."
    532     },
    533     "demo_ability": {
    534       "score": 0,
    535       "justification": "No code, demo, or tool released."
    536     },
    537     "brand_recognition": {
    538       "score": 0,
    539       "justification": "Authors from NTU, SMU, and Wuhan University; not about well-known products or from prominent AI labs."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs