scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25954B)
      1 {
      2   "paper": {
      3     "title": "Sherlock: Reliable and Efficient Agentic Workflow Execution",
      4     "authors": ["Yeonju Ro", "Haoran Qiu", "Íñigo Goiri", "Rodrigo Fonseca", "Ricardo Bianchini", "Aditya Akella", "Zhangyang Wang", "Mattan Erez", "Esha Choukse"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2511.00330",
      8     "doi": "10.48550/arXiv.2511.00330"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No source code repository URL, GitHub link, or archive is provided anywhere in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available benchmarks: CoTCollection, OMEGA, LiveCodeBench, HumanEval, MBPP, HotpotQA, DROP, GSM8k, MATH-500, GTA. All are referenced with citations or HuggingFace links."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Hardware is specified (8×A100 80GB, vLLM serving) and model IDs are given (Table 2), but no requirements.txt, Dockerfile, or library version list is provided."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the methodology but does not provide commands or procedures to replicate results."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Main results in Figures 12-14 and Table 3 report point estimates only. No confidence intervals, error bars, or ± notation on the key accuracy, latency, or cost numbers."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims Sherlock 'consistently achieves higher accuracy' and 'delivers an 18.3% accuracy gain' but no statistical significance tests are reported for any comparison."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are given with baseline context: '18.3% accuracy gain on average', 'up to 48.7% execution time reduction', '26.0% cost reduction'. Table 3 provides detailed latency reduction percentages per benchmark and percentile."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification for the number of benchmark examples used. The paper mentions '100+ different graphs and 15K+ execution traces' for fault injection but does not justify these quantities."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be single-run numbers."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines are compared: non-verifying baseline, random and even verifier placement, static single-verifier configurations, AFlow (Monte Carlo search), tabular approach, and an oracle (§8)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "AFlow (Zhang et al., 2024a) and Flow planner (Niu et al., 2025) are contemporary. The verifier methods (Self-Refine, Debate, LLM-as-a-Judge) are all from 2023-2024."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Each component is evaluated in isolation: verifier placement (§8.2, Figure 12), verifier selection (§8.3, Figures 13-14), and speculative execution (§8.4, Figure 15, Table 3)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Three metric dimensions are used: accuracy (percentage points, pass@1), latency (Texec and Tvrf at multiple percentiles), and cost (normalized cost per 1K problems)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All evaluation is automated using benchmark metrics. No human evaluation of workflow output quality is included, though it could be relevant for instruction-following tasks."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Standard benchmark test sets are used (CoTCollection, OMEGA, LiveCodeBench). The verifier selector is trained on microbenchmark data and evaluated on the agentic benchmarks (§8.3)."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by task category (instruction, code, math, tool) in Figures 3-5, and by benchmark (CoTCollection, LiveCodeBench, OMEGA) in Figures 12, 14, and Table 3."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses where lightweight similarity metrics fail (code and math tasks, Table 5, AUC ≈ 0.5), cases where verification reduces accuracy (Figure 3), and rollback scenarios (§7.2)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative findings are reported: verification can reduce accuracy in some cases (Figure 3, e.g., Self-Refine on Code), similarity metrics fail for code/math tasks (Table 5), and fan-out shows no correlation with vulnerability (§5.3)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of '18.3% accuracy gain' (supported by Figure 14 comparisons), 'up to 48.7% execution time reduction' (Table 3, LiveCodeBench Tvrf mean), and '26.0% cost reduction compared to Monte Carlo search' (Figure 14 vs AFlow) are all supported by the experimental results."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper uses controlled ablation experiments isolating each component (placement in §8.2, selection in §8.3, speculation in §8.4). Each comparison changes one variable while holding others constant, which is adequate for the causal claims made."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title claims 'Reliable and Efficient Agentic Workflow Execution' broadly, but results are on three specific benchmarks with one base model (Llama-3.1-8B). The paper notes domain on-boarding is needed but does not explicitly bound generalizations to the tested settings."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No discussion of alternative explanations for the results. For example, could the gains come from the specific model pairing rather than the framework? Could the topological policy be an artifact of the workflow generator used?"
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures accuracy (benchmark-specific metrics), latency (wall-clock time), and cost (dollar amounts via cost model). Claims match measurement granularity — e.g., 'accuracy gain on CoTCollection' rather than broader 'reliability' claims."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Exact HuggingFace model IDs are provided: 'meta-llama/Llama-3.1-8B-Instruct', 'meta-llama/Llama-3.3-70B-Instruct', 'Qwen/Qwen2.5-7B-Instruct', 'AtlaAI/Selene-1-Mini-Llama-3.1-8B' (Table 2, §8.1)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt texts are provided in Appendix D for the scorer (D.1), judge (D.2), majority vote (D.3), rollback (D.4), self-refine (D.5), debate rounds (D.6-D.7), and LLM planner (D.8)."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Temperature is set to 0 for fault injection (§5.2), and λ is mentioned as a tunable parameter. However, GRPO training hyperparameters (learning rate, epochs, batch size), vLLM serving parameters, and sampling settings for the main executor are not reported."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The agentic workflow structure is described in detail: workflow generation via Flow planner (§2.1), verifier placement policy (§5.4, Algorithm 1), verifier selector architecture (§6), and speculative execution runtime with state machine (§7, Appendix B)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The workflow generation process is documented (§8.1, Flow planner with customized prompts in Appendix D.8), fault injection methodology is detailed (§5.1-5.2, Table 1), and benchmark descriptions are provided (§8.1)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations section. The paper goes directly from Evaluation (§8) to Related Work (§9) to Conclusion (§10) without discussing limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed anywhere in the paper."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, model types, or workflow structures the results do NOT apply to."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw execution traces, fault injection results, or training data for the verifier selector are released."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Fault injection methodology is described in detail (§5.1-5.2): failure modes from Cemri et al. (2025), fault injection procedure, '100+ different graphs and 15K+ execution traces' generated from benchmarks."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. All experiments use standard public benchmarks."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from benchmark tasks → workflow generation (Flow planner) → fault injection → vulnerability estimation → verifier selector training is documented across §5-6. Table 1 shows failure mode frequencies sourced from prior empirical work."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding acknowledgment section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Microsoft Azure Research, Microsoft Azure, and The University of Texas at Austin. The paper evaluates open-source models, not Microsoft products."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed. Given Microsoft Azure Research affiliation, corporate funding is implied but not stated. Microsoft has commercial interest in agentic workflow serving (Azure AI services), making funder independence unclear."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interest statement is included in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The training data cutoff for Llama-3.1-8B and other models used is not stated."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether benchmark examples (HumanEval published 2021, CoTCollection 2023) may overlap with training data of the models used."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "HumanEval (2021) and other benchmarks predate the models' training. LiveCodeBench is designed to be contamination-free, but the paper does not discuss contamination risk for any benchmark."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Inference cost is reported as normalized cost per 1K problems (Figure 13), average cost comparisons (Figure 14), and the cost model is detailed in Appendix A.2 including GPU cost formula and $13.60/hour rate."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Hardware is specified: '8×NVIDIA A100 GPUs (80 GB each)' with GPU allocation per model (Table 2). Cost is $13.60/hour. 15K+ execution traces generated for fault injection analysis."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No results across multiple random seeds are reported. Temperature is fixed to 0 for fault injection (§5.2), but main results do not show seed sensitivity."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of runs for main evaluation results is not stated. '15K+ execution traces' is mentioned for fault injection analysis but not for the end-to-end evaluation."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget is reported for the verifier selector training (GRPO) or any other component."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The paper shows Pareto frontiers (Figure 13) and mentions λ as a tunable knob, but does not justify how specific configurations were selected for the end-to-end evaluation results."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors implement all baselines (random placement, even placement, tabular approach) and do not acknowledge potential bias from evaluating their own system against their own baseline implementations."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Figure 13 shows the accuracy-cost tradeoff explicitly. The paper systematically compares methods at various cost levels and shows Sherlock follows the Pareto frontier."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "No discussion of whether CoTCollection, OMEGA, and LiveCodeBench adequately measure 'reliable and efficient agentic workflow execution' as claimed. The paper does not question construct validity of any benchmark."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "All methods use the same Flow planner for workflow generation and the same base executor (Llama-3.1-8B-Instruct). Verifier placement is the only variable in §8.2, verifier selection in §8.3, and speculation in §8.4."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether Llama-3.1's training data includes solutions to benchmark problems created before its training cutoff."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information through context or prompts."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of train/test independence for the verifier selector (trained on microbenchmarks, tested on agentic benchmarks) or any other component."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection or prevention method is applied to any benchmark."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Sherlock delivers an 18.3% accuracy gain on average across benchmarks compared to the non-verifying baseline.",
    363       "evidence": "End-to-end results in Figure 14 comparing baseline, AFlow, tabular, and Sherlock accuracy across CoTCollection, LiveCodeBench, and OMEGA.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "Sherlock reduces workflow execution time by up to 48.7% over non-speculative execution.",
    368       "evidence": "Table 3 shows LiveCodeBench mean Tvrf reduction of 48.7%. CoTCollection Tvrf mean reduction is 21.9%, OMEGA is 30.6%.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Sherlock lowers verification cost by 26.0% compared to Monte Carlo search-based method (AFlow).",
    373       "evidence": "Figure 14 (right panel) shows normalized cost comparison between AFlow and Sherlock across three benchmarks.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Terminal nodes are most vulnerable to faults, followed by initial nodes, while intermediate nodes contribute least to end-to-end failure propagation.",
    378       "evidence": "Figure 8 (left) shows vulnerability estimates by node position across CoTCollection, HumanEval, and OMEGA benchmarks, with terminal > root > middle.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Fan-in degree shows strong positive correlation with node vulnerability.",
    383       "evidence": "Figure 8 (right) shows monotonically increasing vulnerability estimates (0.15 to 0.27 for CoTCollection, 0.14 to 0.21 for HumanEval) as fan-in increases from 1 to 5.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Sherlock's verifier selector follows the Pareto frontier and approaches oracle performance.",
    388       "evidence": "Figure 13 shows Sherlock points on the accuracy-cost Pareto frontier, close to the oracle.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "Sherlock presents a principled framework for selective verification in agentic workflows, using counterfactual fault injection to identify error-prone nodes and GRPO-trained verifier selection. Terminal and high-fan-in nodes are most vulnerable. Speculative execution reduces latency by up to 48.7% by overlapping verification with downstream computation, while the learned verifier selector achieves near-Pareto-optimal accuracy-cost tradeoffs across instruction-following, coding, math, and tool-use tasks.",
    394   "red_flags": [
    395     {
    396       "flag": "No limitations section",
    397       "detail": "The paper has no limitations, threats to validity, or scope boundary discussion despite making broad claims about 'reliable and efficient agentic workflow execution.' This is a significant methodological omission."
    398     },
    399     {
    400       "flag": "No variance or error bars on main results",
    401       "detail": "All key accuracy, latency, and cost results are reported as point estimates without any uncertainty quantification. LLM outputs are stochastic, so single-run results may not be stable."
    402     },
    403     {
    404       "flag": "Company evaluating related capability",
    405       "detail": "Multiple authors are from Microsoft Azure Research, which has commercial interest in agentic workflow serving infrastructure. No funding or competing interests are disclosed."
    406     },
    407     {
    408       "flag": "Single base model",
    409       "detail": "All experiments use Llama-3.1-8B-Instruct as the base executor. Generalization of the topology-based vulnerability policy and verifier selector to other models or model sizes is not tested."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Self-refine: Iterative refinement with self-feedback",
    415       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    416       "year": 2023,
    417       "relevance": "Key LLM verification method (self-reflection) that Sherlock builds upon and compares against."
    418     },
    419     {
    420       "title": "Improving factuality and reasoning in language models through multiagent debate",
    421       "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"],
    422       "year": 2023,
    423       "relevance": "Multi-agent debate verification method used as a baseline in Sherlock's verifier selection."
    424     },
    425     {
    426       "title": "Self-consistency improves chain of thought reasoning in language models",
    427       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    428       "year": 2023,
    429       "arxiv_id": "2203.11171",
    430       "relevance": "Self-consistency verification method evaluated and compared in Sherlock's framework."
    431     },
    432     {
    433       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    434       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    435       "year": 2023,
    436       "relevance": "LLM-as-a-Judge paradigm used as a verifier type in Sherlock."
    437     },
    438     {
    439       "title": "Why do multi-agent LLM systems fail?",
    440       "authors": ["Mert Cemri", "Mike Z. Pan", "Shijie Yang"],
    441       "year": 2025,
    442       "arxiv_id": "2503.13657",
    443       "relevance": "Provides the empirical failure mode distribution used to calibrate Sherlock's fault injection model."
    444     },
    445     {
    446       "title": "AFlow: Automating agentic workflow generation",
    447       "authors": ["Jiale Zhang", "Jiahui Xiang", "Zhiyuan Yu"],
    448       "year": 2024,
    449       "relevance": "Monte Carlo search-based workflow optimization baseline compared against Sherlock."
    450     },
    451     {
    452       "title": "Flow: Modularized agentic workflow automation",
    453       "authors": ["Boyu Niu", "Yifan Song", "Kaituo Lian"],
    454       "year": 2025,
    455       "relevance": "LLM planner used for workflow generation in Sherlock's evaluation."
    456     },
    457     {
    458       "title": "Speculative Actions: A lossless framework for faster agentic systems",
    459       "authors": ["Nan Ye", "Abhinav Ahuja", "Georgios Liargkovas"],
    460       "year": 2025,
    461       "arxiv_id": "2510.04371",
    462       "relevance": "Related speculative execution approach for agentic systems, complementary to Sherlock."
    463     },
    464     {
    465       "title": "Murakkab: Resource-efficient agentic workflow orchestration in cloud platforms",
    466       "authors": ["Ghufran I. Chaudhry", "Esha Choukse", "Haoran Qiu"],
    467       "year": 2025,
    468       "relevance": "Related work on agentic workflow serving infrastructure from overlapping author group."
    469     },
    470     {
    471       "title": "Circinus: Efficient query planner for compound ML serving",
    472       "authors": ["Banruo Liu", "Wei-Yu Lin", "Muhan Fang"],
    473       "year": 2025,
    474       "relevance": "SLO-aware query planner for compound AI workloads, complementary serving infrastructure."
    475     },
    476     {
    477       "title": "Evaluating large language models trained on code",
    478       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    479       "year": 2021,
    480       "arxiv_id": "2107.03374",
    481       "relevance": "HumanEval benchmark used in Sherlock's fault injection and evaluation experiments."
    482     },
    483     {
    484       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    485       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    486       "year": 2024,
    487       "arxiv_id": "2403.07974",
    488       "relevance": "Contamination-free code benchmark used for Sherlock's end-to-end evaluation."
    489     }
    490   ]
    491 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs