scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25679B)
      1 {
      2   "paper": {
      3     "title": "CodeJudgeBench: Benchmarking LLM-as-a-Judge for Coding Tasks",
      4     "authors": [
      5       "Hongchao Jiang",
      6       "Yiming Chen",
      7       "Yushi Cao",
      8       "Hung-yi Lee",
      9       "Robby T. Tan"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2507.10535"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "A GitHub repository is provided: https://github.com/hongcha0/CodeJudgeBench (Section 1, after the abstract)."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "A HuggingFace dataset is provided: https://huggingface.co/datasets/mattymchen/codejudgebench (Section 1, after the abstract)."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No environment setup instructions, requirements.txt, Dockerfile, or dependency specifications are mentioned in the paper."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided in the paper. While code and data links are given, no README, reproduction guide, or scripts for replicating the main experiments are described."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "All results are reported as point estimates (e.g., accuracy percentages in Tables 3, 4, 5, 7) with no confidence intervals or error bars."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes multiple comparative claims (e.g., 'thinking models significantly outperform non-thinking models') but uses no statistical significance tests. Comparisons are based solely on comparing raw accuracy numbers."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No formal effect sizes (e.g., Cohen's d, odds ratios) are reported. While raw accuracy differences are visible in tables, there is no contextual framing of effect magnitudes beyond comparing numbers."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The benchmark contains 5,352 pairs (Table 1) but no justification is given for why this size is sufficient for the claims being made. No power analysis or sample size rationale is discussed."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Each model appears to have been evaluated once per configuration with single-run point estimates."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper evaluates 26 LLM-as-a-Judge models including both specialized judge models (AceCodeRM, Prometheus, Self-Taught, Skywork-Critic) and general-purpose models, providing baseline comparisons (Table 2, Table 3)."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines include very recent models: Claude 4, Gemini 2.5, Qwen3, DeepSeek R1 (2025), RM-R1 (2025), and other current state-of-the-art models."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "RQ3 provides ablation-style analysis: point-wise vs. pair-wise evaluation (Table 4), candidate pre-processing strategies (Table 5: raw response vs. full code vs. no comments), and response ordering effects (Section 5.2)."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper uses only accuracy as its evaluation metric throughout all experiments. No additional metrics (e.g., F1, AUC, Cohen's kappa for agreement) are reported."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No human evaluation of the system outputs is included. All evaluation is automated via unit test pass/fail verification. Given that the paper studies LLM judgment quality, human agreement with LLM judges would have been informative."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "There is no explicit separation of development and test splits. All 5,352 pairs are used for evaluation with no held-out set mentioned. The difficulty categorization (easy/medium/hard) is determined post-hoc based on model performance."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are broken down by task (CodeGen, CodeRepair, TestGen), by difficulty level (easy, medium, hard), by response source model, and by response ordering (position A vs. B). See Tables 3, 5, and Figures 4-5."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 5.2 discusses failure modes including position bias and model-specific variance. The paper shows models that perform poorly and discusses why (e.g., RM-R1 recency bias, non-thinking models approaching random baseline)."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Several negative results are reported: fine-tuning thinking models for LLM-as-a-Judge does not improve performance (Section 5.1), point-wise evaluation significantly underperforms pair-wise (Section 5.3, Table 4), removing comments hurts performance (Table 5)."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims that thinking models outperform non-thinking models (supported by Table 3), small thinking models outperform large judge-tuned models (Qwen3-8B vs. Prometheus-14B/Self-Taught-70B in Table 3), position sensitivity exists (supported by Figure 4), and pair-wise outperforms point-wise (supported by Table 4). All claims are supported by results."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal-sounding claims such as 'thinking models allocate more tokens for code analysis, which enhances their ability to understand and accurately judge code responses' (Section 5.1) and attributes fine-tuning failures to 'insufficient code-related training data.' These causal explanations are speculative without controlled experiments isolating these factors."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper title claims to benchmark 'LLM-as-a-Judge for Coding Tasks' broadly, but results are limited to competitive programming problems from LiveCodeBench (LeetCode, AtCoder, CodeForces). Real-world software engineering tasks, different programming languages beyond Python, and non-algorithmic coding are not tested. The scope is not explicitly bounded to competitive programming."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No substantive discussion of alternative explanations. For example, the paper does not consider whether thinking models' advantage comes from longer outputs rather than reasoning quality, whether position bias results from training data artifacts, or whether the benchmark's difficulty is an artifact of the response generation process."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Most models lack specific version identifiers. The paper uses 'Claude-3.7-Sonnet', 'Claude-4-Sonnet', 'Gemini-2.5-Pro', 'Gemini-2.5-Flash' without API version snapshots or dates. For example, 'Claude 3.7' with an access date of 2025-5-15 and 'Claude 4' accessed 2025-5-25 are in references, but no API snapshot identifiers are provided. Marketing names without snapshot dates do not count per the schema."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Section 4.3 states 'we use the pair-wise prompt from [39]' and 'we follow the prompts and sampling parameters provided in their official implementations' for judge-tuned LLMs, but the actual prompt text is not provided in the paper or appendix. The reader would need to chase multiple external references to reconstruct prompts."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the evaluated models. Section 4.3 mentions following 'sampling parameters provided in their official implementations' for judge-tuned models but does not state what these are. General LLM settings are not specified."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. The paper evaluates LLMs as judges via direct prompting without multi-step workflows, tool use, or agent loops."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Sections 3.2-3.4 describe the data construction pipeline in detail: response collection from specific models, verification via unit tests (pass all = good, fail any = bad), and pairing strategy (random selection of one good and one bad). Filtering criteria (discard problems with all-correct or all-incorrect responses) are stated. Table 1 provides counts."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations section. The conclusion briefly mentions 'Future work will focus on expanding CodeJudgeBench' but does not discuss limitations of the current work."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats to validity are discussed. There is no discussion of potential issues such as benchmark construction bias, the narrow scope of competitive programming problems, or the limitation of binary correctness as ground truth."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that results are limited to competitive programming, to Python, to execution-free judging, or that the benchmark may not represent real-world coding evaluation scenarios."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The dataset is released on HuggingFace (https://huggingface.co/datasets/mattymchen/codejudgebench), allowing independent verification of the benchmark data."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Sections 3.1-3.5 describe the data collection process: problems sourced from LiveCodeBench-v6 (1,055 problems from May 2023 to April 2025), responses generated by specific LLMs (Claude-3.7-Sonnet, Gemini-2.5-Flash, Gemini-2.5-Pro, etc.), verification via unit tests."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The data source is a standard benchmark (LiveCodeBench) with LLM-generated responses."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The full pipeline is documented in Sections 3.2-3.4 and Figure 3: response collection from LLMs, verification via unit tests, and pairing of good/bad responses. Table 1 provides counts at each stage per source model and task."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding information or acknowledgments section is present in the paper."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly stated: ASUS Intelligent Cloud Services (AICS) and National Taiwan University. However, ASUS is a technology company, and no conflict-of-interest statement addresses this."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so independence cannot be assessed. The primary affiliation is ASUS Intelligent Cloud Services, a commercial entity that could have interest in LLM evaluation outcomes."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement or financial disclosures are provided in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper evaluates 26 LLMs as judges on benchmark data but does not state training data cutoff dates for any of the models. LiveCodeBench-v6 uses problems from May 2023 to April 2025, but it is unclear which models may have been trained on earlier portions of this data."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Section 3.1 states they use LiveCodeBench which 'mitigates data contamination by continually collecting new problems from platforms such as LeetCode, AtCoder, and CodeForces.' This acknowledges the contamination concern and relies on LiveCodeBench's temporal design, though no direct analysis of overlap is performed."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "The paper explicitly chooses LiveCodeBench to mitigate contamination risk (Section 3.1): it uses problems published between May 2023 and April 2025, which are continually updated to stay ahead of model training cutoffs. This is a reasonable mitigation strategy, though individual model cutoffs are not verified."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No inference costs, API costs, or latency figures are reported despite evaluating 26 models across 5,352+ data points with multiple configurations. The cost of running the benchmark is not discussed."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget, GPU hours, API spend, or hardware specifications are stated."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "Thinking models significantly outperform non-thinking models on coding judgment tasks.",
    292       "evidence": "Table 3 shows thinking models (Claude-4-Sonnet 79.93%, Gemini-2.5-Pro 82.12%, QwQ 76.40%) substantially outperform non-thinking models (Claude-3.5-Sonnet 59.12%, Gemini-2.0-Flash 54.80%) on average accuracy across all tasks.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Small thinking models (e.g., Qwen3-8B) can outperform specially trained LLM-as-a-Judge models up to 70B in size.",
    297       "evidence": "Table 3 shows Qwen3-8B (65.60% avg) outperforms Prometheus-14B (57.04%), Self-Taught-70B (56.36%), and Skywork-Critic-70B (55.98%).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Fine-tuning thinking models specifically for LLM-as-a-Judge tasks does not yield improved performance.",
    302       "evidence": "Table 3 shows RM-R1-32B (62.60%) underperforms general-purpose Qwen3-32B (72.42%) and QwQ (76.40%), despite RM-R1 being specifically trained for judge tasks (Section 5.1).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Response ordering significantly impacts LLM-as-a-Judge accuracy, with discrepancies reaching up to 14%.",
    307       "evidence": "Figure 4 shows accuracy differences between correct-at-position-A vs. correct-at-position-B configurations, with some models showing substantial gaps (e.g., RM-R1-32B, Claude-3.7) across all three tasks.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Pair-wise evaluation outperforms point-wise evaluation for coding judgment tasks.",
    312       "evidence": "Table 4 shows point-wise evaluation produces ~50% ties for most models on CodeGen, significantly reducing effective accuracy compared to pair-wise results in Table 3 (Section 5.3).",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Providing the full raw model response (including reasoning/comments) leads to better judge performance than code-only input.",
    317       "evidence": "Table 5 shows 'raw response' (RR) averaging 71.43% vs. 'full code' (FC) at 70.31% vs. 'no comments' (NC) at 69.27% overall. The difference is modest but consistent.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "CodeJudgeBench is more challenging than existing benchmarks like JudgeBench and RM-Bench.",
    322       "evidence": "Figure 1 shows frontier models achieve 93-98% on JudgeBench and 78-87% on RM-Bench code splits, but only 66-82% on CodeJudgeBench CodeGen task.",
    323       "supported": "strong"
    324     }
    325   ],
    326   "methodology_tags": [
    327     "benchmark-eval"
    328   ],
    329   "key_findings": "CodeJudgeBench evaluates 26 LLM-as-a-Judge models across code generation, code repair, and unit test generation using 5,352 curated pairs from LiveCodeBench. Thinking models (e.g., Claude-4-Sonnet at 79.93%, Gemini-2.5-Pro at 82.12%) dramatically outperform non-thinking models (most below 60% accuracy), and even small 8B thinking models surpass large 70B judge-specialized models. All models exhibit significant position bias in pairwise evaluation, with accuracy swings up to 14% when response order is swapped. Pair-wise evaluation substantially outperforms point-wise evaluation due to frequent ties in scalar scoring.",
    330   "red_flags": [
    331     {
    332       "flag": "No statistical rigor",
    333       "detail": "All comparisons are based on single-run point estimates with no confidence intervals, significance tests, or variance reporting. Claims of 'significant' differences are based solely on comparing raw accuracy numbers, making it impossible to distinguish genuine performance differences from noise."
    334     },
    335     {
    336       "flag": "Single metric (accuracy only)",
    337       "detail": "The entire evaluation uses only accuracy. No additional metrics (e.g., agreement with human judges, Cohen's kappa, calibration measures) are reported, limiting understanding of judge quality beyond binary correctness."
    338     },
    339     {
    340       "flag": "No limitations section",
    341       "detail": "The paper has no dedicated limitations or threats-to-validity discussion, which is a significant omission for a benchmark paper. There is no acknowledgment that competitive programming problems may not represent real-world coding tasks."
    342     },
    343     {
    344       "flag": "Overly broad claims from narrow scope",
    345       "detail": "The paper claims to benchmark 'LLM-as-a-Judge for Coding Tasks' broadly, but all problems come from competitive programming (LiveCodeBench). Real-world software engineering, multi-file projects, different programming languages, and non-algorithmic coding are not represented."
    346     },
    347     {
    348       "flag": "Prompts and hyperparameters not provided",
    349       "detail": "The actual prompts used for evaluation are not included in the paper, and no temperature/sampling parameters are reported. Readers cannot fully reproduce the experiments without chasing multiple external references."
    350     },
    351     {
    352       "flag": "Corporate affiliation without conflict disclosure",
    353       "detail": "The primary affiliation is ASUS Intelligent Cloud Services (a commercial entity), yet no funding or competing interests statement is provided."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    359       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    360       "year": 2023,
    361       "relevance": "Foundational work on LLM-as-a-judge paradigm demonstrating high correlation between GPT-4 judgments and human evaluations."
    362     },
    363     {
    364       "title": "JudgeBench: A Benchmark for Evaluating LLM-based Judges",
    365       "authors": ["Sijun Tan", "Siyuan Zhuang", "Kyle Montgomery"],
    366       "year": 2025,
    367       "relevance": "Prior LLM-as-a-Judge benchmark that CodeJudgeBench directly compares against and claims to surpass in difficulty."
    368     },
    369     {
    370       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    371       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    372       "year": 2025,
    373       "relevance": "Source of coding problems used to construct CodeJudgeBench; uses temporal approach to mitigate data contamination."
    374     },
    375     {
    376       "title": "RM-Bench: Benchmarking Reward Models of Language Models with Subtlety and Style",
    377       "authors": ["Yantao Liu", "Zijun Yao", "Rui Min"],
    378       "year": 2025,
    379       "relevance": "Reward model benchmark that includes a coding split; CodeJudgeBench shows models score much lower on its tasks."
    380     },
    381     {
    382       "title": "Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models",
    383       "authors": ["Seungone Kim", "Juyoung Suk", "Shayne Longpre"],
    384       "year": 2024,
    385       "relevance": "Open-source LLM judge evaluated in CodeJudgeBench; demonstrates that specialized judge training does not transfer well to coding tasks."
    386     },
    387     {
    388       "title": "Self-Taught Evaluators",
    389       "authors": ["Tianlu Wang", "Ilia Kulikov", "Olga Golovneva"],
    390       "year": 2024,
    391       "arxiv_id": "2408.02666",
    392       "relevance": "LLM-as-a-Judge trained iteratively on synthetic data; evaluated in CodeJudgeBench where it underperforms smaller thinking models."
    393     },
    394     {
    395       "title": "AceCoder: Acing Coder RL via Automated Test-Case Synthesis",
    396       "authors": ["Huaye Zeng", "Dongfu Jiang", "Haozhe Wang"],
    397       "year": 2025,
    398       "relevance": "Develops AceCodeRM, a specialized code judge model evaluated in CodeJudgeBench that underperforms general thinking models."
    399     },
    400     {
    401       "title": "RM-R1: Reward Modeling as Reasoning",
    402       "authors": ["Xiusi Chen", "Gaotang Li", "Ziqi Wang"],
    403       "year": 2025,
    404       "arxiv_id": "2505.02387",
    405       "relevance": "Thinking LLM-as-a-Judge model using chain-of-rubrics; evaluated extensively in CodeJudgeBench showing mixed results."
    406     },
    407     {
    408       "title": "From Code to Courtroom: LLMs as the New Software Judges",
    409       "authors": ["Junda He", "Jieke Shi", "Terry Yue Zhuo"],
    410       "year": 2025,
    411       "relevance": "Survey of LLM-as-a-Judge for coding tasks, providing taxonomy of judging criteria including code functionality and code quality."
    412     },
    413     {
    414       "title": "A Survey on LLM-as-a-Judge",
    415       "authors": ["Jiawei Gu", "Xuhui Jiang", "Zhichao Shi"],
    416       "year": 2024,
    417       "arxiv_id": "2411.15594",
    418       "relevance": "Comprehensive survey of the LLM-as-a-Judge paradigm that contextualizes the need for domain-specific benchmarks like CodeJudgeBench."
    419     },
    420     {
    421       "title": "CodeMonkeys: Scaling Test-Time Compute for Software Engineering",
    422       "authors": ["Ryan Ehrlich", "Bradley Brown", "Jordan Juravsky"],
    423       "year": 2025,
    424       "relevance": "Demonstrates inference-time scaling for code, related to CodeJudgeBench's Best-of-N evaluation using judge models."
    425     },
    426     {
    427       "title": "Scoring Verifiers: Evaluating Synthetic Verification for Code and Reasoning",
    428       "authors": ["Aleksander Ficek", "Somshubra Majumdar"],
    429       "year": 2025,
    430       "relevance": "Code verification benchmark using ranking evaluation; one of the coding judge benchmarks that CodeJudgeBench extends."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs