scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23204B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Benchmarking and Studying the LLM-based Code Review",
      6     "authors": [
      7       "Zhengran Zeng",
      8       "Ruikai Shi",
      9       "Keke Han",
     10       "Yixin Li",
     11       "Kaicheng Sun",
     12       "Yidong Wang",
     13       "Zhuohao Yu",
     14       "Rui Xie",
     15       "Wei Ye",
     16       "Shikun Zhang"
     17     ],
     18     "year": 2025,
     19     "venue": "arXiv",
     20     "arxiv_id": "2509.01494",
     21     "doi": null
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All four abstract claims are verified in the body: 1000 manually verified PRs confirmed in Section III, ~90% evaluator agreement confirmed in Figure 5a (89.2–94.9%), underperformance confirmed in Table III (best F1 19.38%), and 43.67% F1 boost from multi-review confirmed quantitatively in RQ3.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper claims reasoning-enhanced training is 'a crucial factor' for code review improvement, but this compares entirely different model versions (Qwen-Chat vs Qwen-R1) that differ on many dimensions beyond reasoning training. No controlled ablation isolates the reasoning component specifically.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Conclusions like 'current ACR approaches demonstrate limited performance' are stated broadly, but the benchmark covers only 12 Python open-source GitHub projects. Conclusions are not explicitly bounded to Python, open-source, or that specific project selection.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The finding that Claude-4 underperforms Claude-3.7 is only 'speculatively' attributed to training misalignment with no alternative explanations considered. The hypothesis that evolutionary changes are harder due to subjectivity is asserted without testing alternatives such as annotation bias.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper explicitly distinguishes between what is measured (semantic 'hit' of ground-truth change-points per Figure 4) and practical code review quality, validates the proxy via a human agreement study, and explicitly deprioritizes severity scoring because it is too subjective.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section V 'THREATS TO VALIDITY' covers internal, external, and construct validity threats in dedicated subsections, well beyond a single sentence in the conclusion.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Threats are stated at a generic level: 'potential bugs in implementation', 'quality of selected projects', 'LLMs and ACR tools chosen'. No specifics such as observed annotation disagreement rates, edge cases in the SZZ filtering, or known model failure modes are provided.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper does not explicitly state what conclusions should NOT be drawn. The external validity section notes that representative tools and popular projects were chosen but does not state that results do not generalize to compiled languages, proprietary codebases, or non-PR review workflows.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No funding acknowledgment appears anywhere in the paper. Author affiliations are disclosed but no funding sources are mentioned.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Author affiliations (Peking University and Northwestern Polytechnical University) are clearly disclosed on the title page.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No funding is disclosed, so independence of funders cannot be assessed.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests statement or financial disclosure appears anywhere in the paper.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Key terms are defined: 'Automated Code Review (ACR)' is introduced and contextualized, 'change-points' is explicitly defined in Section II-A as verifiable instances where reviewer suggestions led to subsequent code modifications, and 'Change-PRs' vs 'Clean-PRs' are defined in the construction pipeline.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Four numbered contributions are explicitly listed in the introduction: the SWR-Bench benchmark, the objective evaluation method, the systematic study, and the multi-review aggregation approach.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section II-C provides a detailed comparison with prior benchmarks (Tufano et al. series, CodeReviewer), specifically identifying their limitations in scope (method/hunk vs PR-level), context (missing full codebase), and metrics (BLEU, Exact-Match), then directly addressing each limitation in SWR-Bench's design.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "benchmark-creation": {
    125       "construct_design": {
    126         "construct_validity_argued": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The paper argues explicitly why PR-level review measures real-world ACR capability better than method/hunk-level evaluation, and why 'hit'-based LLM evaluation measures semantic identification of actual reviewer-confirmed change-points rather than textual similarity to reference comments.",
    130           "source": "haiku"
    131         },
    132         "difficulty_distribution_characterized": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper shows change-type distribution (Figure 3) and basic PR statistics (Table II), but does not characterize difficulty with easy/medium/hard tiers or any explicit difficulty measurement. The functional vs. evolutionary breakdown provides some variance information but not a difficulty characterization.",
    136           "source": "haiku"
    137         },
    138         "ceiling_floor_effects_checked": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No explicit ceiling/floor effect analysis is performed. While results cluster below 20% F1 (ruling out ceiling effects de facto), there is no analysis of whether the benchmark discriminates across the full range of model capabilities or whether any subset of instances is trivially easy or hard.",
    142           "source": "haiku"
    143         },
    144         "human_baseline_included": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Humans are used only to validate the evaluation methodology (Figure 5), not as a performance baseline on the benchmark task itself. No F1 score is reported for what trained human reviewers would achieve on SWR-Bench, making the 19.38% maximum score uninterpretable relative to human capability.",
    148           "source": "haiku"
    149         },
    150         "scoring_rubric_justified": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "The paper justifies using F1 based on TP/FP/FN with binary 'hit' as the primary metric, explicitly arguing against BLEU/Exact-Match (poor correlation with human judgment per cited prior work) and against subjective LLM ratings. The choice to separate 'hit', 'type', and 'severity' metrics by reliability level is explicitly argued in Section IV-B.",
    154           "source": "haiku"
    155         }
    156       },
    157       "robustness": {
    158         "contamination_resistance_designed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "The benchmark uses GitHub PRs from 12 popular Python projects (the same projects as SWE-Bench) that are highly likely to appear in LLM training data. No contamination resistance measures such as temporal splits after training cutoffs, canary strings, or dynamic generation are mentioned or implemented.",
    162           "source": "haiku"
    163         },
    164         "temporal_robustness_discussed": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The paper does not discuss whether SWR-Bench will remain useful as LLMs improve, whether it could be gamed, or how scores might saturate. There is no plan for benchmark versioning or updates.",
    168           "source": "haiku"
    169         },
    170         "failure_modes_discussed": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "The paper discusses ACR tool failure modes (high false positives, low precision) but not failure modes of the benchmark itself — e.g., what review capabilities the 'hit' metric fails to capture, what types of changes the annotation process systematically misses, or what behaviors could game the metric without actually reviewing well.",
    174           "source": "haiku"
    175         },
    176         "baseline_implementations_provided": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Code is made publicly available at anonymous.4open.science/status/swrbench-1D0E, and multiple baseline implementations are provided: LLM-Reviewer, SWR-Agent (adapted from SWE-Agent), with experimental configurations specified (temperature=0.2, default configurations for third-party tools).",
    180           "source": "haiku"
    181         }
    182       },
    183       "documentation": {
    184         "dataset_documentation_complete": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The construction pipeline is described in Section III-A but there is no formal data card or source description listing the specific 12 projects used. Critical documentation gaps include: no inter-annotator agreement statistics from the 5-annotator manual verification phase, and no description of the annotation guidelines given to annotators.",
    188           "source": "haiku"
    189         },
    190         "licensing_and_access_clear": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "The code repository link (anonymous.4open.science) is provided but no license is disclosed for either the code or the benchmark dataset. Long-term availability and legal reuse terms are undefined, and the anonymous link may expire post-publication.",
    194           "source": "haiku"
    195         },
    196         "intended_use_specified": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "The paper specifies that SWR-Bench evaluates ACR tools in realistic PR-level scenarios but does not state what should NOT be concluded from benchmark scores, such as that results do not apply to non-Python codebases or should not be used as a proxy for developer productivity.",
    200           "source": "haiku"
    201         }
    202       }
    203     }
    204   },
    205   "claims": [
    206     {
    207       "claim": "SWR-Bench's LLM-based evaluation achieves approximately 90% agreement with human expert judgment on the primary 'hit' metric",
    208       "evidence": "Figure 5a shows hit-agreement ranging from 89.2% to 94.9% across all pairwise combinations of 3 human experts and 2 LLM evaluators on 100 randomly sampled code review reports",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Current SOTA ACR systems are not ready for real-world code review deployment",
    213       "evidence": "Table III shows the best-performing combination (PR-Review with Gemini-2.5-Pro) achieves only 19.38% F1, with most systems below 13%; precision is below 10% for four of five techniques",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "ACR tools detect functional changes substantially better than evolutionary/stylistic changes",
    218       "evidence": "Table V shows F1 scores for functional changes consistently exceed those for evolutionary changes (F.2 Logic: 26.20%, F.1 Interface: 23.55% vs. highest evolutionary E.3.1 Organization: 16.45%)",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Reasoning-enhanced LLMs outperform standard LLMs on automated code review",
    223       "evidence": "Table IV shows Qwen-2.5-R1-14B achieves F1 of 15.95% vs Qwen-2.5-14B at 9.01%; Gemini-2.5-Pro and DeepSeek-R1 generally outperform standard counterparts, though model differences confound the comparison",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Multi-review aggregation boosts F1 scores by up to 43.67%",
    228       "evidence": "Gemini-2.5-Flash Self-Agg (n=10) achieves Overall F1 of 21.91% vs baseline PR-Review's approximately 15.25% F1 with Gemini-2.5-Flash, with recall increasing by 118.83%",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Claude-4 (Opus and Sonnet) underperforms its predecessor Claude-3.7-Sonnet on SWR-Bench",
    233       "evidence": "Table IV shows Claude-4-Opus at 16.99% F1 and Claude-4-Sonnet at 16.61%, both below Claude-3.7-Sonnet at 18.23%; authors speculate this reflects training misalignment with code review demands",
    234       "supported": "moderate"
    235     },
    236     {
    237       "claim": "PR-Review's prompt engineering approach outperforms multi-agent approaches including CR-Agent",
    238       "evidence": "Table III shows PR-Review achieves mean Overall-F1 of 18.73% while CR-Agent achieves only 9.22%, attributed to interaction overhead and error propagation in multi-agent systems",
    239       "supported": "strong"
    240     }
    241   ],
    242   "methodology_tags": [
    243     "benchmark-eval",
    244     "observational"
    245   ],
    246   "key_findings": "SWR-Bench is a new automated code review benchmark with 1,000 real-world GitHub PRs (500 with reviewer-confirmed change-points, 500 clean) providing full project context, validated at 89–95% agreement with human judgment. Current SOTA ACR systems achieve at most 19.38% F1, with excessive false positives being the primary bottleneck rather than recall. Reasoning-enhanced LLMs generally outperform standard counterparts on this task. A simple multi-review aggregation strategy improves F1 by up to 43.67%, substantially outperforming complex multi-agent architectures.",
    247   "red_flags": [
    248     {
    249       "flag": "No human performance baseline",
    250       "detail": "The benchmark includes no measurement of what F1 score trained human reviewers would achieve, making it impossible to assess whether the 19.38% maximum score represents a small or large gap from human-level performance."
    251     },
    252     {
    253       "flag": "Contamination unaddressed",
    254       "detail": "The benchmark uses PRs from 12 popular Python GitHub projects identical to SWE-Bench — projects almost certainly included in LLM training data. No temporal cutoff analysis or contamination mitigation is performed."
    255     },
    256     {
    257       "flag": "Annotation LLM overlap with evaluated models",
    258       "detail": "Gemini-2.5-Pro was used for LLM-based change-point annotation during benchmark construction AND is one of the primary evaluated ACR tools, creating potential circularity in its evaluation results."
    259     },
    260     {
    261       "flag": "Only 12 Python GitHub projects",
    262       "detail": "All benchmark data comes from 12 Python open-source projects selected to match SWE-Bench. Generalizability to other languages, paradigms, enterprise codebases, or non-open-source projects is entirely uncharacterized."
    263     },
    264     {
    265       "flag": "Anonymous-only code repository with no license",
    266       "detail": "Code is available only via an anonymous link (anonymous.4open.science) with no license disclosed. Long-term availability is unknown and legal reuse terms are undefined."
    267     },
    268     {
    269       "flag": "Missing inter-annotator agreement statistics",
    270       "detail": "Manual verification by 5 graduate students (at least 2 per PR) is described as the critical quality step, but no inter-annotator agreement statistics or disagreement resolution procedure are reported for this phase."
    271     }
    272   ],
    273   "cited_papers": [
    274     {
    275       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    276       "relevance": "Primary reference benchmark; SWR-Bench uses the same 12 Python projects and explicitly compares LLM performance rankings against SWE-Bench findings to highlight code-review-specific challenges"
    277     },
    278     {
    279       "title": "Automating Code Review Activities by Large-Scale Pre-Training (CodeReviewer)",
    280       "relevance": "Key prior ACR benchmark at hunk-level granularity that SWR-Bench directly addresses, identifying its limitations in scope and evaluation metrics"
    281     },
    282     {
    283       "title": "Towards Automating Code Review Activities (Tufano et al., ICSE 2021)",
    284       "relevance": "Foundational method-level ACR dataset series that SWR-Bench identifies as insufficient for evaluating modern LLMs due to fine-grained focus and Exact-Match/BLEU metrics"
    285     },
    286     {
    287       "title": "CodeAgent: Autonomous Communicative Agents for Code Review",
    288       "relevance": "Multi-agent ACR tool (CR-Agent) evaluated as one of the primary baselines; its underperformance relative to prompt engineering is a key finding"
    289     },
    290     {
    291       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    292       "relevance": "Architecture directly adapted for the SWR-Agent baseline; demonstrates cross-task adaptation of SE agents to code review"
    293     },
    294     {
    295       "title": "A Survey on LLM-as-a-Judge",
    296       "relevance": "Background for the evaluation approach; the paper's objective hit-based LLM-judge is explicitly contrasted against the subjective approaches surveyed here"
    297     },
    298     {
    299       "title": "Deep Assessment of Code Review Generation Approaches: Beyond Lexical Similarity",
    300       "relevance": "Validates the central methodological claim that BLEU and similar metrics correlate poorly with human judgment in code review, motivating the new evaluation framework"
    301     },
    302     {
    303       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    304       "relevance": "Reference benchmark used to contrast LLM ranking differences on SWR-Bench, showing code review ability diverges from general coding ability"
    305     }
    306   ],
    307   "engagement_factors": {
    308     "practical_relevance": {
    309       "score": 3,
    310       "justification": "Code review is a universal software development practice and the benchmark, evaluation methodology, and multi-review aggregation strategy are directly usable by teams evaluating or deploying ACR tools."
    311     },
    312     "surprise_contrarian": {
    313       "score": 2,
    314       "justification": "The finding that Claude-4 underperforms Claude-3.7 on code review, and that complex multi-agent approaches underperform simple prompt engineering, challenge typical assumptions about model progression and architectural complexity."
    315     },
    316     "fear_safety": {
    317       "score": 0,
    318       "justification": "No AI safety or risk angle; purely a software engineering evaluation methodology paper."
    319     },
    320     "drama_conflict": {
    321       "score": 1,
    322       "justification": "Implicitly critiques the adequacy of existing ACR benchmarks and the practical value of current tools, but no direct controversy or field-level conflict is invoked."
    323     },
    324     "demo_ability": {
    325       "score": 2,
    326       "justification": "Code is available (anonymously) and the multi-review aggregation strategy is simple enough to implement on top of any LLM; practitioners can evaluate their own models against the benchmark."
    327     },
    328     "brand_recognition": {
    329       "score": 2,
    330       "justification": "Evaluates GPT-4o, Claude series, Gemini series, and DeepSeek — all prominent commercial LLM brands — providing comparative rankings with high visibility."
    331     }
    332   },
    333   "hn_data": {
    334     "threads": [
    335       {
    336         "hn_id": "24396301",
    337         "title": "Mining Security Vulnerabities from Secret Integration Channels",
    338         "points": 11,
    339         "comments": 0,
    340         "url": "https://news.ycombinator.com/item?id=24396301",
    341         "created_at": "2020-09-07T03:06:56Z"
    342       },
    343       {
    344         "hn_id": "44024987",
    345         "title": "Can You Trust Code Copilots? Evaluating LLMs from a Code Security Perspec",
    346         "points": 11,
    347         "comments": 2,
    348         "url": "https://news.ycombinator.com/item?id=44024987",
    349         "created_at": "2025-05-18T23:09:48Z"
    350       },
    351       {
    352         "hn_id": "45047314",
    353         "title": "Learning Facts at Scale with Active Reading",
    354         "points": 7,
    355         "comments": 0,
    356         "url": "https://news.ycombinator.com/item?id=45047314",
    357         "created_at": "2025-08-28T01:32:47Z"
    358       },
    359       {
    360         "hn_id": "45115249",
    361         "title": "When Do Consumers Lose from Variable Electricity Pricing?",
    362         "points": 3,
    363         "comments": 0,
    364         "url": "https://news.ycombinator.com/item?id=45115249",
    365         "created_at": "2025-09-03T13:05:57Z"
    366       },
    367       {
    368         "hn_id": "43724556",
    369         "title": "PIM-LLM: A High-Throughput Hybrid PIM Architecture for 1-Bit LLMs",
    370         "points": 3,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=43724556",
    373         "created_at": "2025-04-18T02:58:59Z"
    374       },
    375       {
    376         "hn_id": "42966672",
    377         "title": "Develop AI Agents for System Engineering in Factorio",
    378         "points": 3,
    379         "comments": 0,
    380         "url": "https://news.ycombinator.com/item?id=42966672",
    381         "created_at": "2025-02-06T21:28:58Z"
    382       },
    383       {
    384         "hn_id": "24375913",
    385         "title": "The Sound of Silence: Mining Security Vulns from Secret Integration Channels",
    386         "points": 2,
    387         "comments": 0,
    388         "url": "https://news.ycombinator.com/item?id=24375913",
    389         "created_at": "2020-09-04T15:40:05Z"
    390       },
    391       {
    392         "hn_id": "44030713",
    393         "title": "Cosmos: Predictable and Cost-Effective Adaptation of LLMs",
    394         "points": 1,
    395         "comments": 0,
    396         "url": "https://news.ycombinator.com/item?id=44030713",
    397         "created_at": "2025-05-19T15:11:09Z"
    398       },
    399       {
    400         "hn_id": "28430785",
    401         "title": "Is Machine Learning ready for Traffic Engineering optimization?",
    402         "points": 1,
    403         "comments": 0,
    404         "url": "https://news.ycombinator.com/item?id=28430785",
    405         "created_at": "2021-09-06T05:53:23Z"
    406       }
    407     ],
    408     "top_points": 11,
    409     "total_points": 42,
    410     "total_comments": 2
    411   }
    412 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs