scan-v5.json (18606B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "From Code to Courtroom: LLMs as the New Software Judges", 6 "authors": [ 7 "Junda He", 8 "Jieke Shi", 9 "Terry Yue Zhuo", 10 "Christoph Treude", 11 "Jiamou Sun" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2503.02246", 16 "doi": "10.48550/arXiv.2503.02246" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims a review of existing studies, identification of limitations, and a research roadmap — all three are delivered in Sections 3, 4, and the conclusion respectively.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": false, 28 "answer": false, 29 "justification": "The paper is a forward-looking vision and literature review; it makes no causal claims about interventions improving outcomes.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper generalizes broadly about the future of LLM-as-a-Judge in all of software engineering based on only 16 reviewed studies, without bounding claims to what that corpus can support.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper presents a one-sided pro-LLM-as-judge vision; it acknowledges field-level limitations but does not consider the alternative that LLMs may be fundamentally unsuitable as evaluation surrogates.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly discusses alignment with human judgment as the key validation criterion and distinguishes LLM assessments from actual software quality throughout the limitations section.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 4 ('The Road Ahead') contains six explicitly numbered limitations of the current field (e.g., Limitation 1–6).", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "The limitations discuss the reviewed field's shortcomings, not threats to the paper's own review methodology; there is no discussion of selection bias in the 16 papers chosen or the non-systematic nature of the review.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper notes it is 'not intended to be a definitive guide' but never explicitly states what its review does not cover or what claims cannot be drawn from 16 informally selected papers.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "There is no acknowledgments or funding section anywhere in the paper text.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All seven authors list their affiliations explicitly (Singapore Management University, Monash University, CSIRO's Data61, Australian National University).", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, so independence of funder cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement is present in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 2 provides a formal mathematical definition of LLM-as-a-Judge with typed inputs (T, C, X, R) and outputs (Y, E, F), and explicitly distinguishes it from broader LLM-based evaluation approaches.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper explicitly lists three contributions: a review of 16 primary studies, analysis of limitations and research gaps, and a forward-looking vision with a research roadmap.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper actively engages with prior work throughout — Section 3 maps 16 studies to SE tasks, and the definition section explicitly distinguishes this paper's framing from Wang et al.'s broader definition.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "survey": { 120 "search_and_selection": { 121 "search_strategy_reproducible": { 122 "applies": true, 123 "answer": false, 124 "justification": "No search strategy is described; the 16 papers are listed without any explanation of how they were identified or retrieved.", 125 "source": "haiku" 126 }, 127 "inclusion_exclusion_explicit": { 128 "applies": true, 129 "answer": false, 130 "justification": "No inclusion or exclusion criteria are stated anywhere in the paper; the selection of 16 studies is presented without methodology.", 131 "source": "haiku" 132 }, 133 "prisma_or_structured_protocol": { 134 "applies": true, 135 "answer": false, 136 "justification": "No PRISMA or other structured review protocol is mentioned or followed.", 137 "source": "haiku" 138 }, 139 "search_terms_provided": { 140 "applies": true, 141 "answer": false, 142 "justification": "No search queries or terms are provided.", 143 "source": "haiku" 144 }, 145 "databases_listed": { 146 "applies": true, 147 "answer": false, 148 "justification": "No databases or search sources are listed.", 149 "source": "haiku" 150 }, 151 "screening_process_documented": { 152 "applies": true, 153 "answer": false, 154 "justification": "No screening process with counts at each stage is documented; papers appear selected informally.", 155 "source": "haiku" 156 }, 157 "review_scope_justified": { 158 "applies": true, 159 "answer": false, 160 "justification": "The topic scope (LLM-as-a-Judge in SE) is stated but no justification is given for why these particular years, venues, or task types were chosen.", 161 "source": "haiku" 162 } 163 }, 164 "synthesis_quality": { 165 "conflicting_findings_acknowledged": { 166 "applies": true, 167 "answer": true, 168 "justification": "Limitation 2 explicitly discusses 'Inconsistent Empirical Findings,' citing that Wang et al. found traditional metrics outperform LLM-as-a-Judge while Wu et al. found the opposite for code summarization.", 169 "source": "haiku" 170 }, 171 "quality_assessment_of_sources": { 172 "applies": true, 173 "answer": false, 174 "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of the 16 reviewed papers is performed; all are treated equally regardless of sample size or methodological rigor.", 175 "source": "haiku" 176 }, 177 "publication_bias_discussed": { 178 "applies": true, 179 "answer": false, 180 "justification": "Publication bias is never mentioned; the paper does not acknowledge that its 16 reviewed studies may skew toward positive results for LLM-as-a-Judge.", 181 "source": "haiku" 182 }, 183 "quantitative_synthesis_present": { 184 "applies": true, 185 "answer": false, 186 "justification": "The synthesis is entirely narrative; no meta-analysis, vote counting, or effect size aggregation is performed.", 187 "source": "haiku" 188 }, 189 "recommendations_supported_by_evidence": { 190 "applies": true, 191 "answer": false, 192 "justification": "The 'opportunities' and roadmap items are largely speculative future directions not grounded in the reviewed evidence; they follow logically from identified gaps but are not empirically supported.", 193 "source": "haiku" 194 } 195 } 196 } 197 }, 198 "claims": [ 199 { 200 "claim": "84% of SE researchers agree that human evaluation is problematic due to time constraints, cost, and need for specialized knowledge.", 201 "evidence": "Cited from Buse et al. [7], a 2011 OOPSLA paper on benefits and barriers of user evaluation.", 202 "supported": "moderate" 203 }, 204 { 205 "claim": "There are only 16 primary studies on LLM-as-a-Judge in software engineering, indicating the field is in early stages.", 206 "evidence": "Table 1 maps 16 references to SE tasks; the paper states 'the field remains in its early stages.'", 207 "supported": "moderate" 208 }, 209 { 210 "claim": "Existing LLM-as-a-Judge benchmarks use only small-scale datasets, limiting generalizability.", 211 "evidence": "Wang et al. [65] used 450 samples across three tasks; Ahmed et al. [1] used 420 samples for code summarization.", 212 "supported": "strong" 213 }, 214 { 215 "claim": "Conflicting empirical findings exist: Wang et al. found traditional metrics outperform LLM-as-a-Judge for code summarization, while Wu et al. found the opposite.", 216 "evidence": "Both studies are cited directly and the conflict is characterized as a major challenge requiring standardized evaluation.", 217 "supported": "strong" 218 }, 219 { 220 "claim": "LLMs do not experience fatigue, allowing consistent performance over extended periods unlike human evaluators.", 221 "evidence": "Stated as a motivating attribute with no citation or empirical support; presented as an inherent property.", 222 "supported": "unsupported" 223 }, 224 { 225 "claim": "LLM-as-a-Judge systems are susceptible to biases including position bias, verbosity bias, and egocentric bias in SE contexts.", 226 "evidence": "Cites external NLP/ML bias papers [36, 28, 76] but notes there is 'a lack of thorough empirical investigation' in SE specifically — i.e., the claim is extrapolated, not demonstrated.", 227 "supported": "weak" 228 } 229 ], 230 "methodology_tags": [ 231 "qualitative" 232 ], 233 "key_findings": "This SE 2030 vision paper reviews 16 studies on LLM-as-a-Judge in software engineering and identifies six major limitations: lack of large-scale human-annotated benchmarks, inconsistent empirical findings across studies, insufficient bias investigation, inadequate SE domain expertise in LLMs, over-reliance on internal LLM mechanisms, and insufficient research on adversarial threats. The paper proposes a research roadmap including creating comprehensive benchmarks, embedding expert tacit knowledge, integrating external SE tools, and developing adversarial defenses. The review is entirely non-systematic, with no stated search methodology, inclusion criteria, or quality assessment of the 16 source papers.", 234 "red_flags": [ 235 { 236 "flag": "Non-systematic selection", 237 "detail": "16 papers are reviewed with no search strategy, inclusion/exclusion criteria, or screening process documented — the review is not reproducible and may reflect author familiarity rather than comprehensive coverage." 238 }, 239 { 240 "flag": "Self-citation cluster", 241 "detail": "Multiple references ([55][56][57][74][75]) are co-authored by paper authors (Shi, He, Lo), creating potential citation bias in a paper arguing for a research agenda." 242 }, 243 { 244 "flag": "Speculative roadmap without empirical grounding", 245 "detail": "The 2030 vision and roadmap items are normative prescriptions not derivable from the 16 reviewed papers; they represent author opinion about future directions rather than evidence-based conclusions." 246 }, 247 { 248 "flag": "No paper-level limitations", 249 "detail": "The limitations section discusses the reviewed field's shortcomings, not the paper's own methodological limitations (non-systematic selection, small corpus, no quality assessment of sources)." 250 }, 251 { 252 "flag": "No funding disclosure", 253 "detail": "No acknowledgments or funding statement appears in the paper; this omission is notable given the authors' institutional affiliations with CSIRO's Data61 (a government research agency)." 254 } 255 ], 256 "cited_papers": [ 257 { 258 "title": "Can LLMs Replace Human Evaluators? An Empirical Study of LLM-as-a-Judge in Software Engineering", 259 "relevance": "Key empirical study reviewed; found traditional metrics outperform LLM-as-a-Judge for code summarization — directly motivates the paper's call for standardized benchmarks." 260 }, 261 { 262 "title": "Can Large Language Models Serve as Evaluators for Code Summarization?", 263 "relevance": "Conflicting empirical finding vs. Wang et al.; found LLM-as-a-Judge outperforms conventional metrics for code summarization, exemplifying the inconsistency problem." 264 }, 265 { 266 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 267 "relevance": "Original LLM-as-a-Judge paper from NLP domain that the SE application builds upon; cited as foundational." 268 }, 269 { 270 "title": "ICE-Score: Instructing Large Language Models to Evaluate Code", 271 "relevance": "Early SE-specific LLM evaluation work by a co-author; demonstrates reference-free evaluation of code generation." 272 }, 273 { 274 "title": "CodeJudge: Evaluating Code Generation with Large Language Models", 275 "relevance": "Demonstrates taxonomy-guided LLM evaluation of generated code; key example of multi-facet evaluation approach." 276 }, 277 { 278 "title": "Can LLMs Replace Manual Annotation of Software Engineering Artifacts?", 279 "relevance": "Directly evaluates LLM-as-a-Judge across multiple SE tasks including code summarization, patches, and requirements; one of the 16 primary reviewed studies." 280 }, 281 { 282 "title": "LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods", 283 "relevance": "Broader NLP survey on LLM evaluation that inspires the formal definition used in this paper." 284 }, 285 { 286 "title": "AIME: AI System Optimization via Multiple LLM Evaluators", 287 "relevance": "Proposes combining multiple LLM evaluators to approximate optimal evaluation; cited as a recent methodological advance." 288 } 289 ], 290 "engagement_factors": { 291 "practical_relevance": { 292 "score": 2, 293 "justification": "SE practitioners and researchers evaluating LLM-generated code face real challenges addressed by this roadmap, though the paper offers no immediately usable tools." 294 }, 295 "surprise_contrarian": { 296 "score": 1, 297 "justification": "The finding that only 16 studies exist in this rapidly growing area is somewhat surprising, but the paper's thesis (LLMs as judges are promising) is conventional wisdom." 298 }, 299 "fear_safety": { 300 "score": 1, 301 "justification": "Section 4.4 raises adversarial attacks on LLM judges (obfuscated code, deceptive commit messages) as a security concern, but the treatment is brief and not alarming." 302 }, 303 "drama_conflict": { 304 "score": 1, 305 "justification": "The conflicting findings between Wang et al. and Wu et al. on the same task are highlighted as a field-level problem, but not dramatized." 306 }, 307 "demo_ability": { 308 "score": 0, 309 "justification": "Pure vision/roadmap paper with no implementation, tool, or demo; nothing to try." 310 }, 311 "brand_recognition": { 312 "score": 1, 313 "justification": "Singapore Management University and CSIRO's Data61 are credible research institutions but not AI brand names that drive HN attention." 314 } 315 }, 316 "hn_data": { 317 "threads": [ 318 { 319 "hn_id": "43978357", 320 "title": "Type-constrained code generation with language models", 321 "points": 257, 322 "comments": 127, 323 "url": "https://news.ycombinator.com/item?id=43978357", 324 "created_at": "2025-05-13T22:15:30Z" 325 }, 326 { 327 "hn_id": "45141762", 328 "title": "Fantastic pretraining optimizers and where to find them", 329 "points": 42, 330 "comments": 4, 331 "url": "https://news.ycombinator.com/item?id=45141762", 332 "created_at": "2025-09-05T18:15:42Z" 333 }, 334 { 335 "hn_id": "30665928", 336 "title": "PERCEPT: Online change-point detection using topological data analysis", 337 "points": 8, 338 "comments": 0, 339 "url": "https://news.ycombinator.com/item?id=30665928", 340 "created_at": "2022-03-13T21:31:04Z" 341 }, 342 { 343 "hn_id": "43997113", 344 "title": "An Empirical Study on the Performance and Energy Usage of Compiled Python Code", 345 "points": 3, 346 "comments": 0, 347 "url": "https://news.ycombinator.com/item?id=43997113", 348 "created_at": "2025-05-15T17:12:36Z" 349 }, 350 { 351 "hn_id": "39686242", 352 "title": "Random Networks are not Random Functions", 353 "points": 3, 354 "comments": 0, 355 "url": "https://news.ycombinator.com/item?id=39686242", 356 "created_at": "2024-03-12T23:39:00Z" 357 }, 358 { 359 "hn_id": "44461553", 360 "title": "SegmentAnyMuscle: A muscle segmentation model across different locations in MRI", 361 "points": 2, 362 "comments": 0, 363 "url": "https://news.ycombinator.com/item?id=44461553", 364 "created_at": "2025-07-04T06:01:44Z" 365 }, 366 { 367 "hn_id": "43926603", 368 "title": "Pearch.ai beat LinkedIn's AI search in a head-to-head benchmark", 369 "points": 1, 370 "comments": 0, 371 "url": "https://news.ycombinator.com/item?id=43926603", 372 "created_at": "2025-05-08T14:50:43Z" 373 }, 374 { 375 "hn_id": "43908546", 376 "title": "Performance and Energy Usage of Compiled Python", 377 "points": 1, 378 "comments": 0, 379 "url": "https://news.ycombinator.com/item?id=43908546", 380 "created_at": "2025-05-06T19:03:58Z" 381 } 382 ], 383 "top_points": 257, 384 "total_points": 317, 385 "total_comments": 131 386 } 387 }