scan-v5.json (27145B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "On the Effectiveness of LLM-as-a-Judge for Code Generation and Summarization", 6 "authors": [ 7 "Giuseppe Crupi", 8 "Rosalia Tufano", 9 "Alejandro Velasco", 10 "Antonio Mastropaolo", 11 "Denys Poshyvanyk", 12 "Gabriele Bavota" 13 ], 14 "year": 2025, 15 "venue": "IEEE Transactions on Software Engineering", 16 "arxiv_id": "2507.16587", 17 "doi": "10.1109/TSE.2025.3586082" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims that GPT-4-turbo is best judge and smaller LLMs struggle are directly supported by Cohen's Kappa in Table 2 and Krippendorff's α in Table 5; the claim that even the best LLM frequently misjudges is supported by confusion matrices showing 50% false positive rate for wrong Java implementations.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": false, 29 "answer": false, 30 "justification": "The paper makes comparative observational claims about LLM judging performance, not causal claims; the study is descriptive and evaluative rather than interventional.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "External validity section explicitly bounds results to two tasks (code generation and code summarization) and two languages (Java and Python), with a call for differentiated replications.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": true, 42 "justification": "The paper explicitly tests and dismisses 'lack of coding context' as a major factor by rerunning analysis on self-contained functions; the false positive/negative qualitative analysis identifies specific alternative reasons for misjudgments.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The Construct Validity section explicitly acknowledges that test execution is a proxy for code correctness and documents quality checks excluding unreliable test cases; human judgment as oracle for summarization is similarly discussed with inter-rater agreement measured.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 4 'Threats to Validity' covers construct, internal, and external validity as distinct subsections with specific discussion under each.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats include: CoderEval test suite quality (67 problems excluded with documented criteria), subjectivity in manual analysis mitigated by multi-author labeling with conflict resolution, and explicit restriction to Java/Python and two SE tasks.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "External validity explicitly states results 'are capped by (i) the two code-related tasks subject of the study and (ii) the focus on the Java and Python programming languages.'", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding source is mentioned anywhere in the paper.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All author affiliations are clearly disclosed in the paper header (SEART @ Università della Svizzera italiana, William & Mary).", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": false, 87 "answer": false, 88 "justification": "No funding is disclosed, making this criterion not applicable.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement or financial interest declaration appears in the paper.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "LLM-as-a-judge is defined in the introduction; code generation and code summarization are precisely defined with their evaluation challenges and existing metric limitations described.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper explicitly states its goal is to 'assess the effectiveness of LLMs-as-a-judge for software-related tasks' with a single focused research question stated in Section 2.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 5 explicitly positions this work relative to ICE-Score, CodeJudge, Weyssow et al., and Koutcheme et al., explaining methodological differences and how this study extends or improves on each.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "A replication package is referenced at GitHub [1] (https://github.com/crupig/LLMs-as-a-judge-for-SE-tse RP) containing prompts, extraction scripts, and data.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper explicitly states they 'build (and make publicly available [1]) our own dataset' of 1,163 summaries with human judgments; CoderEval is also publicly available.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "No requirements.txt, Dockerfile, or dependency specification is mentioned; HuggingFace inference endpoints are referenced but no environment specification is provided.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper references a replication package repeatedly but provides no step-by-step reproduction instructions within the paper itself.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "Main results (Kappa scores, Krippendorff's α, bias coefficients) are reported as point estimates without confidence intervals or error bars.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": true, 157 "justification": "Mann-Whitney tests with Benjamini-Hochberg correction for multiple testing are used for self-bias analysis; Krippendorff's α and Cohen's Kappa are used as agreement metrics with explicit interpretation thresholds.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Cliff's δ effect sizes are reported for all Mann-Whitney tests in Tables 3 and 6 with explicit interpretation thresholds (negligible/small/medium/large).", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "No power analysis or formal sample size justification is provided; sample sizes are determined by benchmark availability and resource constraints rather than statistical considerations.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "Agreement scores and bias coefficients are reported as point estimates without variance or standard deviation across repeated runs.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Eight LLMs of different sizes are compared against each other and against oracle ground truths (test execution for code generation, human judgments for summarization).", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "GPT-4-turbo, GPT-3.5-turbo, CodeLlama, and DeepSeek Coder were contemporary state-of-the-art models at time of study.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Four different prompting strategies (zero-shot, zero-shot W/O rationale, automated CoT, slow-thinking) are systematically compared for both tasks across all LLMs.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Code generation uses Cohen's Kappa, confusion matrices, bias coefficients, accuracy, and mutation testing; code summarization uses Krippendorff's α across three quality dimensions.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": true, 207 "justification": "Nine human judges independently evaluated 1,163 code summaries across three quality dimensions, with each summary rated by three judges and inter-rater agreement measured.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": false, 212 "answer": false, 213 "justification": "This is a benchmarking evaluation study, not a machine learning training/prediction task requiring train/test splits.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Results are broken down by language (Java vs Python), by LLM, by quality criterion (content adequacy, conciseness, fluency), and by prompt type.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "A dedicated qualitative analysis identifies reasons for false positives (uncaught wrong behavior 37%, coding context 32%, ambiguous requirements 27%) and false negatives (hallucination 33%, misunderstanding 19%).", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "The primary finding is negative: most LLMs cannot reliably judge code correctness; GPT-4-turbo achieves only 'fair' Kappa (0.21 Java, 0.10 Python) and smaller models completely fail.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": false, 239 "justification": "Open-source models have specific size variants (DeepSeek 1.3B/6.7B/33B, CodeLlama 7B/13B/34B), but GPT-3.5-turbo and GPT-4-turbo lack snapshot dates, which is critical given OpenAI's silent model updates.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Actual prompts are reproduced verbatim in the paper for zero-shot and automated CoT strategies for both code generation and code summarization tasks.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "Temperature, top-p, and other generation hyperparameters are not reported for any of the eight models.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No agentic scaffolding is used; this is direct prompt-based evaluation without orchestration frameworks.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Detailed quality assurance for CoderEval is documented (67 problems excluded with specific criteria); dataset construction for code summarization including function selection, LLM generation, and human annotation is described step-by-step.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "The replication package [1] is stated to contain all collected judgments; the code summarization dataset with human ratings is explicitly made publicly available.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Section 2.3 describes in detail how 80,556 code generation judgments and 22,304 summarization judgments were collected, extracted (via scripts and manual verification), and cleaned.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": true, 282 "answer": false, 283 "justification": "Nine judges are described by qualifications (Master's/PhD, years of Java/Python experience) but the actual recruitment method (lab members, external, volunteer) is not stated.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The full pipeline from benchmark selection through quality assurance, code generation, judgment collection, manual cleaning, and statistical analysis is documented across Sections 2.1–2.4.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Training data cutoffs are not stated for any of the eight models; only vague descriptions like 'trained on a corpus of 2 trillion tokens' are provided without dates.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "The paper does not discuss whether CoderEval problems (from ICSE'24) or the code summarization functions may have appeared in the training data of the evaluated LLMs.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "No discussion of whether GPT-4-turbo or other models saw CoderEval problems during training, which could distort judging performance for familiar code patterns.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": true, 316 "answer": false, 317 "justification": "No pre-registration is mentioned for the human evaluation study involving nine judges assessing 1,163 summaries.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": true, 322 "answer": false, 323 "justification": "No IRB or ethics approval is mentioned despite the study involving nine human participants as paid/volunteer judges.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": true, 328 "answer": true, 329 "justification": "Judges' education (Master's or PhD in Informatics/CS), specialization (four with PhD in SE), and programming experience (avg 5.8 years Java, 6.9 years Python, with min/max) are reported.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": true, 334 "answer": true, 335 "justification": "Judges required to have 'code summarization background' and Master's or PhD degree in Informatics or Computer Science.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper states summaries were split among judges ensuring each assessed by three, but the randomization procedure for assignment is not described.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": true, 346 "answer": false, 347 "justification": "No blinding is described; judges could potentially identify human-written vs LLM-generated summaries, which could introduce evaluation bias.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "All nine judges appear to have completed their assignments with no mention of dropouts; attrition not applicable.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "Despite citing cost as a key motivation for the LLM-as-a-judge paradigm, no actual API costs or inference costs are reported for running 80,556+ judgments.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Total computational budget for running all experiments across eight models, four prompts, and two tasks is not reported.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "GPT-4-turbo is the best LLM judge for both code generation and code summarization among all eight evaluated models.", 376 "evidence": "Cohen's Kappa of 0.21 (Java) and 0.10 (Python) for code generation is highest among all models; Krippendorff's α of 0.58–0.63 for content adequacy in summarization outperforms all others.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Smaller LLMs (DeepSeek Coder 1.3B/6.7B, CodeLlama 7B) are essentially unable to perform code correctness judgment, showing near-zero or negative Kappa scores.", 381 "evidence": "Table 2 shows DeepSeek Coder 1.3B and 6.7B achieve Kappa values near 0 or negative across all prompts and both languages; CodeLlama 7B shows similar results.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Even GPT-4-turbo frequently misjudges code correctness, classifying 50% of wrong Java implementations as correct.", 386 "evidence": "Confusion matrices in Fig. 1 show GPT-4 has a 50% false positive rate for failing Java implementations despite being the best-performing model.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "All LLMs systematically underestimate the correctness of human-written code relative to LLM-generated code.", 391 "evidence": "Table 3 shows negative bias coefficients for human-written code for all judge models, statistically significant with large Cliff's δ effect sizes across all comparisons.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "GPT-4-turbo achieves moderate-to-substantial agreement with human judges for code summary content adequacy.", 396 "evidence": "Krippendorff's α of 0.58 (Java) and 0.63 (Python) for content adequacy with zero-shot prompt, compared to human inter-rater agreement of α=0.81 and 0.69.", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Prompt choice has limited impact on overall findings; model size is the dominant factor in judging capability.", 401 "evidence": "Table 2 shows Kappa scores for each LLM are relatively stable across four prompt variants; GPT-4 remains best-in-class regardless of prompt used.", 402 "supported": "moderate" 403 }, 404 { 405 "claim": "Lack of visible coding context (external dependencies) is not a major cause of LLM judging failures.", 406 "evidence": "Analysis restricted to 80 Java and 58 Python self-contained functions (no external deps) showed no change in judging effectiveness or model rankings.", 407 "supported": "moderate" 408 } 409 ], 410 "methodology_tags": [ 411 "benchmark-eval", 412 "observational", 413 "qualitative" 414 ], 415 "key_findings": "GPT-4-turbo is the best available LLM judge for code-related tasks but remains unreliable for code correctness assessment, misjudging 50% of incorrect Java implementations as correct (Cohen's Kappa = 0.21 Java, 0.10 Python). For code summarization, GPT-4-turbo achieves moderate-to-substantial agreement with human judges on content adequacy (Krippendorff's α ≈ 0.58–0.63), suggesting LLM-as-a-judge is more viable for natural language quality evaluation than for code correctness verification. A systematic anti-human bias was identified: all LLMs significantly underestimate the correctness of human-written code relative to LLM-generated code, with large effect sizes. Smaller LLMs (tens of billions of parameters in the CodeLlama and DeepSeek Coder families) largely fail at both judging tasks entirely.", 416 "red_flags": [ 417 { 418 "flag": "Contamination unaddressed", 419 "detail": "The paper does not discuss whether GPT-4-turbo or other closed-source models may have seen CoderEval problems (published ICSE'24) during training, which could inflate or distort judging performance for familiar code patterns." 420 }, 421 { 422 "flag": "Hyperparameters not reported", 423 "detail": "Temperature, top-p, and other generation hyperparameters are not disclosed for any of the eight models, limiting reproducibility of results." 424 }, 425 { 426 "flag": "No confidence intervals on agreement metrics", 427 "detail": "Main results (Cohen's Kappa, Krippendorff's α, bias coefficients) are reported as point estimates without any uncertainty quantification." 428 }, 429 { 430 "flag": "OpenAI model snapshots undefined", 431 "detail": "GPT-3.5-turbo and GPT-4-turbo lack snapshot dates; these models are silently updated over time, undermining exact reproducibility of the key results." 432 }, 433 { 434 "flag": "Human study not pre-registered or ethics-reviewed", 435 "detail": "The study involving nine human judges evaluating 1,163 summaries was not pre-registered and no IRB/ethics approval is mentioned." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models", 441 "relevance": "Primary benchmark used for both code generation evaluation and as source of functions for the summarization dataset" 442 }, 443 { 444 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 445 "relevance": "Foundational work proposing LLM-as-a-judge concept and identifying positional bias, verbosity, and self-enhancement bias" 446 }, 447 { 448 "title": "CodeJudge: Evaluating Code Generation with Large Language Models", 449 "relevance": "Most closely related prior work applying GPT-3.5 as judge for code correctness with slow-thinking prompts; this paper adopts and extends CodeJudge's prompt" 450 }, 451 { 452 "title": "ICE-Score: Instructing Large Language Models to Evaluate Code", 453 "relevance": "Prior work using GPT-3.5-turbo as judge for code implementations on HumanEval-X; this paper uses a harder benchmark and more LLMs" 454 }, 455 { 456 "title": "CodeUltraFeedback: An LLM-as-a-Judge Dataset for Aligning Large Language Models to Coding Preferences", 457 "relevance": "Related work exploiting LLM-as-a-judge for SE evaluation, source of the zero-shot prompt design" 458 }, 459 { 460 "title": "Large Language Models are Zero-Shot Reasoners", 461 "relevance": "Source of the automated chain-of-thought prompting strategy tested in this study" 462 }, 463 { 464 "title": "Reassessing Automatic Evaluation Metrics for Code Summarization Tasks", 465 "relevance": "Demonstrates shortcomings of BLEU/ROUGE/METEOR for code summarization, motivating LLM-as-a-judge as an alternative" 466 }, 467 { 468 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 469 "relevance": "One of the two open-source LLM families evaluated as judges in this study" 470 } 471 ], 472 "engagement_factors": { 473 "practical_relevance": { 474 "score": 3, 475 "justification": "Directly answers whether practitioners can replace human evaluation with LLMs for automated code review and summarization assessment, a question with immediate industry relevance." 476 }, 477 "surprise_contrarian": { 478 "score": 2, 479 "justification": "Finding that LLMs systematically underestimate human-written code quality and that even GPT-4 fails 50% of code correctness judgments challenges widespread enthusiasm for LLM-as-a-judge in SE research." 480 }, 481 "fear_safety": { 482 "score": 0, 483 "justification": "No safety or AI risk concerns are raised; the paper is a methodological evaluation of automated evaluation quality." 484 }, 485 "drama_conflict": { 486 "score": 1, 487 "justification": "Challenges the growing trend of using LLM-as-a-judge as a cheap substitute for human evaluation in SE, but framed constructively rather than confrontationally." 488 }, 489 "demo_ability": { 490 "score": 2, 491 "justification": "Prompts are provided verbatim, replication package is publicly available, and experiments use accessible APIs, enabling replication with moderate effort." 492 }, 493 "brand_recognition": { 494 "score": 1, 495 "justification": "Published in IEEE Transactions on Software Engineering (top venue) but authors are from USI and William & Mary rather than major AI labs." 496 } 497 }, 498 "hn_data": { 499 "threads": [ 500 { 501 "hn_id": "45028439", 502 "title": "No evidence ageing/declining populations compromise socio-economic performance", 503 "points": 82, 504 "comments": 101, 505 "url": "https://news.ycombinator.com/item?id=45028439", 506 "created_at": "2025-08-26T16:05:54Z" 507 }, 508 { 509 "hn_id": "47213997", 510 "title": "Von Neumann on Consciousness in Quantum Mechanics", 511 "points": 3, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=47213997", 514 "created_at": "2026-03-02T04:46:53Z" 515 }, 516 { 517 "hn_id": "43557330", 518 "title": "Ultra-high resolution multimodal MRI dense labelled holistic brain atlas", 519 "points": 2, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=43557330", 522 "created_at": "2025-04-02T14:48:56Z" 523 } 524 ], 525 "top_points": 82, 526 "total_points": 87, 527 "total_comments": 101 528 } 529 }