scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (34771B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On the Effectiveness of LLM-as-a-Judge for Code Generation and Summarization",
      6     "authors": [
      7       "Giuseppe Crupi",
      8       "Rosalia Tufano",
      9       "Alejandro Velasco",
     10       "Antonio Mastropaolo",
     11       "Denys Poshyvanyk",
     12       "Gabriele Bavota"
     13     ],
     14     "year": 2025,
     15     "venue": "IEEE Transactions on Software Engineering",
     16     "arxiv_id": "2507.16587",
     17     "doi": "10.1109/TSE.2025.3586082"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims are supported: GPT-4-turbo as best judge (Tables 2, 5), smaller LLMs struggling (Tables 1, 2, 5), frequent misjudgments of code correctness (Fig. 1, 50% FP rate), moderate agreement on summarization (Table 5, α=0.58-0.63 for content adequacy).",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper primarily makes comparative/descriptive claims about judging effectiveness rather than strong causal claims. Where causal reasoning is used (e.g., coding context as a factor in misjudgments), it is tested empirically by analyzing self-contained functions separately (Section 3.1).",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 4 explicitly states: 'the generalizability of our findings is capped by (i) the two code-related tasks subject of the study and (ii) the focus on the Java and Python programming languages. Differentiated replications can help to corroborate/contradict our findings.'",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 4 discusses construct validity (tests as proxy for correctness), prompt impact, and manual analysis subjectivity. Section 3.1 investigates coding context as an alternative explanation for misjudgments. Section 3.1.2 systematically analyzes reasons behind false judgments.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Section 4 explicitly acknowledges: 'Using tests as a proxy for code correctness is a limitation of our study.' They also document cases where the proxy fails (unreliable test suites, Section 2.2.1) and excluded problematic cases.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 4 'Threats to Validity' is a dedicated section covering construct, internal, and external validity threats.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 4 discusses specific threats: test suites as imperfect proxies for correctness (with mitigation), subjectivity in manual analysis (mitigated by multiple evaluators), prompt sensitivity (tested with four variants), and language/task scope limitations.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 4 explicitly states what the results do NOT show: 'the generalizability of our findings is capped by (i) the two code-related tasks subject of the study and (ii) the focus on the Java and Python programming languages.' Future work specifies additional tasks and fine-tuning as unexplored directions.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding information or acknowledgments section is present in the paper. A 6-author paper from two universities likely received institutional funding but this is not disclosed.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: SEART @ Software Institute, Università della Svizzera italiana (Crupi, Tufano, Bavota) and W&M (Velasco, Mastropaolo, Poshyvanyk). None of the authors are affiliated with the companies whose models are evaluated.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not establish that no funding exists.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "LLM-as-a-judge is defined in the introduction; 'correctness' is operationalized as passing test suites; the three summary quality criteria (content adequacy, conciseness, fluency & understandability) are defined with explicit scoring rubrics in Section 2.2.2.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly states its goal as assessing 'the effectiveness of LLMs-as-a-judge for software-related tasks' for code generation and summarization, contributing empirical evidence on judging capability including self-bias and failure mode analysis.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 5 explicitly compares with four closely related papers (ICE-Score, CodeJudge, Koutcheme et al., Weyssow et al.), explaining how this work extends prior work with more LLMs, a more complex benchmark, bias analysis, and an additional task.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The paper references a replication package at [1] with a GitHub URL (https://github.com/crupig/LLMs-as-a-judge-for-SE-tse_RP) mentioned in the references section and cited throughout the paper.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper states 'we built (and make publicly available [1]) our own dataset that features human judgments of 1,163 summaries' (Section 2.2.2). CoderEval is also a public benchmark. All data is in the replication package.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specification (requirements.txt, Dockerfile, library versions) is mentioned. The paper mentions using Hugging Face inference endpoints and ChatGPT APIs but provides no reproducible environment details.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are included in the paper. The replication package is referenced but no README with commands or a 'Reproducing Results' section is described.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results (Cohen's Kappa, Krippendorff's alpha, accuracy, bias coefficients) are reported as point estimates with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Mann-Whitney U tests with Benjamini-Hochberg correction are used for the self-bias analysis (Section 2.4, Tables 3 and 6). Adjusted p-values are reported at multiple significance levels.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Cliff's delta effect sizes are reported alongside statistical tests (Tables 3 and 6), with interpretation thresholds defined: negligible, small, medium, large (Section 2.4).",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No power analysis or justification for sample sizes. The 184 Java and 190 Python code generation problems, and 198 functions for summarization, are used without justifying adequacy.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance or standard deviation reported across experimental runs. LLM outputs are stochastic, but all results appear from single runs with no spread measures.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Eight LLMs of varying sizes are compared against each other and against ground truth (test execution for code generation, human judgments for summarization). Four different prompting strategies are compared (Table 2, Table 5).",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "For a paper published in July 2025, the models are dated. GPT-4-turbo was the strongest model tested, but GPT-4o (May 2024), Claude 3/3.5, Llama 3, and other 2024-2025 models are absent. No justification is given for the model selection timeframe.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "The paper ablates along multiple dimensions: four prompt strategies (Table 2, Table 5), self-contained vs. non-self-contained functions (Section 3.1), and the mutant injection + semantic equivalence study (Section 3.1.2, Fig. 2).",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple metrics used: Cohen's Kappa, confusion matrices (TP/TN/FP/FN rates, accuracy), bias coefficient for code generation; Krippendorff's alpha and scatterplots for summarization.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Nine human judges independently evaluated 1,163 code summaries across three quality criteria (content adequacy, conciseness, fluency & understandability) on a 1-5 scale, with each summary assessed by three judges (Section 2.2.2).",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Prompts were developed using 'toy examples' separate from the evaluation data. The main evaluation was conducted on CoderEval (code generation) and the purpose-built summarization dataset, which were not used for prompt tuning (Section 2.3.1).",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by language (Java/Python), by model, by model family, by prompt strategy, and by function type (self-contained vs. dependent). Confusion matrices per model (Fig. 1) and per-criterion scores for summarization (Table 5).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 3.1.2 provides detailed manual analysis of false positives (37% uncaught wrong behavior, 32% coding context, 27% ambiguous requirements) and false negatives (33% hallucination, 19% code misunderstanding). Table 1 documents judging failures.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The main findings are largely negative: smaller LLMs completely fail at judging, GPT-4-turbo misjudges 50% of wrong Java implementations. The paper explicitly reports what doesn't work. DeepSeek Coder family was excluded from summarization entirely due to inability to perform the task.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Models are named as 'GPT-3.5-turbo' and 'GPT-4-turbo' without snapshot dates or API versions. DeepSeek Coder and CodeLlama sizes are specified (1.3B, 6.7B, 33B; 7B, 13B, 34B) but exact checkpoint versions are not given.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Full prompt text is provided for the zero-shot code generation prompt (Section 2.3.1), the automated CoT prompts (Section 2.3.1), and the zero-shot summarization prompt (Section 2.3.2). Additional prompts are in the replication package [1].",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any model. These settings significantly affect LLM output and are essential for reproducibility.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. LLMs are prompted directly via API calls or Hugging Face inference endpoints.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Detailed quality assurance pipeline for CoderEval documented in Section 2.2.1: 230 Java → 210 (excluded 20 failing targets) → 201 (excluded 9 empty-body passes) → 184 (excluded 17 dummy-function passes). Similar filtering for Python: 230 → 191 → 190. Summarization dataset construction also documented.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "The replication package [1] is referenced as containing all data: prompts, judgments, generated code, human evaluations. The summarization dataset is explicitly stated as 'publicly available [1]' (Section 2.2.2).",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data collection is described in detail: CoderEval benchmark selection and quality filtering (Section 2.2.1), summarization dataset construction from top-100 longest functions per language (Section 2.2.2), LLM output extraction using lizard code analyzer (Section 2.3.1).",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "The 9 human judges are characterized by qualifications (Master's/PhD, years of experience) but how they were recruited is not described. No information on recruitment channels or whether they were lab members, students, or external participants.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline is documented: CoderEval quality filtering with counts at each stage (Section 2.2.1), code generation and extraction (Section 2.3.1), judgment collection with manual verification (Section 2.3.1), and total counts (80,556 code gen judgments, 22,304 summarization judgments).",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No training data cutoff dates are stated for any of the eight LLMs. This is relevant because CoderEval benchmark problems could have appeared in the training data of GPT-4-turbo, GPT-3.5-turbo, and other models.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether CoderEval problems or their solutions appeared in any model's training data. CoderEval was published at ICSE'24 and its code/solutions are publicly available.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "Benchmark contamination is not addressed. CoderEval problems are from open-source repositories and were publicly available before GPT-4-turbo's training cutoff, creating contamination risk for both the generation and judging tasks.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for the human evaluation study with 9 judges.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics board approval is mentioned for the human evaluation study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Section 2.2.2 reports: 'All nine judges have a Master's degree in Informatics or Computer Science, four of them have a Ph.D. in Software Engineering. On average, they have 5.8 years of experience (min=1, max=17) in Java programming and 6.9 in Python programming (min=4, max=10).'",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "The judges are described as having 'code summarization background' but no formal inclusion/exclusion criteria are stated. No screening process is described.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "This is a rating/evaluation study, not an experimental study with treatment and control conditions. The human judges are evaluators, not experimental participants assigned to conditions.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "This is an evaluation/rating study rather than a randomized experiment. While blinding judges to summary source (human vs. LLM) would have been methodologically valuable, the schema marks this as NA for non-experimental studies.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No attrition or dropout information is reported. The paper states 9 judges and 3,489 total judgments expected but does not confirm whether all judgments were completed or if any judges dropped out.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost or latency is reported despite running 80,556 code generation judgments and 22,304 summarization judgments across eight LLMs, including commercial API calls to GPT-3.5-turbo and GPT-4-turbo.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total computational budget is stated. The paper used Hugging Face inference endpoints for open models and ChatGPT APIs for GPT models, but neither API costs nor GPU hours are reported.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No mention of multiple random seeds or runs. LLM outputs are stochastic, but results appear to be from single runs without seed sensitivity analysis.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The number of experimental runs is not stated. It appears each judgment was obtained from a single LLM call, but this is never explicitly confirmed.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "No hyperparameter search budget is reported. Four prompts are compared, but the number of prompt variants tested during the trial-and-error development phase is not disclosed.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "Section 2.4 explicitly justifies the selection criterion: 'Since for both tasks there was one judge LLM which was the clear winner independently from the used prompt (i.e., GPT-4-turbo), we selected as best-performing prompt the one ensuring the best performance on it.' Results for all four prompts are shown (Tables 2, 5).",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": true,
    399           "justification": "Benjamini-Hochberg correction is applied to adjust p-values for multiple comparisons in the self-bias analysis (Section 2.4, Tables 3, 6).",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The paper does not discuss author-evaluation bias. The prompts, extraction scripts, and experimental design were developed by the authors, but no acknowledgment of potential bias in their own evaluation methodology is made.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "No analysis of performance as a function of compute budget. Larger models (GPT-4-turbo with estimated >1.5T parameters) are compared with smaller ones (1.3B) without discussing compute cost implications, despite the paper noting that 'the larger the language model, the higher its inference (judgment) cost.'",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": true,
    417           "justification": "Section 2.2.1 extensively analyzes CoderEval's construct validity: verifying target functions pass tests, checking for trivial passing implementations (empty functions, dummy returns), and excluding 76 of 460 problems with unreliable test suites. Section 4 discusses tests as proxy for correctness.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No scaffolding is used. LLMs are prompted directly without agentic scaffolding or tool use.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "No discussion of temporal leakage. CoderEval functions are from open-source projects that could have been in training data for GPT-4-turbo and other models. The temporal relationship between benchmark creation and model training is not discussed.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the judging prompts (description + signature + candidate) provide information that could trigger memorized solutions from training data.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether CoderEval problems or their solutions overlap with LLM training data. The benchmark draws from open-source repositories that are likely in LLM training corpora.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination procedures are mentioned.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "GPT-4-turbo is the best LLM judge for code generation, achieving Cohen's Kappa of 0.21 for Java and 0.10 for Python against test execution ground truth.",
    458       "evidence": "Table 2 reports Kappa scores for 8 LLMs across 4 prompts; GPT-4 consistently outperforms all others while smaller models score near zero or negative.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "GPT-4 misjudges 50% of wrong Java implementations as correct (false positive rate), making it unreliable for automated code review.",
    463       "evidence": "Figure 1 confusion matrices show GPT-4 correctly classifying only 50% of failing Java implementations as wrong under the best prompt (automated CoT).",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "All LLMs systematically underestimate the correctness of human-written code compared to LLM-generated code with large effect sizes.",
    468       "evidence": "Table 3 shows bias coefficients around -0.47 for human-written code vs. positive coefficients for all LLMs; Mann-Whitney tests show statistically significant differences (*** p<0.001) with large Cliff's delta for all judge models.",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "GPT-4 achieves moderate-to-substantial agreement with human judges for code summary content adequacy (Krippendorff's α=0.58 Java, 0.63 Python).",
    473       "evidence": "Table 5 reports these Krippendorff's alpha values for GPT-4 under the zero-shot prompt, substantially above all other evaluated models.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Smaller LLMs (≤7B parameters) are largely unable to perform judging tasks, achieving near-zero or negative Kappa scores.",
    478       "evidence": "Table 2 shows Kappa ≤0.05 for DeepSeek Coder 1.3B/6.7B and CodeLlama 7B across both languages and all prompts; Figure 1 shows random or perverse judgment distributions.",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "GPT-4 shows minimal but statistically detectable self-bias in code generation judging (significant p-value, negligible effect size).",
    483       "evidence": "Table 3 shows GPT-4's Own vs LLMs comparison yields ** significance but (N) negligible Cliff's delta; other strong models show no significant self-bias.",
    484       "supported": "moderate"
    485     },
    486     {
    487       "claim": "Automated Chain-of-Thought prompting yields the best judging performance for code generation across models.",
    488       "evidence": "Table 2 shows automated CoT achieves highest Kappa for GPT-4 on Java (0.21); the authors select it as best-performing and note findings are stable across prompts.",
    489       "supported": "moderate"
    490     }
    491   ],
    492   "methodology_tags": [
    493     "benchmark-eval",
    494     "observational"
    495   ],
    496   "key_findings": "GPT-4-turbo is the best LLM judge for both code generation and summarization but achieves only fair/weak agreement with test execution for code correctness (Kappa=0.21 Java, 0.10 Python), misjudging 50% of wrong Java implementations as correct. For code summarization, GPT-4 performs substantially better, achieving moderate-to-substantial agreement with human judges on content adequacy (α=0.58–0.63). Smaller LLMs fail almost entirely at both judging tasks. A consistent and statistically significant bias is found: all LLMs underestimate human-written code quality relative to LLM-generated code with large effect sizes, suggesting that LLM-as-a-judge frameworks structurally favor LLM outputs over human code.",
    497   "red_flags": [
    498     {
    499       "flag": "No model snapshot identifiers",
    500       "detail": "GPT-4-turbo and GPT-3.5-turbo are referenced without specific API version IDs or snapshot dates, making exact replication impossible as model behavior may change over time."
    501     },
    502     {
    503       "flag": "No CIs on main agreement statistics",
    504       "detail": "Cohen's Kappa and Krippendorff's alpha values are reported as point estimates without confidence intervals, making it impossible to assess statistical uncertainty around the central claims."
    505     },
    506     {
    507       "flag": "No contamination analysis",
    508       "detail": "CoderEval (ICSE 2024) may have appeared in GPT-4's training data; no training cutoff discussion or contamination analysis is presented, which could inflate GPT-4's judging performance."
    509     },
    510     {
    511       "flag": "No hyperparameters reported",
    512       "detail": "Temperature, top-p, and other decoding parameters are not disclosed for any model, preventing precise replication of results."
    513     },
    514     {
    515       "flag": "Human judge recruitment undisclosed",
    516       "detail": "Nine human judges' recruitment method is not described (lab members, paid participants, crowd workers), and no IRB approval is mentioned despite involving human subjects."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    522       "relevance": "Foundational work establishing the LLM-as-a-judge paradigm and identifying key biases (positional, verbosity, self-enhancement) that directly shaped this paper's study design."
    523     },
    524     {
    525       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    526       "relevance": "Direct predecessor using slow-thinking prompting with GPT-3.5 as a code judge; this paper replicates their approach on a harder benchmark with more LLMs."
    527     },
    528     {
    529       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    530       "relevance": "Prior work using GPT-3.5 on HumanEval-X for code correctness judgment; this paper extends to a more complex benchmark (CoderEval) and adds summarization."
    531     },
    532     {
    533       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    534       "relevance": "Primary benchmark used for code generation judging; the paper includes a novel quality assurance procedure excluding 68 CoderEval problems with unreliable test suites."
    535     },
    536     {
    537       "title": "Reassessing Automatic Evaluation Metrics for Code Summarization Tasks",
    538       "relevance": "Motivates the study by demonstrating shortcomings of BLEU/ROUGE for code summarization, establishing the need for human or LLM judgment approaches."
    539     },
    540     {
    541       "title": "CodeUltraFeedback: An LLM-as-a-Judge Dataset for Aligning LLMs to Coding Preferences",
    542       "relevance": "Related work using LLM-as-a-judge for SE tasks, differing by focusing on non-functional requirements rather than correctness verification."
    543     },
    544     {
    545       "title": "Evaluating Language Models for Generating and Judging Programming Feedback",
    546       "relevance": "Related work evaluating LLMs as judges for beginner-level programming feedback using other LLMs as oracles rather than tests or human judgment."
    547     }
    548   ],
    549   "engagement_factors": {
    550     "practical_relevance": {
    551       "score": 3,
    552       "justification": "Directly informs whether practitioners can trust LLMs to evaluate code quality in automated pipelines, CI/CD tools, and code review systems."
    553     },
    554     "surprise_contrarian": {
    555       "score": 2,
    556       "justification": "The finding that GPT-4 misjudges 50% of wrong implementations as correct is more alarming than expected and the systematic bias against human-written code is a novel and counterintuitive result."
    557     },
    558     "fear_safety": {
    559       "score": 1,
    560       "justification": "Raises concerns about unreliable automated code review but does not address safety-critical system contexts directly."
    561     },
    562     "drama_conflict": {
    563       "score": 1,
    564       "justification": "No major controversy; the paper largely quantifies known limitations of LLM judges without challenging prominent claims from high-profile labs."
    565     },
    566     "demo_ability": {
    567       "score": 2,
    568       "justification": "The public replication package with prompts and datasets allows practitioners to test LLM judging behavior on their own code samples immediately."
    569     },
    570     "brand_recognition": {
    571       "score": 1,
    572       "justification": "Authors are from USI Switzerland and William & Mary — respected SE research groups but not top-tier AI labs; GPT-4 evaluation provides some brand association."
    573     }
    574   },
    575   "hn_data": {
    576     "threads": [
    577       {
    578         "hn_id": "45028439",
    579         "title": "No evidence ageing/declining populations compromise socio-economic performance",
    580         "points": 82,
    581         "comments": 101,
    582         "url": "https://news.ycombinator.com/item?id=45028439",
    583         "created_at": "2025-08-26T16:05:54Z"
    584       },
    585       {
    586         "hn_id": "47213997",
    587         "title": "Von Neumann on Consciousness in Quantum Mechanics",
    588         "points": 3,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=47213997",
    591         "created_at": "2026-03-02T04:46:53Z"
    592       },
    593       {
    594         "hn_id": "43557330",
    595         "title": "Ultra-high resolution multimodal MRI dense labelled holistic brain atlas",
    596         "points": 2,
    597         "comments": 0,
    598         "url": "https://news.ycombinator.com/item?id=43557330",
    599         "created_at": "2025-04-02T14:48:56Z"
    600       }
    601     ],
    602     "top_points": 82,
    603     "total_points": 87,
    604     "total_comments": 101
    605   }
    606 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs