scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18606B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "From Code to Courtroom: LLMs as the New Software Judges",
      6     "authors": [
      7       "Junda He",
      8       "Jieke Shi",
      9       "Terry Yue Zhuo",
     10       "Christoph Treude",
     11       "Jiamou Sun"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2503.02246",
     16     "doi": "10.48550/arXiv.2503.02246"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims a review of existing studies, identification of limitations, and a research roadmap — all three are delivered in Sections 3, 4, and the conclusion respectively.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper is a forward-looking vision and literature review; it makes no causal claims about interventions improving outcomes.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper generalizes broadly about the future of LLM-as-a-Judge in all of software engineering based on only 16 reviewed studies, without bounding claims to what that corpus can support.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper presents a one-sided pro-LLM-as-judge vision; it acknowledges field-level limitations but does not consider the alternative that LLMs may be fundamentally unsuitable as evaluation surrogates.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly discusses alignment with human judgment as the key validation criterion and distinguishes LLM assessments from actual software quality throughout the limitations section.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 4 ('The Road Ahead') contains six explicitly numbered limitations of the current field (e.g., Limitation 1–6).",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations discuss the reviewed field's shortcomings, not threats to the paper's own review methodology; there is no discussion of selection bias in the 16 papers chosen or the non-systematic nature of the review.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper notes it is 'not intended to be a definitive guide' but never explicitly states what its review does not cover or what claims cannot be drawn from 16 informally selected papers.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "There is no acknowledgments or funding section anywhere in the paper text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All seven authors list their affiliations explicitly (Singapore Management University, Monash University, CSIRO's Data61, Australian National University).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence of funder cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 2 provides a formal mathematical definition of LLM-as-a-Judge with typed inputs (T, C, X, R) and outputs (Y, E, F), and explicitly distinguishes it from broader LLM-based evaluation approaches.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists three contributions: a review of 16 primary studies, analysis of limitations and research gaps, and a forward-looking vision with a research roadmap.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper actively engages with prior work throughout — Section 3 maps 16 studies to SE tasks, and the definition section explicitly distinguishes this paper's framing from Wang et al.'s broader definition.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No search strategy is described; the 16 papers are listed without any explanation of how they were identified or retrieved.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No inclusion or exclusion criteria are stated anywhere in the paper; the selection of 16 studies is presented without methodology.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No PRISMA or other structured review protocol is mentioned or followed.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No search queries or terms are provided.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No databases or search sources are listed.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No screening process with counts at each stage is documented; papers appear selected informally.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The topic scope (LLM-as-a-Judge in SE) is stated but no justification is given for why these particular years, venues, or task types were chosen.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Limitation 2 explicitly discusses 'Inconsistent Empirical Findings,' citing that Wang et al. found traditional metrics outperform LLM-as-a-Judge while Wu et al. found the opposite for code summarization.",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of the 16 reviewed papers is performed; all are treated equally regardless of sample size or methodological rigor.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Publication bias is never mentioned; the paper does not acknowledge that its 16 reviewed studies may skew toward positive results for LLM-as-a-Judge.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The synthesis is entirely narrative; no meta-analysis, vote counting, or effect size aggregation is performed.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The 'opportunities' and roadmap items are largely speculative future directions not grounded in the reviewed evidence; they follow logically from identified gaps but are not empirically supported.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "84% of SE researchers agree that human evaluation is problematic due to time constraints, cost, and need for specialized knowledge.",
    201       "evidence": "Cited from Buse et al. [7], a 2011 OOPSLA paper on benefits and barriers of user evaluation.",
    202       "supported": "moderate"
    203     },
    204     {
    205       "claim": "There are only 16 primary studies on LLM-as-a-Judge in software engineering, indicating the field is in early stages.",
    206       "evidence": "Table 1 maps 16 references to SE tasks; the paper states 'the field remains in its early stages.'",
    207       "supported": "moderate"
    208     },
    209     {
    210       "claim": "Existing LLM-as-a-Judge benchmarks use only small-scale datasets, limiting generalizability.",
    211       "evidence": "Wang et al. [65] used 450 samples across three tasks; Ahmed et al. [1] used 420 samples for code summarization.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Conflicting empirical findings exist: Wang et al. found traditional metrics outperform LLM-as-a-Judge for code summarization, while Wu et al. found the opposite.",
    216       "evidence": "Both studies are cited directly and the conflict is characterized as a major challenge requiring standardized evaluation.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "LLMs do not experience fatigue, allowing consistent performance over extended periods unlike human evaluators.",
    221       "evidence": "Stated as a motivating attribute with no citation or empirical support; presented as an inherent property.",
    222       "supported": "unsupported"
    223     },
    224     {
    225       "claim": "LLM-as-a-Judge systems are susceptible to biases including position bias, verbosity bias, and egocentric bias in SE contexts.",
    226       "evidence": "Cites external NLP/ML bias papers [36, 28, 76] but notes there is 'a lack of thorough empirical investigation' in SE specifically — i.e., the claim is extrapolated, not demonstrated.",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "qualitative"
    232   ],
    233   "key_findings": "This SE 2030 vision paper reviews 16 studies on LLM-as-a-Judge in software engineering and identifies six major limitations: lack of large-scale human-annotated benchmarks, inconsistent empirical findings across studies, insufficient bias investigation, inadequate SE domain expertise in LLMs, over-reliance on internal LLM mechanisms, and insufficient research on adversarial threats. The paper proposes a research roadmap including creating comprehensive benchmarks, embedding expert tacit knowledge, integrating external SE tools, and developing adversarial defenses. The review is entirely non-systematic, with no stated search methodology, inclusion criteria, or quality assessment of the 16 source papers.",
    234   "red_flags": [
    235     {
    236       "flag": "Non-systematic selection",
    237       "detail": "16 papers are reviewed with no search strategy, inclusion/exclusion criteria, or screening process documented — the review is not reproducible and may reflect author familiarity rather than comprehensive coverage."
    238     },
    239     {
    240       "flag": "Self-citation cluster",
    241       "detail": "Multiple references ([55][56][57][74][75]) are co-authored by paper authors (Shi, He, Lo), creating potential citation bias in a paper arguing for a research agenda."
    242     },
    243     {
    244       "flag": "Speculative roadmap without empirical grounding",
    245       "detail": "The 2030 vision and roadmap items are normative prescriptions not derivable from the 16 reviewed papers; they represent author opinion about future directions rather than evidence-based conclusions."
    246     },
    247     {
    248       "flag": "No paper-level limitations",
    249       "detail": "The limitations section discusses the reviewed field's shortcomings, not the paper's own methodological limitations (non-systematic selection, small corpus, no quality assessment of sources)."
    250     },
    251     {
    252       "flag": "No funding disclosure",
    253       "detail": "No acknowledgments or funding statement appears in the paper; this omission is notable given the authors' institutional affiliations with CSIRO's Data61 (a government research agency)."
    254     }
    255   ],
    256   "cited_papers": [
    257     {
    258       "title": "Can LLMs Replace Human Evaluators? An Empirical Study of LLM-as-a-Judge in Software Engineering",
    259       "relevance": "Key empirical study reviewed; found traditional metrics outperform LLM-as-a-Judge for code summarization — directly motivates the paper's call for standardized benchmarks."
    260     },
    261     {
    262       "title": "Can Large Language Models Serve as Evaluators for Code Summarization?",
    263       "relevance": "Conflicting empirical finding vs. Wang et al.; found LLM-as-a-Judge outperforms conventional metrics for code summarization, exemplifying the inconsistency problem."
    264     },
    265     {
    266       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    267       "relevance": "Original LLM-as-a-Judge paper from NLP domain that the SE application builds upon; cited as foundational."
    268     },
    269     {
    270       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    271       "relevance": "Early SE-specific LLM evaluation work by a co-author; demonstrates reference-free evaluation of code generation."
    272     },
    273     {
    274       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    275       "relevance": "Demonstrates taxonomy-guided LLM evaluation of generated code; key example of multi-facet evaluation approach."
    276     },
    277     {
    278       "title": "Can LLMs Replace Manual Annotation of Software Engineering Artifacts?",
    279       "relevance": "Directly evaluates LLM-as-a-Judge across multiple SE tasks including code summarization, patches, and requirements; one of the 16 primary reviewed studies."
    280     },
    281     {
    282       "title": "LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods",
    283       "relevance": "Broader NLP survey on LLM evaluation that inspires the formal definition used in this paper."
    284     },
    285     {
    286       "title": "AIME: AI System Optimization via Multiple LLM Evaluators",
    287       "relevance": "Proposes combining multiple LLM evaluators to approximate optimal evaluation; cited as a recent methodological advance."
    288     }
    289   ],
    290   "engagement_factors": {
    291     "practical_relevance": {
    292       "score": 2,
    293       "justification": "SE practitioners and researchers evaluating LLM-generated code face real challenges addressed by this roadmap, though the paper offers no immediately usable tools."
    294     },
    295     "surprise_contrarian": {
    296       "score": 1,
    297       "justification": "The finding that only 16 studies exist in this rapidly growing area is somewhat surprising, but the paper's thesis (LLMs as judges are promising) is conventional wisdom."
    298     },
    299     "fear_safety": {
    300       "score": 1,
    301       "justification": "Section 4.4 raises adversarial attacks on LLM judges (obfuscated code, deceptive commit messages) as a security concern, but the treatment is brief and not alarming."
    302     },
    303     "drama_conflict": {
    304       "score": 1,
    305       "justification": "The conflicting findings between Wang et al. and Wu et al. on the same task are highlighted as a field-level problem, but not dramatized."
    306     },
    307     "demo_ability": {
    308       "score": 0,
    309       "justification": "Pure vision/roadmap paper with no implementation, tool, or demo; nothing to try."
    310     },
    311     "brand_recognition": {
    312       "score": 1,
    313       "justification": "Singapore Management University and CSIRO's Data61 are credible research institutions but not AI brand names that drive HN attention."
    314     }
    315   },
    316   "hn_data": {
    317     "threads": [
    318       {
    319         "hn_id": "43978357",
    320         "title": "Type-constrained code generation with language models",
    321         "points": 257,
    322         "comments": 127,
    323         "url": "https://news.ycombinator.com/item?id=43978357",
    324         "created_at": "2025-05-13T22:15:30Z"
    325       },
    326       {
    327         "hn_id": "45141762",
    328         "title": "Fantastic pretraining optimizers and where to find them",
    329         "points": 42,
    330         "comments": 4,
    331         "url": "https://news.ycombinator.com/item?id=45141762",
    332         "created_at": "2025-09-05T18:15:42Z"
    333       },
    334       {
    335         "hn_id": "30665928",
    336         "title": "PERCEPT: Online change-point detection using topological data analysis",
    337         "points": 8,
    338         "comments": 0,
    339         "url": "https://news.ycombinator.com/item?id=30665928",
    340         "created_at": "2022-03-13T21:31:04Z"
    341       },
    342       {
    343         "hn_id": "43997113",
    344         "title": "An Empirical Study on the Performance and Energy Usage of Compiled Python Code",
    345         "points": 3,
    346         "comments": 0,
    347         "url": "https://news.ycombinator.com/item?id=43997113",
    348         "created_at": "2025-05-15T17:12:36Z"
    349       },
    350       {
    351         "hn_id": "39686242",
    352         "title": "Random Networks are not Random Functions",
    353         "points": 3,
    354         "comments": 0,
    355         "url": "https://news.ycombinator.com/item?id=39686242",
    356         "created_at": "2024-03-12T23:39:00Z"
    357       },
    358       {
    359         "hn_id": "44461553",
    360         "title": "SegmentAnyMuscle: A muscle segmentation model across different locations in MRI",
    361         "points": 2,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=44461553",
    364         "created_at": "2025-07-04T06:01:44Z"
    365       },
    366       {
    367         "hn_id": "43926603",
    368         "title": "Pearch.ai beat LinkedIn's AI search in a head-to-head benchmark",
    369         "points": 1,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=43926603",
    372         "created_at": "2025-05-08T14:50:43Z"
    373       },
    374       {
    375         "hn_id": "43908546",
    376         "title": "Performance and Energy Usage of Compiled Python",
    377         "points": 1,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=43908546",
    380         "created_at": "2025-05-06T19:03:58Z"
    381       }
    382     ],
    383     "top_points": 257,
    384     "total_points": 317,
    385     "total_comments": 131
    386   }
    387 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs