scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18945B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "From Code to Courtroom: LLMs as the New Software Judges",
      6     "authors": [
      7       "Junda He",
      8       "Jieke Shi",
      9       "Terry Yue Zhuo",
     10       "Christoph Treude",
     11       "Jiamou Sun",
     12       "Zhenchang Xing",
     13       "Xiaoning Du",
     14       "David Lo"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2503.02246",
     19     "doi": "10.48550/arXiv.2503.02246"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract claims are supported: the paper does review 16 studies (Section 3, Table 1), analyzes limitations (Section 4 with 6 labeled limitations), and proposes a research roadmap with specific directions.",
     27         "source": "opus"
     28       },
     29       "causal_claims_justified": {
     30         "applies": false,
     31         "answer": false,
     32         "justification": "The paper makes no causal claims. It describes the state of the field and proposes future research directions without claiming causal relationships.",
     33         "source": "opus"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper title ('LLMs as the New Software Judges') and vision claims are sweeping relative to the evidence base of 16 papers. The abstract hedges with 'While not intended to be a definitive guide' but the roadmap and 2030 vision make broad claims about the entire SE community's trajectory based on a small literature set.",
     39         "source": "opus"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "This is a pure survey and vision paper that presents no empirical results of its own. Alternative explanations are not applicable.",
     45         "source": "opus"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "The paper makes no measurements of its own. It is a literature review and roadmap with no proxy/outcome gap to address.",
     51         "source": "opus"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Section 4 discusses six limitations of the LLM-as-a-Judge FIELD, but there is no section discussing limitations of this review itself — its methodology, potential selection bias, or gaps in coverage.",
     59         "source": "opus"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No threats to validity of the survey methodology are discussed. The paper does not address whether its 16-paper sample is representative, whether its search was comprehensive, or whether the roadmap reflects author bias.",
     65         "source": "opus"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The abstract says 'While not intended to be a definitive guide' but does not state specific scope boundaries — what SE domains are excluded, what paper types were filtered out, or what the review does NOT cover. The formal definition in Section 2 excludes embedding-based methods, which is the closest to a scope boundary.",
     71         "source": "opus"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper.",
     79         "source": "opus"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are clearly listed: Singapore Management University, Monash University, CSIRO's Data61, and Australian National University.",
     85         "source": "opus"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of funding.",
     91         "source": "opus"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     97         "source": "opus"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "LLM-as-a-Judge is formally defined in Section 2 with a mathematical formulation E(T,C,X,R)→(Y,E,F) covering inputs and outputs, and the distinction from LLM-based evaluation is explicitly drawn.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 1 lists three explicit contributions: review of 16 primary studies, analysis of limitations and gaps, and a forward-looking vision with roadmap toward 2030.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper engages substantively with prior work, differentiating its stricter definition from Wang et al.'s broader definition and discussing specific findings from individual studies throughout Section 3.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "survey": {
    123       "search_and_selection": {
    124         "search_strategy_reproducible": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No search strategy is described; the paper reviews '16 primary studies' with no explanation of how they were identified, making reproduction impossible.",
    128           "source": "haiku"
    129         },
    130         "inclusion_exclusion_explicit": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No inclusion or exclusion criteria are stated; the basis for selecting exactly these 16 papers is entirely unexplained.",
    134           "source": "haiku"
    135         },
    136         "prisma_or_structured_protocol": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No PRISMA flowchart or any structured review protocol is mentioned or followed.",
    140           "source": "haiku"
    141         },
    142         "search_terms_provided": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No search queries or keywords used to identify papers are provided anywhere in the paper.",
    146           "source": "haiku"
    147         },
    148         "databases_listed": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No databases (e.g., IEEE Xplore, ACM DL, arXiv, Google Scholar) are listed as having been searched.",
    152           "source": "haiku"
    153         },
    154         "screening_process_documented": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No screening process, stage counts, or exclusion reasons are documented; papers appear to have been selected informally.",
    158           "source": "haiku"
    159         },
    160         "review_scope_justified": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The review covers LLM-as-a-Judge in SE without justifying why this scope, these years, or these venues were chosen.",
    164           "source": "haiku"
    165         }
    166       },
    167       "synthesis_quality": {
    168         "conflicting_findings_acknowledged": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "The paper explicitly acknowledges conflicting findings: Wang et al. found traditional metrics outperformed LLM-as-a-Judge for code summarization while Wu et al. found the opposite, and calls this a key limitation.",
    172           "source": "haiku"
    173         },
    174         "quality_assessment_of_sources": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of the reviewed papers is applied; all 16 papers are treated as equally credible.",
    178           "source": "haiku"
    179         },
    180         "publication_bias_discussed": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "Publication bias is not discussed; the survey does not acknowledge that positive results in LLM-as-a-Judge research are more likely to be published.",
    184           "source": "haiku"
    185         },
    186         "quantitative_synthesis_present": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "The synthesis is entirely narrative; no meta-analysis, effect size aggregation, or vote counting is performed across the reviewed studies.",
    190           "source": "haiku"
    191         },
    192         "recommendations_supported_by_evidence": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "Recommendations (e.g., 'use structured interviews and think-aloud protocols', 'integrate static analyzers') are generic and not directly derived from evidence in the reviewed papers.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "LLMs offer a promising solution to evaluate LLM-generated software artifacts as cost-effective surrogates for human evaluators.",
    204       "evidence": "Cites studies showing LLMs have strong coding ability and RLHF-based alignment with human judgment, plus 16 reviewed studies using LLMs for evaluation tasks.",
    205       "supported": "weak"
    206     },
    207     {
    208       "claim": "Current LLM-as-a-Judge research in SE relies on small-scale datasets (e.g., 450 and 420 samples) insufficient for generalizability.",
    209       "evidence": "Directly cites Wang et al. [65] using 450 samples across three SE tasks and Ahmed et al. [1] using 420 samples for code summarization.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Empirical findings on LLM-as-a-Judge for code summarization are inconsistent across studies.",
    214       "evidence": "Wang et al. found traditional metrics outperformed LLM-as-a-Judge; Wu et al. found the opposite — two directly cited studies with opposing conclusions.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "LLM-as-a-Judge systems in SE are susceptible to position bias, verbosity bias, and egocentric bias.",
    219       "evidence": "Cited from NLP literature [28, 36, 76]; explicitly acknowledged as a gap in the SE context with no SE-specific evidence provided.",
    220       "supported": "weak"
    221     },
    222     {
    223       "claim": "By 2030, LLM-as-a-Judge systems can serve as reliable, robust, and scalable human surrogates for evaluating software artifacts.",
    224       "evidence": "No empirical evidence or trajectory analysis provided; purely a forward-looking projection.",
    225       "supported": "unsupported"
    226     },
    227     {
    228       "claim": "Adversarial threats to LLM-as-a-Judge in SE (code obfuscation, misleading comments) are under-explored.",
    229       "evidence": "No SE-specific adversarial studies are cited against LLM-as-a-Judge; authors note this as a gap, relying on general adversarial ML literature.",
    230       "supported": "moderate"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "qualitative"
    235   ],
    236   "key_findings": "This informal forward-looking survey reviews 16 primary studies on LLM-as-a-Judge in software engineering without a systematic search protocol, identifying six key limitations in existing work: lack of large-scale human-annotated benchmarks, inconsistent empirical findings across studies, insufficient bias research in SE contexts, inadequate SE domain expertise in LLMs, over-reliance on internal evaluation mechanisms, and under-explored adversarial threats. The paper proposes a research roadmap toward making LLM-as-a-Judge systems reliable and scalable by 2030, but presents no new empirical findings of its own. The review of 16 studies is not systematically conducted — no search terms, databases, inclusion criteria, or PRISMA protocol are documented, undermining the 'comprehensive review' framing.",
    237   "red_flags": [
    238     {
    239       "flag": "No systematic search methodology",
    240       "detail": "The paper reviews '16 primary studies' with no description of how they were found, selected, or screened — calling this a 'comprehensive review' is misleading for a survey paper."
    241     },
    242     {
    243       "flag": "Speculative 2030 vision without empirical grounding",
    244       "detail": "The paper projects that LLMs will be 'reliable, robust, and scalable human surrogates' by 2030 with no empirical trajectory analysis or quantitative baseline to support this claim."
    245     },
    246     {
    247       "flag": "No limitations section for the paper itself",
    248       "detail": "Section 4 discusses limitations of prior work; the paper has no self-assessment of its own methodological limitations as a survey."
    249     },
    250     {
    251       "flag": "No funding or competing interests disclosure",
    252       "detail": "No acknowledgments, funding sources, or competing interests are disclosed despite multiple institutional affiliations."
    253     },
    254     {
    255       "flag": "Overclaims from thin evidence base",
    256       "detail": "Broad claims about LLMs as 'the new software judges' are generalized from 16 informally selected studies, many with datasets of only hundreds of samples."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    262       "relevance": "Foundational paper establishing the LLM-as-a-Judge paradigm that this survey is built upon."
    263     },
    264     {
    265       "title": "Can LLMs Replace Human Evaluators? An Empirical Study of LLM-as-a-Judge in Software Engineering",
    266       "relevance": "Key primary study comparing LLM-as-a-Judge with traditional metrics across three SE tasks; produced conflicting findings with Wu et al."
    267     },
    268     {
    269       "title": "Can Large Language Models Serve as Evaluators for Code Summarization?",
    270       "relevance": "Introduced CODERPE multi-role framework; found LLM-as-a-Judge outperformed traditional metrics — conflicting with Wang et al."
    271     },
    272     {
    273       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    274       "relevance": "Proposed taxonomy-guided LLM evaluation of code generation using error classification; primary study reviewed."
    275     },
    276     {
    277       "title": "Can LLMs replace manual annotation of software engineering artifacts?",
    278       "relevance": "Evaluated LLMs for multiple SE annotation tasks including code summarization and variable naming; primary study reviewed."
    279     },
    280     {
    281       "title": "LLMs-as-judges: a comprehensive survey on LLM-based evaluation methods",
    282       "relevance": "Broader NLP survey of LLM-as-judge that this SE-specific paper builds upon, refines, and applies to the SE domain."
    283     },
    284     {
    285       "title": "CodeJudge-Eval: Can Large Language Models be Good Judges in Code Understanding?",
    286       "relevance": "Empirically evaluated 12 LLMs on code generation evaluation tasks; primary study reviewed."
    287     },
    288     {
    289       "title": "AIME: AI System Optimization via Multiple LLM Evaluators",
    290       "relevance": "Proposed multi-LLM evaluation framework combining evaluators for correctness, readability, and performance — representative of the future the roadmap envisions."
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 2,
    296       "justification": "Directly relevant to practitioners building LLM-based SE tools who need scalable evaluation methods, though the paper offers no immediately usable tools or benchmarks."
    297     },
    298     "surprise_contrarian": {
    299       "score": 1,
    300       "justification": "Surfacing conflicting results across studies is mildly interesting but the overall message — 'LLMs are promising but need more work' — is not surprising."
    301     },
    302     "fear_safety": {
    303       "score": 1,
    304       "justification": "Adversarial attacks on LLM-as-a-judge systems (code obfuscation to manipulate evaluation outcomes) are briefly raised as an under-explored threat."
    305     },
    306     "drama_conflict": {
    307       "score": 1,
    308       "justification": "Directly conflicting empirical results between Wang et al. and Wu et al. on LLM-as-a-judge effectiveness provide mild tension."
    309     },
    310     "demo_ability": {
    311       "score": 0,
    312       "justification": "Pure roadmap paper with no implemented system, benchmark, dataset, or demo for readers to try."
    313     },
    314     "brand_recognition": {
    315       "score": 1,
    316       "justification": "Mentions GitHub Copilot and Cursor as motivating examples; no results from flagship AI labs."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [
    321       {
    322         "hn_id": "43978357",
    323         "title": "Type-constrained code generation with language models",
    324         "points": 257,
    325         "comments": 127,
    326         "url": "https://news.ycombinator.com/item?id=43978357",
    327         "created_at": "2025-05-13T22:15:30Z"
    328       },
    329       {
    330         "hn_id": "45141762",
    331         "title": "Fantastic pretraining optimizers and where to find them",
    332         "points": 42,
    333         "comments": 4,
    334         "url": "https://news.ycombinator.com/item?id=45141762",
    335         "created_at": "2025-09-05T18:15:42Z"
    336       },
    337       {
    338         "hn_id": "30665928",
    339         "title": "PERCEPT: Online change-point detection using topological data analysis",
    340         "points": 8,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=30665928",
    343         "created_at": "2022-03-13T21:31:04Z"
    344       },
    345       {
    346         "hn_id": "43997113",
    347         "title": "An Empirical Study on the Performance and Energy Usage of Compiled Python Code",
    348         "points": 3,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=43997113",
    351         "created_at": "2025-05-15T17:12:36Z"
    352       },
    353       {
    354         "hn_id": "39686242",
    355         "title": "Random Networks are not Random Functions",
    356         "points": 3,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=39686242",
    359         "created_at": "2024-03-12T23:39:00Z"
    360       },
    361       {
    362         "hn_id": "44461553",
    363         "title": "SegmentAnyMuscle: A muscle segmentation model across different locations in MRI",
    364         "points": 2,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=44461553",
    367         "created_at": "2025-07-04T06:01:44Z"
    368       },
    369       {
    370         "hn_id": "43926603",
    371         "title": "Pearch.ai beat LinkedIn's AI search in a head-to-head benchmark",
    372         "points": 1,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=43926603",
    375         "created_at": "2025-05-08T14:50:43Z"
    376       },
    377       {
    378         "hn_id": "43908546",
    379         "title": "Performance and Energy Usage of Compiled Python",
    380         "points": 1,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=43908546",
    383         "created_at": "2025-05-06T19:03:58Z"
    384       }
    385     ],
    386     "top_points": 257,
    387     "total_points": 317,
    388     "total_comments": 131
    389   }
    390 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs