scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22352B)
      1 {
      2   "paper": {
      3     "title": "From Code to Courtroom: LLMs as the New Software Judges",
      4     "authors": [
      5       "Junda He",
      6       "Jieke Shi",
      7       "Terry Yue Zhuo",
      8       "Christoph Treude",
      9       "Jiamou Sun",
     10       "Zhenchang Xing",
     11       "Xiaoning Du",
     12       "David Lo"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2503.02246",
     17     "doi": "10.48550/arXiv.2503.02246"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["survey_methodology"],
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No code or analysis scripts are released. No repository URL is mentioned anywhere in the paper."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The list of 16 reviewed papers is in Table 1, but no structured dataset, extraction forms, or supplementary materials are released."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment or tooling specification is provided. A survey could release its analysis environment but this one does not."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No instructions for reproducing the literature review are provided. The search methodology is not documented well enough to replicate."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "This is a qualitative literature review with no statistical analysis or experiments."
     49       },
     50       "significance_tests": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "No experiments or statistical comparisons are performed."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No quantitative analysis is conducted; the review is entirely qualitative."
     59       },
     60       "sample_size_justified": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "No experiments or quantitative analysis. The paper reviews literature without statistical aggregation."
     64       },
     65       "variance_reported": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No experiments with multiple runs. This is a qualitative survey paper."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper does not compare its review against prior surveys or reviews on LLM-as-a-Judge. While it references Li et al. [33] and Wang et al. [65] for definitional contrast, it does not systematically compare its coverage or findings against other reviews."
     76       },
     77       "baselines_contemporary": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "No experiments requiring baselines. This is a survey/vision paper."
     81       },
     82       "ablation_study": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No system components to ablate. This is a literature review."
     86       },
     87       "multiple_metrics": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No experiments with metrics. This is a qualitative literature review."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No system outputs to evaluate. This is a survey paper."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "No experiments requiring train/test splits."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 1 organizes the 16 reviewed studies by SE task (code generation, code summarization, bug report summarization, code translation, question answering, requirements causality extraction, code patches generation). Sections 3.1-3.4 discuss each category."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4 identifies six specific limitations of current LLM-as-a-Judge approaches: lack of benchmarks, inconsistent findings, unexplored biases, inadequate domain expertise, reliance on internal evaluation only, and insufficient adversarial research."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper explicitly discusses conflicting findings in the literature. Limitation 2 (Section 4.1) notes that Wang et al. [65] found traditional metrics outperformed LLM-as-a-Judge for code summarization, while Wu et al. [68] found the opposite."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims are supported: the paper does review 16 studies (Section 3, Table 1), analyzes limitations (Section 4 with 6 labeled limitations), and proposes a research roadmap with specific directions."
    123       },
    124       "causal_claims_justified": {
    125         "applies": false,
    126         "answer": false,
    127         "justification": "The paper makes no causal claims. It describes the state of the field and proposes future research directions without claiming causal relationships."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper title ('LLMs as the New Software Judges') and vision claims are sweeping relative to the evidence base of 16 papers. The abstract hedges with 'While not intended to be a definitive guide' but the roadmap and 2030 vision make broad claims about the entire SE community's trajectory based on a small literature set."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This is a pure survey and vision paper that presents no empirical results of its own. Alternative explanations are not applicable."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper makes no measurements of its own. It is a literature review and roadmap with no proxy/outcome gap to address."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "No models are used. This is a literature review."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No prompting is used in this paper."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "No experiments are conducted."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not describe how the 16 primary studies were identified. No search queries, databases, date ranges, or inclusion/exclusion criteria are documented. The paper states it reviews '16 primary studies' without explaining the selection methodology."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Section 4 discusses six limitations of the LLM-as-a-Judge FIELD, but there is no section discussing limitations of this review itself — its methodology, potential selection bias, or gaps in coverage."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No threats to validity of the survey methodology are discussed. The paper does not address whether its 16-paper sample is representative, whether its search was comprehensive, or whether the roadmap reflects author bias."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The abstract says 'While not intended to be a definitive guide' but does not state specific scope boundaries — what SE domains are excluded, what paper types were filtered out, or what the review does NOT cover. The formal definition in Section 2 excludes embedding-based methods, which is the closest to a scope boundary."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data or extraction tables are available. The review findings are presented narratively without structured data that could be independently verified."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The paper does not describe how the 16 primary studies were found. No search databases, queries, date ranges, or selection criteria are documented."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants in this study."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No pipeline from literature search to final review is documented. The paper jumps from stating it reviews 16 papers to discussing their content, with no description of how the set was assembled."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Singapore Management University, Monash University, CSIRO's Data61, and Australian National University."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of funding."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial disclosure statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a literature review."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No model evaluation on benchmarks is performed."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No benchmark evaluation is conducted in this paper."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "This is a survey/vision paper with no computational experiments."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "This is a survey/vision paper with no computational experiments."
    297       }
    298     },
    299     "survey_methodology": {
    300       "prisma_or_structured_protocol": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No PRISMA flow diagram, no systematic protocol, no documented search strategy. The paper does not describe how the 16 primary studies were identified or selected. The review appears to be ad-hoc rather than following a structured review protocol."
    304       },
    305       "quality_assessment_of_sources": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper does not assess the methodological quality of its 16 reviewed studies. All studies are treated equally regardless of rigor. No quality scoring rubric or risk-of-bias assessment is applied."
    309       },
    310       "publication_bias_discussed": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No discussion of publication bias. The paper does not consider whether its sources are biased toward positive results about LLM-as-a-Judge, nor does it discuss whether negative findings about LLM-as-a-Judge are underrepresented."
    314       }
    315     }
    316   },
    317   "claims": [
    318     {
    319       "claim": "LLM-as-a-Judge systems offer a promising solution to address the limitations of both costly human evaluation and traditional automated metrics in SE.",
    320       "evidence": "Section 1 and Section 2 argue this based on LLMs' coding abilities, human-like reasoning, RLHF alignment, and lack of fatigue. No original empirical evidence is provided; the claim rests on cited studies.",
    321       "supported": "weak"
    322     },
    323     {
    324       "claim": "The LLM-as-a-Judge field in SE is in its early stages, with only 16 primary studies identified.",
    325       "evidence": "Table 1 lists 16 studies across 7 SE tasks. Section 3 reviews each. The small number itself is the evidence for the claim.",
    326       "supported": "moderate"
    327     },
    328     {
    329       "claim": "Existing LLM-as-a-Judge benchmarks rely on small-scale datasets, often only a few hundred samples.",
    330       "evidence": "Section 4.1 cites Wang et al. [65] using 450 samples across 3 tasks and Ahmed et al. [1] using 420 samples for code summarization.",
    331       "supported": "moderate"
    332     },
    333     {
    334       "claim": "Empirical findings on LLM-as-a-Judge in SE are inconsistent, with conflicting conclusions across studies.",
    335       "evidence": "Section 4.1 Limitation 2: Wang et al. [65] found traditional metrics outperformed LLM-as-a-Judge for code summarization, while Wu et al. [68] found the opposite.",
    336       "supported": "moderate"
    337     },
    338     {
    339       "claim": "LLM-as-a-Judge systems can achieve reliable, robust, and scalable evaluation of software artifacts by 2030.",
    340       "evidence": "Section 4 presents a roadmap with opportunities but no empirical support for this timeline or feasibility. This is a vision statement, not an evidence-based prediction.",
    341       "supported": "unsupported"
    342     }
    343   ],
    344   "methodology_tags": ["meta-analysis"],
    345   "key_findings": "This paper reviews 16 studies on LLM-as-a-Judge for software engineering evaluation and identifies six key limitations: lack of large-scale human-annotated benchmarks, inconsistent empirical findings, unexplored biases, inadequate SE domain expertise in LLMs, reliance on internal evaluation mechanisms only, and insufficient adversarial threat research. The paper proposes a research roadmap toward reliable, robust, and scalable LLM-as-a-Judge systems by 2030, including better benchmarks, domain-specific training, tool integration, human-in-the-loop approaches, and adversarial defenses. The review covers code generation, code summarization, code changes, and other SE tasks but is a qualitative narrative review rather than a systematic review with structured methodology.",
    346   "red_flags": [
    347     {
    348       "flag": "No systematic review methodology",
    349       "detail": "The paper reviews 16 studies but provides no search methodology — no databases searched, no search queries, no inclusion/exclusion criteria, no PRISMA diagram. It is impossible to determine whether the 16 papers are a representative or comprehensive sample. The review appears ad-hoc."
    350     },
    351     {
    352       "flag": "Sweeping vision claims from thin evidence base",
    353       "detail": "The paper makes broad claims about what LLM-as-a-Judge can achieve by 2030 based on a review of only 16 papers, without quality assessment of those papers. The roadmap is aspirational rather than evidence-derived."
    354     },
    355     {
    356       "flag": "No quality assessment of reviewed studies",
    357       "detail": "All 16 reviewed papers are treated equally. No quality scoring, risk-of-bias assessment, or structured evaluation is applied. This is exactly the 'laundering weak results' problem: findings from rigorous and weak papers are given equal weight in shaping the roadmap."
    358     },
    359     {
    360       "flag": "Self-citation pattern",
    361       "detail": "Multiple authors appear in the reviewed studies (e.g., Terry Yue Zhuo is both a paper author and author of reviewed study [88]; Jieke Shi, Junda He, and David Lo appear in several reviewed papers [55-57, 67, 72-75]). The overlap between reviewers and reviewed is not discussed."
    362     }
    363   ],
    364   "cited_papers": [
    365     {
    366       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    367       "authors": ["Terry Yue Zhuo"],
    368       "year": 2023,
    369       "arxiv_id": "2304.14317",
    370       "relevance": "Directly uses LLMs (GPT-3.5) to evaluate code generation quality — core to the LLM-as-a-Judge paradigm being surveyed."
    371     },
    372     {
    373       "title": "Can LLMs Replace Human Evaluators? An Empirical Study of LLM-as-a-Judge in Software Engineering",
    374       "authors": ["Ruiqi Wang", "Jiyu Guo", "Cuiyun Gao", "Guodong Fan", "Chun Yong Chong", "Xin Xia"],
    375       "year": 2025,
    376       "arxiv_id": "2502.06193",
    377       "relevance": "Empirical study comparing LLM-as-a-Judge with traditional metrics for SE tasks including code generation, summarization, and translation."
    378     },
    379     {
    380       "title": "Can Large Language Models Serve as Evaluators for Code Summarization?",
    381       "authors": ["Yang Wu", "Yao Wan", "Zhaoyang Chu", "Wenting Zhao", "Ye Liu", "Hongyu Zhang", "Xuanhua Shi", "Philip S Yu"],
    382       "year": 2024,
    383       "arxiv_id": "2412.01333",
    384       "relevance": "Introduces CODERPE framework using multi-role LLM evaluation for code summaries, directly relevant to LLM-as-a-Judge methodology."
    385     },
    386     {
    387       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    388       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    389       "year": 2024,
    390       "relevance": "Foundational LLM-as-a-Judge paper establishing evaluation methodology with MT-Bench and Chatbot Arena."
    391     },
    392     {
    393       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    394       "authors": ["Weixi Tong", "Tianyi Zhang"],
    395       "year": 2024,
    396       "relevance": "Uses LLMs with a taxonomy of programming errors to evaluate generated code, a specialized LLM-as-a-Judge approach."
    397     },
    398     {
    399       "title": "Can LLMs replace manual annotation of software engineering artifacts?",
    400       "authors": ["Toufique Ahmed", "Premkumar Devanbu", "Christoph Treude", "Michael Pradel"],
    401       "year": 2024,
    402       "arxiv_id": "2408.05534",
    403       "relevance": "Investigates LLM capability to replace human annotation across multiple SE tasks including code summarization and function similarity."
    404     },
    405     {
    406       "title": "AIME: AI System Optimization via Multiple LLM Evaluators",
    407       "authors": ["Bhrij Patel", "Souradip Chakraborty", "Wesley A Suttle", "Mengdi Wang", "Amrit Singh Bedi", "Dinesh Manocha"],
    408       "year": 2024,
    409       "arxiv_id": "2410.03131",
    410       "relevance": "Proposes multi-LLM evaluation framework for code quality assessment including correctness, readability, and runtime performance."
    411     },
    412     {
    413       "title": "Evaluating large language models trained on code",
    414       "authors": ["Mark Chen", "Jerry Tworek"],
    415       "year": 2021,
    416       "arxiv_id": "2107.03374",
    417       "relevance": "Introduces HumanEval benchmark and Pass@k metric, foundational to code generation evaluation."
    418     },
    419     {
    420       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    421       "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"],
    422       "year": 2024,
    423       "arxiv_id": "2406.15877",
    424       "relevance": "Major code generation benchmark with diverse function calls, relevant to the evaluation infrastructure for LLM code capabilities."
    425     },
    426     {
    427       "title": "LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods",
    428       "authors": ["Haitao Li", "Qian Dong", "Junjie Chen"],
    429       "year": 2024,
    430       "arxiv_id": "2412.05579",
    431       "relevance": "Comprehensive survey on LLM-based evaluation methods that provided the definitional framework adopted in this paper."
    432     },
    433     {
    434       "title": "CodeJudge-Eval: Can Large Language Models be Good Judges in Code Understanding?",
    435       "authors": ["Yuwei Zhao", "Ziyang Luo", "Yuchen Tian"],
    436       "year": 2025,
    437       "relevance": "Empirically evaluates 12 LLMs as code generation judges, providing evidence on which models are effective evaluators."
    438     },
    439     {
    440       "title": "G-Eval: NLG Evaluation Using GPT-4 with Better Human Alignment",
    441       "authors": ["Yang Liu", "Dan Iter", "Yichong Xu", "Shuohang Wang", "Ruochen Xu", "Chenguang Zhu"],
    442       "year": 2023,
    443       "arxiv_id": "2303.16634",
    444       "relevance": "General-purpose LLM-as-a-Judge method tested in SE contexts, represents cross-pollination from NLP to SE evaluation."
    445     }
    446   ]
    447 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs