scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31888B)
      1 {
      2   "paper": {
      3     "title": "On the Effectiveness of LLM-as-a-judge for Code Generation and Summarization",
      4     "authors": [
      5       "Giuseppe Crupi",
      6       "Rosalia Tufano",
      7       "Alejandro Velasco",
      8       "Antonio Mastropaolo",
      9       "Denys Poshyvanyk",
     10       "Gabriele Bavota"
     11     ],
     12     "year": 2025,
     13     "venue": "IEEE Transactions on Software Engineering",
     14     "arxiv_id": "2507.16587",
     15     "doi": "10.1109/TSE.2025.3586082"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper references a replication package at [1] with a GitHub URL (https://github.com/crupig/LLMs-as-a-judge-for-SE-tse_RP) mentioned in the references section and cited throughout the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper states 'we built (and make publicly available [1]) our own dataset that features human judgments of 1,163 summaries' (Section 2.2.2). CoderEval is also a public benchmark. All data is in the replication package."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specification (requirements.txt, Dockerfile, library versions) is mentioned. The paper mentions using Hugging Face inference endpoints and ChatGPT APIs but provides no reproducible environment details."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are included in the paper. The replication package is referenced but no README with commands or a 'Reproducing Results' section is described."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results (Cohen's Kappa, Krippendorff's alpha, accuracy, bias coefficients) are reported as point estimates with no confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Mann-Whitney U tests with Benjamini-Hochberg correction are used for the self-bias analysis (Section 2.4, Tables 3 and 6). Adjusted p-values are reported at multiple significance levels."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Cliff's delta effect sizes are reported alongside statistical tests (Tables 3 and 6), with interpretation thresholds defined: negligible, small, medium, large (Section 2.4)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No power analysis or justification for sample sizes. The 184 Java and 190 Python code generation problems, and 198 functions for summarization, are used without justifying adequacy."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance or standard deviation reported across experimental runs. LLM outputs are stochastic, but all results appear from single runs with no spread measures."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Eight LLMs of varying sizes are compared against each other and against ground truth (test execution for code generation, human judgments for summarization). Four different prompting strategies are compared (Table 2, Table 5)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "For a paper published in July 2025, the models are dated. GPT-4-turbo was the strongest model tested, but GPT-4o (May 2024), Claude 3/3.5, Llama 3, and other 2024-2025 models are absent. No justification is given for the model selection timeframe."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper ablates along multiple dimensions: four prompt strategies (Table 2, Table 5), self-contained vs. non-self-contained functions (Section 3.1), and the mutant injection + semantic equivalence study (Section 3.1.2, Fig. 2)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics used: Cohen's Kappa, confusion matrices (TP/TN/FP/FN rates, accuracy), bias coefficient for code generation; Krippendorff's alpha and scatterplots for summarization."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Nine human judges independently evaluated 1,163 code summaries across three quality criteria (content adequacy, conciseness, fluency & understandability) on a 1-5 scale, with each summary assessed by three judges (Section 2.2.2)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Prompts were developed using 'toy examples' separate from the evaluation data. The main evaluation was conducted on CoderEval (code generation) and the purpose-built summarization dataset, which were not used for prompt tuning (Section 2.3.1)."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by language (Java/Python), by model, by model family, by prompt strategy, and by function type (self-contained vs. dependent). Confusion matrices per model (Fig. 1) and per-criterion scores for summarization (Table 5)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 3.1.2 provides detailed manual analysis of false positives (37% uncaught wrong behavior, 32% coding context, 27% ambiguous requirements) and false negatives (33% hallucination, 19% code misunderstanding). Table 1 documents judging failures."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The main findings are largely negative: smaller LLMs completely fail at judging, GPT-4-turbo misjudges 50% of wrong Java implementations. The paper explicitly reports what doesn't work. DeepSeek Coder family was excluded from summarization entirely due to inability to perform the task."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: GPT-4-turbo as best judge (Tables 2, 5), smaller LLMs struggling (Tables 1, 2, 5), frequent misjudgments of code correctness (Fig. 1, 50% FP rate), moderate agreement on summarization (Table 5, α=0.58-0.63 for content adequacy)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper primarily makes comparative/descriptive claims about judging effectiveness rather than strong causal claims. Where causal reasoning is used (e.g., coding context as a factor in misjudgments), it is tested empirically by analyzing self-contained functions separately (Section 3.1)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 4 explicitly states: 'the generalizability of our findings is capped by (i) the two code-related tasks subject of the study and (ii) the focus on the Java and Python programming languages. Differentiated replications can help to corroborate/contradict our findings.'"
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 4 discusses construct validity (tests as proxy for correctness), prompt impact, and manual analysis subjectivity. Section 3.1 investigates coding context as an alternative explanation for misjudgments. Section 3.1.2 systematically analyzes reasons behind false judgments."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 4 explicitly acknowledges: 'Using tests as a proxy for code correctness is a limitation of our study.' They also document cases where the proxy fails (unreliable test suites, Section 2.2.1) and excluded problematic cases."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Models are named as 'GPT-3.5-turbo' and 'GPT-4-turbo' without snapshot dates or API versions. DeepSeek Coder and CodeLlama sizes are specified (1.3B, 6.7B, 33B; 7B, 13B, 34B) but exact checkpoint versions are not given."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Full prompt text is provided for the zero-shot code generation prompt (Section 2.3.1), the automated CoT prompts (Section 2.3.1), and the zero-shot summarization prompt (Section 2.3.2). Additional prompts are in the replication package [1]."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any model. These settings significantly affect LLM output and are essential for reproducibility."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. LLMs are prompted directly via API calls or Hugging Face inference endpoints."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Detailed quality assurance pipeline for CoderEval documented in Section 2.2.1: 230 Java → 210 (excluded 20 failing targets) → 201 (excluded 9 empty-body passes) → 184 (excluded 17 dummy-function passes). Similar filtering for Python: 230 → 191 → 190. Summarization dataset construction also documented."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4 'Threats to Validity' is a dedicated section covering construct, internal, and external validity threats."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4 discusses specific threats: test suites as imperfect proxies for correctness (with mitigation), subjectivity in manual analysis (mitigated by multiple evaluators), prompt sensitivity (tested with four variants), and language/task scope limitations."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 4 explicitly states what the results do NOT show: 'the generalizability of our findings is capped by (i) the two code-related tasks subject of the study and (ii) the focus on the Java and Python programming languages.' Future work specifies additional tasks and fine-tuning as unexplored directions."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The replication package [1] is referenced as containing all data: prompts, judgments, generated code, human evaluations. The summarization dataset is explicitly stated as 'publicly available [1]' (Section 2.2.2)."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Data collection is described in detail: CoderEval benchmark selection and quality filtering (Section 2.2.1), summarization dataset construction from top-100 longest functions per language (Section 2.2.2), LLM output extraction using lizard code analyzer (Section 2.3.1)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The 9 human judges are characterized by qualifications (Master's/PhD, years of experience) but how they were recruited is not described. No information on recruitment channels or whether they were lab members, students, or external participants."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The full pipeline is documented: CoderEval quality filtering with counts at each stage (Section 2.2.1), code generation and extraction (Section 2.3.1), judgment collection with manual verification (Section 2.3.1), and total counts (80,556 code gen judgments, 22,304 summarization judgments)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding information or acknowledgments section is present in the paper. A 6-author paper from two universities likely received institutional funding but this is not disclosed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: SEART @ Software Institute, Università della Svizzera italiana (Crupi, Tufano, Bavota) and W&M (Velasco, Mastropaolo, Poshyvanyk). None of the authors are affiliated with the companies whose models are evaluated."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not establish that no funding exists."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the eight LLMs. This is relevant because CoderEval benchmark problems could have appeared in the training data of GPT-4-turbo, GPT-3.5-turbo, and other models."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether CoderEval problems or their solutions appeared in any model's training data. CoderEval was published at ICSE'24 and its code/solutions are publicly available."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Benchmark contamination is not addressed. CoderEval problems are from open-source repositories and were publicly available before GPT-4-turbo's training cutoff, creating contamination risk for both the generation and judging tasks."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No pre-registration is mentioned for the human evaluation study with 9 judges."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No IRB or ethics board approval is mentioned for the human evaluation study."
    258       },
    259       "demographics_reported": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 2.2.2 reports: 'All nine judges have a Master's degree in Informatics or Computer Science, four of them have a Ph.D. in Software Engineering. On average, they have 5.8 years of experience (min=1, max=17) in Java programming and 6.9 in Python programming (min=4, max=10).'"
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "The judges are described as having 'code summarization background' but no formal inclusion/exclusion criteria are stated. No screening process is described."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "This is a rating/evaluation study, not an experimental study with treatment and control conditions. The human judges are evaluators, not experimental participants assigned to conditions."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This is an evaluation/rating study rather than a randomized experiment. While blinding judges to summary source (human vs. LLM) would have been methodologically valuable, the schema marks this as NA for non-experimental studies."
    278       },
    279       "attrition_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No attrition or dropout information is reported. The paper states 9 judges and 3,489 total judgments expected but does not confirm whether all judgments were completed or if any judges dropped out."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost or latency is reported despite running 80,556 code generation judgments and 22,304 summarization judgments across eight LLMs, including commercial API calls to GPT-3.5-turbo and GPT-4-turbo."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget is stated. The paper used Hugging Face inference endpoints for open models and ChatGPT APIs for GPT models, but neither API costs nor GPU hours are reported."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds or runs. LLM outputs are stochastic, but results appear to be from single runs without seed sensitivity analysis."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is not stated. It appears each judgment was obtained from a single LLM call, but this is never explicitly confirmed."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search budget is reported. Four prompts are compared, but the number of prompt variants tested during the trial-and-error development phase is not disclosed."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section 2.4 explicitly justifies the selection criterion: 'Since for both tasks there was one judge LLM which was the clear winner independently from the used prompt (i.e., GPT-4-turbo), we selected as best-performing prompt the one ensuring the best performance on it.' Results for all four prompts are shown (Tables 2, 5)."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Benjamini-Hochberg correction is applied to adjust p-values for multiple comparisons in the self-bias analysis (Section 2.4, Tables 3, 6)."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper does not discuss author-evaluation bias. The prompts, extraction scripts, and experimental design were developed by the authors, but no acknowledgment of potential bias in their own evaluation methodology is made."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No analysis of performance as a function of compute budget. Larger models (GPT-4-turbo with estimated >1.5T parameters) are compared with smaller ones (1.3B) without discussing compute cost implications, despite the paper noting that 'the larger the language model, the higher its inference (judgment) cost.'"
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section 2.2.1 extensively analyzes CoderEval's construct validity: verifying target functions pass tests, checking for trivial passing implementations (empty functions, dummy returns), and excluding 76 of 460 problems with unreliable test suites. Section 4 discusses tests as proxy for correctness."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. LLMs are prompted directly without agentic scaffolding or tool use."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. CoderEval functions are from open-source projects that could have been in training data for GPT-4-turbo and other models. The temporal relationship between benchmark creation and model training is not discussed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the judging prompts (description + signature + candidate) provide information that could trigger memorized solutions from training data."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether CoderEval problems or their solutions overlap with LLM training data. The benchmark draws from open-source repositories that are likely in LLM training corpora."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination procedures are mentioned."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "GPT-4-turbo is the best judge among the eight tested LLMs for both code generation and code summarization tasks.",
    370       "evidence": "Table 2 shows GPT-4 achieves the highest Cohen's Kappa for code generation (0.21 Java, 0.10 Python with automated CoT). Table 5 shows GPT-4 achieves highest Krippendorff's alpha for code summarization (0.58 Java, 0.63 Python for content adequacy). Section 3.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Smaller LLMs (1-13B parameters) cannot effectively act as judges for code generation or summarization.",
    375       "evidence": "Table 2: DeepSeek Coder 1.3B, 6.7B and CodeLlama 7B achieve negative or near-zero Kappa scores. Table 5: CodeLlama 7B, 13B, 34B all achieve negative Krippendorff's alpha for most criteria. DeepSeek Coder family excluded entirely from summarization study due to inability to perform the task.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Even GPT-4-turbo frequently misjudges code correctness, especially failing to identify incorrect implementations — 50% of wrong Java implementations are misjudged as correct.",
    380       "evidence": "Fig. 1 confusion matrices show GPT-4 correctly classifies 72% of correct Java implementations but misjudges 50% of wrong ones as correct (50% false positive rate for wrong code).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "GPT-4-turbo achieves moderate agreement with humans for judging code summary quality, particularly content adequacy.",
    385       "evidence": "Table 5: Krippendorff's alpha for content adequacy is 0.58 (Java) and 0.63 (Python), which falls in the moderate agreement range. Conciseness alpha: 0.40 Java, 0.36 Python.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLMs are not significantly self-biased in code generation judging, with the exception of GPT-4 showing negligible self-bias.",
    390       "evidence": "Table 3: Mann-Whitney tests show no significant difference for most LLMs when judging own vs. others' code. GPT-4 shows significant but negligible effect size (p<0.01, Cliff's d=Negligible).",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "LLMs systematically underestimate the correctness of human-written code compared to LLM-generated code.",
    395       "evidence": "Table 3: Human-written code has average bias coefficient of -0.37 (underestimated), while all LLM-generated code has positive coefficients (overestimated). All statistical tests show significant difference with large effect size for human vs. own code.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Artificial hallucination is the most common cause of false negative judgments (33%), with LLMs commenting on non-existent bugs or missing requirements that were actually implemented.",
    400       "evidence": "Section 3.1.2: Manual analysis of false negatives found 33% were due to artificial hallucination, 19% due to misunderstanding of code statements.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Prompt choice has some impact on judging performance but does not substantially change overall findings or the identification of the best model.",
    405       "evidence": "Tables 2 and 5 show results across four prompting strategies. Kappa and Krippendorff's alpha values are relatively stable across prompts for each model, and GPT-4 remains the best performer across all prompts.",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": ["benchmark-eval", "qualitative"],
    410   "key_findings": "GPT-4-turbo is the best LLM judge for both code generation and summarization among eight tested models, but still frequently misjudges code correctness (50% of wrong Java implementations judged as correct). Smaller LLMs (1-33B parameters) are largely unable to perform judging tasks. For code summarization, GPT-4-turbo achieves moderate agreement with human judgments (Krippendorff's alpha 0.58-0.63 for content adequacy). LLMs systematically underestimate human-written code quality while overestimating LLM-generated code quality, with large effect sizes.",
    411   "red_flags": [
    412     {
    413       "flag": "Outdated model selection",
    414       "detail": "Published July 2025 but tests only models available in 2023-early 2024 (GPT-4-turbo, CodeLlama, DeepSeek Coder v1). Major 2024-2025 models (GPT-4o, Claude 3/3.5, Llama 3, DeepSeek V2/V3, Gemini) are completely absent, significantly limiting the relevance of findings."
    415     },
    416     {
    417       "flag": "No hyperparameters reported",
    418       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the eight LLMs. These settings substantially affect output quality and are essential for reproducibility of the 100,000+ judgments collected."
    419     },
    420     {
    421       "flag": "Single-run stochastic results",
    422       "detail": "LLM outputs are stochastic, but no multiple runs, seed sensitivity, or variance across runs is reported. A different random seed could yield different judgments, especially for borderline cases."
    423     },
    424     {
    425       "flag": "No contamination analysis",
    426       "detail": "CoderEval functions are from open-source projects likely in LLM training data. Models may have memorized solutions, affecting both code generation and judging accuracy. This confound is not discussed."
    427     },
    428     {
    429       "flag": "Small convenience sample of human judges",
    430       "detail": "Only 9 human judges with unspecified recruitment method. Likely convenience sample from the authors' institutions. No IRB approval mentioned despite involving human participants."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    436       "authors": ["H. Yu", "B. Shen", "D. Ran", "J. Zhang", "Q. Zhang", "Y. Ma", "G. Liang", "Y. Li", "Q. Wang", "T. Xie"],
    437       "year": 2024,
    438       "relevance": "Primary benchmark used for evaluating LLM code generation and judging capabilities; ICSE'24 paper with 460 code generation problems in Java and Python."
    439     },
    440     {
    441       "title": "Evaluating large language models trained on code",
    442       "authors": ["M. Chen"],
    443       "year": 2021,
    444       "arxiv_id": "2107.03374",
    445       "relevance": "Introduced the HumanEval benchmark and pass@k metric, foundational for LLM code generation evaluation."
    446     },
    447     {
    448       "title": "Code Llama: Open Foundation Models for Code",
    449       "authors": ["B. Rozière"],
    450       "year": 2024,
    451       "arxiv_id": "2308.12950",
    452       "relevance": "Open-source code LLM family (7B-34B) evaluated as both code generators and judges in this study."
    453     },
    454     {
    455       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    456       "authors": ["D. Guo", "Q. Zhu", "D. Yang"],
    457       "year": 2024,
    458       "arxiv_id": "2401.14196",
    459       "relevance": "Code-focused LLM family (1.3B-33B) evaluated as judges, found unable to perform summarization judging tasks."
    460     },
    461     {
    462       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    463       "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng"],
    464       "year": 2023,
    465       "arxiv_id": "2306.05685",
    466       "relevance": "Foundational work on LLM-as-a-judge paradigm identifying positional bias, verbosity bias, self-enhancement bias, and limited reasoning ability."
    467     },
    468     {
    469       "title": "ICE-score: Instructing Large Language Models to Evaluate Code",
    470       "authors": ["T. Y. Zhuo"],
    471       "year": 2024,
    472       "relevance": "Prior work using GPT-3.5-turbo as a judge for code implementations on HumanEval-X; reports weak-to-moderate correlations with test outcomes."
    473     },
    474     {
    475       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    476       "authors": ["W. Tong", "T. Zhang"],
    477       "year": 2024,
    478       "relevance": "GPT-3.5-based code correctness judge using slow-thinking prompt strategy; reports 54-57% accuracy on complex benchmarks, aligned with this paper's findings."
    479     },
    480     {
    481       "title": "CodeUltraFeedback: An LLM-as-a-Judge Dataset for Aligning Large Language Models to Coding Preferences",
    482       "authors": ["M. Weyssow", "A. Kamanda", "H. Sahraoui"],
    483       "year": 2024,
    484       "relevance": "Proposes using LLM-as-a-judge for evaluating non-functional code requirements; reports LLMs agree that GPT-based models produce superior solutions."
    485     },
    486     {
    487       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    488       "authors": ["T. Y. Zhuo", "M. C. Vu", "J. Chim"],
    489       "year": 2024,
    490       "relevance": "Comprehensive code generation benchmark used in CodeJudge evaluation; features complex coding problems beyond HumanEval difficulty."
    491     },
    492     {
    493       "title": "Reassessing Automatic Evaluation Metrics for Code Summarization Tasks",
    494       "authors": ["D. Roy", "S. Fakhoury", "V. Arnaoudova"],
    495       "year": 2021,
    496       "relevance": "Large-scale dataset of human code summary evaluations (6,253 evaluations by 226 developers) showing low inter-rater agreement; motivates the need for better evaluation methods."
    497     },
    498     {
    499       "title": "On the Limitations of Fine-tuned Judge Models for LLM Evaluation",
    500       "authors": ["H. Huang", "Y. Qu", "H. Zhou"],
    501       "year": 2024,
    502       "arxiv_id": "2403.02839",
    503       "relevance": "Studies fine-tuning LLMs specifically for judging tasks, showing limitations in generalizability, fairness, and scalability of specialized judge models."
    504     },
    505     {
    506       "title": "Evaluating Language Models for Generating and Judging Programming Feedback",
    507       "authors": ["C. Koutcheme", "N. Dainese", "A. Hellas"],
    508       "year": 2024,
    509       "arxiv_id": "2407.04873",
    510       "relevance": "Evaluates LLMs for judging programming assignments at beginner level; identifies hallucination as a key failure mode in code judging."
    511     }
    512   ]
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs