scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24169B)
      1 {
      2   "paper": {
      3     "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
      4     "authors": ["Weixi Tong", "Tianyi Zhang"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2410.02184"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'Our code and datasets are available on GitHub https://github.com/VichyTong/CodeJudge' in both the abstract and Section 5 (Conclusion)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets (HumanEval-X, CoNaLa, APPS, BigCodeBench) and states code and datasets are available at their GitHub repository. The datasets themselves are standard public benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using 'eight A100-80GB GPUs' for local models and the GPT-3.5-Turbo-1106 API, but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the code is released on GitHub, the paper itself does not include step-by-step reproduction instructions, a 'Reproducing Results' section, or specific commands to run the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Standard deviations are reported across three runs for open-source models in the appendix tables (Tables 11-15), e.g., '0.515±0.04'. However, GPT-3.5 experiments were only run once (temperature 0), so no variance is reported for those."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CODEJUDGE 'significantly outperformed existing methods' but does not report any statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on comparing correlation coefficient numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context, e.g., 'CODEJUDGE achieves significantly higher correlations (12.1%-41.8%) than existing methods' and provides full numeric baselines in Tables 3-6 with absolute values to compute differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is provided for why 100 competition-level tasks were sampled from APPS, or why the specific dataset sizes were chosen. No power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations are reported for open-source model experiments across three runs in the appendix tables (e.g., Table 11: '0.625±0.00'). However, GPT-3.5 experiments were only run once due to cost, so no variance is available for those results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against nine existing methods: six token-based (BLEU, ROUGE-L, METEOR, chrF, CodeBLEU, RUBY), one embedding-based (CodeBERTScore), one LLM-based (ICE-Score), and a VANILLA baseline (Section 4.3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "ICE-Score (Zhuo, 2024) is the state-of-the-art LLM-based code evaluation method. CodeBERTScore (Zhou et al., 2023) and BigCodeBench (Zhuo et al., 2024) are also recent. The baselines represent the field well."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper ablates multiple components: with vs. without reference code, different prompt designs (CoT, few-shot, combinations thereof in Appendix A / Table 7), and compares the two CODEJUDGE methods (Analyze-then-Summarize vs. Taxonomy-Guided Fault Localization)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses Kendall's tau coefficient, Spearman's rank correlation, and accuracy as evaluation metrics (Section 4.2)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper does not include human evaluation of CODEJUDGE's outputs. It uses existing test case results and human-annotated usefulness scores from CoNaLa as ground truth, but no humans independently evaluate the quality of CODEJUDGE's judgments beyond automated metrics."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The penalty score settings for the fault localization method were tuned on a small validation set of 32 tasks (20% of HumanEval, Appendix B), with results reported on the remaining tasks and separate datasets (CoNaLa, APPS, BigCodeBench)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by programming language (Table 5), by dataset (Tables 3-4), and by LLM evaluator (Table 6). The appendix provides per-language results for all models (Tables 11-15)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.5 includes a failure case analysis of 600 cases where CODEJUDGE made wrong predictions, identifying three failure patterns: wrong analysis of code logic (52.83%), wrong identification of task requirements (26.42%), and requirements of error handling (20.75%)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that CoT and few-shot learning methods do not improve CODEJUDGE's performance (Table 7, Appendix A), that providing reference code hurts performance on CoNaLa, and that all methods suffer significant performance drops on APPS and BigCodeBench."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims CODEJUDGE 'significantly outperformed existing methods in most settings' and 'achieved better results even when using a much smaller model, Llama-3-8B-Instruct.' Both are supported by Tables 3 and 6."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's main causal claim is that 'slow thinking' prompting improves evaluation quality. This is supported by controlled comparisons: CODEJUDGE vs. VANILLA (which uses the same LLMs without the structured reasoning pipeline), constituting adequate single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper tests across five programming languages, four datasets, and four LLMs. Claims are generally scoped to the tested settings. The paper notes performance drops on challenging benchmarks and acknowledges language-specific variation (Table 5)."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the improvement over ICE-Score might be due to longer prompts giving the model more tokens to reason over (rather than the specific 'slow thinking' structure), or whether the taxonomy design biases results toward certain error types."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'GPT-3.5-Turbo-1106' (Section 4.4), 'CodeLlama-Instruct (34B)', 'Llama-3-Instruct (8B)', and 'Llama-3-Instruct (70B)'. The GPT version includes a snapshot date. Open-source model sizes are specified."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompts for all methods are provided in the appendix: VANILLA prompts (Tables 16-17), CODEJUDGE Analyze-then-Summarize prompt (Table 18), and Fault Localization prompt (Table 19). These include the actual text used with placeholders clearly marked."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.4 states: 'we set the temperature to 0 and top_p to 1' for GPT-3.5, and 'temperature to 0.4 and top_p to 0.9' for open-source models. Penalty weights for fault localization are specified in Appendix B."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The two-step scaffolding (analysis then summarization) is described in detail in Sections 3.1 and 3.2, with a full workflow diagram in Figure 2. The taxonomy of inconsistencies is provided in Table 2, and the scoring formula is given in Appendix B."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix C describes data sources: official dataset versions are used, code generation for HumanEval-X and APPS uses MultiPL-E, BigCodeBench uses official pre-generated samples. For APPS, 100 competition-level tasks were randomly sampled. CoNaLa human annotations are from Evtikhiev et al. (2023)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 is titled 'Limitations' and discusses that LLMs can generate incorrect judgments on complex cases like APPS, and that the framework's quality depends on the backbone model."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations section is brief (one paragraph) and only mentions that 'LLMs can generate incorrect judgments or fail to completely follow system prompts when evaluating challenging and complex cases such as the APPS benchmark.' This is more of a known limitation than a threat to validity. No specific threats like construct validity of the correlation metrics, external validity across unseen languages/tasks, or internal validity concerns are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what it does NOT show or claim. It does not bound its findings to specific languages, task types, or model families, despite testing only on specific sets of each. The title and framing ('Evaluating Code Generation with Large Language Models') are broader than the tested settings."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper uses publicly available datasets (HumanEval-X, CoNaLa, APPS, BigCodeBench) and states that code and datasets are released at https://github.com/VichyTong/CodeJudge, enabling verification of raw results."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes each dataset's origin: HumanEval-X from CodeGeeX (164 tasks, 5 languages), CoNaLa from StackOverflow (472 tasks), APPS from code competitions (100 randomly sampled competition-level tasks), and BigCodeBench (1,140 tasks). Appendix C provides additional detail on code generation procedures."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited for this study. The study uses existing benchmarks and model-generated code."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from datasets to evaluation is documented: datasets are sourced from official releases, code is generated using specified tools (MultiPL-E, official BigCodeBench samples), evaluation is performed using scipy correlation metrics, and postprocessing is described in Appendix B."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Weixi Tong at Huazhong University of Science and Technology, Tianyi Zhang at Purdue University. These are academic institutions without obvious conflicts."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence of funder cannot be assessed. The absence of a funding disclosure means this criterion is not satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state the training data cutoff dates for any of the four LLMs used (GPT-3.5-Turbo-1106, CodeLlama-34B, Llama-3-8B, Llama-3-70B). This matters because the LLM evaluators may have seen the benchmark tasks during training."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper does not discuss whether the LLM evaluators (GPT-3.5, Llama-3, CodeLlama) may have seen HumanEval, CoNaLa, or APPS problems during training. HumanEval was published in 2021 and is widely known — models trained after 2021 almost certainly encountered it."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval was published in 2021, CoNaLa in 2018, and APPS in 2021 — all well before the training cutoffs of GPT-3.5 and Llama-3. The LLM evaluator could have memorized these problems and their solutions, which would inflate its ability to judge code correctness. This is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study. The paper evaluates automated methods using existing benchmarks."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 10 (Appendix D) reports average execution times per evaluation for all four models (e.g., GPT-3.5-Turbo: 2.36s for A.S., 1.14s for F.L.). The paper notes costs were high enough that GPT-3.5 experiments were only run once."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper mentions using eight A100-80GB GPUs for local models but does not state total GPU hours, API spend, or overall computational budget for the experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CODEJUDGE achieves significantly higher correlations (12.1%-41.8%) than existing methods in most settings when using GPT-3.5-Turbo.",
    286       "evidence": "Table 3 shows CODEJUDGE achieves 0.612 Kendall's tau on HumanEval-X vs. 0.475 for ICE-Score (the best existing LLM-based method), and similar advantages on other datasets.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Even with a smaller model (Llama-3-8B-Instruct), CODEJUDGE outperforms ICE-Score which uses GPT-3.5-Turbo.",
    291       "evidence": "Table 6 shows Llama-3-8B CODEJUDGE achieves 0.523 tau on CoNaLa and 0.480 on HumanEval-X, vs. ICE-Score with GPT-3.5 at 0.253/0.475 on the same datasets.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "CODEJUDGE without reference code outperforms all existing methods that use reference code in most settings.",
    296       "evidence": "Table 3: CODEJUDGE w/o REF achieves 0.502 tau on HumanEval-X vs. 0.372 for CodeBERTScoreF3, the best reference-based method. On APPS and BigCodeBench, the advantage is smaller or absent.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "CODEJUDGE achieves 80.56% average accuracy on binary code correctness prediction.",
    301       "evidence": "Table 4 shows average accuracies of 80.56% on HumanEval-X, 68.33% on APPS, and 74.56% on BigCodeBench using GPT-3.5-Turbo.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Chain-of-Thought and few-shot prompting do not improve CODEJUDGE's performance.",
    306       "evidence": "Table 7 (Appendix A) shows that all CoT and few-shot variants achieve lower accuracy than the base CODEJUDGE method (81.63% base vs. 77.65% CoT, 78.22% CoT+few-shot, etc.).",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CodeJudge is an LLM-based code evaluation framework that uses 'slow thinking' prompting strategies to assess code correctness without test cases. It achieves significantly higher statistical correlations with ground truth than nine existing methods across four datasets and five programming languages. The framework works with multiple LLMs and can operate without reference code while still outperforming reference-dependent methods. Performance degrades substantially on competition-level problems (APPS), and failure analysis reveals that wrong analysis of code logic accounts for 52.83% of errors.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical significance tests",
    315       "detail": "The paper claims CODEJUDGE 'significantly outperformed existing methods' but provides no statistical significance tests. Differences are based solely on comparing correlation coefficients without testing whether differences are statistically significant."
    316     },
    317     {
    318       "flag": "Benchmark contamination risk",
    319       "detail": "HumanEval (2021), CoNaLa (2018), and APPS (2021) were all published before the training cutoffs of the LLM evaluators used. The LLMs may have memorized these problems and their solutions, artificially inflating their ability to judge code correctness. This is never discussed."
    320     },
    321     {
    322       "flag": "Single run for main results",
    323       "detail": "The main GPT-3.5-Turbo results (Tables 3-5) are based on a single run due to cost, with temperature 0. While this reduces randomness, it means no variance can be reported for the primary model, and any API non-determinism at temperature 0 is not accounted for."
    324     },
    325     {
    326       "flag": "Validation set used for hyperparameter tuning from same benchmark",
    327       "detail": "The penalty weights for fault localization were tuned on 20% of HumanEval (32 tasks, Appendix B), but HumanEval-X results are still reported. While separate tasks may have been used, this creates a risk of indirect data leakage within the same benchmark distribution."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating large language models trained on code",
    333       "authors": ["Mark Chen", "Jerry Tworek"],
    334       "year": 2021,
    335       "arxiv_id": "2107.03374",
    336       "relevance": "Introduces HumanEval and pass@k, foundational benchmark and metric for LLM code generation evaluation."
    337     },
    338     {
    339       "title": "ICE-score: Instructing large language models to evaluate code",
    340       "authors": ["Terry Yue Zhuo"],
    341       "year": 2024,
    342       "relevance": "Primary baseline; the state-of-the-art LLM-based code evaluation method that CODEJUDGE improves upon."
    343     },
    344     {
    345       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    346       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    347       "year": 2023,
    348       "relevance": "Introduces EvalPlus for augmenting test cases in code generation evaluation, addressing test suite insufficiency."
    349     },
    350     {
    351       "title": "Out of the BLEU: How should we assess quality of the code generation models?",
    352       "authors": ["Mikhail Evtikhiev", "Egor Bogomolov"],
    353       "year": 2023,
    354       "relevance": "Shows statistically significant disagreement between human judges and token-based metrics for code evaluation."
    355     },
    356     {
    357       "title": "CodeBERTScore: Evaluating code generation with pretrained models of code",
    358       "authors": ["Shuyan Zhou", "Uri Alon"],
    359       "year": 2023,
    360       "relevance": "Embedding-based code evaluation metric using CodeBERT, serving as a baseline in this study."
    361     },
    362     {
    363       "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena",
    364       "authors": ["Lianmin Zheng"],
    365       "year": 2023,
    366       "relevance": "Foundational work on using LLMs as evaluators, directly motivating the LLM-as-code-judge approach."
    367     },
    368     {
    369       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    370       "authors": ["Terry Yue Zhuo"],
    371       "year": 2024,
    372       "arxiv_id": "2406.15877",
    373       "relevance": "Recent code generation benchmark with complex multi-API tasks, used as an evaluation dataset."
    374     },
    375     {
    376       "title": "Measuring coding challenge competence with APPS",
    377       "authors": ["Dan Hendrycks", "Steven Basart"],
    378       "year": 2021,
    379       "relevance": "Competition-level code generation benchmark used to test CODEJUDGE on challenging problems."
    380     },
    381     {
    382       "title": "G-eval: NLG evaluation using GPT-4 with better human alignment",
    383       "authors": ["Yang Liu"],
    384       "year": 2023,
    385       "relevance": "LLM-based evaluation framework for text generation that inspired LLM-as-judge approaches for code."
    386     },
    387     {
    388       "title": "ChatEval: Towards better LLM-based evaluators through multi-agent debate",
    389       "authors": ["Chi-Min Chan", "Weize Chen"],
    390       "year": 2024,
    391       "relevance": "Multi-agent LLM evaluation approach relevant to understanding LLM-as-judge capabilities."
    392     },
    393     {
    394       "title": "CodeGeeX: A pre-trained model for code generation with multilingual benchmarking on HumanEval-X",
    395       "authors": ["Qinkai Zheng", "Xiao Xia"],
    396       "year": 2023,
    397       "relevance": "Introduces HumanEval-X multilingual benchmark, the primary evaluation dataset used in this study."
    398     },
    399     {
    400       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    401       "authors": ["Jason Wei"],
    402       "year": 2022,
    403       "relevance": "Chain-of-thought prompting technique tested as a prompt design variant for CODEJUDGE."
    404     }
    405   ]
    406 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs