scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33065B)
      1 {
      2   "paper": {
      3     "title": "DeepCRCEval: Revisiting the Evaluation of Code Review Comment Generation",
      4     "authors": [
      5       "Junyi Lu",
      6       "Xiaojia Li",
      7       "Zihan Hua",
      8       "Lei Yu",
      9       "Shiqi Cheng",
     10       "Li Yang",
     11       "Fengjun Zhang",
     12       "Chun Zuo"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2412.18291",
     17     "doi": "10.48550/arXiv.2412.18291"
     18   },
     19   "scan_version": 3,
     20   "active_modules": [
     21     "experimental_rigor",
     22     "data_leakage"
     23   ],
     24   "methodology_tags": [
     25     "benchmark-eval",
     26     "qualitative"
     27   ],
     28   "key_findings": "Less than 10% of benchmark comments in major code review datasets (3% in Tufano, 8% in CodeReviewer) meet all quality, category, tone, and context criteria for automation references. The proposed DeepCRCEval framework using 9 domain-specific criteria provides better discrimination than text similarity metrics. LLM evaluators reduce evaluation time by 88.78% and cost by 90.32% compared to human evaluators while maintaining commendable reliability. A training-free GPT-4-based LLM-Reviewer substantially outperforms all existing trained CRCGs (Tufano et al., CommentFinder, CodeReviewer, AUGER, CCT5) on the proposed criteria.",
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper states 'Materials publicly available at https://zenodo.org/records/10511726' (Section 1) and mentions the scoring tool and test set are 'accessible in our open-source repository.'"
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The test set of 1,000 code cases is described as 'accessible in our open-source repository' (Section 5.4). The Zenodo archive contains study materials."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper mentions using the 'OpenAI Python library' with temperature 0.1 and 8192 token limit, but does not provide a requirements.txt, Dockerfile, or detailed environment specifications with library versions."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No step-by-step reproduction instructions are provided in the paper. While prompt templates and the test set are released, there is no guide on how to replicate the full experimental pipeline."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The main results in Tables 6 and 7 report only average scores and rankings without confidence intervals or error bars. The paper mentions '95% confidence level' for sample size justification in Section 4.2 but does not report CIs on the comparative results."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper claims LLM-Reviewer outperforms other CRCGs but provides no statistical significance tests (no p-values, t-tests, or similar). Rankings and scores are compared by raw numbers only."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper reports percentage improvements with baseline context: '88.78% and 90.32%' reductions in time and cost (Table 4), with absolute values provided for both human ($0.62) and LLM ($0.06) per case. Scoring differences are shown in Table 6 with absolute values for all methods."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Section 4.2 explicitly justifies the 100-sample size: 'According to the average reliability of 93% for humans in Table 6, the margin of error for 95% confidence level of 100 sample size is within 5%.' Interview saturation is noted after 3-4 interviews."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "Tables 6 and 7 report only average scores without standard deviations or variance across evaluators. ICC values in Table 5 indicate agreement but do not substitute for reporting variance in the main results."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper compares against 5 existing CRCGs: Tufano et al., CommentFinder, CodeReviewer, AUGER, and CCT5 (Section 5.2), plus introduces LLM-Reviewer as a new baseline."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Baselines include CodeReviewer (2022), AUGER (2022), CCT5 (2023), CommentFinder (2022), and Tufano et al. (2022). For a 2024 paper, these are 1-2 years old and represent the state of the art in CRCGs."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No ablation study is conducted. The paper does not test variants of DeepCRCEval (e.g., removing criteria) or LLM-Reviewer (e.g., different numbers of few-shot examples, different prompt components). The system has multiple components (criteria, CoT, ranking) that could be ablated."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper evaluates across 9 quality criteria (C1-C9: Readability, Relevance, Explanation Clarity, Problem Identification, Actionability, Completeness, Specificity, Contextual Adequacy, Brevity) plus ranking (Table 7)."
     99       },
    100       "human_evaluation": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Human evaluators are central to the study. Five graduate students scored comments using a QT-based tool (Sections 4.2, 5.1). Five industry developers provided feedback in a user study (Section 7.1, Figure 5)."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "A separate test set of 1,000 code cases was created for evaluation, distinct from the training sets used by baseline CRCGs. 'For baselines, we utilized their respective training sets' (Section 5.4)."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 6 provides per-criterion (C1-C9) breakdowns for all methods with both human and LLM evaluator scores. Table 3 shows per-category distributions. The Venn diagram (Figure 3) shows overlap analysis."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 8 provides two detailed case studies showing failure modes of existing CRCGs — generic, irrelevant, or meaningless comments (e.g., 'Why is this needed?', 'Remove this line'). Section 7.2 discusses these qualitatively."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The core finding is negative: only 3-8% of benchmark comments are suitable references, and existing SOTA CRCGs perform poorly on the proposed criteria. Section 6.1 also reports areas where LLM evaluators diverge from humans (Readability, Brevity)."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Abstract claims are supported: 'less than 10% of benchmark comments are high quality' matches Venn diagram results (3% Tufano, 8% CRer in Figure 3); '88.78% and 90.32%' cost/time reduction matches Table 4; LLM-Reviewer superiority matches Tables 6-7."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper claims LLM-Reviewer's superiority is 'attributed to LLM-Reviewer's direct alignment with the objectives' and that baselines' failures 'stem from their reliance on indirect text similarity metrics.' These are causal claims without controlled experiments isolating the alignment factor from GPT-4's inherent capability advantage."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The title 'Revisiting the Evaluation of Code Review Comment Generation' is general, but the study is limited to Java only. Section 7.3 acknowledges 'the focus on the Java programming language' but the abstract and main claims do not bound conclusions to Java."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Section 7.3 discusses threats (GPT-4 selection, Java focus, student evaluators, LLM-evaluating-LLM bias) but does not consider the most important alternative explanation: LLM-Reviewer's advantage may stem from GPT-4's superior language modeling rather than the criteria-guided prompt design. The comparison is confounded by model capability differences."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The entire paper is built around distinguishing proxy (text similarity) from actual outcome (code review quality). The authors explicitly argue that BLEU/ROUGE are indirect proxies that fail to capture defect detection and code improvement goals, and propose direct criteria-based evaluation."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper states 'GPT-4' without specifying a version or snapshot date (e.g., gpt-4-0613 vs gpt-4-1106-preview). Section 5.4 only says 'OpenAI Python library' without version. For baseline models, specific pre-trained checkpoints are not specified."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Full prompt templates are provided in Tables 9 and 10 in the appendix. The evaluator prompt (Table 9) includes task description, guidelines, evaluation objects, and generation format. The LLM-Reviewer prompt (Table 10) includes task description, guidelines, and demonstration format."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 5.4 states 'a temperature setting of 0.1 and a token limit of 8192.' Few-shot k=3 is stated in Section 5.3. For baselines, they mention using publicly available models and provided scripts."
    168       },
    169       "scaffolding_described": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "No agentic scaffolding is used. LLM-Reviewer is a simple prompt-in, comment-out pipeline (Figure 4) without tool use, retry logic, or multi-step workflows."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The test set creation is vaguely described: '1,000 code cases with typical issues' that were 'processed by humans to enhance simplicity' with ROUGE-L deduplication (Section 5.4). The actual human processing criteria, selection criteria for 'typical issues,' and how many candidates were filtered to reach 1,000 are not documented."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 7.3 'Threats to Validity' provides substantive discussion of multiple limitations including model selection, language scope, evaluator proxies, sample size, and LLM-evaluating-LLM bias."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 7.3 discusses threats specific to this study: GPT-4 as deliberate choice for its 'advanced capabilities,' Java focus as 'most commonly used language in prior research,' graduate students as 'proxies for actual developers' with 'significant programming experience,' and cost constraints limiting manual sample size."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "While threats to validity are discussed, the paper does not explicitly state what the results do NOT show or what populations/settings are excluded. There are no explicit statements like 'we do not claim this generalizes to other programming languages' — the bounds are implicit in the threats discussion."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Materials including the test set and scoring tools are available at Zenodo (https://zenodo.org/records/10511726) and the open-source repository, allowing independent verification of the evaluation data."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The paper describes data sources: 100 comments sampled from Tufano and CodeReviewer datasets (Section 4.2), 1,000 code test cases selected for typical issues (Section 5.4). Interview methodology described in Appendix B. Scoring methodology described in Appendix C."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "Participant characteristics are listed (e.g., '7 industry developers, each with over five years of experience,' '5 master's and doctoral students with over six years of programming experience') but the recruitment channels and methods are not described. How were the developers and students selected? Were they convenience samples?"
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "The pipeline from raw benchmark datasets to the final 1,000 test cases lacks documentation of intermediate steps and counts. How many candidate code cases were considered before arriving at 1,000? How many were removed by ROUGE-L dedup? The human processing step lacks criteria documentation."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding sources or acknowledgments section is present in the paper. The authors are affiliated with the Chinese Academy of Sciences and industry companies but no grants or funding are disclosed."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "All author affiliations are listed: Institute of Software, Chinese Academy of Sciences; University of Chinese Academy of Sciences; Kuaishou Technology; and Sinosoft Company Limited."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No funding information is disclosed, so independence of funding cannot be verified. One author is from Kuaishou Technology (a tech company that could benefit from code review automation), but no funding relationship is stated."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests or financial interests statement is present in the paper."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The paper uses GPT-4 and several pre-trained models but does not state any training data cutoff dates. The cutoff for GPT-4 is not mentioned, nor are the training data periods for the baseline CRCGs."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "The paper mentions test cases were 'processed by humans to enhance simplicity, and thus to reduce the risk of data leakage' (Section 5.4) but does not analyze whether GPT-4 or baseline models may have seen similar code in their training data."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "The test set code snippets could plausibly appear in GPT-4's training data (drawn from OSS projects). The paper uses ROUGE-L dedup and human simplification as mitigation but does not systematically address whether the benchmark was contaminated."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No pre-registration mentioned for any component of the study (interviews, evaluations, or user study)."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No IRB or ethics board approval is mentioned despite involving human participants in interviews, scoring tasks, and a user study."
    268       },
    269       "demographics_reported": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "Professional demographics are reported: interviewees have 'over five years of experience and familiarity with machine learning tools' (Section B.2); scorers are 'master's and doctoral students in computer science, each with over six years of programming experience' (Section C.1); user study involves '5 industry developers' (Section 7.1)."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Inclusion criteria are stated: interviewees required 'over five years of experience and familiarity with machine learning tools in software engineering' (Section B.2); scorers required computer science graduate student status with 'at least six years of experience' (Section C.2)."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Not an experimental study with group assignment. All evaluators rated all methods in a within-subjects design. No treatment/control randomization is applicable."
    283       },
    284       "blinding_described": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No description of whether human evaluators were blinded to which method produced which comment. The LLM evaluator prompt says 'Models' names should not influence your judgment' (Table 9) suggesting names were visible. Order bias mitigation (ascending/descending) was applied for LLM evaluators but blinding is not addressed."
    288       },
    289       "attrition_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No reporting of participant dropout or attrition. The paper does not state how many evaluators started vs. finished, or whether any interview participants or scorers withdrew."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Table 4 reports per-case costs: humans $0.62 vs LLMs $0.06 for single comment evaluation, humans $2.09 vs LLMs $0.17 for performance comparison. API pricing ($0.03/1K input tokens, $0.06/1K output tokens) is stated in Section 5.4."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No total computational budget is stated. Per-case costs are given but total API spend across all experiments, total time for all evaluations, or hardware used are not reported."
    305       }
    306     },
    307     "experimental_rigor": {
    308       "seed_sensitivity_reported": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No sensitivity analysis across random seeds. GPT-4 was used with temperature 0.1 (near-deterministic) but no experiments with different seeds or temperature settings are reported."
    312       },
    313       "number_of_runs_stated": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "For LLM evaluators: 'evaluations were performed twice for each case, once in descending and once in ascending order' (Section 5.1). The number of runs is explicitly stated."
    317       },
    318       "hyperparameter_search_budget": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No hyperparameter search is described. Temperature 0.1 and k=3 for few-shot were chosen without reporting any search or comparison of alternatives."
    322       },
    323       "best_config_selection_justified": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The choice of k=3 for few-shot demonstrations is justified only with 'balancing input length with informative content' (Section 5.3). Temperature 0.1 is stated without justification. No systematic comparison of configurations is provided."
    327       },
    328       "multiple_comparison_correction": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable."
    332       },
    333       "self_comparison_bias_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The authors evaluate their own LLM-Reviewer using their own DeepCRCEval framework. While they acknowledge 'using LLMs to evaluate LLMs still potentially introduces bias' (Section 7.3), they do not address the broader self-comparison bias of evaluating their system with their own evaluation criteria."
    337       },
    338       "compute_budget_vs_performance": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "LLM-Reviewer uses GPT-4 while baselines are much smaller trained models (T5-based). The massive compute/capability difference between GPT-4 and the baseline models is not discussed or controlled for."
    342       },
    343       "benchmark_construct_validity": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "This is the paper's central contribution. It extensively argues that text similarity benchmarks (BLEU, ROUGE) lack construct validity for code review evaluation, and proposes criteria-based evaluation (Section 4, RQ1) with empirical evidence that only 3-8% of benchmark comments are suitable references."
    347       },
    348       "scaffold_confound_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "No scaffolding is involved. LLM-Reviewer is a simple prompt-based approach without agentic scaffolding."
    352       }
    353     },
    354     "data_leakage": {
    355       "temporal_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether GPT-4's training data includes code from the OSS projects used to construct the test set. The 1,000 test cases come from open-source projects that likely predate GPT-4's training cutoff."
    359       },
    360       "feature_leakage_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Not discussed. The LLM-Reviewer prompt includes evaluation criteria (the 9 quality dimensions), which is the same criteria used by the evaluator — this creates a circular advantage not addressed in the paper."
    364       },
    365       "non_independence_addressed": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "ROUGE-L was used to remove duplicates within the test set, but independence between training data of baseline models and the test set is not analyzed."
    369       },
    370       "leakage_detection_method": {
    371         "applies": true,
    372         "answer": false,
    373         "justification": "ROUGE-L deduplication and human simplification are used as basic prevention, but no formal leakage detection method (canary strings, membership inference, n-gram overlap analysis) is applied."
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "Less than 10% of benchmark comments are high quality for automation (3% in Tufano dataset, 8% in CodeReviewer dataset)",
    380       "evidence": "Venn diagram analysis (Figure 3) of 100 sampled comments from each dataset across quality, category, tone, and context dimensions (Section 4.3).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "DeepCRCEval provides higher discrimination than text similarity metrics for evaluating code review comments",
    385       "evidence": "Section 6.1 argues text similarity metrics show negligible differences (< 1% BLEU) while DeepCRCEval's 9 criteria show larger scoring spreads (Table 6). No direct quantitative comparison of discrimination power is provided.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "LLM evaluators reduce evaluation time by 88.78% and cost by 90.32% compared to human evaluators",
    390       "evidence": "Table 4 compares per-case averages: humans 224.45s/$0.62 vs LLMs 25.18s/$0.06 for single comments; humans 752.65s/$2.09 vs LLMs 68.69s/$0.17 for comparisons.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "LLM-Reviewer outperforms all existing SOTA CRCGs across almost all evaluation aspects",
    395       "evidence": "Table 6 shows LLM-Reviewer scoring 9+ on most criteria while baselines score 1-5. Table 7 shows LLM-Reviewer ranked 1st by both human and LLM evaluators.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "LLM evaluators achieve high concordance with human evaluators (above 0.75 ICC) in most evaluation aspects",
    400       "evidence": "Table 5 shows ICC values: 0.83 (Explanation Clarity), 0.80 (Problem Identification), 0.81 (Actionability), 0.79 (Specificity), 0.78 (Contextual Adequacy). Lower for Readability (0.62) and Brevity (0.62).",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Circular evaluation design",
    407       "detail": "LLM-Reviewer's prompt includes the same 9 quality criteria used by DeepCRCEval for evaluation (noted in Section 5.3: 'we use a similar prompt template to the one used for prompt evaluation'). This gives LLM-Reviewer a structural advantage — it is optimized to produce exactly what the evaluator rewards. The authors call this 'target-oriented' but do not acknowledge the circularity."
    408     },
    409     {
    410       "flag": "Unfair baseline comparison",
    411       "detail": "LLM-Reviewer uses GPT-4 (one of the most capable LLMs) while baseline CRCGs are much smaller models (T5-based, CodeReviewer). The performance gap could be due to GPT-4's general superiority rather than the criteria-guided approach. No comparison between GPT-4 with vs without criteria-guided prompts is conducted."
    412     },
    413     {
    414       "flag": "LLM evaluating LLM",
    415       "detail": "GPT-4 is used both as the backbone for LLM-Reviewer and as the LLM evaluator in DeepCRCEval. The paper acknowledges this risk (Section 7.3) but the mitigation (also using human evaluators) does not fully address it, especially given the small human evaluation sample."
    416     },
    417     {
    418       "flag": "No statistical significance tests",
    419       "detail": "Claims of superiority across all 9 criteria are based on comparing raw average scores without any statistical tests. With small human evaluation samples, observed differences may not be statistically significant."
    420     },
    421     {
    422       "flag": "Missing blinding in human evaluation",
    423       "detail": "It is unclear whether human evaluators knew which method generated which comment. Without blinding, evaluator expectations could bias results, especially given that LLM-Reviewer comments are qualitatively different (longer, more detailed) than baseline outputs."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Using pre-trained models to boost code review automation",
    429       "authors": [
    430         "R. Tufano",
    431         "S. Masiero",
    432         "A. Mastropaolo",
    433         "L. Pascarella",
    434         "D. Poshyvanyk",
    435         "G. Bavota"
    436       ],
    437       "year": 2022,
    438       "relevance": "Foundational work on DNN-based code review comment generation using T5, key baseline evaluated in this paper."
    439     },
    440     {
    441       "title": "Automating code review activities by large-scale pre-training",
    442       "authors": [
    443         "Z. Li",
    444         "S. Lu",
    445         "D. Guo",
    446         "N. Duan"
    447       ],
    448       "year": 2022,
    449       "relevance": "CodeReviewer model — major CRCG baseline with code-review-specific pre-training, evaluated and found lacking by DeepCRCEval."
    450     },
    451     {
    452       "title": "CCT5: A code-change-oriented pre-trained model",
    453       "authors": [
    454         "B. Lin",
    455         "S. Wang",
    456         "Z. Liu",
    457         "Y. Liu",
    458         "X. Xia",
    459         "X. Mao"
    460       ],
    461       "year": 2023,
    462       "relevance": "Code-change-oriented pre-trained model for review comment generation, baseline in this study."
    463     },
    464     {
    465       "title": "AUGER: automatically generating review comments with pre-training models",
    466       "authors": [
    467         "L. Li",
    468         "L. Yang",
    469         "H. Jiang",
    470         "J. Yan"
    471       ],
    472       "year": 2022,
    473       "relevance": "Pre-training-based code review comment generator using review tags, baseline in this study."
    474     },
    475     {
    476       "title": "CommentFinder: a simpler, faster, more accurate code review comments recommendation",
    477       "authors": [
    478         "Y. Hong",
    479         "C. Tantithamthavorn",
    480         "P. Thongtanunam",
    481         "A. Aleti"
    482       ],
    483       "year": 2022,
    484       "relevance": "Retrieval-based code review comment system, demonstrating alternative to generative approaches."
    485     },
    486     {
    487       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    488       "authors": [
    489         "L. Zheng",
    490         "W.L. Chiang",
    491         "Y. Sheng"
    492       ],
    493       "year": 2023,
    494       "relevance": "Foundational work on using LLMs as evaluators, showing GPT-4 agreement with humans surpasses inter-human agreement. Basis for DeepCRCEval's LLM evaluator design."
    495     },
    496     {
    497       "title": "Llama-Reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning",
    498       "authors": [
    499         "J. Lu",
    500         "L. Yu",
    501         "X. Li",
    502         "L. Yang",
    503         "C. Zuo"
    504       ],
    505       "year": 2023,
    506       "relevance": "First attempt at parameter-efficient fine-tuning of LLMs for code review tasks."
    507     },
    508     {
    509       "title": "Exploring the impact of code review factors on the code review comment generation",
    510       "authors": [
    511         "J. Lu",
    512         "Z. Li",
    513         "C. Shen",
    514         "L. Yang",
    515         "C. Zuo"
    516       ],
    517       "year": 2024,
    518       "relevance": "Investigates factors influencing code review for both pre-trained LMs and LLMs."
    519     },
    520     {
    521       "title": "EvaCRC: Evaluating code review comments",
    522       "authors": [
    523         "L. Yang",
    524         "J. Xu",
    525         "Y. Zhang",
    526         "H. Zhang",
    527         "A. Bacchelli"
    528       ],
    529       "year": 2023,
    530       "relevance": "BERT-based evaluation of code review comments across four dimensions, key related work on evaluation methodology."
    531     },
    532     {
    533       "title": "Expectations, outcomes, and challenges of modern code review",
    534       "authors": [
    535         "A. Bacchelli",
    536         "C. Bird"
    537       ],
    538       "year": 2013,
    539       "doi": "10.1109/ICSE.2013.6606617",
    540       "relevance": "Seminal study on code review expectations and challenges, provides the comment category taxonomy adopted by this paper."
    541     },
    542     {
    543       "title": "Code review quality: How developers see it",
    544       "authors": [
    545         "O. Kononenko",
    546         "O. Baysal",
    547         "M.W. Godfrey"
    548       ],
    549       "year": 2016,
    550       "doi": "10.1145/2884781.2884840",
    551       "relevance": "Defines developer perspectives on code review quality, directly informs the 9 evaluation criteria used in this paper."
    552     },
    553     {
    554       "title": "Towards automating code review activities",
    555       "authors": [
    556         "R. Tufano",
    557         "L. Pascarella",
    558         "M. Tufano",
    559         "D. Poshyvanyk",
    560         "G. Bavota"
    561       ],
    562       "year": 2021,
    563       "relevance": "Pioneering work on T5-based code review automation, predecessor to the Tufano et al. 2022 baseline."
    564     }
    565   ],
    566   "engagement_factors": {
    567     "practical_relevance": {
    568       "score": 2,
    569       "justification": "Proposes a usable evaluation framework and demonstrates GPT-4 as a training-free code reviewer, relevant to developers working on code review tooling."
    570     },
    571     "surprise_contrarian": {
    572       "score": 2,
    573       "justification": "The finding that less than 10% of benchmark comments are actually high quality challenges the foundation of how the field has been evaluating code review automation."
    574     },
    575     "fear_safety": {
    576       "score": 0,
    577       "justification": "No safety, security, or risk concerns are raised."
    578     },
    579     "drama_conflict": {
    580       "score": 1,
    581       "justification": "Mildly questions the validity of established benchmarks and metrics used by prior work, but doesn't target specific companies or make inflammatory claims."
    582     },
    583     "demo_ability": {
    584       "score": 1,
    585       "justification": "Materials are on Zenodo and a Gradio demo was built, but reproducing results requires GPT-4 API access and significant setup."
    586     },
    587     "brand_recognition": {
    588       "score": 0,
    589       "justification": "Authors are from the Chinese Academy of Sciences and lesser-known institutions, not prominent AI labs."
    590     }
    591   }
    592 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs