scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29009B)
      1 {
      2   "paper": {
      3     "title": "Retrieval-Augmented Code Review Comment Generation",
      4     "authors": [
      5       "Hyunsun Hong",
      6       "Jongmoon Baik"
      7     ],
      8     "year": 2025,
      9     "venue": "arXiv.org",
     10     "arxiv_id": "2506.11591",
     11     "doi": "10.48550/arXiv.2506.11591"
     12   },
     13   "scan_version": 3,
     14   "active_modules": [
     15     "experimental_rigor",
     16     "data_leakage"
     17   ],
     18   "methodology_tags": [
     19     "benchmark-eval"
     20   ],
     21   "key_findings": "RAG-Reviewer, a retrieval-augmented framework for code review comment generation, outperforms generation-only baselines by up to +1.67% EM and +4.25% BLEU on the Tufano et al. Java benchmark. Pair retrieval (code + comment) consistently outperforms singleton retrieval (comment only) despite fitting fewer exemplars within the token budget. The approach improves generation of low-frequency ground-truth tokens by up to 24.01%, and performance scales with the number of retrieved exemplars, though with diminishing returns due to input length constraints.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper states: 'To support reproducibility and encourage future research, we publicly release our implementation on GitHub: https://github.com/RAG-Reviewer/RAG-Reviewer' (Section I, contribution 3)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper uses the publicly available Tufano et al. dataset (Section IV.B), and references all baseline replication packages with specific URLs (references [36]–[42])."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Section IV.E mentions 'NVIDIA Tesla V100-SXM2-32GB GPU' and training hyperparameters, but provides no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "While code is released on GitHub, the paper itself contains no step-by-step reproduction instructions, README-level commands, or scripts for replicating experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results in Tables III, IV, and Figures 6–7 are reported as point estimates (e.g., '2.90% EM', '12.98% BLEU') with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims improvements (e.g., 'RAG-Reviewer outperforms') based solely on comparing raw numbers. No statistical significance tests (t-tests, bootstrap, etc.) are reported anywhere."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper provides percentage improvements with baseline context throughout: '+1.67% EM' (from 0.87% to 2.54%), '+4.25% BLEU' (from 9.27% to 13.52%), and relative LFGT improvements (e.g., '24.01%' in Table IV)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification is provided for the dataset size (16,780 test instances from Tufano et al.) or the 100-sample manual analysis. No power analysis or sample adequacy discussion."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No standard deviations, variance, or spread measures are reported. Results appear to be from single experimental runs with no indication of result stability."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Seven baselines are compared: two IR-based (CommentFinder, UniXCoder-IR) and five generation-based (Tufano T5, CodeT5, CodeT5+, CodeReviewer, AUGER), all presented in Table III."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include recent work: Kartal et al. 2024, CodeT5+ 2023, AUGER 2022, CodeReviewer 2022. These represent the current state of the art for this task."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper ablates the retrieval strategy (singleton vs. pair, Table III), the number of retrieved exemplars (0 to 8, Figure 6), and compares with/without retrieval augmentation (vanilla vs. RAG)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Two metrics are used throughout: BLEU-4 and Exact Match (EM), as stated in Section IV.C."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section VI.B reports manual analysis of 100 review comments classified into four categories (Exact Match, Semantically Equivalent, Alternative Solution, Incorrect) in Table V, following the methodology of Tufano et al. [9]."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "They use the standard train/valid/test split from Tufano et al.: 134,239/16,780/16,780 (Table II). 'The best model checkpoint was selected based on validation performance' (Section IV.E)."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by retrieval strategy (Table III), token frequency thresholds (Table IV: ≤20 through ≤100), code/review length intervals (Figure 7), and number of retrieved exemplars (Figure 6)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Figure 1 illustrates where generation-based and IR-based methods fail. The manual analysis (Table V) reports 50/100 incorrect outputs for RAG-Reviewer. Section VI.B discusses concrete success and failure examples (Figures 8, 9)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports that gains over IR baselines are small (e.g., Pair CodeT5 2.90% vs. CommentFinder 2.80% EM), that RAG-Reviewer had fewer exact matches than CommentFinder in manual analysis (2 vs. 4, Table V), and that CodeT5 showed only modest LFGT improvements (3.67%–6.09%, Table IV)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of '+1.67% higher exact match' (Tufano T5: 0.87%→2.54%, Table III), '+4.25% higher BLEU' (CodeReviewer: 9.27%→13.52%, Table III), and '24.01%' LFGT improvement (CodeReviewer ≤100, Table IV) are all supported by results."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims ('RAG improves generation') are supported by controlled comparisons: vanilla vs. RAG (same PLM backbone), singleton vs. pair retrieval, and varying k from 0 to 8. These ablation-style experiments isolate the retrieval augmentation component."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title 'Retrieval-Augmented Code Review Comment Generation' implies general applicability, but all experiments use a single Java dataset (Tufano et al.). While Section VIII acknowledges the Java-only limitation, the title and framing do not bound to Java or this specific benchmark."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not discuss alternative explanations for why RAG helps. It does not consider whether improvements stem from simply having longer input context, memorization of training exemplars, or other confounds."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures EM and BLEU and frames these as 'review comment generation quality.' They acknowledge the proxy gap by conducting manual analysis (Section VI.B) noting that 'these metrics does not fully reflect the quality of generated review comments.'"
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Specific model variants are identified: 'CodeT5-base' (12 encoder/12 decoder layers), 'CodeT5p-220m', CodeReviewer, UniXcoder. Exact HuggingFace model URLs are provided in references [37]–[42]."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper does not use prompting. All models are fine-tuned encoder-decoder PLMs with structured input sequences (Equations 6–8), not prompt-based."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section IV.E provides detailed hyperparameters: AdamW optimizer (lr=3e-5, weight decay=0.01), gradient accumulation (3 steps), batch size 12, gradient clipping (max norm=1.0), beam size 10, 20 epochs, early stopping after 3 epochs, input/output token limits (512/128)."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. RAG-Reviewer is a standard retrieval + generation pipeline without agents, tools, or iterative reasoning."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section IV.B documents the dataset source (Java open-source projects from GitHub/Gerrit), token length distributions (Figure 5, Table II), tokenization (CodeT5 tokenizer), and token truncation at 512/128. The retrieval database construction is described in Section III.B."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section VIII 'Threats to Validity' provides substantive discussion of both internal validity (reimplementation, hyperparameter choices) and external validity (Java-only evaluation)."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Specific threats include: reimplementation of Tufano T5 in PyTorch from TensorFlow 2.6.0 with converted pretrained weights, hyperparameters constrained by GPU memory limits, and evaluation limited to a single Java dataset."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section VIII explicitly states: 'Our evaluation is based solely on the Java dataset from Tufano et al. To improve generalizability, future work should evaluate across more languages and diverse software projects.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The Tufano et al. dataset is publicly available, and all model replication packages are referenced with URLs (refs [36]–[42]). The authors also release their implementation on GitHub."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section IV.B describes the dataset: 'constructed from large-scale Java open-source projects on GitHub and Gerrit. Each code is function-level granularity written in Java and paired with its corresponding review comment.' Statistics in Table II."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The data source is a standard public benchmark (Tufano et al.)."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline is documented: code encoding with UniXcoder (Eq. 1), retrieval database construction (Eq. 2), similarity computation (Eq. 4), top-k selection (Eq. 5), input augmentation (Eqs. 6–8), and training loss (Eq. 10). Token frequency distribution analysis provided in Table I."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information, acknowledgments section, or grant disclosures appear anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Both authors clearly list their affiliation as School of Computing, KAIST, Daejeon, Republic of Korea. They are not evaluating a product from their own organization."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence of the funder from the outcome cannot be assessed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial disclosure appears in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the pre-trained models used (CodeT5, CodeT5+, CodeReviewer, UniXcoder). The pre-training corpora included GitHub code which may overlap with the benchmark."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether the Tufano et al. test examples could appear in the pre-training data of models like CodeT5 (pre-trained on CodeSearchNet and GitHub repositories)."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The Tufano et al. benchmark was published in 2022, and models like CodeT5 were pre-trained on overlapping sources (GitHub, CodeSearchNet). No contamination analysis is performed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The manual analysis of 100 samples was performed by the authors, not external human subjects."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in the study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference cost, latency, or per-example computation time is reported for RAG-Reviewer. The paper mentions CommentFinder's 49× speedup over generation baselines (from prior work) but does not measure RAG-Reviewer's own cost."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The GPU type is mentioned (V100-SXM2-32GB) and training hyperparameters are given, but total GPU hours, training wall-clock time, and total compute budget are not stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs producing the reported results is never stated."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Section VIII notes hyperparameters were 'selected based on prior work and GPU memory limits' but no search budget, number of configurations tried, or search method is reported."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Section IV.E states: 'The best model checkpoint was selected based on validation performance' with early stopping applied after 3 epochs of no improvement on the validation set."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed at all, so multiple comparison correction is moot. The absence of statistical testing is captured by significance_tests."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors reimplemented all baselines themselves ('We reproduced all the baselines,' Section IV.D) and do not acknowledge the well-documented bias of authors' reimplementations systematically underperforming."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "RAG-Reviewer adds retrieval encoding and database lookup on top of generation, using more compute than vanilla generation baselines. This compute overhead is not discussed or compared."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper uses EM and BLEU without discussing whether these metrics actually capture review comment quality. The manual analysis (Section VI.B) partially addresses this gap but the paper does not explicitly question construct validity of the automated metrics."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. RAG-Reviewer is a standard retrieval + generation pipeline, not an agentic scaffold."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the pre-trained models' training data includes code-review pairs from after the Tufano et al. dataset's collection period, or whether benchmark solutions were available during pre-training."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "Section III.A states: 'To prevent data leakage, the top-1 retrieved exemplar—which is identical to the input training instance—is excluded during training.' This addresses the primary feature leakage vector in the RAG evaluation setup."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether train and test examples share structural similarities such as originating from the same projects, same authors, or containing near-duplicate code patterns."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "The top-1 exclusion during training is a design choice to avoid trivial copying, not a leakage detection method in the sense of canary strings, membership inference, or decontamination pipelines. No systematic leakage detection is applied."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "RAG-Reviewer (Pair) outperforms generation-based baselines by up to +1.67% EM and +4.25% BLEU.",
    373       "evidence": "Table III shows Tufano T5 improves from 0.87% to 2.54% EM (+1.67%), and CodeReviewer improves from 9.27% to 13.52% BLEU (+4.25%). Consistent gains across all five PLMs.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Pair retrieval consistently outperforms singleton retrieval across all PLMs.",
    378       "evidence": "Table III shows pair retrieval achieves higher EM (+0.14% to +0.53%) and BLEU (+0.53% to +1.76%) than singleton retrieval for all five models tested.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "RAG-Reviewer improves generation of low-frequency ground-truth tokens by up to 24.01%.",
    383       "evidence": "Table IV shows relative improvements across all models, with CodeReviewer achieving 24.01% at the ≤100 threshold and Tufano T5 consistently showing 20–23% gains.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Performance improves as the number of retrieved exemplars increases.",
    388       "evidence": "Figure 6 shows EM improving from 1.39% (k=0) to 2.90% (k=8) for Pair CodeT5, with diminishing marginal gains at higher k values.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "RAG-Reviewer produces more semantically equivalent comments and fewer incorrect outputs than baselines in manual analysis.",
    393       "evidence": "Table V shows RAG-Reviewer achieves 39 semantically equivalent vs. 30 for CommentFinder and 36 for Tufano T5, with 50 incorrect vs. 57 and 54 respectively. Sample size is only 100.",
    394       "supported": "weak"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "No statistical testing on any comparison",
    400       "detail": "All claims of improvement are based on comparing single-run point estimates with no significance tests, confidence intervals, or variance measures. With margins as small as 0.10–0.21% EM between methods, these differences may not be statistically significant."
    401     },
    402     {
    403       "flag": "Single benchmark evaluation",
    404       "detail": "All experiments use only the Tufano et al. Java dataset. Generalizability to other languages, review styles, or project types is untested despite general framing in the title."
    405     },
    406     {
    407       "flag": "Marginal improvement over IR baselines",
    408       "detail": "RAG-Reviewer's best EM (3.01%) is only marginally better than CommentFinder (2.80%) and UniXCoder-IR (2.79%). Without statistical tests, it is unclear whether these gains are meaningful or within noise."
    409     },
    410     {
    411       "flag": "Self-reimplementation of all baselines",
    412       "detail": "The authors reimplemented all baselines ('We reproduced all the baselines') without acknowledging or mitigating the well-documented bias that authors' reimplementations of competitors tend to systematically underperform."
    413     },
    414     {
    415       "flag": "Small manual analysis without inter-annotator agreement",
    416       "detail": "The manual analysis uses only 100 samples with no mention of who performed the evaluation, how many annotators were involved, or inter-annotator agreement. Author self-evaluation of their own system's output is prone to bias."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Using pre-trained models to boost code review automation",
    422       "authors": ["R. Tufano", "S. Masiero", "A. Mastropaolo", "L. Pascarella", "D. Poshyvanyk", "G. Bavota"],
    423       "year": 2022,
    424       "relevance": "Provides the primary benchmark dataset and baseline (Tufano T5) for code review comment generation evaluation."
    425     },
    426     {
    427       "title": "CodeReviewer: Pre-training for automating code review activities",
    428       "authors": ["Z. Li", "S. Lu", "D. Guo", "N. Duan"],
    429       "year": 2022,
    430       "arxiv_id": "2203.09095",
    431       "relevance": "Proposes code review-specific pre-training and a benchmark dataset that is foundational for evaluating RCG methods."
    432     },
    433     {
    434       "title": "CommentFinder: a simpler, faster, more accurate code review comments recommendation",
    435       "authors": ["Y. Hong", "C. Tantithamthavorn", "P. Thongtanunam", "A. Aleti"],
    436       "year": 2022,
    437       "relevance": "First IR-based code review comment generation approach, reporting 32% improvement over generation-based methods with 49× speedup."
    438     },
    439     {
    440       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    441       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. Hoi"],
    442       "year": 2021,
    443       "arxiv_id": "2109.00859",
    444       "relevance": "Pre-trained code model used as a backbone generator in RAG-Reviewer and found to be a strong baseline for code review tasks."
    445     },
    446     {
    447       "title": "CodeT5+: Open code large language models for code understanding and generation",
    448       "authors": ["Y. Wang", "H. Le", "A. D. Gotmare", "N. D. Bui", "J. Li", "S. C. Hoi"],
    449       "year": 2023,
    450       "arxiv_id": "2305.07922",
    451       "relevance": "Extended code PLM with mixture of pre-training objectives, used as a generator backbone achieving the best overall RAG-Reviewer performance."
    452     },
    453     {
    454       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    455       "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
    456       "year": 2020,
    457       "relevance": "Foundational RAG paper that this work builds upon to integrate retrieval with generation for code review."
    458     },
    459     {
    460       "title": "Retrieval augmented code generation and summarization",
    461       "authors": ["M. R. Parvez", "W. U. Ahmad", "S. Chakraborty", "B. Ray", "K.-W. Chang"],
    462       "year": 2021,
    463       "arxiv_id": "2108.11601",
    464       "relevance": "Applies RAG to code summarization and generation with PLMs; provides the pair/singleton retrieval strategy methodology adopted by this work."
    465     },
    466     {
    467       "title": "UniXcoder: Unified cross-modal pre-training for code representation",
    468       "authors": ["D. Guo", "S. Lu", "N. Duan", "Y. Wang", "M. Zhou", "J. Yin"],
    469       "year": 2022,
    470       "arxiv_id": "2203.03850",
    471       "relevance": "Code encoder used for the retrieval module in RAG-Reviewer; shown to achieve state-of-the-art IR-based RCG performance."
    472     },
    473     {
    474       "title": "Generation-based code review automation: How far are we?",
    475       "authors": ["X. Zhou", "K. Kim", "B. Xu", "D. Han", "J. He", "D. Lo"],
    476       "year": 2023,
    477       "relevance": "Comparative study of PLMs for code review that found CodeT5 consistently outperformed other code-specific models."
    478     },
    479     {
    480       "title": "Llama-Reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning",
    481       "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"],
    482       "year": 2023,
    483       "relevance": "Applies LLM fine-tuning (LLaMA with PEFT) to code review automation, providing evidence on LLM vs PLM effectiveness for RCG."
    484     },
    485     {
    486       "title": "AUGER: Automatically generating review comments with pre-training models",
    487       "authors": ["L. Li", "L. Yang", "H. Jiang"],
    488       "year": 2022,
    489       "relevance": "T5-based review comment generator with review-line tagging, used as a baseline in this work."
    490     },
    491     {
    492       "title": "InferFix: End-to-end program repair with LLMs",
    493       "authors": ["M. Jin", "S. Shahriar", "M. Tufano"],
    494       "year": 2023,
    495       "relevance": "Applies RAG to LLM-based program repair by augmenting prompts with retrieved bug-fix pairs, demonstrating RAG effectiveness in SE tasks."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 1,
    501       "justification": "Code review automation is practically relevant, but this is a research framework evaluated on academic benchmarks, not a deployable tool."
    502     },
    503     "surprise_contrarian": {
    504       "score": 0,
    505       "justification": "RAG improving generation quality is an expected and well-established finding; the paper confirms rather than challenges conventional wisdom."
    506     },
    507     "fear_safety": {
    508       "score": 0,
    509       "justification": "No safety, security, or risk implications are raised by automated code review comment generation."
    510     },
    511     "drama_conflict": {
    512       "score": 0,
    513       "justification": "No controversy or conflict; straightforward application of RAG to a new task."
    514     },
    515     "demo_ability": {
    516       "score": 2,
    517       "justification": "Code is publicly released on GitHub (https://github.com/RAG-Reviewer/RAG-Reviewer), enabling reproduction though not a simple demo."
    518     },
    519     "brand_recognition": {
    520       "score": 0,
    521       "justification": "KAIST is a respected research university but not a brand that drives mainstream attention."
    522     }
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs