ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20779B)


      1 {
      2   "paper": {
      3     "title": "CRScore++: Reinforcement Learning with Verifiable Tool and AI Feedback for Code Review",
      4     "authors": ["Manav Nitin Kapadnis", "Atharva Naik", "Carolyn Rosé"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2506.00296"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive link is provided in the paper. The paper references external tools (Ruff, PyScent) but does not release its own training/evaluation code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses the publicly available CodeReviewer dataset (Apache 2.0 license, linked in Appendix B.3) and references public tools. The base dataset is publicly accessible."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Appendix A.4 specifies GPU type (NVIDIA A100 80GB), model versions (Qwen2.5-Coder-3B/7B-Instruct), flash attention, batch size, max sequence length, training epochs, and references HuggingFace alignment-handbook recipes. Sufficient to recreate environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described in text but there are no executable instructions."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 1-4 report only point estimates. No confidence intervals, error bars, or ± notation appear anywhere in the results."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims improvements (e.g., '56% relative improvement') but provides no statistical significance tests. Comparisons are based solely on comparing numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports relative improvements with baseline context (e.g., '56% relative improvement in comprehensiveness for 3B models' in Section 6, and absolute changes from zero-shot shown in parentheses in Tables 1-2)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 33 samples per language were chosen for human evaluation, or why 5,000 instances for DPO. The limitations section acknowledges the human eval sample is small but provides no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported for any results. Single-run numbers only with no indication of variability across seeds or runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 includes multiple baselines: Zero Shot, CR dataset (direct training on CodeReviewer), and Tool Guided, compared against Stage 1 (SFT) and Stage 2 (DPO)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The baselines are all internal configurations (zero-shot, direct fine-tuning, tool-guided). No comparison against other recent code review generation methods such as Jaoua et al. (2025) despite discussing them in related work."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The staged approach (Zero Shot → Tool Guided → Stage 1 SFT → Stage 2 DPO) serves as an ablation, showing incremental contributions of each component. Table 3 separately evaluates tool utilization."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Three metrics are used: comprehensiveness, conciseness, and relevance (Tables 1-2, 4). Table 3 adds accuracy and coverage for tool utilization."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.4 describes a human evaluation study with 4 experienced software developers on 100 code changes (33 per language), rating on 5-point Likert scales. Results in Table 4."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses CodeReviewer test sets for evaluation, separate from the training data (20,888 SFT samples, 5,000 DPO samples). Cross-language evaluation uses Java and JavaScript test sets not seen during training."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by model size (3B vs 7B), training stage, and programming language (Python, Java, JavaScript) across Tables 1-4."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses the comprehensiveness-conciseness trade-off (Section 5.1), where DPO models sacrifice brevity. Also notes that models trained on CodeReviewer ground truth perform worse than zero-shot."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Models trained directly on CodeReviewer data perform worse than zero-shot (Table 1). DPO reduces conciseness scores. Table 3 shows DPO provides no accuracy improvements over SFT for tool utilization."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims about improving student models through SFT+RL and cross-language generalization are supported by Tables 1-2 and 4."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about CRScore++ improving performance are supported by controlled ablation: same base model with progressive addition of components (zero-shot → tool-guided → SFT → DPO). This is adequate single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Code Review' generally, but the paper only tests on Qwen2.5-Coder models with GPT-4o-mini as teacher, on CodeReviewer dataset, for 3 languages. The limitations section acknowledges single model family but the title/abstract do not bound claims to this setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its results. For example, the cross-language generalization could be due to the base model's pre-existing multilingual capabilities rather than CRScore++ training. No confounds are considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'GPT-4o Mini' and 'Qwen2.5-Coder-Instruct' but does not provide API snapshot dates or specific version identifiers for GPT-4o-mini. No date stamp like 'gpt-4o-mini-2024-07-18'."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt text is provided in Appendices A.2, A.3, B.1, and B.2 for the teacher model, SFT training, LLM-as-a-judge evaluation, and CoT evaluation prompts respectively."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.4 and Section 3 report learning rate (2e-5), batch size (2/8), epochs (2), β=0.1 for DPO, weight decay 0.01, max sequence length 1024, fp16, seed 42, eval steps 100, AdamW optimizer."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The approach is a standard SFT+DPO training pipeline with static analysis tool integration, not an agentic system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix B.3 documents dataset statistics: 20,888 Python examples for SFT, 5,000 subset for DPO, 20 candidates per sample scored by GPT-4o-mini, pairs with Δ≥2 selected. Static analysis tool pipeline described in Section 3.1."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 8 'Limitations' provides a dedicated, substantive discussion of multiple limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 8 discusses specific threats: small human eval sample (33 per language), GPT-4o-mini evaluation bias, limitation to Qwen model family, exclusion of LCoT models, limited static analysis toolkit coverage."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 explicitly states: limited to 3 programming languages, Qwen model family only, no LCoT models tested, static analysis toolkit doesn't cover test coverage or architectural consistency."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The generated SFT training data, DPO preference pairs, and evaluation outputs are not released. Only the base CodeReviewer dataset is public."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Appendix B.3 describes data collection: CodeReviewer dataset selection, annotation with Ruff and PyScent, GPT-4o-mini generation of reviews and topics, DPO pair creation with score differential ≥2."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The human evaluation uses '4 experienced software developers' but does not describe how they were recruited, their background, or potential selection bias."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented: CodeReviewer → static analysis annotation → GPT-4o-mini review generation (20,888 samples) → DPO subset (5,000) → 20 candidates per sample → scoring → pair selection (Δ≥2)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding sources or acknowledgments section mentioning grants or sponsors is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations with Carnegie Mellon University Language Technologies Institute are clearly stated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses GPT-4o-mini as teacher/judge and Qwen2.5-Coder as student but does not state training data cutoff dates for either model."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The CodeReviewer dataset is publicly available. No discussion of whether GPT-4o-mini or Qwen2.5-Coder may have seen CodeReviewer test data during pre-training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "CodeReviewer was published in 2022 and is publicly available. Both GPT-4o-mini and Qwen2.5-Coder were trained after 2022, so contamination is plausible. This is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No mention of pre-registration for the human evaluation study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned. The Ethics Statement mentions informed consent and fair compensation but not ethics review."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "Annotators are described only as '4 experienced software developers' with no further demographic information (years of experience, etc.)."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria for selecting the 4 annotators are described."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "Not an experimental study with treatment/control assignment. All annotators rated the same reviews."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No mention of whether annotators knew which model configuration produced which review."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No mention of whether all 4 annotators completed all annotations or if there was any dropout."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost or latency per review is reported. The paper mentions API calls for evaluation but not the cost of generating a single review at inference time."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Appendix B.4 states: ~450 GPU hours on A100 hardware total, SFT ~8 hours, DPO ~6 hours, ~40,000 GPT-4o-mini API calls for data preparation, ~2,600 API calls for evaluation."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CRScore++ achieves 56% relative improvement in comprehensiveness for 3B models and 42% relative improvement in relevance compared to zero-shot baselines on Python.",
    286       "evidence": "Table 1: Qwen 3B zero-shot comprehensiveness 0.43 → Stage 2 0.67 (+0.24); relevance 0.45 → 0.64 (+0.19). Section 6.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Models trained exclusively on Python achieve nearly equivalent performance when reviewing Java and JavaScript code.",
    291       "evidence": "Table 2: Stage 2 3B Python comprehensiveness 0.67, Java 0.64, JavaScript 0.62. Gaps are small. Human evaluation (Table 4) shows similar patterns.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Models trained directly on CodeReviewer ground truth perform worse than zero-shot approaches.",
    296       "evidence": "Table 1: Qwen 3B CR dataset comprehensiveness 0.10 vs zero-shot 0.43; relevance 0.20 vs 0.45.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "DPO provides no accuracy improvements over SFT for tool utilization, indicating tool understanding is established during initial distillation.",
    301       "evidence": "Table 3: SFT models show higher or comparable accuracy scores to DPO across all languages. Section 5.3.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "CRScore++ combines verifiable static analysis signals (linters, code smell detectors) with LLM-based subjective feedback for RL training of code review models. The two-stage approach (SFT + DPO) improves comprehensiveness and relevance of generated reviews at the cost of conciseness. Models trained only on Python generalize well to Java and JavaScript without language-specific fine-tuning. Notably, models trained on scraped CodeReviewer ground truth perform worse than zero-shot baselines, motivating the use of teacher-generated demonstrations.",
    307   "red_flags": [
    308     {
    309       "flag": "No statistical significance tests",
    310       "detail": "All comparative claims are based on point estimate differences without any significance testing, error bars, or variance reporting. With LLM-as-a-judge evaluation, the variability of the judge itself is not quantified."
    311     },
    312     {
    313       "flag": "LLM-as-a-judge circularity",
    314       "detail": "GPT-4o-mini is used both as the teacher model for generating training data AND as the judge for evaluation. This creates a potential circularity where the evaluation metric favors outputs that match the teacher's style."
    315     },
    316     {
    317       "flag": "No external baselines",
    318       "detail": "Despite discussing Jaoua et al. (2025) in related work, no comparison is made against this or any other published code review generation method. All baselines are internal configurations."
    319     },
    320     {
    321       "flag": "Benchmark contamination risk unaddressed",
    322       "detail": "CodeReviewer dataset (2022) is publicly available and likely in the training data of both GPT-4o-mini and Qwen2.5-Coder. This is not discussed."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "CRScore: Grounding Automated Evaluation of Code Review Comments in Code Claims and Smells",
    328       "authors": ["Atharva Naik", "Marcus Alenius", "Daniel Fried", "Carolyn Rose"],
    329       "year": 2024,
    330       "relevance": "Predecessor metric for evaluating code review quality that this paper extends into a training framework."
    331     },
    332     {
    333       "title": "Combining Large Language Models with Static Analyzers for Code Review Generation",
    334       "authors": ["Imen Jaoua", "Oussama Ben Sghaier", "Houari Sahraoui"],
    335       "year": 2025,
    336       "arxiv_id": "2502.06633",
    337       "relevance": "Closely related work on hybrid LLM+static analysis approaches for code review generation."
    338     },
    339     {
    340       "title": "Automating Code Review Activities by Large-Scale Pre-Training",
    341       "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"],
    342       "year": 2022,
    343       "relevance": "Source of the CodeReviewer dataset and baseline model used in this paper's experiments."
    344     },
    345     {
    346       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    347       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    348       "year": 2024,
    349       "relevance": "Core alignment method (DPO) used in the paper's preference optimization stage."
    350     },
    351     {
    352       "title": "Can LLMs Replace Human Evaluators? An Empirical Study of LLM-as-a-Judge in Software Engineering",
    353       "authors": ["Ruiqi Wang", "Jiyu Guo", "Cuiyun Gao"],
    354       "year": 2025,
    355       "arxiv_id": "2502.06193",
    356       "relevance": "Validates LLM-as-a-judge for SE evaluation tasks, methodology used in this paper."
    357     },
    358     {
    359       "title": "RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback",
    360       "authors": ["Harrison Lee", "Samrat Phatale", "Hassan Mansoor"],
    361       "year": 2024,
    362       "relevance": "Foundation work on RLAIF that this paper extends with verifiable tool signals."
    363     },
    364     {
    365       "title": "CodeReviewQA: The Code Review Comprehension Assessment for Large Language Models",
    366       "authors": ["Hong Yi Lin", "Chunhua Liu", "Haoyu Gao"],
    367       "year": 2025,
    368       "arxiv_id": "2503.16167",
    369       "relevance": "Benchmark for evaluating LLM code review comprehension abilities."
    370     },
    371     {
    372       "title": "ReTool: Reinforcement Learning for Strategic Tool Use in LLMs",
    373       "authors": ["Jiazhan Feng", "Shijue Huang", "Xingwei Qu"],
    374       "year": 2025,
    375       "relevance": "Motivates using RL over tool feedback to enhance reasoning and tool utilization in LLMs."
    376     },
    377     {
    378       "title": "A Critical Evaluation of AI Feedback for Aligning Large Language Models",
    379       "authors": ["Archit Sharma", "Sedrick Scott Keh", "Eric Mitchell"],
    380       "year": 2024,
    381       "relevance": "Informs the design decision to use the same teacher model for both SFT and PO stages."
    382     }
    383   ]
    384 }

Impressum · Datenschutz