scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24234B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Harnessing Large Language Models for Curated Code Reviews",
      6     "authors": [
      7       "O. Sghaier",
      8       "M. Weyssow",
      9       "H. Sahraoui"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE Working Conference on Mining Software Repositories",
     13     "arxiv_id": "2502.03425",
     14     "doi": "10.1109/MSR66628.2025.00039"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The 85.9% refactoring+bugfix figure matches Figure 3 (69.7%+16.2%), 46% BLEU improvement matches Table X (7.71→11.26), and 22% CodeBLEU improvement matches Table XII (0.36→0.44).",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The curation pipeline combines two steps (filtering + reformulation) with no ablation separating their effects; additionally, changing the target comments for fine-tuning inflates BLEU trivially by making ground-truth targets more regular, yet this confound is not addressed.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Conclusions claim the curated dataset 'offers a valuable baseline for future research in code review' broadly, but evaluation is limited to one dataset (Li et al. 2022) and one model (DeepSeek-6.7B-Instruct) without bounding scope.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not consider that higher BLEU with curated targets may be mechanically inflated because cleaner, more uniform targets reduce n-gram entropy—not necessarily because the model learned better representations.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "BLEU and CodeBLEU are used interchangeably with 'accuracy' and 'quality' without discussing the limits of n-gram overlap as proxies for real-world comment usefulness or code correctness.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section VI 'Threats to Validity' is a dedicated section discussing potential limitations of the methodology.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The two threats discussed (noisy/non-English comments, LLM judge reliability) are generic; the section omits critical threats such as lack of ablation between filtering and reformulation, absolute performance ceiling, sample subset selection rationale, and single-model generalizability.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what the results do not show; there are no explicit scope boundaries such as 'results apply only to this dataset' or 'we cannot claim X about other models or languages.'",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source is mentioned anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations (Université de Montréal and Singapore Management University) are clearly stated in the paper header.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement appears in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms including 'code review,' 'curation pipeline,' 'LLM-as-a-Judge,' and the evaluation criteria (clarity, relevance, conciseness, civility, type, nature) are defined explicitly in Tables II and III and surrounding text.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The three contributions are explicitly enumerated in the introduction: an evaluation framework, an automated curation pipeline, and a comparative study demonstrating improvements.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section VII engages substantively with prior code review automation work and explicitly differentiates CuRev by noting that no prior work examined dataset quality or implemented curation preprocessing.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Replication package explicitly available at https://github.com/OussamaSghaier/CuREV, stated in the paper header and Data Availability section.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Dataset released at https://zenodo.org/records/14812107, stated in the paper header and Data Availability section.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Hardware is specified (4× NVIDIA RTX A5000 24GB) but no requirements.txt, Dockerfile, or software dependency list is provided in the paper.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The paper describes the methodology at a high level but provides no step-by-step instructions for reproducing results; full reproduction requires consulting the external GitHub repository.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All main results (BLEU 7.71→11.26, CodeBLEU 0.36→0.44, scoring improvements) are reported as point estimates with no confidence intervals or error bars.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are applied to the downstream task comparisons or dataset quality comparisons; only Cohen's kappa is reported for the human sanity check.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Relative improvements are reported (46% BLEU improvement, 22% CodeBLEU improvement) with baseline values provided for context.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 20,000-sample subset used for downstream tasks is not justified—no power analysis or rationale is given for why this size was chosen from the 176,613-sample dataset.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No standard deviation, variance, or confidence ranges are reported for any metric across the experiments.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The original uncurated dataset serves as the explicit baseline for all downstream comparisons in RQ3 and RQ4.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The original dataset (Li et al. 2022) is the most natural and direct comparison; no competing curation methods exist in the literature according to the related work.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The curation pipeline has two distinct steps (relevance filtering and comment reformulation) but no ablation study separates their individual contributions to downstream performance.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Code refinement is evaluated with both CodeBLEU and Exact Match; comment quality uses clarity, conciseness, relevance, type, nature, and civility metrics.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "The 100-sample human sanity check evaluates the LLM judge's annotation quality, not the quality of generated comments or refined code produced by the downstream systems.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "For comment generation (RQ3), the 20,000-sample subset is split 75/25 for training and evaluation on a held-out test set.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Tables V and VII provide detailed per-category breakdowns of scoring criteria for type, nature, and civility subcategories across original and curated datasets.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "The paper provides only successful reformulation examples (Table IX, XI) but does not systematically discuss failure cases or cases where reformulation degraded comment quality.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "All results uniformly favor the curated dataset; no negative or null results are reported, including no analysis of where curation failed or hurt performance.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact model versions are specified: 'Llama-3.1-70B-Instruct' for judging/curation and 'DeepSeek-Coder-6.7B-Instruct' for downstream tasks.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Excerpts of both prompts (evaluation in Table IV, reformulation in Table VI) are provided with actual instructions; full prompts are stated to be available in the replication package.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Hyperparameters are reported: batch size 4, 5 epochs, LoRA with r=16, α=32, dropout=0.05.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used; the models are called directly for annotation, reformulation, and inference.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Data preprocessing is described: relevance threshold of 4 for filtering (removing 5,895 samples), followed by LLM-based reformulation of the remaining 170,718 comments.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Both the original dataset (cited as Li et al. 2022, publicly available) and the curated CuRev dataset (released at Zenodo) are publicly accessible.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The dataset selection and processing pipeline (LLM-as-judge annotation, relevance filtering, reformulation, re-evaluation) is described across Sections III and IV.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participant recruitment; the dataset is from automated extraction of public GitHub pull requests.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Figure 1 provides an overview of the full pipeline from dataset selection through annotation, filtering, reformulation, and downstream evaluation.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The training data cutoffs for Llama-3.1-70B-Instruct and DeepSeek-Coder-6.7B-Instruct are not stated, despite the code review dataset being sourced from public GitHub repositories commonly included in LLM pre-training corpora.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether the pre-trained base models (Llama-3.1-70B or DeepSeek-Coder) may have seen the GitHub pull request data used in the code review dataset during pre-training.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "The evaluation dataset is derived from public GitHub PRs, which are standard LLM pre-training data; neither contamination risk nor its potential inflation of fine-tuning results is discussed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants study; the 2-author sanity check is an internal quality check, not a registered human subjects study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants requiring IRB review.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost or latency is reported for running Llama-3.1-70B over 176,613 samples, which would be a significant compute cost.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Hardware is mentioned (4× RTX A5000 24GB) but total GPU-hours for training or annotation are not stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Fine-tuning on the curated dataset achieves a 46% improvement in BLEU score for comment generation (7.71 → 11.26) compared to the original dataset.",
    373       "evidence": "Table X reports BLEU scores for DeepSeek-Coder-6.7B-Instruct trained on original (7.71) vs. curated (11.26) comments on a 25% held-out test split.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Curated comments improve code refinement by 22% on CodeBLEU (0.36 → 0.44) and increase Exact Match from 408 to 445.",
    378       "evidence": "Table XII reports CodeBLEU and EM for the same DeepSeek model given original vs. curated comments as context for direct inference.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Llama-3.1-70B achieves near-perfect agreement with human annotators for code review classification (Cohen's kappa 0.64–1.0).",
    383       "evidence": "Section III-D reports kappa values from a 100-sample sanity check: civility=1.0, type=0.88, nature=0.82, relevance=0.85, conciseness=0.76, clarity=0.64.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "The curation pipeline substantially improves clarity (6.89 → 8.96) and conciseness (7.71 → 8.05) while achieving 100% civil comments.",
    388       "evidence": "Table VII reports per-category improvements in the curated dataset using the same LLM-as-judge evaluation framework applied to the original dataset.",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "85.9% of code review samples address refactoring or bugfixes, and 62.6% are prescriptive in nature.",
    393       "evidence": "Figure 3 shows refactoring 69.7% + bugfix 16.2% = 85.9%, and nature distribution showing prescriptive at 62.6%.",
    394       "supported": "strong"
    395     }
    396   ],
    397   "methodology_tags": [
    398     "benchmark-eval",
    399     "empirical",
    400     "case-study"
    401   ],
    402   "key_findings": "The paper proposes CuRev, a curated version of the largest public code review dataset (176,613 samples), using Llama-3.1-70B as an automated judge to filter irrelevant comments (5,895 removed) and reformulate remaining comments for clarity, conciseness, and civility. A validation sanity check on 100 manually annotated samples shows Cohen's kappa of 0.64–1.0 between the LLM judge and human annotators. Fine-tuning DeepSeek-Coder-6.7B-Instruct on the curated dataset yields 46% higher BLEU for comment generation and 22% higher CodeBLEU for code refinement versus the original dataset, though absolute performance remains low (BLEU 11.26, CodeBLEU 0.44). No ablation separates the contributions of filtering versus reformulation, and no significance tests are applied to the results.",
    403   "red_flags": [
    404     {
    405       "flag": "Target contamination of BLEU metric",
    406       "detail": "The curated dataset changes both training and test targets; higher BLEU may mechanically result from more uniform, regular targets rather than improved model learning. This confound is not discussed."
    407     },
    408     {
    409       "flag": "No ablation between pipeline steps",
    410       "detail": "The curation pipeline combines relevance filtering and LLM reformulation, but no experiment isolates their separate effects on downstream performance."
    411     },
    412     {
    413       "flag": "No statistical significance tests",
    414       "detail": "All comparative results (BLEU, CodeBLEU, Exact Match, scoring criteria improvements) are reported as point estimates with no significance testing or confidence intervals."
    415     },
    416     {
    417       "flag": "Low absolute performance obscured by relative framing",
    418       "detail": "A 46% improvement from BLEU 7.71 to 11.26 sounds impressive but both values indicate extremely poor comment generation; the practical significance of the improvement is not discussed."
    419     },
    420     {
    421       "flag": "20K sample subset unjustified",
    422       "detail": "The downstream evaluation uses only 20,000 of 176,613 samples with no justification for this choice or analysis of whether the subset is representative."
    423     },
    424     {
    425       "flag": "Single model generalizability",
    426       "detail": "All downstream task claims rely on a single model (DeepSeek-Coder-6.7B-Instruct); no validation on other architectures or model sizes is provided."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Automating code review activities by large-scale pre-training",
    432       "relevance": "The base dataset (176,613 samples) used for the entire study; also a key prior work in LLM-based code review automation."
    433     },
    434     {
    435       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    436       "relevance": "Core methodological justification for using LLM-as-a-Judge for annotation at scale."
    437     },
    438     {
    439       "title": "Towards automating code review activities (Tufano et al., ICSE 2021)",
    440       "relevance": "Foundational prior work on T5-based automated comment generation that CuRev aims to improve upon."
    441     },
    442     {
    443       "title": "DeepSeek-Coder: When the large language model meets programming",
    444       "relevance": "The model used for fine-tuning and evaluation in both downstream tasks."
    445     },
    446     {
    447       "title": "Improving the learning of code review successive tasks with cross-task knowledge distillation",
    448       "relevance": "Prior work by same first author (DISCOREV) that CuRev builds on for the code review task setting."
    449     },
    450     {
    451       "title": "LoRA: Low-rank adaptation of large language models",
    452       "relevance": "Parameter-efficient fine-tuning technique used for both downstream experiments."
    453     },
    454     {
    455       "title": "CodeUltraFeedback: An LLM-as-a-judge dataset for aligning large language models to coding preferences",
    456       "relevance": "Prior work by co-author on LLM-as-judge methodology applied to software engineering tasks."
    457     },
    458     {
    459       "title": "Do words have power? Understanding and fostering civility in code review discussion",
    460       "relevance": "Provides the civility classification framework adopted in the evaluation schema."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "Directly addresses a practical problem (noisy code review datasets) and releases a curated dataset practitioners and researchers can use for training code review models."
    467     },
    468     "surprise_contrarian": {
    469       "score": 1,
    470       "justification": "The finding that cleaner training data helps is intuitive; the surprise is the magnitude of improvement (46% BLEU) from relatively minor curation."
    471     },
    472     "fear_safety": {
    473       "score": 0,
    474       "justification": "No AI risk, safety, or harm concerns raised."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "Straightforward engineering paper with no controversy or competing claims."
    479     },
    480     "demo_ability": {
    481       "score": 2,
    482       "justification": "Dataset and code are publicly released on GitHub and Zenodo, enabling others to reproduce or build on the curation pipeline."
    483     },
    484     "brand_recognition": {
    485       "score": 1,
    486       "justification": "Uses Llama-3.1-70B and DeepSeek-Coder, recognizable models, but from an academic group without major brand recognition."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [],
    491     "top_points": 0,
    492     "total_points": 0,
    493     "total_comments": 0
    494   }
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs