ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25104B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Large Language Models for Code Review",
      6     "authors": [
      7       "Umut Cihan",
      8       "Arda Içöz",
      9       "Vahid Haratian",
     10       "Eray Tüzün"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2505.20206",
     15     "doi": "10.48550/arXiv.2505.20206"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims are supported by results section. Performance figures (68.50% GPT4o, 63.89% Gemini correctness; 67.83% and 54.26% correction ratios) match reported data. Code type effects confirmed via ground truth vs mixed dataset comparison.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Paper claims problem descriptions causally improve performance but only shows observational within-subjects comparison. No ablation studies or random assignment; cannot rule out confounds. Claims causal effect without proper experimental design.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Explicitly states 'our scope is limited to Python. Therefore our findings are only directly generalizable to Python' (p.8). Acknowledges HumanEval limitations and AI-generated code datasets. Bounds generalization appropriately.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Contradictory results (Gemini outperforms on ground truth, GPT4o on mixed) trigger 'raises questions about code type' but no systematic exploration. Doesn't discuss whether dataset contamination, model-specific biases, or task alignment explain findings.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Uses unit test passage as proxy for code quality without explicitly distinguishing measured outcome (unit test pass) from claimed outcome (code review quality). Acknowledges limitation in VI.D but doesn't resolve the distinction during analysis.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Dedicated Section VI 'THREATS TO VALIDITY' with four subsections covering Internal, External, Construct, and Conclusion validity. Substantial discussion, not a single sentence.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats with quantification: prompt sensitivity, YAML/indentation errors (4.08%, 1.08%), dataset limitations (simple questions vs real code), Python-only scope. Not generic disclaimers.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicitly bounds to Python, unit-tested code, simple problems, and pre-trained models. States 'unit testing is not always conducted' in real practice, limiting applicability.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment section or source disclosed. Appears unfunded academic work but lacks explicit statement or declaration of competing interests.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors listed as Bilkent University, Ankara. No affiliation with OpenAI, Google, or evaluated product vendors.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funder disclosed; likely unfunded independent academic work.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No statement of competing interests, patents, equity, or consulting relationships. Standard conflicts statement absent.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Code correctness defined as 'ability of code to perform intended functionality in all cases' (p.3). Correct/Incorrect operationalized via unit test passage. RQs clearly framed.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Stated goal: 'illuminate LLM capabilities in code reviews.' Two RQs on LLM assessment accuracy and suggestion effectiveness. Proposes Human-in-the-loop process as methodological contribution.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section II.C compares to Tufano, Rasheed, Tang et al. Explicitly states 'unlike prior work, our study examines LLMs as code approvers' and establishes benchmarking setup as differentiation.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Code and data available at Zenodo (https://doi.org/10.5281/zenodo.14962566). Explicitly stated with persistent identifier.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "HumanEval dataset publicly available. AI-generated code dataset and results shared via Zenodo. Both datasets fully accessible.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Only states Python without version or dependency specification. Model parameters described as 'default' without hyperparameter details. No requirements.txt or Dockerfile.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "States 'we share experiment setup and source code' but paper itself lacks step-by-step reproduction instructions. Code exists in Zenodo but not documented in paper.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Reports standard deviations from 3 runs (0.35%-1.61% range) but no confidence intervals in figures. Error bars absent from visualizations.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Chi-square test for variance consistency only; no statistical tests comparing GPT4o vs Gemini performance differences. Main performance contrasts lack significance testing.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Reports performance percentages (68.50%, 63.89%) with baseline context. Differences quantified (e.g., 'up to 22.87%'). Percentage improvements are effect sizes.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Uses 492+164 code blocks without justification or power analysis. Acknowledges dataset scarcity ('failed to find dataset') but doesn't justify final sample size.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Standard deviations reported in text across all metrics. Obtained by running each configuration 3 times. Variance explicitly quantified.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "Only two LLMs compared (GPT4o vs Gemini). No simpler heuristic baselines or older model versions for temporal comparison.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "GPT-4o (May 2024) and Gemini 2.0 Flash (Dec 2024) are state-of-the-art at evaluation time (May 2025).",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "Compares with/without problem descriptions, but this is a prompt configuration variant, not classical ablation removing model components.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Uses Correctness Accuracy, False Positive Rate, False Negative Rate, Correction Ratio, Regression Ratio. Five distinct metrics evaluating different aspects.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Evaluation is fully automated via unit tests. No human judges assess code quality or suggestion usefulness. No subjective code review evaluation.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "Evaluating pre-trained models, not training models. Entire dataset is test data. Train/test split not applicable.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": false,
    217           "justification": "Breaks down by dataset type and prompt configuration only. No breakdown by code difficulty, algorithm category, language feature, or error type.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Discusses YAML/indentation failures (4.08%, 1.08%), regression cases (up to 23.79%), and false positive scenarios. Negative outcomes explicitly reported.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Reports moderate accuracy (68.50% best case), high regression rates, and concludes full automation unreliable. Transparently discusses limitations.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specifies 'Gemini-2.0-Flash and gpt-4o-2024-11-20'. Exact versions with snapshot dates provided.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Figure 2 provides full prompt template with placeholders filled. Chain-of-thought structure explicit. Red text shows problem description inclusion.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "States 'default model parameters' without specifying temperature, top_p, frequency_penalty, or other settings.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding beyond simple prompting. Single-turn prompt-response, no multi-step reasoning framework.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "Uses HumanEval and Yetistiren et al. code as-is. No preprocessing steps (filtering, cleaning, filtering, deduplication) documented.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Zenodo repository (https://doi.org/10.5281/zenodo.14962566) contains raw code blocks and unit tests for independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Specifies 492 AI-generated code from ChatGPT (9 Jan '23), CodeWhisperer (Jan '23), GitHub Copilot (v1.70.8099). 164 canonical from HumanEval. Sources and dates clear.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; uses public benchmark data. Not applicable.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Figure 1 and methodology section describe: collect code blocks → prompt LLM → extract classification and code → run unit tests. Pipeline transparent.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Models used (GPT-4o Nov 2024, Gemini 2.0 Dec 2024) likely trained on data including May 2025 paper submission. Cutoff not explicitly stated.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "HumanEval (2021 benchmark) likely in both model training and public knowledge. Potential contamination not discussed or addressed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "No discussion of whether HumanEval examples appeared in training data of GPT-4o or Gemini. Contamination risk not acknowledged.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants involved in study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human subjects; ethics approval not applicable.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Uses OpenAI and Google APIs but no cost or latency figures reported. No discussion of computational or financial budget.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Implicitly ~7,800 API calls (656 samples × 2 models × 2 configs × 3 runs) but no total compute budget or cost analysis provided.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GPT-4o achieves 68.50% correctness classification accuracy on code review with problem descriptions",
    374       "evidence": "Results section, Figure 3: mixed dataset with problem descriptions",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Problem descriptions significantly improve LLM code review performance",
    379       "evidence": "Performance drops without descriptions across all metrics (Figures 3-7); differences up to 22.87% in correction ratios",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "GPT-4o corrects up to 67.83% of incorrect code with suggestions",
    384       "evidence": "Correction Ratio results, Figure 6, mixed dataset with descriptions",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "LLMs cause code regressions in 10-24% of correct code blocks",
    389       "evidence": "Regression Ratio results across configurations (Figures 7, 9)",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Code type (AI-generated vs canonical) affects model relative performance differently",
    394       "evidence": "Ground truth results contradict mixed dataset: Gemini 66.67% vs GPT-4o 42.07% (Figure 8)",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Full automation of LLM code review is unreliable and risky",
    399       "evidence": "Moderate accuracy (68.50% best), high error rates (44.44% approval error), regression risks (23.79%)",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Human-in-the-loop review process mitigates LLM code review risks",
    404       "evidence": "Process proposed (Figure 10) with human oversight layer; logically follows from error analysis",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "observational"
    411   ],
    412   "key_findings": "GPT-4o and Gemini 2.0 Flash achieve only moderate accuracy (68.50% and 63.89%) at code correctness classification and 67.83%/54.26% effectiveness at code correction, with performance strongly dependent on code description availability. Performance varies dramatically across different code types (AI-generated vs canonical), suggesting no single model configuration generalizes. Error rates—including false positives merging broken code (up to 44.44%) and regressions corrupting correct code (up to 24.80%)—indicate full automation is unreliable. The authors propose a human-in-the-loop process where LLMs flag changes but humans make final merge decisions.",
    413   "red_flags": [
    414     {
    415       "flag": "No significance testing",
    416       "detail": "Performance differences between models not tested for statistical significance. Chi-square test checks consistency only, not comparative differences."
    417     },
    418     {
    419       "flag": "Limited baselines",
    420       "detail": "Only two LLMs compared; no baseline heuristics (e.g., AST-based checkers) or simpler models for reference."
    421     },
    422     {
    423       "flag": "Benchmark contamination unaddressed",
    424       "detail": "HumanEval (2021) likely in training data of 2024+ models; potential data leakage not discussed or mitigated."
    425     },
    426     {
    427       "flag": "Sample size unjustified",
    428       "detail": "656 total samples used without power analysis or justification. Acknowledged data scarcity but doesn't explain final sample choice."
    429     },
    430     {
    431       "flag": "Proxy outcome conflation",
    432       "detail": "Unit test passage used as proxy for code quality; actual code review criteria (readability, maintainability, architecture) not evaluated."
    433     },
    434     {
    435       "flag": "No human evaluation",
    436       "detail": "Fully automated evaluation via unit tests; no human judges assess whether suggestions are actually useful or suggestions are realistic."
    437     },
    438     {
    439       "flag": "Narrow scope generalizability",
    440       "detail": "Python only, simple HumanEval problems, AI-generated code. Results may not transfer to complex, real-world codebases."
    441     },
    442     {
    443       "flag": "Hyperparameters underspecified",
    444       "detail": "Only 'default parameters' stated; temperature, top_p, and other settings not disclosed, limiting reproducibility."
    445     },
    446     {
    447       "flag": "Contradictory findings unexplained",
    448       "detail": "Ground truth performance contradicts mixed dataset findings (Gemini outperforms GPT-4o on canonical, underperforms on AI-generated). Root cause not investigated."
    449     }
    450   ],
    451   "cited_papers": [
    452     {
    453       "title": "Modern code review: A case study at Google",
    454       "relevance": "Foundational work on modern code review practices and motivation for automation"
    455     },
    456     {
    457       "title": "Expectations, outcomes, and challenges of modern code review",
    458       "relevance": "Establishes code review importance and time-consuming nature in practice"
    459     },
    460     {
    461       "title": "Code review automation: Strengths and weaknesses of the state of the art",
    462       "relevance": "Recent survey of prior code review automation attempts; comparison baseline for this work"
    463     },
    464     {
    465       "title": "Using pre-trained models to boost code review automation",
    466       "relevance": "Prior T5-based approach to automating code review; related work comparison"
    467     },
    468     {
    469       "title": "AI-powered code review with LLMs: Early results",
    470       "relevance": "Concurrent work on LLM-based code review agents"
    471     },
    472     {
    473       "title": "Evaluating large language models trained on code",
    474       "relevance": "HumanEval benchmark paper; foundational dataset used in evaluation"
    475     },
    476     {
    477       "title": "Impact of peer code review on peer impression formation: A survey",
    478       "relevance": "Empirical evidence on code review benefits; motivation for reliability concerns"
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 3,
    484       "justification": "Directly applicable to real-world code review workflows; tools like Qodo and CodeRabbit use evaluated models."
    485     },
    486     "surprise_contrarian": {
    487       "score": 1,
    488       "justification": "Results largely confirm intuition: LLMs help but are unreliable; human oversight needed. No surprising findings that challenge assumptions."
    489     },
    490     "fear_safety": {
    491       "score": 0,
    492       "justification": "Focuses on code quality and automation reliability, not AI safety or alignment risks. No safety-related concerns raised."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "No controversy, heated debate, or adversarial framing. Straightforward empirical evaluation."
    497     },
    498     "demo_ability": {
    499       "score": 2,
    500       "justification": "Code and datasets released on Zenodo; practitioners can run experiments on their own codebases, though setup requires effort."
    501     },
    502     "brand_recognition": {
    503       "score": 2,
    504       "justification": "Evaluates OpenAI (GPT-4o) and Google (Gemini) models. Authors from Bilkent University (regional tier, not top-tier)."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [
    509       {
    510         "hn_id": "45535425",
    511         "title": "Reasoning LLMs are wandering solution explorers",
    512         "points": 90,
    513         "comments": 98,
    514         "url": "https://news.ycombinator.com/item?id=45535425"
    515       },
    516       {
    517         "hn_id": "44778108",
    518         "title": "Agentic Web: Weaving the Next Web with AI Agents",
    519         "points": 3,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=44778108"
    522       },
    523       {
    524         "hn_id": "45275073",
    525         "title": "The Mathematician's Assistant: Integrating AI into Research Practice",
    526         "points": 2,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=45275073"
    529       },
    530       {
    531         "hn_id": "45155065",
    532         "title": "Reverse Designing Ferroelectric Capacitors with ML-Based Compact Modeling",
    533         "points": 2,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=45155065"
    536       },
    537       {
    538         "hn_id": "44831312",
    539         "title": "Meta Clip 2: Worldwide",
    540         "points": 2,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=44831312"
    543       },
    544       {
    545         "hn_id": "40561445",
    546         "title": "There and Back Again: The AI Alignment Paradox",
    547         "points": 2,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=40561445"
    550       },
    551       {
    552         "hn_id": "44853245",
    553         "title": "Agentic Web – Weaving the Next Web with AI Agents",
    554         "points": 1,
    555         "comments": 0,
    556         "url": "https://news.ycombinator.com/item?id=44853245"
    557       }
    558     ],
    559     "top_points": 90,
    560     "total_points": 102,
    561     "total_comments": 99
    562   }
    563 }

Impressum · Datenschutz