scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29876B)
      1 {
      2   "paper": {
      3     "title": "Evaluating Large Language Models for Code Review",
      4     "authors": [
      5       "Umut Cihan",
      6       "Arda İçöz",
      7       "Vahid Haratian",
      8       "Eray Tüzün"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2505.20206",
     13     "doi": "10.48550/arXiv.2505.20206"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "GPT-4o and Gemini 2.0 Flash achieved moderate accuracy (68.50% and 63.89%) at classifying code correctness on 492 AI-generated code blocks, with GPT-4o correcting 67.83% of incorrect code. Including problem descriptions in prompts consistently improved all metrics. Results differed significantly across datasets (mixed vs ground truth), with Gemini outperforming GPT-4o on the ground truth set, suggesting performance is code-type-dependent. The authors conclude LLMs are unreliable for fully automated code review and propose a human-in-the-loop process.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper provides a Zenodo replication package at https://doi.org/10.5281/zenodo.14962566 (Section VIII, footnote 1), described as containing 'data obtained from this study, as well as the code used.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The datasets used (HumanEval and AI-generated code blocks from Yetistiren et al.) are publicly available, and the paper's experimental data is shared in the Zenodo replication package (Section VIII)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper specifies model versions but does not describe the execution environment, library versions, requirements.txt, or any dependency specifications needed to run the experiments."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "While a replication package is shared on Zenodo, the paper itself does not include step-by-step reproduction instructions or describe how to run the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The main results (Figures 3-9) report point estimates without confidence intervals or error bars. Standard deviations are mentioned as ranges across all configurations (e.g., '0.35% to 1.61% for correctness accuracy') but are not attached to individual results."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "A chi-square test is used only to confirm within-configuration consistency across 3 runs (Section IV). No significance test is applied to between-model or between-condition comparisons, despite claims like 'GPT4o outperformed Gemini.'"
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports both absolute values for compared conditions (e.g., GPT-4o 68.50% vs Gemini 63.89%) and differences (e.g., 'up to 22.87%'), providing sufficient context to assess the magnitude of effects."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for the sample sizes of 492 mixed code blocks and 164 ground truth blocks. No power analysis is discussed. The choice of 3 experimental runs is also not justified beyond citing the stochastic nature of LLMs."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper reports standard deviation ranges across 3 runs per configuration: '0.35% to 1.61% for correctness accuracy, from 1.02% to 1.93% for false positive rates, from 0.65% to 1.07% for false negative rates, from 0% to 2.88% for regression ratios, and from 0.38% to 1.34% for correction ratios' (Section IV)."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study compares two models (GPT-4o and Gemini 2.0 Flash) against each other across multiple configurations, and uses the ground truth dataset as a control group. However, no comparison against prior code review methods or simple heuristic baselines (e.g., random classifier) is included."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both GPT-4o (November 2024) and Gemini 2.0 Flash (December 2024) are contemporary, state-of-the-art models at the time of the study."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The study systematically removes problem descriptions from the prompts and measures the impact on all metrics, functioning as an ablation of the contextual input component. Results show consistent degradation without descriptions (Section IV)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Five distinct metrics are reported: correctness accuracy, false positive rate, false negative rate, correction ratio, and regression ratio (Section III-C)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluation is automated via unit test pass/fail. No human evaluation of the LLM's review quality, suggestion usefulness, or output naturalness is performed."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The paper mentions 'We optimized our prompt using a chain-of-thought style' but does not describe using a separate validation set for prompt optimization versus a held-out test set for final evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by model (GPT-4o vs Gemini), prompt type (with/without descriptions), dataset (mixed vs ground truth), and error type (false positives vs false negatives), providing detailed per-category views."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The paper discusses aggregate error rates (false positives, regressions) and YAML/indentation parsing errors (Section VI-A) but does not show qualitative examples of specific failure cases or analyze why particular code blocks were misclassified."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports several negative findings: regression rates up to 24.80%, performance degradation without descriptions, GPT-4o's poor correctness accuracy (42.07%) on the ground truth dataset, and the overall conclusion that LLMs are unreliable for full automation."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract's specific claims (68.50%, 63.89% correctness accuracy; 67.83%, 54.26% correction ratios) are all supported by the results in Section IV. The qualitative conclusions about moderate reliability are consistent with the reported numbers."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The main causal claim — that including problem descriptions improves performance — is supported by a controlled comparison using the same code blocks with and without descriptions, which is an adequate design for this limited causal inference."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Evaluating Large Language Models for Code Review' is broad but the study only tests 2 models on Python code blocks from HumanEval. While the threats-to-validity section acknowledges 'our scope is limited to Python' and 'other LLMs may exhibit different behaviors,' the title and framing substantially over-generalize."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The threats-to-validity section discusses specific methodological concerns (prompt sensitivity, Python-only scope, stochastic LLM behavior) but does not discuss alternative explanations for the observed results, such as why GPT-4o outperforms Gemini on mixed data but not ground truth, or what factors drive the performance patterns."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper defines code correctness precisely via unit tests (Section III) and explicitly acknowledges the proxy gap in conclusion validity (Section VI-D): 'unit testing is not always conducted,' 'practitioners may apply different criteria for code approval,' and 'unit tests might not have enough coverage.'"
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "GPT-4o is versioned as 'gpt-4o-2024-11-20' (Section IV), but Gemini is specified only as 'Gemini-2.0-Flash' without a snapshot date or API version. The schema requires exact versions for all models; the Gemini specification is a marketing/API name without a pinned version."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The full prompt template is provided in Figure 2, showing the actual text used including instructions, output format requirements, and rules. The placeholder values (#code block, #problem description) come from publicly available datasets, allowing full reconstruction."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper states only 'default model parameters' (Section IV) without specifying what those defaults are (temperature, top-p, max tokens, etc.). These settings significantly affect LLM output and should be explicitly stated."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The setup sends single prompts to LLM APIs and parses the YAML response — a simple prompt-response pattern without tools, retry logic, or multi-step workflows."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper describes the data pipeline: datasets sourced from HumanEval and Yetistiren et al., code blocks categorized as correct/incorrect via unit tests (234 correct, 258 incorrect), YAML responses parsed, and error handling documented (94.70% clean execution, 4.08% indentation errors, 1.08% YAML format errors — Section VI-A)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section VI 'Threats to Validity' provides a detailed discussion with four subsections: internal, external, construct, and conclusion validity."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The threats are specific to this study: prompt sensitivity and YAML extraction errors (internal), Python-only scope and 3-run averaging (external), HumanEval simplicity and AI-generated dataset limitations (construct), unit test coverage limitations (conclusion)."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly states: 'our scope is limited to Python. Therefore our findings are only directly generalizable to Python' (Section VI-B), 'other LLMs may exhibit different behaviors' (Section VI-C), and that the HumanEval dataset 'consists of simple questions' not representative of real projects (Section VI-C)."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The Zenodo replication package (https://doi.org/10.5281/zenodo.14962566) contains the data and code from the study (Section VIII)."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The datasets are clearly described: 164 canonical HumanEval solutions, and 492 AI-generated code blocks from three tools (ChatGPT Jan '23, Amazon CodeWhisperer Jan '23, GitHub Copilot v1.70.8099) as collected by Yetistiren et al. (Section III-A)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are standard public benchmarks (HumanEval) and a prior published dataset."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is described in Figure 1 and Section III-B: code blocks are sent to LLMs via prompt, YAML responses are parsed, code suggestions are extracted, unit tests are run, and results are classified. Error rates at the parsing stage are documented (Section VI-A)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding sources, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All four authors are listed as affiliated with Bilkent University, Ankara, Turkey. They have no apparent affiliation with OpenAI or Google, the providers of the evaluated models."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement makes this unevaluable."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial interest disclosures appear in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The training data cutoff dates for GPT-4o and Gemini 2.0 Flash are not stated anywhere in the paper, despite using the HumanEval benchmark which was published in 2021."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether GPT-4o or Gemini 2.0 Flash may have seen HumanEval solutions or the AI-generated code blocks during training. This is a significant omission given that HumanEval is widely used and likely in training corpora."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "HumanEval was published in 2021 (Chen et al.) and is one of the most widely reproduced benchmarks. Both GPT-4o and Gemini 2.0 Flash were trained well after 2021 and almost certainly encountered HumanEval solutions. The paper does not discuss this contamination risk at all."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. The evaluation is entirely automated using unit tests on code blocks."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study evaluates LLMs on code blocks with automated unit tests."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No mention of API costs, tokens consumed, or inference latency for either GPT-4o or Gemini 2.0 Flash, despite running 656 code blocks × 2 models × 2 conditions × 3 runs."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget, API spend, or hardware information is provided."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Each configuration was run 3 times with results averaged, and standard deviations are reported (ranging from 0.35% to 2.88% across metrics). The chi-square test confirms these variations are not statistically significant (Section IV)."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "'To ensure reliability, we ran each experiment configuration three times and reported the average results' (Section IV)."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search is described. The paper states 'default model parameters' were used without documenting what those parameters are or whether alternatives were explored."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "All 8 experimental configurations (2 models × 2 prompt types × 2 datasets) are reported in full. The paper does not cherry-pick only the best configuration but presents all results transparently."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes numerous comparisons across 8 configurations and 5 metrics without applying any correction for multiple comparisons (Bonferroni, Holm, etc.). The chi-square test used is for within-configuration consistency, not between-condition comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "The authors evaluate third-party LLMs (GPT-4o, Gemini 2.0 Flash), not their own system. Self-comparison bias does not apply."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "GPT-4o and Gemini 2.0 Flash differ substantially in compute cost and model size, but the paper does not discuss compute budget, API cost, or latency differences when comparing their performance."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper acknowledges in threats to validity that 'The HumanEval dataset consists of simple questions' (Section VI-C) but does not substantively discuss whether unit-test-based correctness classification on simple coding problems is a valid proxy for real-world code review capability."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. Both models receive the same prompt through their respective APIs in a simple prompt-response setup."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "HumanEval was published in 2021, and the AI-generated code blocks from Yetistiren et al. were published in 2023. Both GPT-4o (2024) and Gemini 2.0 Flash (2024) were trained after these datasets were publicly available. The paper does not discuss this temporal leakage."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "If the models have memorized HumanEval solutions from training, their ability to assess code correctness may be inflated — they could be pattern-matching against known solutions rather than truly reviewing code. This is not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the training data of GPT-4o or Gemini includes HumanEval problems or their solutions, which would violate independence between training and test data."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection methods (canary strings, membership inference, n-gram overlap analysis, decontamination) are applied despite using a widely-known benchmark that is almost certainly in the training data."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "GPT-4o correctly classified code correctness 68.50% of the time with problem descriptions on the mixed dataset, compared to Gemini's 63.89%.",
    370       "evidence": "Figure 3 and Section IV-A report averaged results across 3 runs for both models with problem descriptions.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "GPT-4o corrected 67.83% of incorrect code blocks with problem descriptions, versus Gemini's 54.26%.",
    375       "evidence": "Figure 6 and Section IV-A show correction ratios across models and conditions.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Including problem descriptions in prompts consistently improved LLM performance across all metrics.",
    380       "evidence": "All results in Section IV show higher accuracy, correction ratios, and lower regression ratios when descriptions are included. Differences of up to 22.87% are observed.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Performance differs significantly across datasets: Gemini outperformed GPT-4o on the ground truth dataset (66.67% vs 42.07%) despite performing worse on the mixed dataset.",
    385       "evidence": "Figure 8 (Section IV-B) shows the reversal in correctness accuracy between datasets.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "LLM code reviews are unreliable for full automation, with regression rates up to 24.80% and inaccurate approval decisions up to 44.44%.",
    390       "evidence": "Section V-C cites regression rates of 23.79% and 44.44% inaccurate decisions from Gemini without problem descriptions.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No contamination analysis on a widely-known benchmark",
    397       "detail": "HumanEval was published in 2021 and is among the most reproduced LLM benchmarks. Both GPT-4o and Gemini 2.0 Flash were trained years later and almost certainly encountered HumanEval solutions during training. Models that have memorized correct solutions would have an unfair advantage at assessing code correctness. The paper does not discuss this at all."
    398     },
    399     {
    400       "flag": "Only 3 runs per configuration",
    401       "detail": "Three experimental runs per configuration is minimal for statistical inference. With n=3, standard deviation estimates are unreliable, and the chi-square test for consistency has very low power (df=2)."
    402     },
    403     {
    404       "flag": "No statistical significance tests between conditions",
    405       "detail": "Claims that 'GPT-4o outperformed Gemini' are based solely on comparing averaged percentages without any formal significance test. The observed differences (e.g., 68.50% vs 63.89%) may not be statistically significant."
    406     },
    407     {
    408       "flag": "Construct validity concerns",
    409       "detail": "Using unit test pass/fail on simple HumanEval-style problems as a proxy for code review capability is questionable. Real code review involves design, readability, security, maintainability, and context understanding — none of which are captured by this evaluation framework."
    410     },
    411     {
    412       "flag": "Only 2 models tested with broad title",
    413       "detail": "The title 'Evaluating Large Language Models for Code Review' implies a broad evaluation, but only GPT-4o and Gemini 2.0 Flash are tested. No open-source models, no Claude, no specialized code models are included."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduces the HumanEval benchmark and Codex model, which form the core dataset and motivation for this study's evaluation of LLM code review capabilities."
    423     },
    424     {
    425       "title": "Evaluating the code quality of ai-assisted code generation tools: An empirical study on github copilot, amazon codewhisperer, and chatgpt",
    426       "authors": ["Burak Yetistiren", "Isık Özsoy", "Miray Ayerdem", "Eray Tüzün"],
    427       "year": 2023,
    428       "relevance": "Provides the 492 AI-generated code blocks (from Copilot, CodeWhisperer, ChatGPT) used as the mixed dataset in this study."
    429     },
    430     {
    431       "title": "Code review automation: Strengths and weaknesses of the state of the art",
    432       "authors": ["Rosalia Tufano", "Ozren Dabić", "Antonio Mastropaolo", "Matteo Ciniselli", "Gabriele Bavota"],
    433       "year": 2024,
    434       "relevance": "Qualitatively evaluates prior code review automation work alongside ChatGPT, finding it competitive for code-to-comment tasks."
    435     },
    436     {
    437       "title": "Using pre-trained models to boost code review automation",
    438       "authors": ["Rosalia Tufano", "Simone Masiero", "Antonio Mastropaolo", "Luca Pascarella", "Denys Poshyvanyk", "Gabriele Bavota"],
    439       "year": 2022,
    440       "arxiv_id": "2201.06850",
    441       "relevance": "Employs T5 model for code review automation, representing the pre-LLM era of neural code review approaches."
    442     },
    443     {
    444       "title": "AI-powered code review with LLMs: Early results",
    445       "authors": ["Zeeshan Rasheed", "Malik Abdul Sami", "Muhammad Waseem"],
    446       "year": 2024,
    447       "relevance": "Develops LLM agents to automate code review, directly related to the question of LLM reliability in code review."
    448     },
    449     {
    450       "title": "CodeAgent: Autonomous communicative agents for code review",
    451       "authors": ["Xunzhu Tang", "Kisub Kim", "Yewei Song"],
    452       "year": 2024,
    453       "relevance": "Presents an agentic approach to code review automation using communicative LLM agents."
    454     },
    455     {
    456       "title": "GPT-4 technical report",
    457       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    458       "year": 2023,
    459       "arxiv_id": "2303.08774",
    460       "relevance": "Technical report for GPT-4, the predecessor to GPT-4o which is one of the two models evaluated in this study."
    461     },
    462     {
    463       "title": "Automating code review activities by large-scale pre-training",
    464       "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"],
    465       "year": 2022,
    466       "relevance": "Explores automation of code review through large-scale pre-training on diverse code datasets."
    467     },
    468     {
    469       "title": "A prompt pattern catalog to enhance prompt engineering with chatgpt",
    470       "authors": ["Jules White", "Quchen Fu", "Sam Hays"],
    471       "year": 2023,
    472       "arxiv_id": "2302.11382",
    473       "relevance": "Provides the prompt engineering methodology foundation that informs the chain-of-thought prompting approach used in this study."
    474     },
    475     {
    476       "title": "Resolving code review comments with machine learning",
    477       "authors": ["Alexander Froemmgen", "Jacob Austin", "Peter Choy"],
    478       "year": 2024,
    479       "relevance": "Deployed ML tool at Google for automatically resolving code review comments, demonstrating practical industry application of automated code review."
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 2,
    485       "justification": "Proposes a human-in-the-loop code review process and provides a replication package for practitioners to benchmark LLMs on their own codebases."
    486     },
    487     "surprise_contrarian": {
    488       "score": 1,
    489       "justification": "The finding that LLMs have only moderate accuracy (~68%) at code review is mildly informative but unlikely to surprise practitioners who have used these tools."
    490     },
    491     "fear_safety": {
    492       "score": 0,
    493       "justification": "No safety, security, or risk implications discussed beyond the general point that faulty code review could introduce bugs."
    494     },
    495     "drama_conflict": {
    496       "score": 0,
    497       "justification": "No controversy, no strong claims against specific companies or products, and balanced reporting of results."
    498     },
    499     "demo_ability": {
    500       "score": 1,
    501       "justification": "Zenodo replication package is available but it is an experimental setup, not a ready-to-use tool or demo."
    502     },
    503     "brand_recognition": {
    504       "score": 1,
    505       "justification": "Evaluates well-known models (GPT-4o, Gemini 2.0 Flash) but comes from Bilkent University rather than a major AI lab."
    506     }
    507   }
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs