scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33609B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Large Language Models for Code Review",
      6     "authors": [
      7       "Umut Cihan",
      8       "Arda Içöz",
      9       "Vahid Haratian",
     10       "Eray Tüzün"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2505.20206",
     15     "doi": "10.48550/arXiv.2505.20206"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's specific claims (68.50%, 63.89% correctness accuracy; 67.83%, 54.26% correction ratios) are all supported by the results in Section IV. The qualitative conclusions about moderate reliability are consistent with the reported numbers.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The main causal claim — that including problem descriptions improves performance — is supported by a controlled comparison using the same code blocks with and without descriptions, which is an adequate design for this limited causal inference.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title 'Evaluating Large Language Models for Code Review' is broad but the study only tests 2 models on Python code blocks from HumanEval. While the threats-to-validity section acknowledges 'our scope is limited to Python' and 'other LLMs may exhibit different behaviors,' the title and framing substantially over-generalize.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The threats-to-validity section discusses specific methodological concerns (prompt sensitivity, Python-only scope, stochastic LLM behavior) but does not discuss alternative explanations for the observed results, such as why GPT-4o outperforms Gemini on mixed data but not ground truth, or what factors drive the performance patterns.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper defines code correctness precisely via unit tests (Section III) and explicitly acknowledges the proxy gap in conclusion validity (Section VI-D): 'unit testing is not always conducted,' 'practitioners may apply different criteria for code approval,' and 'unit tests might not have enough coverage.'",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section VI 'Threats to Validity' provides a detailed discussion with four subsections: internal, external, construct, and conclusion validity.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The threats are specific to this study: prompt sensitivity and YAML extraction errors (internal), Python-only scope and 3-run averaging (external), HumanEval simplicity and AI-generated dataset limitations (construct), unit test coverage limitations (conclusion).",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states: 'our scope is limited to Python. Therefore our findings are only directly generalizable to Python' (Section VI-B), 'other LLMs may exhibit different behaviors' (Section VI-C), and that the HumanEval dataset 'consists of simple questions' not representative of real projects (Section VI-C).",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources, acknowledgments section, or grant numbers are mentioned anywhere in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are listed as affiliated with Bilkent University, Ankara, Turkey. They have no apparent affiliation with OpenAI or Google, the providers of the evaluated models.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement makes this unevaluable.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial interest disclosures appear in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Code correctness defined as 'ability to perform intended functionality in all cases.' 'Correct'/'Incorrect' operationalized by unit test pass/fail. Code review, pull requests, and human-in-the-loop all explained.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Explicitly states goal: compare GPT-4o and Gemini on code review. Proposes 'Human-in-the-loop LLM Code Review' process and provides replicable experimental methodology.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section II.C surveys code review automation literature (reviewer recommendation, ML-based approaches, recent LLM agents). Clearly positions this work: 'Unlike prior work, our study examines LLMs as code approvers, responsible for merge decisions.'",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper provides a Zenodo replication package at https://doi.org/10.5281/zenodo.14962566 (Section VIII, footnote 1), described as containing 'data obtained from this study, as well as the code used.'",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The datasets used (HumanEval and AI-generated code blocks from Yetistiren et al.) are publicly available, and the paper's experimental data is shared in the Zenodo replication package (Section VIII).",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper specifies model versions but does not describe the execution environment, library versions, requirements.txt, or any dependency specifications needed to run the experiments.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "While a replication package is shared on Zenodo, the paper itself does not include step-by-step reproduction instructions or describe how to run the experiments.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "The main results (Figures 3-9) report point estimates without confidence intervals or error bars. Standard deviations are mentioned as ranges across all configurations (e.g., '0.35% to 1.61% for correctness accuracy') but are not attached to individual results.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "A chi-square test is used only to confirm within-configuration consistency across 3 runs (Section IV). No significance test is applied to between-model or between-condition comparisons, despite claims like 'GPT4o outperformed Gemini.'",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "The paper reports both absolute values for compared conditions (e.g., GPT-4o 68.50% vs Gemini 63.89%) and differences (e.g., 'up to 22.87%'), providing sufficient context to assess the magnitude of effects.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification is given for the sample sizes of 492 mixed code blocks and 164 ground truth blocks. No power analysis is discussed. The choice of 3 experimental runs is also not justified beyond citing the stochastic nature of LLMs.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "The paper reports standard deviation ranges across 3 runs per configuration: '0.35% to 1.61% for correctness accuracy, from 1.02% to 1.93% for false positive rates, from 0.65% to 1.07% for false negative rates, from 0% to 2.88% for regression ratios, and from 0.38% to 1.34% for correction ratios' (Section IV).",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The study compares two models (GPT-4o and Gemini 2.0 Flash) against each other across multiple configurations, and uses the ground truth dataset as a control group. However, no comparison against prior code review methods or simple heuristic baselines (e.g., random classifier) is included.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Both GPT-4o (November 2024) and Gemini 2.0 Flash (December 2024) are contemporary, state-of-the-art models at the time of the study.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "The study systematically removes problem descriptions from the prompts and measures the impact on all metrics, functioning as an ablation of the contextual input component. Results show consistent degradation without descriptions (Section IV).",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Five distinct metrics are reported: correctness accuracy, false positive rate, false negative rate, correction ratio, and regression ratio (Section III-C).",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "All evaluation is automated via unit test pass/fail. No human evaluation of the LLM's review quality, suggestion usefulness, or output naturalness is performed.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "The paper mentions 'We optimized our prompt using a chain-of-thought style' but does not describe using a separate validation set for prompt optimization versus a held-out test set for final evaluation.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by model (GPT-4o vs Gemini), prompt type (with/without descriptions), dataset (mixed vs ground truth), and error type (false positives vs false negatives), providing detailed per-category views.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "The paper discusses aggregate error rates (false positives, regressions) and YAML/indentation parsing errors (Section VI-A) but does not show qualitative examples of specific failure cases or analyze why particular code blocks were misclassified.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports several negative findings: regression rates up to 24.80%, performance degradation without descriptions, GPT-4o's poor correctness accuracy (42.07%) on the ground truth dataset, and the overall conclusion that LLMs are unreliable for full automation.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "GPT-4o is versioned as 'gpt-4o-2024-11-20' (Section IV), but Gemini is specified only as 'Gemini-2.0-Flash' without a snapshot date or API version. The schema requires exact versions for all models; the Gemini specification is a marketing/API name without a pinned version.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The full prompt template is provided in Figure 2, showing the actual text used including instructions, output format requirements, and rules. The placeholder values (#code block, #problem description) come from publicly available datasets, allowing full reconstruction.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "The paper states only 'default model parameters' (Section IV) without specifying what those defaults are (temperature, top-p, max tokens, etc.). These settings significantly affect LLM output and should be explicitly stated.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The setup sends single prompts to LLM APIs and parses the YAML response — a simple prompt-response pattern without tools, retry logic, or multi-step workflows.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The paper describes the data pipeline: datasets sourced from HumanEval and Yetistiren et al., code blocks categorized as correct/incorrect via unit tests (234 correct, 258 incorrect), YAML responses parsed, and error handling documented (94.70% clean execution, 4.08% indentation errors, 1.08% YAML format errors — Section VI-A).",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The Zenodo replication package (https://doi.org/10.5281/zenodo.14962566) contains the data and code from the study (Section VIII).",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The datasets are clearly described: 164 canonical HumanEval solutions, and 492 AI-generated code blocks from three tools (ChatGPT Jan '23, Amazon CodeWhisperer Jan '23, GitHub Copilot v1.70.8099) as collected by Yetistiren et al. (Section III-A).",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. Data sources are standard public benchmarks (HumanEval) and a prior published dataset.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline is described in Figure 1 and Section III-B: code blocks are sent to LLMs via prompt, YAML responses are parsed, code suggestions are extracted, unit tests are run, and results are classified. Error rates at the parsing stage are documented (Section VI-A).",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The training data cutoff dates for GPT-4o and Gemini 2.0 Flash are not stated anywhere in the paper, despite using the HumanEval benchmark which was published in 2021.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether GPT-4o or Gemini 2.0 Flash may have seen HumanEval solutions or the AI-generated code blocks during training. This is a significant omission given that HumanEval is widely used and likely in training corpora.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "HumanEval was published in 2021 (Chen et al.) and is one of the most widely reproduced benchmarks. Both GPT-4o and Gemini 2.0 Flash were trained well after 2021 and almost certainly encountered HumanEval solutions. The paper does not discuss this contamination risk at all.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study. The evaluation is entirely automated using unit tests on code blocks.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants. The study evaluates LLMs on code blocks with automated unit tests.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No mention of API costs, tokens consumed, or inference latency for either GPT-4o or Gemini 2.0 Flash, despite running 656 code blocks × 2 models × 2 conditions × 3 runs.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget, API spend, or hardware information is provided.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": true,
    373           "justification": "Each configuration was run 3 times with results averaged, and standard deviations are reported (ranging from 0.35% to 2.88% across metrics). The chi-square test confirms these variations are not statistically significant (Section IV).",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": true,
    379           "justification": "'To ensure reliability, we ran each experiment configuration three times and reported the average results' (Section IV).",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search is described. The paper states 'default model parameters' were used without documenting what those parameters are or whether alternatives were explored.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": true,
    391           "justification": "All 8 experimental configurations (2 models × 2 prompt types × 2 datasets) are reported in full. The paper does not cherry-pick only the best configuration but presents all results transparently.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "The paper makes numerous comparisons across 8 configurations and 5 metrics without applying any correction for multiple comparisons (Bonferroni, Holm, etc.). The chi-square test used is for within-configuration consistency, not between-condition comparisons.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": false,
    402           "answer": false,
    403           "justification": "The authors evaluate third-party LLMs (GPT-4o, Gemini 2.0 Flash), not their own system. Self-comparison bias does not apply.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "GPT-4o and Gemini 2.0 Flash differ substantially in compute cost and model size, but the paper does not discuss compute budget, API cost, or latency differences when comparing their performance.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper acknowledges in threats to validity that 'The HumanEval dataset consists of simple questions' (Section VI-C) but does not substantively discuss whether unit-test-based correctness classification on simple coding problems is a valid proxy for real-world code review capability.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No scaffolding is used. Both models receive the same prompt through their respective APIs in a simple prompt-response setup.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "HumanEval was published in 2021, and the AI-generated code blocks from Yetistiren et al. were published in 2023. Both GPT-4o (2024) and Gemini 2.0 Flash (2024) were trained after these datasets were publicly available. The paper does not discuss this temporal leakage.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "If the models have memorized HumanEval solutions from training, their ability to assess code correctness may be inflated — they could be pattern-matching against known solutions rather than truly reviewing code. This is not discussed.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "No discussion of whether the training data of GPT-4o or Gemini includes HumanEval problems or their solutions, which would violate independence between training and test data.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection methods (canary strings, membership inference, n-gram overlap analysis, decontamination) are applied despite using a widely-known benchmark that is almost certainly in the training data.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "LLMs can evaluate code changes for approval/rejection with moderate accuracy (GPT-4o 68.50%, Gemini 63.89%)",
    456       "evidence": "Tested on 492 AI-generated code blocks with unit tests as ground truth. Correctness accuracy measured as proportion matching unit test results.",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "Problem descriptions significantly improve LLM performance on code review tasks",
    461       "evidence": "With descriptions: GPT-4o 68.50% accuracy vs without: drops to ~55% (implicit from figures). Regression ratios differ by up to 22.87% with/without descriptions.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "LLM code improvement suggestions have moderate effectiveness (GPT-4o 67.83% correction ratio)",
    466       "evidence": "Correction ratio measured as proportion of incorrect code blocks where suggested code passes all unit tests.",
    467       "supported": "moderate"
    468     },
    469     {
    470       "claim": "Model performance is highly dependent on code dataset type (mixed vs ground truth)",
    471       "evidence": "GPT-4o achieves 68.50% on mixed dataset but only 42.07% on ground truth. Gemini shows opposite trend (63.89% mixed, 66.67% ground truth).",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "High regression rates (10-24%) prevent safe full automation of code review",
    476       "evidence": "GPT-4o 10.43% regression on mixed dataset, 13.53% for Gemini. Up to 23.79% without problem descriptions. Regression defined as correct code made incorrect.",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "False negative errors are preferable to false positives in code review",
    481       "evidence": "Reasoning: false positives merge faulty code (quality risk); false negatives inconvenience author (minor issue). Sound logic but normative rather than empirically tested.",
    482       "supported": "moderate"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval",
    487     "case-study"
    488   ],
    489   "key_findings": "GPT-4o achieves 68.50% accuracy in classifying code correctness and 67.83% effectiveness in correcting code on a mixed dataset of 492 AI-generated blocks, outperforming Gemini 2.0 Flash. However, this pattern reverses on the ground truth HumanEval dataset (42.07% vs 66.67%), suggesting code type significantly affects model behavior. Problem descriptions improve both metrics by up to 22.87%, highlighting the value of clear PR/commit comments. Regression rates of 10–24% demonstrate that LLM code suggestions can corrupt previously correct code, making full automation unsafe. The authors propose a hybrid 'Human-in-the-loop LLM Code Review' process where humans make final merge decisions to balance efficiency gains with reliability.",
    490   "red_flags": [
    491     {
    492       "flag": "No significance testing for model comparisons",
    493       "detail": "Standard deviations reported but no t-tests or ANOVA to determine if GPT-4o vs Gemini differences are statistically significant. Observed differences (68.50% vs 63.89%) could be within noise."
    494     },
    495     {
    496       "flag": "Dataset size and composition concerns",
    497       "detail": "Only 656 total blocks (164 very simple problems per HumanEval). No evaluation on production code. AI-generated dataset may not reflect real code review scenarios."
    498     },
    499     {
    500       "flag": "No contamination analysis for HumanEval",
    501       "detail": "HumanEval (2021 benchmark) is well-known; likely in training data of 2024 models. No discussion of train-test overlap risk."
    502     },
    503     {
    504       "flag": "Limited hyperparameter exploration",
    505       "detail": "Only 'default model parameters' used. Temperature, top-p, and other settings not specified or ablated. Results may not represent optimal LLM configuration."
    506     },
    507     {
    508       "flag": "No specific failure case analysis",
    509       "detail": "Quantifies regression and error rates but provides no examples of which code patterns fail or why. Limits actionability of findings."
    510     },
    511     {
    512       "flag": "Missing practical cost analysis",
    513       "detail": "No inference cost, latency, or token usage reported. Critical for adoption decision but absent from evaluation."
    514     },
    515     {
    516       "flag": "Dataset-dependent reversal raises reliability concerns",
    517       "detail": "Opposite ranking of models on mixed vs ground truth datasets (68.50% vs 42.07% for GPT-4o) suggests findings may be unstable across different code distributions."
    518     }
    519   ],
    520   "cited_papers": [
    521     {
    522       "title": "Evaluating Large Language Models Trained on Code",
    523       "authors": "Chen, M. et al.",
    524       "year": 2021,
    525       "relevance": "Introduces HumanEval benchmark directly used in this study for ground truth dataset."
    526     },
    527     {
    528       "title": "Modern Code Review: A Case Study at Google",
    529       "authors": "Sadowski, C. et al.",
    530       "year": 2018,
    531       "relevance": "Industry context for code review practices and motivation for automation."
    532     },
    533     {
    534       "title": "Expectations, Outcomes, and Challenges of Modern Code Review",
    535       "authors": "Bacchelli, A. & Bird, C.",
    536       "year": 2013,
    537       "relevance": "Foundational work on code review practice and knowledge sharing goals."
    538     },
    539     {
    540       "title": "Code Review Automation: Strengths and Weaknesses of the State of the Art",
    541       "authors": "Tufano, R. et al.",
    542       "year": 2024,
    543       "relevance": "Recent comprehensive survey of automated code review approaches and LLM applications."
    544     },
    545     {
    546       "title": "AI-Powered Code Review with LLMs: Early Results",
    547       "authors": "Rasheed, Z. et al.",
    548       "year": 2024,
    549       "relevance": "Concurrent work on LLM agents for code review automation."
    550     },
    551     {
    552       "title": "CodeAgent: Autonomous Communicative Agents for Code Review",
    553       "authors": "Tang, X. et al.",
    554       "year": 2024,
    555       "relevance": "Recent agent-based approach to automating code review tasks."
    556     },
    557     {
    558       "title": "Attention is All You Need",
    559       "authors": "Vaswani, A. et al.",
    560       "year": 2017,
    561       "relevance": "Foundational transformer architecture underlying GPT and Gemini models."
    562     }
    563   ],
    564   "engagement_factors": {
    565     "practical_relevance": {
    566       "score": 2,
    567       "justification": "Proposes a human-in-the-loop code review process and provides a replication package for practitioners to benchmark LLMs on their own codebases."
    568     },
    569     "surprise_contrarian": {
    570       "score": 1,
    571       "justification": "The finding that LLMs have only moderate accuracy (~68%) at code review is mildly informative but unlikely to surprise practitioners who have used these tools."
    572     },
    573     "fear_safety": {
    574       "score": 0,
    575       "justification": "No safety, security, or risk implications discussed beyond the general point that faulty code review could introduce bugs."
    576     },
    577     "drama_conflict": {
    578       "score": 0,
    579       "justification": "No controversy, no strong claims against specific companies or products, and balanced reporting of results."
    580     },
    581     "demo_ability": {
    582       "score": 1,
    583       "justification": "Zenodo replication package is available but it is an experimental setup, not a ready-to-use tool or demo."
    584     },
    585     "brand_recognition": {
    586       "score": 1,
    587       "justification": "Evaluates well-known models (GPT-4o, Gemini 2.0 Flash) but comes from Bilkent University rather than a major AI lab."
    588     }
    589   },
    590   "hn_data": {
    591     "threads": [
    592       {
    593         "hn_id": "45535425",
    594         "title": "Reasoning LLMs are wandering solution explorers",
    595         "points": 90,
    596         "comments": 98,
    597         "url": "https://news.ycombinator.com/item?id=45535425"
    598       },
    599       {
    600         "hn_id": "44778108",
    601         "title": "Agentic Web: Weaving the Next Web with AI Agents",
    602         "points": 3,
    603         "comments": 1,
    604         "url": "https://news.ycombinator.com/item?id=44778108"
    605       },
    606       {
    607         "hn_id": "45275073",
    608         "title": "The Mathematician's Assistant: Integrating AI into Research Practice",
    609         "points": 2,
    610         "comments": 0,
    611         "url": "https://news.ycombinator.com/item?id=45275073"
    612       },
    613       {
    614         "hn_id": "45155065",
    615         "title": "Reverse Designing Ferroelectric Capacitors with ML-Based Compact Modeling",
    616         "points": 2,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=45155065"
    619       },
    620       {
    621         "hn_id": "44831312",
    622         "title": "Meta Clip 2: Worldwide",
    623         "points": 2,
    624         "comments": 0,
    625         "url": "https://news.ycombinator.com/item?id=44831312"
    626       },
    627       {
    628         "hn_id": "40561445",
    629         "title": "There and Back Again: The AI Alignment Paradox",
    630         "points": 2,
    631         "comments": 0,
    632         "url": "https://news.ycombinator.com/item?id=40561445"
    633       },
    634       {
    635         "hn_id": "44853245",
    636         "title": "Agentic Web – Weaving the Next Web with AI Agents",
    637         "points": 1,
    638         "comments": 0,
    639         "url": "https://news.ycombinator.com/item?id=44853245"
    640       }
    641     ],
    642     "top_points": 90,
    643     "total_points": 102,
    644     "total_comments": 99
    645   }
    646 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs