scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33772B)
      1 {
      2   "paper": {
      3     "title": "DeepReview: Improving LLM-based Paper Review with Human-like Deep Thinking Process",
      4     "authors": [
      5       "Minjun Zhu",
      6       "Yixuan Weng",
      7       "Linyi Yang",
      8       "Yue Zhang"
      9     ],
     10     "year": 2025,
     11     "venue": "Annual Meeting of the Association for Computational Linguistics",
     12     "arxiv_id": "2503.08569",
     13     "doi": "10.48550/arXiv.2503.08569"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "DeepReview introduces a multi-stage review framework (novelty verification, multi-dimension evaluation, reliability verification) and trains DeepReviewer-14B on DeepReview-13K, a synthetic dataset with structured reasoning chains from ICLR 2024-2025 reviews. The 14B model achieves 44.80% lower Rating MSE than CycleReviewer-70B and 80-88% win rates against GPT-o1 and DeepSeek-R1 in LLM-as-judge evaluation. Test-time scaling through reasoning path depth (Fast/Standard/Best) and simulated reviewer count shows positive performance trends, and the model exhibits some robustness to adversarial prompt injection attacks.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The Resources section lists 'Code Repository: zhu-minjun/Researcher' and states 'The code, model, dataset and demo have be released in http://ai-researcher.net.' URLs are provided for models (DeepReviewer-7B, DeepReviewer-14B), dataset (DeepReview-13K), and a demo page."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The Resources section lists 'Dataset: DeepReview-13K' as released, and the paper states it will be publicly available. The underlying data is also sourced from the publicly accessible OpenReview platform."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions '8x H100 80G GPUs with DeepSpeed + ZeRO3' and training hyperparameters (Section 4.3), but provides no requirements.txt, Dockerfile, or detailed library version specifications needed to recreate the environment."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. The methodology sections describe the pipeline conceptually but do not include specific commands, scripts, or a README-style guide to reproduce experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 2, 3, and 4 report only point estimates (e.g., MSE, MAE, accuracy, win rates) with no confidence intervals, error bars, or ± notation anywhere in the paper."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes many comparative claims (e.g., '44.80% reduction in Rating MSE', '6.04% improvement in Rating Spearman') but no statistical significance tests (p-values, t-tests, bootstrap tests) are reported for any comparison."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper consistently reports improvements with baseline context, e.g., 'reduces Rating MSE by an average of 65.83%' (Section 5.2), 'improvements of 33.58% and 22.09%' in MSE and MAE vs CycleReviewer-70B (Section 5.2), and absolute numbers for both systems are visible in tables."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The test set is 10% of the dataset (1,286 samples) split by random sampling. No power analysis or explicit justification for why this sample size is adequate for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported anywhere. All results in Tables 2-4 and Figure 3 appear to be single-run point estimates."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Extensive baselines are compared: prompt-based methods (AI Scientist, AgentReview) with multiple backbone LLMs (GPT-o1, Claude-3.5-sonnet, Gemini-2.0-Flash-Thinking, DeepSeek-V3, DeepSeek-R1) and fine-tuned baselines (CycleReviewer-8B, CycleReviewer-70B)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include GPT-o1-2024-12-17, Claude-3.5-sonnet-20241022, Gemini-2.0-Flash-Thinking-01-21, DeepSeek-R1, and CycleReviewer (ICLR 2025 submission). All are state-of-the-art models from 2024-2025."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.5 presents test-time scaling analysis with Reasoning Path Scaling (Fast/Standard/Best modes, progressively adding reasoning stages z1/z2/z3) and Reviewer Scaling (R=1 to R=6), which function as ablations showing which components contribute to performance."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics are used: Rating MSE, Rating MAE, Decision Accuracy, Decision F1, Rating Spearman correlation, and Pairwise Rating Accuracy (Table 2). Qualitative evaluation adds Constructive Value, Analytical Depth, Plausibility, Technical Accuracy, and Overall Judgment (Table 4)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper uses LLM-as-a-judge evaluation with Gemini-2.0-Flash-Thinking (Section 3.2, Table 4) rather than human evaluation. The appendix includes a qualitative case study comparing DeepReviewer's meta-review to real human reviews for one paper, but no systematic human evaluation of DeepReviewer's output quality is performed."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3.2 states 'we randomly sampled 10% (1.2K) of the dataset to create DeepReview-Bench.' Table 1 shows separate ICLR 2024 Test (652) and ICLR 2025 Test (634) sets distinct from training data."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 3 breaks down results by Soundness, Presentation, and Contribution dimensions. Table 4 breaks down by Constructive Value, Analytical Depth, Plausibility, Technical Accuracy, and Overall Judgment. Results are also separated by ICLR 2024 vs ICLR 2025."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No qualitative error analysis or specific failure cases are shown. The paper mentions adversarial vulnerability (0.31-point rating increase under attack in Section 5.4) and Reviewer Scaling variability, but does not analyze specific examples where DeepReviewer produces poor or incorrect reviews."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "Every main comparison shows DeepReviewer winning. While Section 5.5 acknowledges performance variability in Reviewer Scaling when R≠4, this is presented as an expected artifact of training distribution, not a negative result. No failed approaches or configurations that underperformed are reported."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of 88.21% and 80.20% win rates against GPT-o1 and DeepSeek-R1 match Table 4 (ICLR 2024, Overall Judgment). The claim that DeepReviewer-14B 'outperforms CycleReviewer-70B with fewer tokens' is supported by Tables 2-3 and Section 5.5's token comparison."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The main causal claims about reasoning stages improving performance are supported by controlled ablation through Fast/Standard/Best modes (Section 5.5), which progressively add reasoning stages. The attribution of adversarial robustness to 'multi-stage reasoning framework' (Section 5.4) is more speculative but hedged with 'We attribute.'"
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The abstract claims to 'set a new benchmark for LLM-based paper review' broadly, but all experiments use only ICLR CS/ML papers. The title says 'Improving LLM-based Paper Review' without domain qualification. The limitations section mentions generalizability concerns but the main claims are not bounded to the tested setting."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations for the results are discussed. Potential confounds are unaddressed: the Phi-4 base model's contribution vs. the training framework, whether the LLM judge (Gemini) has systematic biases, whether training data overlap with test data inflates scores, or whether CycleReviewer baselines were run under optimal conditions."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper uses MSE/MAE against averaged ICLR review scores as a proxy for review quality and LLM-as-judge win rates as a proxy for review usefulness, without discussing the gap between these proxies and actual review quality. Whether matching aggregated human scores means a review is actually insightful or useful is not examined."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 5.1 specifies 'GPT-o1-2024-12-17, Claude-3.5-sonnet-20241022, Gemini-2.0-Flash-Thinking-01-21, DeepSeek-V3, and DeepSeek-R1.' Section 4.2 specifies 'Qwen-2.5-72B-Instruct', 'Qwen-2.5-3B-Instruct', and 'Phi-4 14B' for the base model."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix Figures 4, 5, 6, and 7 provide full system prompt text for the LLM-as-judge evaluation, review improvement, paper analysis, and reliability verification stages respectively. These are complete prompt texts, not just descriptions."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 5.1 reports 'temperature of 0.4 with maximum input and output lengths set to 100K and 16,384 tokens.' Section 4.3 reports training: 'batch size of 16 and a learning rate of 5e-6', '23,500 steps', '256K context window using LongRoPE, with a 40K context window during training.'"
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The multi-stage pipeline is described in detail in Section 4.2: Stage 1 uses Semantic Scholar API and OpenScholar for literature retrieval with ReRank for reordering; Stage 2 uses review reconstruction; Stage 3 uses Gemini for evidence analysis. The inference strategy (Section 4.3) describes how the three modes (Fast/Standard/Best) execute different subsets of the pipeline."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The paper describes collecting 18,976 submissions from OpenReview and converting with MinerU, with 'empty PDFs filtered during conversion' (footnote 2). The final dataset has 13,378 training + 1,286 test samples, leaving ~4,312 removed. The quality control mechanism is described conceptually but exact counts removed at each stage (PDF filtering, quality control failures) are not reported."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "A dedicated 'Limitations' section discusses three specific issues: synthetic data may not capture genuine human review nuances, computational intensity of 'Best' mode, and incomplete adversarial robustness."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The limitations are specific to this study: (1) 'synthetic data may not fully capture the complexities and nuances of genuine human paper review', (2) the 'Best' mode with 'complete reasoning chain and external knowledge retrieval, can be computationally intensive', (3) 'complete immunity is not yet achieved' for adversarial attacks."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound results to ICLR/CS/ML papers despite testing only on that domain. The limitations discuss general issues but do not specify excluded settings, untested populations, or claims the authors are not making."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The source data (ICLR reviews) is publicly available on OpenReview. The paper states DeepReview-13K will be released via ai-researcher.net (Resources section). The underlying ICLR review data is independently verifiable."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.1 describes: 'collected raw data from the OpenReview platform arXiv repository, gathering 18,976 paper submissions spanning two ICLR conference cycles (2024-2025).' Reviews include 'textual assessments (Strengths, Weaknesses, and Questions), interactive discussions from the rebuttal phase, and standardized scores.'"
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants are involved in the experiments. Data comes from publicly available ICLR submissions on OpenReview. The LLM-as-judge evaluation uses an automated model, not human evaluators."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The pipeline stages are described (collect → convert → filter → construct reasoning chains → quality control) but exact counts at each stage are missing. Starting with 18,976 samples and ending with 14,664 (13,378 train + 1,286 test), approximately 4,312 were removed, but the paper does not break down how many were lost at PDF filtering vs. quality control."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The footnote on page 1 states 'Supported by Research Center for Industries of the Future, Westlake University.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are listed: Zhejiang University, Westlake University, and University College London. The authors are not affiliated with any of the companies whose models they evaluate (OpenAI, Anthropic, Google, DeepSeek)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The funder (Westlake University's Research Center for Industries of the Future) is an academic institution with no apparent commercial stake in the outcome of automated paper review systems."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper. The absence of a disclosure statement means this criterion is not met."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not state the training data cutoff for Phi-4 (the base model for DeepReviewer-14B) or for any of the baseline models (GPT-o1, DeepSeek-R1, etc.). Since ICLR reviews are publicly available, these models may have seen the test data during pre-training."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper does not discuss whether Phi-4's pre-training data includes ICLR 2024-2025 reviews from OpenReview. The test set (DeepReview-Bench) is split from the same ICLR data, and potential overlap with base model pre-training is not analyzed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "ICLR 2024 reviews were publicly available on OpenReview before the training cutoffs of Phi-4 and the baseline models. ICLR 2025 reviews may also have been available. The paper does not discuss this contamination risk."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants involved. All evaluations are automated (metrics computed on held-out data, LLM-as-judge)."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants involved in the experiments. The paper has an ethical considerations section but does not involve human subjects research."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants involved in the experiments."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants involved in the experiments."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants involved in the experiments."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants involved in the experiments."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants involved in the experiments."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Section 5.5 reports approximate output token counts per mode (Fast ~3,000, Standard ~8,000, Best ~14,500) but no actual inference cost in dollars, wall-clock latency, or tokens-per-second throughput is reported. The Best mode's external API calls (Semantic Scholar, OpenScholar) add unquantified latency."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Section 4.3 mentions '8x H100 80G GPUs' and '23,500 steps with a batch size of 16' but does not report total training time in GPU-hours, wall-clock time, or total API costs for baseline evaluations."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper does not state how many runs produced the results in Tables 2-4 or Figure 3. It is unclear whether results are from single runs or averaged."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Training hyperparameters (lr=5e-6, batch=16, 23,500 steps) are reported but there is no mention of how many configurations were tried, what search method was used, or whether any hyperparameter tuning was performed."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No discussion of how the final training configuration was selected. The paper presents one configuration without explaining whether alternatives were tried or how this particular setup was chosen."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes dozens of comparisons across multiple metrics, models, and datasets (Tables 2-4) but performs no statistical tests at all, let alone corrections for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implemented DeepReviewer and designed the evaluation framework (DeepReview-Bench) but do not acknowledge self-comparison bias. They also designed the data construction pipeline and the evaluation metrics, creating potential for systematic bias in their favor."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 5.5 and Figure 3 explicitly show performance as a function of inference token count (compute). The paper notes that 'DeepReviewer's Fast mode, with only half the output tokens (3000), outperformed the CycleReviewer model (6000 output tokens) across various metrics.'"
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper uses ICLR review score prediction as a benchmark for review quality without discussing whether predicting aggregated reviewer scores actually measures the ability to produce useful reviews. Whether lower MSE against averaged human scores means better review quality (vs. just matching the central tendency) is not examined."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "Different baselines use fundamentally different scaffolding: AI Scientist uses agentic prompting, AgentReview uses multi-agent simulation, CycleReviewer is direct fine-tuning, and DeepReviewer has a multi-stage pipeline with external retrieval. Performance differences are attributed to the model/method without controlling for scaffolding differences."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The base model Phi-4 and baseline models (GPT-o1, DeepSeek-R1) may have been pre-trained on ICLR 2024 review data from OpenReview, which predates their release. This temporal leakage risk is not discussed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. The training and test data are from the same conferences (ICLR 2024-2025) and the model is trained to predict scores from the same review platform."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Training and test data are randomly split from the same pool of ICLR 2024-2025 submissions. Papers by the same authors, on the same topics, or from the same research groups could appear in both splits. This non-independence is not discussed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention methods are employed. No canary strings, membership inference tests, decontamination pipelines, or overlap analysis between Phi-4's pre-training data and the ICLR test set."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "DeepReviewer-14B reduces Rating MSE by 44.80% compared to CycleReviewer-70B",
    370       "evidence": "Table 2 shows DeepReviewer-14B achieves Rating MSE of 1.3137 (ICLR 2024) vs CycleReviewer-70B's 2.4870, and 1.3410 vs 2.4294 on ICLR 2025.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "DeepReviewer-14B achieves 88.21% win rate against GPT-o1 and 80.20% against DeepSeek-R1 in overall judgment",
    375       "evidence": "Table 4 (ICLR 2024) shows Overall Judgment win rates of 88.21% vs AI Scientist GPT-o1 and 80.20% vs AI Scientist DeepSeek-R1, using Gemini-2.0-Flash-Thinking as judge.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "DeepReviewer shows strong resilience to adversarial attacks with only 0.31 rating increase under attack",
    380       "evidence": "Figure 2 (Section 5.4) shows overall rating increase of 0.31 points (5.38→5.69) vs Gemini's 4.26 points (4.23→8.49). Attribution to multi-stage reasoning is speculative.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Test-time scaling through reasoning path and reviewer count improves performance",
    385       "evidence": "Figure 3 (Section 5.5) shows positive trends in Decision Accuracy, Rating MSE, Spearman correlation as tokens increase from Fast→Standard→Best modes, and from R=1 to R=6 reviewers.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "DeepReviewer's Fast mode outperforms CycleReviewer with fewer output tokens",
    390       "evidence": "Section 5.5 states Fast mode (3,000 tokens) outperforms CycleReviewer (6,000 tokens) on Decision Accuracy, Rating MSE, and fine-grained Spearman correlations, supported by Table 3 comparisons.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "DeepReviewer improves Rating Spearman by 6.04% over CycleReviewer-70B",
    395       "evidence": "Table 2 shows Rating Spearman of 0.3559 vs 0.3356 (ICLR 2024) and 0.4047 vs 0.2674 (ICLR 2025). The 6.04% improvement is for ICLR 2024; ICLR 2025 shows a much larger gap.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "LLM-as-judge circularity",
    402       "detail": "Gemini-2.0-Flash-Thinking serves as the evaluation judge (Table 4) but was also used in the data construction pipeline for paper analysis (Stage 1, Figure 6) and reliability verification (Stage 3, Figure 7). The judge may systematically favor outputs whose reasoning style it helped shape, creating circular validation."
    403     },
    404     {
    405       "flag": "No human evaluation of review quality",
    406       "detail": "For a paper about automating paper review, the absence of systematic human evaluation is notable. All quality assessments rely on automated metrics (MSE against aggregated scores) or an LLM judge. Whether DeepReviewer's reviews are actually useful to authors or area chairs is not tested."
    407     },
    408     {
    409       "flag": "No statistical significance testing",
    410       "detail": "All claims of superiority over baselines are based on comparing point estimates across Tables 2-4 without any significance tests, error bars, or variance measures. With many comparisons across models, metrics, and datasets, some apparent improvements could be noise."
    411     },
    412     {
    413       "flag": "Self-designed benchmark",
    414       "detail": "The authors created DeepReview-Bench from the same data distribution as their training set (ICLR 2024-2025 via OpenReview), designed the evaluation metrics, and trained the model — creating potential for optimization toward their own benchmark rather than genuine review capability."
    415     },
    416     {
    417       "flag": "Contamination risk from base model pre-training",
    418       "detail": "Phi-4 and all baseline models (GPT-o1, DeepSeek-R1, etc.) may have been pre-trained on ICLR reviews from OpenReview, which is the source of both training and test data. The paper does not investigate or discuss this overlap."
    419     },
    420     {
    421       "flag": "Overclaiming scope",
    422       "detail": "The abstract claims to 'set a new benchmark for LLM-based paper review' and the title says 'Improving LLM-based Paper Review' broadly, but all experiments are limited to ICLR CS/ML papers. Generalization to other venues, disciplines, or paper types is untested."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    428       "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    429       "year": 2024,
    430       "arxiv_id": "2408.06292",
    431       "relevance": "Foundational work on autonomous AI-driven scientific discovery using LLM agents, directly compared as a baseline."
    432     },
    433     {
    434       "title": "AgentReview: Exploring Peer Review Dynamics with LLM Agents",
    435       "authors": ["Yuxuan Jin", "Qinlin Zhao", "Yilun Wang"],
    436       "year": 2024,
    437       "relevance": "Multi-agent LLM framework simulating peer review dynamics, used as a prompt-based baseline in DeepReview's evaluation."
    438     },
    439     {
    440       "title": "CycleResearcher: Improving Automated Research via Automated Review",
    441       "authors": ["Yixuan Weng", "Minjun Zhu", "Guanghao Bao"],
    442       "year": 2025,
    443       "relevance": "Closely related work training LLM-based research and review models with reinforcement learning; CycleReviewer-70B is a primary baseline."
    444     },
    445     {
    446       "title": "OpenScholar: Synthesizing Scientific Literature with Retrieval-Augmented LMs",
    447       "authors": ["Akari Asai"],
    448       "year": 2024,
    449       "arxiv_id": "2411.14199",
    450       "relevance": "RAG system for scientific literature synthesis, used as a core component in DeepReview's novelty verification stage."
    451     },
    452     {
    453       "title": "Are We There Yet? Revealing the Risks of Utilizing Large Language Models in Scholarly Peer Review",
    454       "authors": ["Ruofan Ye"],
    455       "year": 2024,
    456       "arxiv_id": "2412.01708",
    457       "relevance": "Examines risks and vulnerabilities of LLM-based peer review including prompt engineering attacks, directly relevant to DeepReview's adversarial evaluation."
    458     },
    459     {
    460       "title": "LLMs Assist NLP Researchers: Critique Paper (Meta-)Reviewing",
    461       "authors": ["Jiangshu Du"],
    462       "year": 2024,
    463       "doi": "10.18653/v1/2024.emnlp-main.292",
    464       "relevance": "LLM-assisted paper review and meta-review generation, directly in the automated review domain."
    465     },
    466     {
    467       "title": "Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100+ NLP Researchers",
    468       "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"],
    469       "year": 2025,
    470       "relevance": "Large-scale evaluation of LLM capability for scientific idea generation with human expert assessment."
    471     },
    472     {
    473       "title": "The AI Review Lottery: Widespread AI-Assisted Peer Reviews Boost Paper Scores and Acceptance Rates",
    474       "authors": ["Gabriel R. Latona"],
    475       "year": 2024,
    476       "arxiv_id": "2405.02150",
    477       "relevance": "Empirical study showing AI-assisted reviews systematically boost scores, raising concerns about automated review reliability."
    478     },
    479     {
    480       "title": "Potential and Perils of Large Language Models as Judges of Unstructured Textual Data",
    481       "authors": ["Rewina Bedemariam"],
    482       "year": 2025,
    483       "arxiv_id": "2501.08167",
    484       "relevance": "Evaluates reliability and limitations of LLM-as-judge paradigm, directly relevant to DeepReview's evaluation methodology."
    485     },
    486     {
    487       "title": "Large Language Models for Automated Scholarly Paper Review: A Survey",
    488       "authors": ["Zhenzhen Zhuang"],
    489       "year": 2025,
    490       "relevance": "Comprehensive survey of the LLM-based paper review landscape that DeepReview contributes to."
    491     },
    492     {
    493       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    494       "authors": ["Aman Madaan"],
    495       "year": 2024,
    496       "relevance": "Self-refinement technique for LLMs that influences DeepReview's reliability verification stage design."
    497     },
    498     {
    499       "title": "Peer Review as a Multi-Turn and Long-Context Dialogue with Role-Based Interactions",
    500       "authors": ["Chao Tan"],
    501       "year": 2024,
    502       "arxiv_id": "2406.05688",
    503       "relevance": "Alternative approach to LLM-based review modeling as multi-turn dialogue, used as a comparison point for DeepReview."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "Released model and demo at ai-researcher.net/deepreviewer that researchers could use for self-assessment, though requires significant compute for the 14B model."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "A 14B model outperforming 70B models and GPT-o1 is mildly surprising, but the trend of smaller fine-tuned models beating larger general models is well-established."
    514     },
    515     "fear_safety": {
    516       "score": 1,
    517       "justification": "Raises concerns about LLMs automating peer review and potential manipulation, but the paper explicitly positions itself as augmenting rather than replacing human reviewers."
    518     },
    519     "drama_conflict": {
    520       "score": 1,
    521       "justification": "Touches on the controversial topic of LLMs in peer review (ICLR 2025 introduced LLM assistance), but the paper avoids taking an adversarial position."
    522     },
    523     "demo_ability": {
    524       "score": 2,
    525       "justification": "Live demo available at ai-researcher.net/deepreviewer with released models (DeepReviewer-7B, 14B) and code repository."
    526     },
    527     "brand_recognition": {
    528       "score": 1,
    529       "justification": "Westlake University and UCL are recognized institutions but not in the top tier of AI lab brand recognition; published at ACL."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs