scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29034B)
      1 {
      2   "paper": {
      3     "title": "MixRevDetect: Towards Detecting AI-Generated Content in Hybrid Peer Reviews",
      4     "authors": [
      5       "Sandeep Kumar",
      6       "Samarth Garg",
      7       "Sagnik Sengupta",
      8       "Tirthankar Ghosal",
      9       "Asif Ekbal"
     10     ],
     11     "year": 2025,
     12     "venue": "NAACL 2025 (Short Papers)",
     13     "doi": "10.18653/v1/2025.naacl-short.79"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "GitHub repository link provided in footnote 1: https://github.com/sandeep82945/AI-text-Points. Abstract states 'We make our dataset and code public.'"
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract states 'We make our dataset and code public' with the GitHub link. The dataset of human and AI-generated reviews from NeurIPS 2022 is released."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No requirements.txt, Dockerfile, conda environment, or library version details are provided in the paper. Only the models used (GPT-4o, LLaMA 70B) are named."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are included in the paper. The methodology is described at a conceptual level but there are no commands, scripts, or README-level instructions for replicating experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Tables 1 and 2 report only point estimates for precision, recall, and F1 with no confidence intervals, error bars, or ± notation."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims 'significantly outperforming' baselines and reports percentage improvements (27.5%, 31.5%, etc.) but no statistical significance tests (p-values, t-tests, bootstrap tests) are performed. Comparisons are based solely on comparing raw numbers."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 4.3 reports relative improvements over each baseline with full context: '27.5% improvement over the best-performing baseline model, FAST-DETECT GPT, which has an F1 score of 0.6968.' Absolute and relative differences are provided for all baselines."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "1,000 papers were collected from NeurIPS 2022 but no justification is given for why this number was chosen. No power analysis or sample size rationale is provided."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No standard deviations, variance, or spread measures are reported. Results appear to be from a single run. No mention of multiple experimental runs or seeds."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Four baselines are compared: RADAR (Hu et al., 2023b), LLMDET (Wu et al., 2023), DEEP-FAKE (Li et al., 2023b), and FAST-DETECT GPT (Bao et al., 2023a). Results are presented in Tables 1 and 2."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "All baselines are from 2023 (RADAR from NeurIPS 2023, FAST-DETECT GPT from ICLR, LLMDET from EMNLP 2023, DEEP-FAKE from 2023), which are reasonably contemporary for a 2025 paper."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The system has multiple components (tail pruning, GPT-4o completion, BERTScore similarity, classification) but no ablation study removes or replaces individual components to measure their contribution. The tail pruning ratio experiment (Section 4.4) varies a hyperparameter, not a component."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Precision, recall, and F1 score are reported in Tables 1 and 2 for all methods."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Section 4.7 mentions 'human analyses to understand when and why our models fail' but this is qualitative error analysis of failure cases, not a systematic human evaluation of the system's detection outputs."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 4.1 states 'The dataset is split into training (70%), validation (10%), and test (20%) sets.' Results appear to be reported on the test set."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Only aggregate precision, recall, and F1 are reported. No breakdown by review type, aspect category, paper topic, or review length is provided."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 4.7 and Appendix C provide error analysis with specific failure categories: 'Formality and Abstraction' and 'Conciseness.' Concrete examples with true labels, predictions, and error causes are given."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 4.4 shows that tail pruning ratio of 0.5 yields a 'sharp drop in performance, with an F1 score of 0.721.' Section 4.5 shows performance drops under paraphrasing. These are configurations/conditions that hurt performance."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims an F1 score of 88.86% and 'significantly outperforming existing AI text detection methods.' Table 1 confirms F1 of 0.8886 and shows it outperforms all four baselines."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper implicitly claims that the tail pruning + completion + similarity approach causes superior detection performance. Language like 'our proposed method achieves' and the architecture framing imply causality. Without component-level ablations, the causal contribution of each component is unsubstantiated."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title claims 'Detecting AI-Generated Content in Hybrid Peer Reviews' broadly, but the method is only tested on NeurIPS 2022 reviews with GPT-4o-generated AI content. The Limitations section acknowledges GPT-4o reliance but does not bound generalization to this specific venue, time period, or generation model."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not consider alternative explanations for the high BERTScore similarity of AI completions. For example, GPT-4o may have been trained on NeurIPS 2022 reviews, inflating similarity. No discussion of confounds or robustness to the completion model choice."
    132       },
    133       "proxy_outcome_distinction": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper measures P/R/F1 for detecting AI-generated review points and frames its claims at the same granularity. The proxy (classification accuracy on constructed data) and the outcome (detecting AI content in peer reviews) are distinguished appropriately, and the Limitations section acknowledges the gap to real-world deployment."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper uses 'GPT-4o model' (Section 3.2) and 'LLaMA 70B' (Section 4.5) without specifying snapshot dates, API versions, or exact model identifiers. 'GPT-4o' is a marketing name without a version specifier."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The completion prompt is provided in full in Appendix D with the exact template used. The paraphrasing prompt is given verbatim in Section 4.5: 'Paraphrase the review comment below such that it looks like it is human written.'"
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The classifier has '100 epochs' and 'three hidden layers' (Section 4.2), but no temperature, top-p, max_tokens, or other sampling parameters for GPT-4o or LLaMA 70B are reported. These significantly affect output quality."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used. The method is a sequential pipeline (pruning → completion → similarity → classification) without agent loops, tool use, or feedback mechanisms."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The paper does not describe how reviews are split into individual review points/comments, which is a critical preprocessing step. The AI review generation process is also vaguely described ('we also generated AI-written reviews' in Appendix E). The tail pruning ratio is described but the upstream parsing steps are missing."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "A dedicated 'Limitations' section is present, discussing the reliance on GPT-4o and recommending future researchers match their LLM to the target corpus."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The Limitations section identifies a threat specific to this study: 'This study mainly relied on GPT-4o for generating AI-generated texts' and notes this may not capture usage patterns of other LLMs. This is a concrete, study-specific limitation."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound results to NeurIPS reviews, English-language reviews, or specific review styles. The limitation about GPT-4o is a practical note, not a scope boundary statement."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The dataset is made publicly available via the GitHub repository (footnote 1). The abstract states 'We make our dataset and code public.'"
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 4.1 and Appendix E describe collecting 1,000 papers and reviews from NeurIPS 2022 via OpenReview, choosing pre-ChatGPT reviews to avoid AI influence. The rationale for the time period selection is clear."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants were recruited. The study uses existing published peer reviews from NeurIPS 2022 obtained via OpenReview."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "The pipeline from raw reviews to individual review points is not documented. How reviews are segmented into individual comments (R1, R2, ..., Rn) is unexplained. The total number of review points in the dataset is not stated. How AI reviews were generated from the papers is only vaguely described."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Acknowledgement section discloses Prime Minister Research Fellowship (PMRF) program and Google's 'Gemma Academic Program GCP Credit Award' for cloud credits."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "All author affiliations are clearly listed: IIT Patna, IIITM Gwalior, Manipal Institute of Technology, Oak Ridge National Laboratory, and IIT Jodhpur."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "PMRF is a government fellowship and Google Cloud credits are for compute infrastructure. Neither funder has a financial interest in the specific outcome of AI-generated review detection."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "GPT-4o is used as a core component for generating completions, but its training data cutoff date is never stated. The paper notes NeurIPS 2022 reviews predate ChatGPT's release but does not address GPT-4o's training data."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No discussion of whether GPT-4o's training data includes NeurIPS 2022 reviews from OpenReview. Since these reviews are publicly available, GPT-4o may have seen them, which could affect the quality and similarity of generated completions."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "NeurIPS 2022 reviews are publicly available on OpenReview and were published well before GPT-4o's training. The model may have seen both the papers and the reviews used in this study. This contamination risk is not discussed."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study. The paper analyzes existing peer reviews and generates synthetic AI reviews."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. The Ethics Statement discusses privacy and data concerns but this is not a human subjects study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The method requires calling GPT-4o for every tail-pruned review point, which is expensive at scale. No API costs, tokens consumed, or latency per review are reported."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "Google Cloud credits are acknowledged in the funding section but no quantification of total compute budget, GPU hours, or API spend is provided."
    291       }
    292     },
    293     "experimental_rigor": {
    294       "seed_sensitivity_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No mention of multiple random seeds. The classifier training and results appear to be from a single run."
    298       },
    299       "number_of_runs_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The number of experimental runs is never stated. Results are presented as single point estimates without indication of how many runs produced them."
    303       },
    304       "hyperparameter_search_budget": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The tail pruning ratio is varied across values (Figure 2) but no formal hyperparameter search budget is reported. The classifier's hidden layer count and epoch count appear chosen without documented search."
    308       },
    309       "best_config_selection_justified": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Tail pruning ratio of 0.7 is reported as yielding the highest F1 (Figure 2), but it is unclear whether this was selected on the validation set or the test set. No explicit selection procedure is described."
    313       },
    314       "multiple_comparison_correction": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Four baseline comparisons are made across two experimental conditions (standard and paraphrased) without any statistical tests, let alone multiple comparison correction."
    318       },
    319       "self_comparison_bias_addressed": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The authors evaluate their own system against baselines without acknowledging the inherent bias. No discussion of whether baselines were run under optimal conditions or with the same tuning effort."
    323       },
    324       "compute_budget_vs_performance": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "MixRevDetect requires GPT-4o API calls per review point, which is vastly more expensive than zero-shot methods like FAST-DETECT GPT or statistical methods like LLMDET. This cost disparity is never acknowledged or discussed."
    328       },
    329       "benchmark_construct_validity": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The paper does not discuss whether detecting GPT-4o-generated reviews on NeurIPS 2022 data actually measures the ability to detect AI-generated content in real-world hybrid reviews. The synthetic construction may not reflect how reviewers actually mix AI and human content."
    333       },
    334       "scaffold_confound_addressed": {
    335         "applies": false,
    336         "answer": false,
    337         "justification": "No agentic scaffolding is involved. The method is a pipeline without scaffold comparisons."
    338       }
    339     },
    340     "data_leakage": {
    341       "temporal_leakage_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "NeurIPS 2022 reviews are publicly available and predate GPT-4o's training. GPT-4o may have seen these exact reviews and papers during training, which would affect completion quality. This is not discussed."
    345       },
    346       "feature_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of whether the evaluation setup leaks information. The paper content is provided to GPT-4o during completion, which is by design, but no analysis of whether this introduces systematic biases."
    350       },
    351       "non_independence_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "The 70/10/20 train/validation/test split is described, but no discussion of whether review points from the same paper or reviewer might appear across splits, violating independence."
    355       },
    356       "leakage_detection_method": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or temporal splits are applied."
    360       }
    361     }
    362   },
    363   "scan_version": 3,
    364   "active_modules": ["experimental_rigor", "data_leakage"],
    365   "claims": [
    366     {
    367       "claim": "MixRevDetect achieves an F1 score of 88.86%, representing a 27.5% improvement over the best-performing baseline (FAST-DETECT GPT at 69.68%).",
    368       "evidence": "Table 1 shows precision 0.8799, recall 0.8982, F1 0.8886 compared to four baselines. Section 4.3 details percentage improvements.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "MixRevDetect is robust to paraphrasing, showing only a 6.34% drop in F1 versus 38-48% drops for baselines.",
    373       "evidence": "Table 2 shows F1 drops from paraphrasing: DEEP-FAKE 47.77%, FAST-DETECT GPT 38.17%, LLMDET 37.00%, RADAR 6.92%, MixRevDetect 6.34%. Section 4.5.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "A tail pruning ratio of 0.7 yields the highest F1 score at 0.884.",
    378       "evidence": "Figure 2 shows F1 scores across tail pruning ratios from 0.1 to 0.9. Section 4.4 describes the results.",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "BERTScore effectively differentiates AI-generated and human-written review completions because AI completions show higher similarity to pruned tails.",
    383       "evidence": "Section 4.6 and Appendix B provide qualitative examples showing high BERTScore for AI completions and lower scores for human completions.",
    384       "supported": "weak"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "MixRevDetect proposes a tail-pruning and completion approach to detect AI-generated points within hybrid peer reviews, achieving 88.86% F1 on a constructed dataset of NeurIPS 2022 reviews. The method significantly outperforms existing AI text detectors (RADAR, LLMDET, DEEP-FAKE, FAST-DETECT GPT) and shows strong robustness to paraphrasing with only a 6.34% F1 drop. The core insight is that GPT-4o completions of pruned AI-generated text show higher BERTScore similarity to the original text than completions of pruned human-written text.",
    389   "red_flags": [
    390     {
    391       "flag": "No error bars or uncertainty quantification",
    392       "detail": "All results in Tables 1 and 2 are single-point estimates. No standard deviations, confidence intervals, or multiple-run results are reported despite the classifier training being stochastic."
    393     },
    394     {
    395       "flag": "No statistical significance tests",
    396       "detail": "The paper claims 'significantly outperforming' baselines but never performs statistical tests. All comparisons are raw numerical differences."
    397     },
    398     {
    399       "flag": "Possible test-set selection of tail pruning ratio",
    400       "detail": "The optimal tail pruning ratio of 0.7 is identified in Figure 2, but it is unclear whether this was selected using the validation set or the test set. If selected on the test set, reported results are inflated."
    401     },
    402     {
    403       "flag": "Contamination risk from GPT-4o training data",
    404       "detail": "NeurIPS 2022 reviews are publicly available on OpenReview and were likely in GPT-4o's training data. If GPT-4o has memorized these reviews, its completions would be artificially similar to AI-generated text and artificially dissimilar to human text, inflating detection accuracy."
    405     },
    406     {
    407       "flag": "Unfair cost comparison with baselines",
    408       "detail": "MixRevDetect requires a GPT-4o API call for every review point, which is orders of magnitude more expensive than zero-shot or statistical baselines. This cost disparity is never discussed, making the comparison misleading for practical deployment."
    409     },
    410     {
    411       "flag": "Synthetic evaluation may not reflect real-world hybrid reviews",
    412       "detail": "The dataset constructs hybrid reviews by combining separate human and AI-generated reviews. In practice, reviewers may interleave AI suggestions with their own writing in more complex ways, and the evaluation does not test this realistic scenario."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Monitoring AI-modified content at scale: A case study on the impact of ChatGPT on AI conference peer reviews",
    418       "authors": ["Weixin Liang", "Zachary Izzo", "Yaohui Zhang"],
    419       "year": 2024,
    420       "relevance": "Estimates 6.5-16.9% of peer review text at AI conferences was influenced by LLMs, directly motivating the need for AI review detection."
    421     },
    422     {
    423       "title": "Quis custodiet ipsos custodes? Who will watch the watchmen? On detecting AI-generated peer-reviews",
    424       "authors": ["Sandeep Kumar", "Mohit Sahu", "Vardhan Gacche", "Tirthankar Ghosal", "Asif Ekbal"],
    425       "year": 2024,
    426       "arxiv_id": "2410.09770",
    427       "relevance": "Prior work by same group on detecting fully AI-generated peer reviews, which this paper extends to hybrid review detection."
    428     },
    429     {
    430       "title": "DetectGPT: Zero-shot machine-generated text detection using negative curvature in probability space",
    431       "authors": ["Eric Mitchell"],
    432       "year": 2023,
    433       "relevance": "Foundational zero-shot AI text detection method using log probability curvature, baseline approach for detecting AI-generated content."
    434     },
    435     {
    436       "title": "Fast-DetectGPT: Efficient zero-shot detection of machine-generated text via conditional probability curvature",
    437       "authors": ["Guangsheng Bao", "Yanbin Zhao", "Zhiyang Teng"],
    438       "year": 2023,
    439       "arxiv_id": "2310.05130",
    440       "relevance": "Best-performing baseline in this study; efficient zero-shot AI text detection method used for comparison."
    441     },
    442     {
    443       "title": "RADAR: Robust AI-text detection via adversarial learning",
    444       "authors": ["Xiaomeng Hu", "Pin-Yu Chen", "Tsung-Yi Ho"],
    445       "year": 2023,
    446       "relevance": "Adversarial training-based AI text detector, one of the main baselines evaluated for detecting AI-generated peer review content."
    447     },
    448     {
    449       "title": "LLMDet: A third party large language models generated text detection tool",
    450       "authors": ["Kangxi Wu", "Liang Pang", "Huawei Shen"],
    451       "year": 2023,
    452       "relevance": "Perplexity-based AI text detection tool, evaluated as a baseline for detecting AI-generated review points."
    453     },
    454     {
    455       "title": "Deepfake text detection in the wild",
    456       "authors": ["Yafu Li", "Qintong Li", "Leyang Cui"],
    457       "year": 2023,
    458       "arxiv_id": "2305.13242",
    459       "relevance": "AI-generated text detection framework with progressive difficulty settings, evaluated as a baseline in this study."
    460     },
    461     {
    462       "title": "GPT-Sentinel: A robust approach to AI-generated text detection",
    463       "authors": ["Ji Chen"],
    464       "year": 2023,
    465       "relevance": "Trained RoBERTa and T5 classifiers for AI-generated text detection, related approach to detecting LLM outputs."
    466     },
    467     {
    468       "title": "BERTScore: Evaluating text generation with BERT",
    469       "authors": ["Tianyi Zhang", "Varsha Kishore", "Felix Wu"],
    470       "year": 2019,
    471       "arxiv_id": "1904.09675",
    472       "relevance": "Core similarity metric used in MixRevDetect to compare AI completions with original review text for detection."
    473     },
    474     {
    475       "title": "LLaMA: Open and efficient foundation language models",
    476       "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"],
    477       "year": 2023,
    478       "arxiv_id": "2302.13971",
    479       "relevance": "LLaMA 70B used for paraphrasing experiments to test robustness of AI text detection against evasion."
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 2,
    485       "justification": "Could be integrated into editorial workflows for screening peer reviews, though the GPT-4o API dependency limits immediate practicality."
    486     },
    487     "surprise_contrarian": {
    488       "score": 1,
    489       "justification": "Novel framing of hybrid (mixed human+AI) review detection rather than whole-review classification, but the core approach (completion similarity) is intuitive rather than surprising."
    490     },
    491     "fear_safety": {
    492       "score": 1,
    493       "justification": "Raises concerns about AI-assisted peer review integrity but does not present alarming new attack vectors or existential concerns."
    494     },
    495     "drama_conflict": {
    496       "score": 1,
    497       "justification": "Touches on the controversial topic of AI use in peer review, citing Springer retractions, but presents itself as a constructive tool rather than exposing wrongdoing."
    498     },
    499     "demo_ability": {
    500       "score": 2,
    501       "justification": "Code and dataset are released publicly on GitHub, allowing others to run the system, though it requires GPT-4o API access."
    502     },
    503     "brand_recognition": {
    504       "score": 0,
    505       "justification": "From Indian academic institutions (IIT Patna, IIT Jodhpur) and Oak Ridge National Lab, not major AI labs or widely recognized product brands."
    506     }
    507   }
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs