scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27565B)
      1 {
      2   "paper": {
      3     "title": "AI-Driven Scholarly Peer Review via Persistent Workflow Prompting, Meta-Prompting, and Meta-Reasoning",
      4     "authors": ["Evgeny Markhasin"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2505.03332",
      8     "doi": "10.48550/arXiv.2505.03332"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The full PeerReviewPrompt.md and other prompt files are provided via an OSF repository (https://osf.io/nq68y/) and as PDF attachments. While there is no traditional code, the prompt artifacts constitute the core reproducible artifact of this work."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The test paper used for demonstrations is provided via a view-only OSF link (Supporting Information Section B), and shared AI chat logs are linked throughout the paper (e.g., references [84], [87], [88], [90], [91], [101], [102], [103])."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper states the primary target model is 'Gemini Advanced 2.5 Pro' and mentions 'ChatGPT Plus o1 & o3' and 'SuperGrok Grok 3 Think', but no version snapshots, API versions, or dates of access beyond 'as of Apr 2025' are provided. No environment specifications (software versions, dependencies) are given since the approach uses standard chat interfaces."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Section C of the Supporting Information provides a 'Demo Usage Protocol for PeerReviewPrompt' with step-by-step instructions: submit the prompt as Message 1, then submit a specific analysis request with the manuscript attached as Message 2, plus sample prompts."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "The paper presents only qualitative demonstrations with no quantitative results or metrics, so confidence intervals are structurally inapplicable."
     38       },
     39       "significance_tests": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "No comparative quantitative claims are made that would require significance tests. The paper is a qualitative proof-of-concept."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No quantitative experiments or measured effects are reported. The evaluation is entirely qualitative."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper uses a single test case (one chemistry paper) for development and demonstration. Section 3.4 acknowledges this as a limitation ('Single Test Case') but provides no justification for why a single paper is sufficient, nor any power analysis. The limitation is merely acknowledged, not justified."
     53       },
     54       "variance_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No quantitative experiments with multiple runs are reported. The paper mentions that 'specific details and phrasing of the analyses varied between models and even between different runs' (Section 3) but does not quantify this variance."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper mentions 'a model might identify potential flaws in an improvised experimental setup but still conclude the experiment was successful based solely on the manuscript's claim' as an example of naive analysis behavior, and mentions a 'Gemini - Baseline' demonstration in the appendix. However, no systematic comparison against baseline prompting techniques is presented. Section 3.4 explicitly acknowledges: 'No quantitative benchmark was constructed for systematic evaluation against ground truth or objective metrics' and the evaluation 'lacks comparison to defined baselines.'"
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No baselines are systematically compared against, so contemporaneity cannot be assessed. The paper cites many related AI peer review systems but does not compare PWP against any of them."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The PWP prompt has multiple components (persona engineering, workflow library, hierarchical structure, bias mitigation), but no ablation study tests the contribution of individual components to the overall performance."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No quantitative evaluation metrics are used at all. The evaluation is purely qualitative and observational, as acknowledged in Section 3.4: 'No quantitative benchmark was constructed for systematic evaluation.'"
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper's evaluation consists only of the author's own informal assessment of the LLM outputs. Section 3.4 states: 'Performance-related statements are based solely on the author's conventional (human-driven) evaluation of the generated LLM analyses, which introduces subjectivity.' This is not a structured human evaluation with defined criteria or multiple evaluators."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The single test paper was used for both developing and evaluating the prompt. Section 3.4 states: 'The PeerReviewPrompt was developed and primarily tested using a single publication.' There is no held-out test set."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No per-category or per-task breakdown of performance is provided. Results are discussed as qualitative anecdotes from demonstration analyses."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 3.4 discusses several limitations and failure modes: LLM hallucinations, inconsistent context recall, platform compatibility issues (Qwen rejecting the prompt due to input limits), and the general unreliability of multimodal analysis in some models. Section 3.1 notes that 'verifying the extent of such capabilities in ChatGPT models can be challenging.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that early simpler versions of the prompt exhibited positive input bias (Section 3.2), that ChatGPT models did not yield convincing multimodal analysis (Section 3.1), and that the Qwen chat interface rejected the prompt entirely due to input limits (Section 3.4). These are things that did not work."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The abstract claims the PWP prompt 'equips the LLM with persistent workflows triggered by subsequent queries, guiding modern reasoning LLMs through systematic, multimodal evaluations' and that 'Demonstrations show the PWP-guided LLM identifying major methodological flaws.' These claims are supported only by qualitative demonstrations on a single test case chosen specifically for its known flaws, not by systematic evaluation. The abstract presents the results more strongly than the evidence warrants."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes causal claims such as 'the negative bias conditioning implemented in the current PeerReviewPrompt successfully and reliably suppressed the observed positive input bias' (Section 3.2) and that PWP 'effectively directs LLM reasoning towards critical evaluation points' (Section 3). These causal claims are based on informal observation of a single test case without controlled experiments to isolate the effect of the prompt engineering techniques."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 1.3 explicitly states 'Our investigation is deliberately constrained to using frontier LLMs accessible via standard chat interfaces.' Section 3.4 acknowledges limitations including single test case, limited prompt scope, and qualitative non-benchmarked evaluation. The paper consistently frames itself as a 'proof-of-concept' throughout."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 3.3 discusses alternative explanations for why PWP-guided LLMs can identify flaws, suggesting it may arise 'not from deep reasoning in the human sense, but from a sophisticated comparison of manuscript-specific details against learned representations of scientific norms and consensus.' This is a substantive discussion of an alternative mechanism."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper uses 'Gemini Advanced 2.5 Pro', 'ChatGPT Plus o1', 'ChatGPT Plus o3', and 'SuperGrok Grok 3 Think'. These are marketing names without specific version snapshots or API version identifiers. The only temporal qualifier is 'as of Apr 2025' in the feature highlights appendix."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The full PeerReviewPrompt text is provided in Appendix A (approximately 10 pages of detailed prompt text), and is also available as a Markdown file via the OSF repository. Additional meta-prompts are provided as well. This is exemplary prompt transparency."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No temperature, top-p, max tokens, or other LLM hyperparameters are reported anywhere in the paper. The paper uses standard chat interfaces, but these interfaces still have default settings that affect output."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The approach uses no agentic scaffolding — it operates entirely within standard LLM chat interfaces with no API calls, tool use, or external systems. The prompt itself is the entire system."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 2.2.1 states: 'the input material used was the manuscript file combined with its corresponding supporting information, taken exactly as provided by the publisher without structural modification or reformatting.' This documents that no preprocessing was applied."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.4 is titled 'Study Limitations' and contains five detailed, numbered limitation points covering single test case, limited prompt scope, qualitative evaluation, prompt size constraints, and uncharacterized LLM reliability."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.4 discusses specific threats: (1) the single test case was chosen for its known flaws, which limits generalizability to sound papers; (2) the evaluation is 'based solely on the author's conventional (human-driven) evaluation, which introduces subjectivity'; (3) the prompt exceeds input limits on some platforms (specific example: Qwen's 10,000 character limit). These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 1.3 explicitly states scope boundaries: 'deliberately constrained to using frontier LLMs accessible via standard chat interfaces, without relying on APIs, coding, or specialized tools.' Section 3.4 identifies what was NOT tested: 'testing on a broader range of manuscripts remains future work', evaluation on methodologically sound papers, and quantitative benchmarking."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The shared AI chat logs (linked throughout as references [84], [87], [88], [90], [91], [95], [96], [101], [102], [103]) constitute the raw data of this study, and they are publicly accessible via shared links to Gemini, ChatGPT, and Grok conversations."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The paper describes the iterative development process in detail across Sections 2.1-2.3, including which meta-prompting techniques were used, which models were involved, and the sequence of refinement. The demonstration protocol is specified in Supporting Information Section C."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were involved. The study uses LLMs and a single test paper selected by the author."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline from prompt development (meta-prompting) through demonstration (submitting prompt + paper to LLM) to evaluation (author assessment of outputs) is described in the methodology and results sections. The process is straightforward: submit prompt, submit paper, receive analysis."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source or acknowledgment of funding is disclosed. The Acknowledgments section mentions only the use of generative AI in writing. For a university-affiliated author, the absence of any funding disclosure (even 'no funding') is notable."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The author's affiliation with Lobachevsky State University of Nizhny Novgorod is clearly stated, along with ORCID and LinkedIn links. The paper evaluates no specific commercial product, so there is no product-affiliation conflict."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence cannot be assessed. The paper does not state whether it is unfunded."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or financial interests declaration is present anywhere in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate model capability on a benchmark. It tests whether a prompt engineering technique can guide LLMs to produce useful peer review output, not whether models have memorized specific content."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No benchmark evaluation is conducted. The paper tests a prompt engineering methodology, not model knowledge on a standard benchmark."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark is used. The paper evaluates LLM-generated analysis quality qualitatively on a single test paper."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper does not report inference cost, token consumption, or wall-clock time for running the PWP prompt. Given that the prompt exceeds 30 kB of text and is used with subscription-based services (Gemini Advanced, ChatGPT Plus), practical cost information would be relevant but is absent."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The paper uses commercial chat interfaces (Gemini Advanced, ChatGPT Plus, SuperGrok) but does not quantify the total resources consumed."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "PWP-guided LLMs can identify major methodological flaws in scientific manuscripts while mitigating LLM input bias.",
    287       "evidence": "Section 3.1 describes demonstrations where multiple frontier LLMs guided by the PeerReviewPrompt 'relatively reliably identified major methodological flaws within the test paper and converged on the conclusion that its central claim was highly dubious.' Section 3.2 discusses bias mitigation. Evidence is based on qualitative analysis of a single test paper with known flaws.",
    288       "supported": "weak"
    289     },
    290     {
    291       "claim": "The PWP prompt successfully counteracts default positive input biases in LLMs, enabling more critical and objective analysis.",
    292       "evidence": "Section 3.2 states 'the negative bias conditioning implemented in the current PeerReviewPrompt successfully and reliably suppressed the observed positive input bias when applied to the test paper using the target models.' However, this is demonstrated only qualitatively on one paper, with no systematic measurement of bias reduction.",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "PWP-guided LLMs identified issues initially overlooked during manual human review, including the glass-wool-packed condenser criticism and boiling point inconsistencies.",
    297       "evidence": "Section 3.1 describes these findings, noting 'the LLM analyses highlighted at least two potentially significant issues not initially noted by the author during manual review.' The author acknowledges these fall outside direct expertise and preliminary external checks suggest validity.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The consistency of flaw identification across different model architectures suggests PWP effectively directs LLM reasoning toward critical evaluation points.",
    302       "evidence": "Section 3 states 'all tested models, when guided by the PeerReviewPrompt, relatively reliably identified major methodological flaws.' However, no quantitative consistency metrics are reported, and the test paper was selected specifically for having known flaws.",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "PWP is a broadly applicable prompt engineering methodology that can be adapted to complex analytical tasks beyond chemistry peer review.",
    307       "evidence": "The paper argues this throughout (Section 1.2, 3.5, 4) but provides no evidence beyond the single chemistry demonstration. This claim is aspirational rather than empirically supported.",
    308       "supported": "unsupported"
    309     }
    310   ],
    311   "methodology_tags": ["case-study", "qualitative"],
    312   "key_findings": "The paper introduces Persistent Workflow Prompting (PWP), a prompt engineering methodology that uses a large, hierarchically structured prompt as a persistent workflow library for LLM chat interfaces. Applied to experimental chemistry peer review, the approach demonstrated that multiple frontier LLMs (Gemini 2.5 Pro, ChatGPT o1/o3, Grok 3) could identify known methodological flaws in a test paper and that persona engineering with negative bias conditioning could counteract LLMs' default tendency toward positive input bias. However, all findings are based on qualitative analysis of a single deliberately chosen flawed test case, with no quantitative evaluation, no baselines, and no systematic benchmarking.",
    313   "red_flags": [
    314     {
    315       "flag": "Single test case used for both development and evaluation",
    316       "detail": "The PeerReviewPrompt was developed and evaluated using the same single paper (Prasad et al. 2011), which was deliberately chosen for its known methodological flaws. This means the prompt was essentially optimized to find flaws in a paper known to have them, providing no evidence of generalizability to methodologically sound papers, papers with subtle flaws, or papers in different chemistry subdisciplines."
    317     },
    318     {
    319       "flag": "No quantitative evaluation or benchmarking",
    320       "detail": "Section 3.4 explicitly acknowledges: 'No quantitative benchmark was constructed for systematic evaluation against ground truth or objective metrics. Performance-related statements are based solely on the author's conventional (human-driven) evaluation of the generated LLM analyses, which introduces subjectivity and lacks comparison to defined baselines.' Despite the 105-reference bibliography, the paper's empirical contribution rests on anecdotal observations."
    321     },
    322     {
    323       "flag": "Circular evaluation",
    324       "detail": "The author selected a paper with known flaws, designed a prompt to find those types of flaws, then evaluated success by whether the LLM found those flaws. The author's own assessment is the only evaluation criterion, creating a circular validation loop. No independent evaluator was involved."
    325     },
    326     {
    327       "flag": "Broad generalizability claims from single proof-of-concept",
    328       "detail": "The paper claims PWP is 'intended for broader applicability across various complex analytical tasks' and discusses adaptation to 'physics, biology, materials science, computer science' (Section 3.5). These claims have zero empirical support beyond one chemistry test case."
    329     },
    330     {
    331       "flag": "No comparison to simpler prompting approaches",
    332       "detail": "The paper does not systematically compare the 30+ kB PWP prompt against simpler alternatives (e.g., a one-paragraph instruction to critically review the paper, chain-of-thought prompting, or existing peer review prompts). Without such comparisons, the marginal value of the elaborate prompt architecture is unknown."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "AI-Driven Review Systems: Evaluating LLMs in Scalable and Bias-Aware Academic Reviews",
    338       "authors": ["K. Tyser", "B. Segev", "G. Longhitano"],
    339       "year": 2024,
    340       "arxiv_id": "2408.10365",
    341       "relevance": "Directly relevant as a system for AI-driven academic peer review, addressing scalability and bias in LLM reviews."
    342     },
    343     {
    344       "title": "Are We There Yet? Revealing the Risks of Utilizing Large Language Models in Scholarly Peer Review",
    345       "authors": ["R. Ye", "X. Pang", "J. Chai"],
    346       "year": 2024,
    347       "arxiv_id": "2412.01708",
    348       "relevance": "Evaluates risks and limitations of using LLMs for peer review, directly relevant to assessing AI-assisted review quality."
    349     },
    350     {
    351       "title": "Can large language models provide useful feedback on research papers? A large-scale empirical analysis",
    352       "authors": ["W. Liang", "Y. Zhang", "H. Cao"],
    353       "year": 2023,
    354       "arxiv_id": "2310.01783",
    355       "relevance": "Large-scale empirical study of LLM feedback on research papers, relevant to evaluating LLM review capabilities."
    356     },
    357     {
    358       "title": "ReviewerGPT? An Exploratory Study on Using Large Language Models for Paper Reviewing",
    359       "authors": ["R. Liu", "N.B. Shah"],
    360       "year": 2023,
    361       "arxiv_id": "2306.00622",
    362       "relevance": "Exploratory study on LLMs for paper reviewing, directly relevant to AI-assisted peer review quality assessment."
    363     },
    364     {
    365       "title": "Meta-Prompting: Enhancing Language Models with Task-Agnostic Scaffolding",
    366       "authors": ["M. Suzgun", "A.T. Kalai"],
    367       "year": 2024,
    368       "arxiv_id": "2401.12954",
    369       "relevance": "Foundational work on meta-prompting techniques that this paper builds upon for prompt engineering methodology."
    370     },
    371     {
    372       "title": "The Prompt Report: A Systematic Survey of Prompt Engineering Techniques",
    373       "authors": ["S. Schulhoff", "M. Ilie", "N. Balepur"],
    374       "year": 2025,
    375       "arxiv_id": "2406.06608",
    376       "relevance": "Comprehensive survey of prompt engineering techniques, providing methodological context for the PWP approach."
    377     },
    378     {
    379       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    380       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    381       "year": 2023,
    382       "arxiv_id": "2201.11903",
    383       "relevance": "Foundational prompting technique that PWP builds upon for guiding LLM reasoning through structured steps."
    384     },
    385     {
    386       "title": "DeepReview: Improving LLM-based Paper Review with Human-like Deep Thinking Process",
    387       "authors": ["M. Zhu", "Y. Weng", "L. Yang"],
    388       "year": 2025,
    389       "arxiv_id": "2503.08569",
    390       "relevance": "Recent work on improving LLM-based paper review through structured reasoning, directly comparable to PWP approach."
    391     },
    392     {
    393       "title": "A Peek into Token Bias: Large Language Models Are Not Yet Genuine Reasoners",
    394       "authors": ["B. Jiang", "Y. Xie", "Z. Hao"],
    395       "year": 2024,
    396       "arxiv_id": "2406.11050",
    397       "relevance": "Relevant to understanding LLM reasoning biases that PWP aims to mitigate through prompt engineering."
    398     },
    399     {
    400       "title": "Automatically Evaluating the Paper Reviewing Capability of Large Language Models",
    401       "authors": ["H. Shin", "J. Tang", "Y. Lee"],
    402       "year": 2025,
    403       "arxiv_id": "2502.17086",
    404       "relevance": "Provides methods for evaluating LLM reviewing capability, relevant to benchmarking AI peer review systems."
    405     },
    406     {
    407       "title": "CycleResearcher: Improving Automated Research via Automated Review",
    408       "authors": ["Y. Weng", "M. Zhu", "G. Bao"],
    409       "year": 2025,
    410       "arxiv_id": "2411.00816",
    411       "relevance": "Develops automated review systems with training datasets, relevant to AI-driven scholarly review infrastructure."
    412     }
    413   ]
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs