ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24926B)


      1 {
      2   "paper": {
      3     "title": "AI as Cognitive Amplifier: Rethinking Human Judgment in the Age of Generative AI",
      4     "authors": ["Tao An"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.10961"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The author lists a GitHub URL (https://github.com/tao-hpu) on the first page as a personal profile link, but no repository or code specific to this paper is released. There is no analysis code, data processing scripts, or supplementary materials."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper references field observations of 580 professionals but releases no data — no observation logs, training session records, performance measurements, or anonymized datasets. All claims rest on unreleased and unverifiable field observations."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a position paper with no computational experiments. There is no software environment to specify."
     25       },
     26       "reproduction_instructions": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a position paper with no experiments to reproduce. The field observations are not described with enough specificity to constitute a reproducible protocol."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a position paper. While it presents quantitative-looking figures (e.g., '+45% performance improvement'), these are described as from 'systematic field observations' without any statistical analysis framework. The paper does not run formal experiments, so statistical methodology criteria do not structurally apply."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No formal experiments or statistical comparisons are conducted. The paper is a position paper synthesizing observations and literature."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No formal experiments are conducted. The percentages in Figure 3 (e.g., +45%, +35%) are presented as field observations without statistical methodology."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No formal study is conducted. The paper mentions observing 580 professionals but does not treat this as a formal sample requiring justification."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No formal experiments with multiple runs are conducted. This is a position paper."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a position paper proposing a theoretical framework. It does not conduct experiments that would require baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No experimental evaluation is conducted; baselines are structurally inapplicable."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a position paper with no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No formal evaluation is conducted. The paper proposes a conceptual framework, not a measured system."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "The paper does not present a system whose outputs need to be evaluated. It is a position paper proposing a framework."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No datasets or test sets are used. This is a position paper."
     89       },
     90       "per_category_breakdown": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No formal evaluation is conducted that would benefit from per-category breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses how novice users fail with AI tools in detail — Section 3.4 describes the 'sycophancy problem' as a failure mode, and Sections 3.1-3.2 describe how novice users produce low-quality outputs by accepting AI responses uncritically. The paper also discusses how AI 'amplifies misconceptions' for users lacking domain expertise."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper cites the METR RCT finding [5] that AI tool usage 'actually increased completion time despite developers estimating they were faster' — a counter-intuitive negative result. The paper also reports that teaching prompt engineering to domain novices 'yields minimal improvement in output quality.'"
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims the paper 'demonstrates that domain knowledge, quality judgment, and iterative refinement capabilities create substantial performance gaps between users' through 'analysis of empirical studies and systematic observations.' However, the paper provides no quantified, verifiable evidence for these performance gaps from its own observations. The percentages in Figure 3 (e.g., +45%) appear without any methodology for how they were measured. The 'demonstration' rests on anecdotes and unverifiable field observations rather than rigorous evidence."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims — e.g., 'domain knowledge, quality judgment, and iterative refinement capabilities CREATE substantial performance gaps,' and the framework positions domain expertise as the CAUSE of differential AI effectiveness. Yet the paper explicitly acknowledges in its limitations that 'field observations... lack the controlled experimental design needed for causal claims.' The observational design cannot establish causation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The limitations section (5.2) explicitly bounds generalizations: 'The observations derive primarily from corporate training in knowledge work domains (writing, software development, data analysis), which may not generalize to all professional contexts or educational settings.' The paper also acknowledges it 'focuses primarily on individual-level expertise and judgment, giving less attention to organizational and systemic factors.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed expert-novice performance gap. For example, it does not consider that the gap might be due to general intelligence differences, motivation differences, familiarity with digital tools, or selection effects in who attends training. The paper assumes domain expertise is the primary driver without ruling out confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "The paper does not conduct experiments using specific AI models. It is a position paper that references AI tools generically. The acknowledgments mention 'Claude, ChatGPT' were used for literature review and drafting but these are not part of an experimental setup."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No experiments involving prompting are conducted. The paper discusses prompting conceptually but does not use prompts as part of a research methodology."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments are conducted that involve model hyperparameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a position paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper claims 'systematic observations from professional training contexts' involving 580 professionals across multiple industries (Figure 7), but provides no documentation of how observations were collected, recorded, categorized, or analyzed. There is no description of data preprocessing or analysis methodology."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.2 is titled 'Limitations' and contains two substantive paragraphs discussing the limitations of the work."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section identifies specific threats: (1) field observations 'lack the controlled experimental design needed for causal claims,' (2) observations 'derive primarily from corporate training in knowledge work domains (writing, software development, data analysis), which may not generalize to all professional contexts,' and (3) the paper 'focuses primarily on individual-level expertise and judgment, giving less attention to organizational and systemic factors.' These are specific to this study, not generic disclaimers."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.2 explicitly states scope boundaries: the observations are limited to corporate training in knowledge work domains, may not generalize to 'all professional contexts or educational settings,' and the analysis 'focuses primarily on individual-level expertise and judgment.' The paper also calls for future 'controlled studies' to validate its framework."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is available. The paper claims observations from 580 professionals but provides no data, observation logs, or any verifiable evidence. The percentages in Figure 3 cannot be independently verified."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The data collection procedure is extremely vague. The paper mentions 'systematic observations from professional training contexts' and Figure 7 shows a timeline, but there is no description of what was observed, how observations were recorded, what instruments were used, or what 'performance improvement' means or how it was measured."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper describes participants as professionals in 'corporate training programs' across tech, finance, healthcare, and education sectors (Figure 3, Figure 7), but provides no information about how participants were recruited, who selected them, whether participation was voluntary, or what selection biases might exist."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "There is no documentation of how field observations were transformed into the quantitative claims in the paper. Figure 3 presents specific percentages (+45%, +35%, etc.) but there is no description of how raw observations became these numbers."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed. The acknowledgments section thanks training participants and mentions AI assistance for drafting but does not address funding. There is no statement about whether the work was funded or unfunded."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation (Hawaii Pacific University) is clearly listed on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed. The author conducts corporate AI training programs — a commercial activity that benefits from the narrative that AI amplifies expertise (making training more valuable). This potential conflict is not acknowledged."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present. The author runs corporate AI training programs (described throughout the paper), which represents a financial interest in the conclusion that domain expertise determines AI effectiveness — a finding that directly supports demand for the author's training services."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It is a position paper."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No benchmark evaluation is conducted. Contamination is structurally inapplicable."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is conducted. Contamination is structurally inapplicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper describes observations of 580 professionals in training programs, which constitutes human subjects research. There is no mention of pre-registration."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper collects and reports observations from human participants in training sessions but does not mention IRB or ethics board approval."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Figure 3 reports participant demographics by role category: Technical Experts (120), Middle Management (180), General Employees (150), Students (80), Executives (50). Figure 7 notes industries covered: Tech, Finance, Healthcare, Education. However, no individual-level demographics (gender, age, years of experience) are reported."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria are stated for which training participants' observations were included in the analysis. The paper does not describe who was eligible for training or how participants were selected."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is an observational study of training sessions, not an experimental study with treatment/control conditions. Randomization is not applicable."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "This is an observational study of training sessions, not an experimental study. Blinding is not applicable."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "The paper reports a growing total (50 → 150 → 300 → 450 → 580) across Figure 7 but does not discuss whether any participants dropped out of training sessions, were excluded from observations, or otherwise failed to complete the observed activities."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a position paper. It does not propose a method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a position paper. No computational experiments are conducted."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AI tools function as cognitive amplifiers that magnify existing human capabilities rather than substituting for them.",
    286       "evidence": "The paper synthesizes Engelbart's intelligence amplification framework (Section 2.1), augmented cognition research [2], and the author's field observations from training 580 professionals (Figures 3, 7). No controlled experiment or quantified evidence from the author's own observations is provided.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "Domain expertise, not technical AI proficiency, is the primary determinant of AI tool effectiveness.",
    291       "evidence": "The paper cites Sun et al. [4] showing expert-novice differences in AI-assisted conversational agent design, and the METR RCT [5] showing AI increased completion time for experienced developers. The author's own evidence is anecdotal field observations. Figure 3 shows performance improvement percentages by role but provides no methodology for how these were measured.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Technical experts show the highest average performance improvement (+45%) when using AI tools, while general employees show the lowest (+20%).",
    296       "evidence": "Figure 3 presents these numbers attributed to 'data from systematic observations of 580 professionals.' No methodology is described for how 'performance improvement' was defined, measured, or calculated. The data underlying these claims is not available.",
    297       "supported": "unsupported"
    298     },
    299     {
    300       "claim": "Teaching prompt engineering to domain novices yields minimal improvement in output quality.",
    301       "evidence": "Section 4.1, Recommendation 1 states this based on 'field experience' across 'multiple organizations.' No controlled comparison is provided, and no data is shown to support this claim.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "AI sycophancy creates a dangerous feedback cycle that amplifies misconceptions for novice users.",
    306       "evidence": "Section 3.4 cites research on LLM sycophancy [12, 13, 14] showing near-100% compliance with flawed requests and overweighting of contradictory feedback. The application to the amplification framework is theoretical/observational rather than empirically tested by the author.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["theoretical", "qualitative"],
    311   "key_findings": "This position paper argues that AI tools function as 'cognitive amplifiers' that magnify existing human capabilities, with output quality determined primarily by domain expertise rather than technical AI proficiency. The author proposes a three-layer model of human contribution (problem definition, quality evaluation, iterative refinement) and a three-level model of AI engagement (passive acceptance, iterative collaboration, cognitive direction). Based on field observations of 580 professionals in corporate training and synthesis of existing literature, the paper claims expert users achieve substantially better results than novices with the same AI tools, and that this gap widens with task complexity. The paper identifies AI sycophancy as a mechanism that amplifies both expertise and misconceptions.",
    312   "red_flags": [
    313     {
    314       "flag": "Unverifiable field observations presented as evidence",
    315       "detail": "The paper's core empirical evidence comes from the author's observations of 580 professionals in corporate training sessions. No raw data, observation protocols, measurement instruments, or analysis methodology are described or released. Figure 3 presents specific percentages (+45%, +35%, etc.) that cannot be independently verified or replicated."
    316     },
    317     {
    318       "flag": "Undisclosed conflict of interest",
    319       "detail": "The author conducts corporate AI training programs — a commercial activity that directly benefits from the paper's central conclusion that domain expertise (which training develops) determines AI effectiveness. This financial interest in the outcome is not disclosed or acknowledged."
    320     },
    321     {
    322       "flag": "No ethics review for human subjects observations",
    323       "detail": "The paper reports observations from 580 professionals in training sessions without mentioning IRB or ethics board approval. Even observational research involving human participants typically requires ethics review."
    324     },
    325     {
    326       "flag": "Causal claims from observational data",
    327       "detail": "The paper makes causal claims (domain expertise 'creates' performance gaps, AI 'amplifies' expertise) while acknowledging in the limitations that observations 'lack the controlled experimental design needed for causal claims.' Yet the framework and recommendations are presented as if the causal mechanism is established."
    328     },
    329     {
    330       "flag": "Weak reference quality",
    331       "detail": "Several references are to blog posts, Wikipedia articles, and community websites (CircleCI blog, MLOps Community, Wikipedia) rather than peer-reviewed research. Reference [2] is a Wikipedia article on 'Augmented Cognition' used to support a key theoretical claim about mathematical formalization."
    332     },
    333     {
    334       "flag": "Quantitative figures without methodology",
    335       "detail": "Figures 3 and 5 present specific quantitative values (performance percentages, capability levels) labeled as from 'systematic field observations' and 'relative capabilities based on systematic field observations,' but no measurement methodology, scales, or operationalization of these quantities is described anywhere in the paper."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Comparing Experts and Novices for AI Data Work",
    341       "authors": ["Lining Sun", "Yiren Liu", "Geena Joseph", "Zeyu Yu", "Haiyi Zhu", "Steven P Dow"],
    342       "year": 2022,
    343       "relevance": "Empirical study on expert-novice performance differences in AI-assisted tasks, directly relevant to understanding human factors in AI tool effectiveness."
    344     },
    345     {
    346       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    347       "authors": ["METR"],
    348       "year": 2025,
    349       "relevance": "Randomized controlled trial finding AI tools increased completion time for experienced developers, a key counter-intuitive result about AI productivity claims."
    350     },
    351     {
    352       "title": "Sycophancy in Large Language Models: Causes and Mitigations",
    353       "year": 2024,
    354       "arxiv_id": "2411.15287",
    355       "relevance": "Research on LLM sycophancy bias relevant to understanding AI safety and reliability in human-AI interaction."
    356     },
    357     {
    358       "title": "When Helpfulness Backfires: LLMs and the Risk of False Medical Information Due to Sycophantic Behavior",
    359       "year": 2025,
    360       "relevance": "Nature Digital Medicine study on LLM compliance with flawed requests, relevant to AI safety and reliability evaluation."
    361     },
    362     {
    363       "title": "How Overconfidence in Initial Choices and Underconfidence Under Criticism Modulate Change of Mind in Large Language Models",
    364       "year": 2025,
    365       "arxiv_id": "2507.03120",
    366       "relevance": "Study on LLM overconfidence and sensitivity to user feedback, relevant to understanding model behavior in interactive settings."
    367     },
    368     {
    369       "title": "Does Prompt Formatting Have Any Impact on LLM Performance?",
    370       "year": 2024,
    371       "arxiv_id": "2411.10541",
    372       "relevance": "Empirical study on how prompt format affects LLM outputs, relevant to evaluation of prompt engineering practices."
    373     },
    374     {
    375       "title": "ChatGPT in Higher Education: Considerations for Academic Integrity and Student Learning",
    376       "authors": ["Debby RE Cotton", "Peter A Cotton", "J Reuben Shipway"],
    377       "year": 2023,
    378       "relevance": "Study on ChatGPT use patterns in higher education, relevant to understanding AI adoption and its effects on learning."
    379     }
    380   ]
    381 }

Impressum · Datenschutz