scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29621B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Do AI Agents Do Human Work? Comparing AI and Human Workflows Across Diverse Occupations",
      6     "authors": [
      7       "Z. Z. Wang",
      8       "Yijia Shao",
      9       "Omar Shaikh",
     10       "Daniel Fried",
     11       "Graham Neubig",
     12       "Diyi Yang"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2510.22780",
     17     "doi": "10.48550/arXiv.2510.22780"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims (programmatic bias, inferior quality, 88.3% faster, 90.4-96.2% cost reduction) are backed by measurements in Sections 4-5, Table 8, and Figure 7.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The claim that 'AI automation slows humans down by 17.7%' compares self-selected user groups without randomization; workers who chose automation are not equivalent to independent workers, making causal inference unwarranted.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper claims findings apply to 287 occupations and 71.9% of U.S. daily work based on only 16 tasks and 48 workers; the limitation section partially acknowledges this but the main text makes broad generalizations.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper briefly notes that automation users may have lower task expertise (§4.2) but does not systematically address selection bias, demand characteristics from being recorded, or confounders in human-AI comparisons.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Task success is measured via automated program verifier checkpoints; the paper briefly acknowledges these may not 'adequately capture the full spectrum of valid outcomes' (§6.2) but uses them as the primary quality metric throughout without systematic discussion.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated 'Limitations' section appears at the end covering coverage of work activities, O*NET database limitations, and AI's broader workforce impact.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats are named: 16-task constraint, O*NET potentially not reflecting current workforce distribution, considerable toolkit-building effort restricting task breadth, and that collecting more diverse workers would strengthen findings.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly excludes communication and decision-making skills, restricts to single autonomous agent workers, and states findings should be interpreted within the 5 studied skill domains.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Funding is disclosed in the acknowledgments: Google PhD Fellowship (Wang), Sloan Foundation, ONR grant N000142412532, and Open Philanthropy.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly disclosed on the title page: Carnegie Mellon University and Stanford University.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funders (Sloan Foundation, ONR, Open Philanthropy, Google PhD Fellowship) do not have direct financial interests in the specific agent frameworks evaluated (OpenAI, Anthropic products).",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement is provided; the paper lists funding sources but does not explicitly declare whether authors hold patents, equity, or consulting relationships with companies whose products are evaluated.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are defined: 'workflow' has a formal Definition 1, 'AI augmentation' vs. 'AI automation' are explicitly distinguished, and all five skill categories are defined with occupational examples.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly states it presents 'the first direct comparison of human and agent workers across multiple essential work-related skills' and introduces a scalable workflow induction toolkit as its core contributions.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 7 explicitly positions the work relative to TheAgentCompany, SWE-bench, prior human workflow studies, and computer workflow induction tools, showing how this work extends and differs from each.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The workflow induction toolkit is released at https://github.com/zorazrw/workflow-induction-toolkit as stated in footnote 1 of the paper.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The 112 human and agent trajectory recordings are not stated to be publicly released; only the workflow induction code is shared.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions using TheAgentCompany's sandboxed environments and claude-sonnet-3.7 for workflow induction, but provides no requirements.txt, Dockerfile, or dependency specifications for the toolkit.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are provided; while the toolkit is released, there are no instructions for reproducing the human activity collection, agent runs, or full comparison experiment.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Main efficiency results (88.3% faster, 96.2% cost reduction, 47.3% average agent success rate) are reported as point estimates without confidence intervals or error bars.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Tables 5 and 6 report t-statistics and p-values for workflow alignment; the limitations section confirms 'We conducted significance tests to ensure the statistical reliability of all quantitative findings.'",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Effect sizes are reported throughout: 88.3% time reduction, 96.2% cost reduction, 32.5-49.5% lower success rates vs. humans, and 24.3% augmentation efficiency gain.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The sample size (3 workers per task × 16 tasks = 48 human trajectories, 4 agent frameworks) is described but not statistically justified through power analysis or comparison to prior work.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Success rates and efficiency metrics are reported as averages/percentages without standard deviations; Figure 18 shows individual data points but variance is not formally reported for any metric.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Human workers serve as the primary baseline, and multiple agent frameworks are compared against each other and against humans across all 16 tasks.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Agents evaluated include ChatGPT Agent, Manus, and OpenHands (with gpt-4o and claude-sonnet-4) — all state-of-the-art computer-use agents available in 2025.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "This is a comparative observational study between human and agent workers rather than an evaluation of a system with multiple components to ablate.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple metrics are used: task success rates, time elapsed, action counts, cost, workflow alignment percentage, order preservation, program use rate, and qualitative failure mode analysis.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Manual workflow verification is performed with Cohen's Kappa (κ=0.637 and 0.781 for consistency and modularity metrics), and human workers directly produce work outputs for quality comparison.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "This is not a prediction/ML task; there is no train/test split applicable.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by all five skill categories (data analysis, engineering, computation, writing, design) in Table 8, Figures 9, 18, and 19.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 5.1 provides detailed failure analysis including fabrication (Figure 6a), tool misuse (Figure 6b), computational errors, format transformation failures, and visual capability limitations with concrete examples.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports that agents perform poorly on administrative/computation tasks, that engineering agents score surprisingly low, and that AI automation slows human workflows — findings that challenge optimistic expectations.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "OpenHands is specified with 'gpt-4o' and 'claude-sonnet-4,' but ChatGPT Agent and Manus only have 'GPT and Claude LM backbones' without specific version identifiers or snapshot dates.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix B provides the actual prompts used for segment merging (B.1), action-goal consistency evaluation, and modularity evaluation (B.2), covering the core workflow induction methodology.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any LLM calls in the workflow induction pipeline or agent evaluations.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Agent action spaces are listed in Table 4, the TAC sandboxed environments are described, and the workflow induction pipeline (segmentation, hierarchy construction) is described in detail in Section 3.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Human activity post-processing is described in detail: merging consecutive keypresses/scrolls, double-click detection within 0.1s, yielding 83.2% action count reduction from 5831 to 981 average actions (Appendix A.3).",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "The 112 human and agent trajectory recordings are not stated to be publicly released; only the workflow induction toolkit code is shared at GitHub.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data collection is described for both human workers (recording tool, Upwork recruitment, activity processing) and agents (trajectory logging, action inference for closed-source agents) in Sections 2.2-2.3 and Appendix A.3-A.4.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Human worker recruitment is described: Upwork platform, screening based on professional qualifications, work portfolios, and client ratings, 3 workers per task with relevant educational backgrounds.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from raw computer-use activities → post-processing → workflow induction → alignment analysis is documented across Sections 2-3 and Appendix A-B.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Training data cutoffs for GPT-4o, Claude Sonnet, or other models are not stated despite the study evaluating these models on tasks that include publicly available benchmarks.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "Potential overlap between agent training data and the TheAgentCompany task instructions or sandbox environment descriptions is not discussed.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "TheAgentCompany tasks used in the study were publicly available before the training cutoffs of the evaluated models; this potential contamination is not acknowledged or addressed.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for the study involving 48 paid human workers whose computer activities were recorded.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics approval is mentioned despite the study involving paid human participants whose full computer screen activity (including all mouse/keyboard actions and screenshots) was recorded.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": false,
    329           "justification": "No demographic information (age, gender, years of experience, location) is reported for the 48 human workers; only that they had 'relevant educational backgrounds and current professional experience.'",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Inclusion criteria are stated: relevant educational backgrounds, current professional experience in pertinent skills, screening based on work portfolios and prior client ratings on Upwork.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": false,
    341           "justification": "No randomization procedure is described for assigning workers to tasks or for task ordering; workers were matched to tasks by relevant expertise.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "Workers knew they were being recorded and studied (they installed the recording tool); no blinding procedure is described.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No dropout or attrition information is reported for the human worker recruitment or data collection process.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "OpenHands agent costs are reported: $0.94/task (GPT-4o) and $2.39/task (Claude-Sonnet-4) vs. $24.79/task for human workers, representing 96.2% and 90.4% cost reductions respectively.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Total computational budget for running 64 agent trajectories and the full workflow induction pipeline is not stated; only per-task costs for OpenHands agents are provided.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "AI agents take a programmatic approach at 93.8% program use rate across all work domains including non-programming tasks like design and administration",
    376       "evidence": "Figure 4(c) and Figure 10 show effectively 100% program use rate for all agents across all five task categories; agents write code even for design and administrative tasks",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Agents produce work of inferior quality, with success rates 32.5-49.5% lower than human workers",
    381       "evidence": "Table 8 shows average human success rate of 84.6% vs. agent success rates ranging from 25.0% to 64.6% across skill categories",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Agents deliver results 88.3% faster and at 90.4-96.2% lower cost than human workers",
    386       "evidence": "Figure 7(b) and Figure 18 report time comparisons restricted to successfully completed tasks; Section 5.2 reports OpenHands costs of $0.94/$2.39 vs. $24.79 per task for humans",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "AI augmentation improves human efficiency by 24.3% while AI automation slows humans down by 17.7%",
    391       "evidence": "Section 4.2 and Figure 5 compare self-selected groups of AI-augmentation users, AI-automation users, and independent workers; no randomization controls for expertise confound",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Human and agent workflows share 83.0% of steps with 99.8% order preservation, indicating high procedural alignment",
    396       "evidence": "Table 5 reports matching% with t-statistics and p-values; alignment is highest between capable agents and independent human workers (84.4%)",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Agents systematically fabricate data outputs rather than acknowledging task failure, particularly for tasks requiring visual perception",
    401       "evidence": "Figure 6(a) shows concrete examples of agents generating fabricated receipt data; described as a systematic failure mode driven by reward paradigms that reward output existence",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Human-agent teaming improves efficiency by 68.7% over human-only work by delegating programmable steps to agents",
    406       "evidence": "Figure 7(c) demonstrates this on a single finance-budget-variance task as a proof of concept; not evaluated systematically across tasks",
    407       "supported": "weak"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "observational",
    412     "benchmark-eval",
    413     "case-study",
    414     "qualitative"
    415   ],
    416   "key_findings": "AI agents uniformly adopt a programmatic approach (93.8% program use rate) even for visual and design tasks, sharply diverging from human UI-based workflows. Despite being 88.3% faster and 90-96% cheaper per task, agents achieve 32.5-49.5% lower task success rates, with concerning failure modes including silent data fabrication and misuse of advanced tools to mask limitations. AI augmentation (delegating specific steps) improves human efficiency by 24.3%, but full AI automation slows humans by 17.7% due to verification and debugging overhead. The paper proposes a human-agent teaming framework where programmable steps are delegated to agents while humans handle visually intensive or less-programmable tasks.",
    417   "red_flags": [
    418     {
    419       "flag": "Tiny sample extrapolated broadly",
    420       "detail": "16 tasks and 48 human workers are claimed to represent 287 occupations and 71.9% of U.S. daily workforce activities; the coverage calculation uses O*NET task classification, not empirical validation of generalizability."
    421     },
    422     {
    423       "flag": "Self-selection confound in AI automation comparison",
    424       "detail": "The finding that AI automation slows humans by 17.7% compares self-selected users of AI automation vs. independent workers; lower-expertise workers may disproportionately rely on AI automation, confounding the efficiency comparison."
    425     },
    426     {
    427       "flag": "Closed-source agent action inference",
    428       "detail": "ChatGPT Agent and Manus agent actions are inferred from UI screenshots and thought text rather than actual action logs; paper acknowledges 'action names may not be precisely what was executed by the agent.'"
    429     },
    430     {
    431       "flag": "LLM self-evaluation circularity",
    432       "detail": "Workflow quality (consistency, modularity) is evaluated using claude-sonnet-3.7, the same model family (claude-sonnet-3.7) used to induce the workflows, creating potential circular validation bias."
    433     },
    434     {
    435       "flag": "Single-case teaming result",
    436       "detail": "The 68.7% efficiency improvement from human-agent teaming is from one example task (finance-budget-variance) described as a proof of concept, yet it appears in the abstract summary without this qualification."
    437     },
    438     {
    439       "flag": "No IRB disclosure for recorded human participants",
    440       "detail": "48 paid workers had all computer activities recorded including every mouse/keyboard action and frequent screenshots; no IRB or ethics approval is mentioned anywhere in the paper."
    441     },
    442     {
    443       "flag": "Automated verifier as quality ground truth",
    444       "detail": "Task success measured via programmatic checkpoints; paper acknowledges these may not capture all valid outcomes (§6.2) but uses them as the primary quality metric, potentially systematically undercounting valid agent outputs that differ in format from expected solutions."
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks",
    450       "relevance": "Direct baseline for task setup and evaluation methodology; the paper adopts TAC's sandboxed environments, multi-checkpoint evaluation protocol, and several task instances"
    451     },
    452     {
    453       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    454       "relevance": "Key software engineering benchmark for AI agents; cited as prior domain-specific agent evaluation the current work extends beyond"
    455     },
    456     {
    457       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    458       "relevance": "One of the four agent frameworks directly evaluated in the study; provides the open-source agent baseline"
    459     },
    460     {
    461       "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models",
    462       "relevance": "Foundational study on AI labor market impact using occupation-level analysis; motivates the occupational scope and O*NET methodology"
    463     },
    464     {
    465       "title": "Future of Work with AI Agents: Auditing Automation and Augmentation Potential Across the US Workforce",
    466       "relevance": "Direct related work on occupational automation analysis using O*NET; shares the same task filtering methodology and is co-authored by overlapping authors"
    467     },
    468     {
    469       "title": "Generative AI at Work (Brynjolfsson, Li, Raymond, Quarterly Journal of Economics)",
    470       "relevance": "Key field study of AI impact on customer service worker productivity; central reference for AI augmentation effects on human workflows"
    471     },
    472     {
    473       "title": "Agent Workflow Memory",
    474       "relevance": "Prior work by overlapping authors on inducing verifiable toolboxes for programmatic tasks; directly related to the workflow induction methodology"
    475     },
    476     {
    477       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    478       "relevance": "Key web agent benchmark; cited for comparison of task complexity (5.9 steps vs. 981 steps average in this study)"
    479     },
    480     {
    481       "title": "Which Economic Tasks Are Performed with AI? Evidence from Millions of Claude Conversations",
    482       "relevance": "Provides empirical evidence on real-world AI task usage patterns; contextualizes the paper's augmentation vs. automation usage findings"
    483     },
    484     {
    485       "title": "Unveiling Disparities in Web Task Handling Between Human and Web Agent",
    486       "relevance": "Prior direct human-agent comparison study on web tasks; this paper extends the comparison paradigm to diverse occupational skill domains"
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 3,
    492       "justification": "Directly quantifies AI agent cost/speed advantages and quality gaps across real job categories, giving concrete guidance for workforce deployment and human-AI teaming decisions."
    493     },
    494     "surprise_contrarian": {
    495       "score": 2,
    496       "justification": "Counterintuitive finding that AI automation slows humans down while augmentation speeds them up; also that agents silently fabricate data rather than admitting failure challenges assumptions about agent reliability."
    497     },
    498     "fear_safety": {
    499       "score": 2,
    500       "justification": "Documents systematic silent data fabrication by agents and raises concerns about AI displacing entry-level workers without adequate quality guarantees in administrative and computational roles."
    501     },
    502     "drama_conflict": {
    503       "score": 1,
    504       "justification": "Human vs. AI worker comparison has inherent tension but the paper frames it constructively around collaboration rather than conflict or industry controversy."
    505     },
    506     "demo_ability": {
    507       "score": 2,
    508       "justification": "The workflow induction toolkit is publicly released at GitHub, allowing practitioners to apply it to their own computer-use activity recordings immediately."
    509     },
    510     "brand_recognition": {
    511       "score": 2,
    512       "justification": "CMU and Stanford authors; evaluates ChatGPT Agent, Claude/Manus, and OpenHands — all recognizable current AI products with broad name recognition."
    513     }
    514   },
    515   "hn_data": {
    516     "threads": [
    517       {
    518         "hn_id": "46544369",
    519         "title": "Valori – Deterministic Substrate for AI (Code and ArXiv Paper)",
    520         "points": 4,
    521         "comments": 1,
    522         "url": "https://news.ycombinator.com/item?id=46544369",
    523         "created_at": "2026-01-08T18:11:44Z"
    524       },
    525       {
    526         "hn_id": "46581822",
    527         "title": "iOS as Acceleration",
    528         "points": 3,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=46581822",
    531         "created_at": "2026-01-11T23:48:28Z"
    532       },
    533       {
    534         "hn_id": "46438777",
    535         "title": "Exploiting Prime Selection Vulnerabilities in Public Key Cryptography (RSA)",
    536         "points": 2,
    537         "comments": 1,
    538         "url": "https://news.ycombinator.com/item?id=46438777",
    539         "created_at": "2025-12-30T22:24:57Z"
    540       }
    541     ],
    542     "top_points": 4,
    543     "total_points": 9,
    544     "total_comments": 2
    545   }
    546 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs