ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28136B)


      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor"],
      4   "paper": {
      5     "title": "How Do AI Agents Do Human Work? Comparing AI and Human Workflows Across Diverse Occupations",
      6     "authors": ["Zora Zhiruo Wang", "Yijia Shao", "Omar Shaikh", "Daniel Fried", "Graham Neubig", "Diyi Yang"],
      7     "year": 2025,
      8     "venue": "arXiv",
      9     "arxiv_id": "2510.22780",
     10     "doi": "10.48550/arXiv.2510.22780"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "GitHub repository URL provided in footnote 1: https://github.com/zorazrw/workflow-induction-toolkit"
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No explicit dataset download link provided. The 112 collected trajectories (human and agent activities) are not stated to be publicly released."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section listing library versions is mentioned. The paper states experiments run in TAC's sandboxed environments but does not provide reproducible environment specifications."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions or README with commands described. The paper references the TAC setup but does not provide specific instructions for reproducing the full study."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Main results (success rates in Table 8, efficiency comparisons) are reported as point estimates without confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "T-statistics and p-values are reported for workflow alignment measures in Tables 5 and 6. The paper states 'We conducted significance tests to ensure the statistical reliability of all quantitative findings' in the Limitations section."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported with baseline context throughout: agents are 88.3% faster, cost 90.4-96.2% less, human-agent alignment is 83.0%, AI augmentation accelerates work by 24.3% while automation slows by 17.7%."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification for why 3 human workers per task, 4 agent frameworks, or 16 tasks were chosen. No power analysis. The Limitations section acknowledges the limited number of tasks but does not justify sample sizes."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No standard deviations or variance measures reported across runs or workers. Results are presented as averages without spread measures."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Human workers serve as the baseline for comparison against 4 agent frameworks. Multiple agent frameworks (OpenHands-GPT, OpenHands-Claude, ChatGPT, Manus) are compared against each other and against humans."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Agent frameworks tested (ChatGPT Agent, Manus, OpenHands with gpt-4o and claude-sonnet-4) are contemporary state-of-the-art systems."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No ablation study is performed. The workflow induction toolkit components are not systematically ablated to measure their contributions."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used: task success rate, workflow alignment (matching steps %, order preservation %), time elapsed, action count, cost, and progress rate."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Manual verification of workflows and their evaluation results is performed, showing 'substantial agreement with human judgments — 0.637 and 0.781 in Cohen's Kappa for consistency and modularity metrics' (§3.3). Workflow alignment pairs are also 'manually refined for accuracy' (§3.4)."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "This is not a train/test evaluation paradigm. The study compares human and agent workflows on the same tasks without a training phase."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by skill category (data analysis, engineering, computation, writing, design) throughout the paper, including Tables 6, 8, and Figures 9, 10, 19."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5.1 extensively discusses agent failure modes: data fabrication (Figure 6a), tool misuse (Figure 6b), computation errors, format transformation failures, and limited visual capabilities."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative findings are reported: AI automation slows humans by 17.7% (§4.2), agents produce lower-quality work (§5), agents fabricate data (§5.1), engineering tasks surprisingly show low agent quality despite being the focus of agent development."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about programmatic approach, inferior quality with fabrication, and 88.3% faster/90.4-96.2% cheaper are all supported by results in §4 and §5."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper makes causal claims like 'AI automation markedly reshapes workflows and slows human work by 17.7%' and 'agents deliver results 88.3% faster' but the study design is observational — no randomization of workers to conditions, no control for worker expertise differences. Workers self-selected AI tool usage."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title claims comparison 'Across Diverse Occupations' but the study uses only 16 tasks with 48 human workers and 4 agent frameworks. The paper extrapolates to '287 computer-using U.S. occupations and 71.9% of their daily work activities' based on skill category coverage, which is a substantial generalization from 16 tasks."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper discusses alternative explanations: AI automation slowdown may be due to '(i) additional time spent verifying, debugging, and correcting AI solutions' and '(ii) lower task expertise among human workers who are more likely to rely on AI for full-process automation' (§4.2)."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper uses program verifier checkpoints as a proxy for 'work quality' and action count/time as proxies for 'efficiency', but does not discuss limitations of these proxies. Checkpoint-based evaluation is acknowledged as limited in §6.2 but this is brief and does not distinguish proxy from outcome for the main quality claims."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Agent models are specified as 'gpt-4o' and 'claude-sonnet-4' without snapshot dates or API versions. The workflow induction model is 'claude-sonnet-3.7' without a version identifier. These are marketing names, not pinned versions."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Prompts for workflow induction steps are provided in Appendix B: segment merging (§B.1), action-goal consistency evaluation, and modularity evaluation (§B.2)."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No temperature, top-p, or sampling parameters reported for any of the LLM calls (workflow induction, agent runs, or evaluation)."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Agent frameworks are described: OpenHands is described as coding-oriented with specific action spaces (Table 4), ChatGPT Agent and Manus are described with their available tools and distinctive behaviors (search_web, check_with_users). The workflow induction pipeline is described in detail in §3."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Human activity preprocessing is documented in §2.2 and §A.3: merging consecutive keypress/scroll actions, double-click detection within 0.1 seconds, reducing action count by 83.2%. Agent activity processing described in §A.4."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "A dedicated 'Limitations' section is present (after §8) discussing coverage of work activities, O*NET database limitations, and understanding AI's impact on work."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats discussed: 'our study does not yet capture the full diversity of real-world working contexts', 'the inherent difficulty in constructing and maintaining such a large-scale database may introduce inaccuracies', limited number of tasks and workers, O*NET may not reflect current workforce distribution."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states scope boundaries: communication and decision-making skills are excluded (§A.1), study focuses on computer-using occupations only, and acknowledges the need for 'additional task instances with greater breadth'."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The 112 collected trajectories (raw mouse/keyboard actions and screenshots) are not stated to be publicly available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Data collection is described in detail: §2.2 for human workers (recording tool capturing actions and screenshots), §2.3 for agents (direct access for OpenHands, UI-based capture for ChatGPT/Manus), and §A.3-A.4 for processing details."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Human worker recruitment described in §2.2 and §A.3: hired from Upwork, screened based on professional qualifications and work portfolios, 3 workers per task, required relevant educational backgrounds and current professional experience."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from raw activities to workflows is documented: raw recording → preprocessing (merging keypresses, scroll actions, double-click detection, 83.2% reduction) → trajectory segmentation (MSE-based) → semantic merging → hierarchical goal annotation."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Funding disclosed in Acknowledgments: Google PhD Fellowship, Sloan Foundation, ONR grant N000142412532, Open Philanthropy."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations (Carnegie Mellon University, Stanford University) are clearly listed. No authors are affiliated with the agent companies being evaluated (OpenAI, Anthropic, Manus)."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funders (Google, Sloan Foundation, ONR, Open Philanthropy) do not have a direct financial stake in the comparative outcomes of ChatGPT vs. Manus vs. OpenHands. Google is a competitor but the first author's fellowship does not create a direct outcome dependency."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This paper does not evaluate pre-trained model capability on a benchmark in the traditional sense. It evaluates agent frameworks on work tasks — the concern is agent capability, not whether models memorized benchmark answers."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Same as training_cutoff_stated — the study evaluates agent work behavior on novel realistic tasks, not model knowledge on existing benchmarks."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Tasks are custom-designed or adopted from TheAgentCompany with sandboxed environments, making traditional benchmark contamination less relevant."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No mention of pre-registration. The study involves 48 human participants but no pre-registration link is provided."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No IRB or ethics board approval is mentioned despite recruiting 48 human workers and collecting their computer-use activities."
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "Worker demographics are not reported beyond stating they have 'relevant educational backgrounds and current professional experience.' No experience levels, gender, geographic distribution, or other demographics provided."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "Inclusion criteria stated in §2.2 and §A.3: 'relevant educational backgrounds and currently doing jobs related to each task and skill', screened based on 'professional qualifications', 'work portfolios and prior client ratings'."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "This is not a randomized experiment — workers are not assigned to conditions. Each worker completes their task using their preferred tools."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "Blinding is not applicable — this is an observational comparison of human vs. agent workflows, not an experiment with treatment conditions."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No attrition information provided. The paper states 48 human trajectories were collected (3 per task × 16 tasks) but does not mention whether any workers started but did not finish."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Agent costs reported: OpenHands-GPT $0.94/task, OpenHands-Claude $2.39/task. Human workers charge $24.79/task on average (§5.2)."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget stated for the workflow induction pipeline (which uses claude-sonnet-3.7 extensively) or for the overall study."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Agent runs appear to be single-run. No mention of running agents multiple times with different seeds to assess result stability."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Number of agent runs per task is not explicitly stated. It appears each agent framework was run once per task (64 agent trajectories = 4 agents × 16 tasks)."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No hyperparameter tuning is performed — agents are used as-is with their default configurations."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No configuration selection is performed — agents are evaluated with their default settings."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple t-tests are reported across skill categories (Tables 5, 6) without mention of multiple comparison correction (Bonferroni, etc.)."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "The authors evaluate third-party agent systems (ChatGPT, Manus, OpenHands), not their own system. Self-comparison bias is not applicable."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "Compute differences between agents are negligible relative to the study's goals — the focus is on workflow comparison, not compute-matched performance."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper explicitly discusses construct validity: tasks are designed to cover 70.1-95.2% of employment per skill category using O*NET data (§2.1, Figure 2). The Limitations section discusses O*NET database limitations and coverage gaps."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "The paper explicitly compares the same model backbone (GPT, Claude) across different scaffolds (OpenHands vs ChatGPT/Manus), and discusses how different frameworks lead to different behaviors despite shared backbones (§2.3, §4.1)."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "Contamination module not activated — see contamination section rationale."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "Contamination module not activated."
    347       },
    348       "non_independence_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "Contamination module not activated."
    352       },
    353       "leakage_detection_method": {
    354         "applies": false,
    355         "answer": false,
    356         "justification": "Contamination module not activated."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Agents take an overwhelmingly programmatic approach across all work domains, with 93.8% program-use rate, even for open-ended visual tasks like design.",
    363       "evidence": "Figure 4(c) and Figure 10 show program-use rates across tasks. Agents exhibit 27.8% stronger alignment with program-using human steps than non-programmatic ones (Table 7, §4.1).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Human workflows are substantially altered by AI automation but not by augmentation. AI augmentation accelerates work by 24.3%, while AI automation slows humans by 17.7%.",
    368       "evidence": "§4.2 and Figure 5. Workflow alignment with independent humans: 40.3% for automation scenarios vs 76.8% for augmentation scenarios.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Agents produce work of inferior quality, with success rates 32.5-49.5% lower than humans, including concerning behaviors like data fabrication.",
    373       "evidence": "Table 8 and Figure 7(a). Concrete fabrication examples in Figure 6(a) where agent creates fake receipt data instead of parsing actual bill images.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Agents deliver results 88.3% faster and at 90.4-96.2% lower cost than human workers.",
    378       "evidence": "§5.2: restricted to tasks completed by both, agents take 88.3% less time and 96.4% fewer actions. OpenHands-GPT costs $0.94/task vs human $24.79/task.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Human-agent teaming at the workflow step level improves efficiency by 68.7% while maintaining task correctness.",
    383       "evidence": "§5.3 and Figure 7(c): demonstrated on data analysis tasks where human handles file navigation and agent performs analysis. Single example demonstrated.",
    384       "supported": "weak"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval", "observational", "qualitative"],
    388   "key_findings": "AI agents approach human work through a programmatic lens (93.8% program use rate), even for visual tasks, diverging from humans' UI-centric workflows. Despite high workflow alignment (83% matching steps), agents produce 32.5-49.5% lower quality work, with concerning behaviors like data fabrication. However, agents are 88.3% faster and 90.4-96.2% cheaper, suggesting value in step-level delegation where humans handle navigation and verification while agents handle programmable steps.",
    389   "red_flags": [
    390     {
    391       "flag": "Very small sample size for broad claims",
    392       "detail": "Only 16 tasks, 48 human workers (3 per task), and 4 agent frameworks are used to make claims about '287 computer-using U.S. occupations and 71.9% of their daily work activities.' The coverage argument via skill categories is indirect."
    393     },
    394     {
    395       "flag": "Single-run agent evaluation",
    396       "detail": "Each agent appears to be run only once per task. Given the stochasticity of LLM outputs, single-run results may not be representative of typical agent performance."
    397     },
    398     {
    399       "flag": "No IRB for human subjects research",
    400       "detail": "The study recruits 48 human workers from Upwork and records their screen activities, but no IRB or ethics approval is mentioned."
    401     },
    402     {
    403       "flag": "Teaming claim based on single example",
    404       "detail": "The human-agent teaming benefit (68.7% efficiency gain) is demonstrated on data analysis tasks failed by one agent (Manus), which is a very limited demonstration for a headline finding."
    405     },
    406     {
    407       "flag": "LLM-as-judge without calibration",
    408       "detail": "Workflow quality evaluation and alignment are performed using claude-sonnet-3.7 as an evaluator. While Cohen's Kappa against human judgments is reported (0.637 for consistency), this is only 'substantial' agreement — the modularity Kappa of 0.781 is better but the evaluation still relies heavily on an LLM judge."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "SWE-bench: Can language models resolve real-world github issues?",
    414       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"],
    415       "year": 2024,
    416       "relevance": "Key benchmark for evaluating LLM agents on software engineering tasks, directly relevant to agentic coding evaluation."
    417     },
    418     {
    419       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    420       "authors": ["X. Wang", "B. Li", "Y. Song"],
    421       "year": 2025,
    422       "relevance": "One of the agent frameworks evaluated in this study; open-source platform for AI coding agents."
    423     },
    424     {
    425       "title": "TheAgentCompany: benchmarking LLM agents on consequential real world tasks",
    426       "authors": ["F. F. Xu", "Y. Song", "B. Li"],
    427       "year": 2024,
    428       "relevance": "Benchmark environment used in this study for evaluating agents on work-related tasks across occupations."
    429     },
    430     {
    431       "title": "GPTs are GPTs: An early look at the labor market impact potential of large language models",
    432       "authors": ["T. Eloundou", "S. Manning", "P. Mishkin", "D. Rock"],
    433       "year": 2023,
    434       "arxiv_id": "2303.10130",
    435       "relevance": "Foundational study on LLM labor market impact potential, directly relevant to AI workforce effects."
    436     },
    437     {
    438       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    439       "authors": ["J. Becker", "N. Rush", "E. Barnes", "D. Rein"],
    440       "year": 2025,
    441       "relevance": "Studies AI impact on developer productivity, finding verification overhead can negate efficiency gains — corroborates this paper's automation slowdown finding."
    442     },
    443     {
    444       "title": "Which economic tasks are performed with AI? Evidence from millions of Claude conversations",
    445       "authors": ["K. Handa", "A. Tamkin"],
    446       "year": 2025,
    447       "relevance": "Large-scale analysis of AI use in economic tasks, relevant to understanding AI's role in human work."
    448     },
    449     {
    450       "title": "Code with me or for me? How increasing AI automation transforms developer workflows",
    451       "authors": ["V. Chen", "A. Talwalkar", "R. Brennan", "G. Neubig"],
    452       "year": 2025,
    453       "relevance": "Directly studies how AI automation vs augmentation changes developer workflows — complementary to this paper."
    454     },
    455     {
    456       "title": "Navigating the jagged technological frontier: Field experimental evidence of the effects of AI on knowledge worker productivity and quality",
    457       "authors": ["F. Dell'Acqua"],
    458       "year": 2023,
    459       "relevance": "Field experiment on AI effects on knowledge worker productivity, finds AI helps for some tasks but hurts for others."
    460     },
    461     {
    462       "title": "R2e-gym: Procedural environment generation and hybrid verifiers for scaling open-weights SWE agents",
    463       "authors": ["N. Jain", "J. Singh"],
    464       "year": 2025,
    465       "relevance": "Training environment for SWE agents, relevant to understanding agent development for software engineering tasks."
    466     },
    467     {
    468       "title": "Future of work with AI agents: Auditing automation and augmentation potential across the US workforce",
    469       "authors": ["Y. Shao", "H. Zope"],
    470       "year": 2025,
    471       "relevance": "Audits AI automation/augmentation potential across US occupations using same O*NET framework as this study."
    472     },
    473     {
    474       "title": "The AI Scientist: Towards fully automated open-ended scientific discovery",
    475       "authors": ["C. Lu", "C. Lu", "R. T. Lange"],
    476       "year": 2024,
    477       "arxiv_id": "2408.06292",
    478       "relevance": "Attempts fully automated scientific research with AI agents, relevant to understanding agent capabilities and limitations."
    479     },
    480     {
    481       "title": "Re-bench: Evaluating frontier AI R&D capabilities of language model agents against human experts",
    482       "authors": ["H. Wijk", "T. Lin", "J. Becker"],
    483       "year": 2024,
    484       "relevance": "Directly compares AI agents against human experts on R&D tasks, complementary human-agent comparison study."
    485     }
    486   ]
    487 }

Impressum · Datenschutz