scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30290B)
      1 {
      2   "paper": {
      3     "title": "Ambig-SWE: Interactive Agents to Overcome Underspecificity in Software Engineering",
      4     "authors": [
      5       "Sanidhya Vijayvargiya",
      6       "Xuhui Zhou",
      7       "Akhila Yerukola",
      8       "Maarten Sap",
      9       "Graham Neubig"
     10     ],
     11     "year": 2025,
     12     "venue": "ICLR 2026",
     13     "arxiv_id": "2502.13069"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a GitHub link: https://github.com/sani903/InteractiveSWEAgents (footnote 1, page 1). The reproducibility statement also confirms: 'We have also attached the code with the steps to reproduce and the experimental data.'"
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The dataset is built on the publicly available SWE-Bench Verified dataset (500 issues). The paper states 'We have also attached the code with the steps to reproduce and the experimental data' in the Reproducibility Statement. Additionally, 'LLM annotations for underspecification are provided in the supplementary materials' (footnote 2)."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions using OpenHands agent framework (v0.60 for Qwen 3 Coder and Claude Sonnet 4, Appendix A.7), Docker containers from SWE-Bench, and 16 workers in Remote Runtime (beta). However, no requirements.txt, Dockerfile, or detailed dependency specification is provided in the paper text itself."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The Reproducibility Statement states: 'All key components of the proposed framework are described with the intention of enabling replication by an independent research group. The experimental setup is detailed in §2 and full prompts are provided in the Appendix §A. We have also attached the code with the steps to reproduce and the experimental data.'"
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper reports resolve rates as point estimates (e.g., percentages in Figure 3, Table 1, Table 2) without confidence intervals or error bars. No ± notation or CI notation is present."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper uses Wilcoxon Signed-Rank Tests (described in Appendix A.3.1) with a significance level of 0.05 to compare Hidden vs. Interaction and Interaction vs. Full settings. Table 4 reports p-values for all six models across both comparisons."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports percentage improvements with baseline context. For example, 'up to 74% over the non-interactive settings' (abstract), and relative performance recovery percentages: 'Claude Sonnet 3.5 models and Haiku 3.5 recover up to 80% of the performance in the Full setting' (§3.2). Resolve rates are reported across all settings (Figure 3) enabling effect magnitude assessment."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper uses all 500 issues from SWE-Bench Verified for most models, but Claude Sonnet 4 is evaluated on only 100/500 instances in the Hidden setting due to cost (footnote 4). No power analysis is provided, and the choice of 500 issues is inherited from SWE-Bench Verified without independent justification."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Results are reported as single-run resolve rates. No standard deviation, variance across runs, or repeated experiment results are provided. The only std dev reported is for the summarization quality metrics (Table 3: ROUGE/BERTScore), not for the main experimental results."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper includes three settings as comparisons: Full (fully specified, no interaction), Hidden (underspecified, no interaction), and Interaction (underspecified, with interaction). The Hidden setting serves as the baseline for measuring interaction benefits, and the Full setting serves as the upper bound."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The evaluated models include contemporary models: Claude Sonnet 4 (2025), Qwen 3 Coder 480B, Claude Sonnet 3.5, Claude Haiku 3.5, Deepseek-v2, and Llama 3.1 70B. These represent current state-of-the-art proprietary and open-weight models."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper ablates across multiple dimensions: (1) three experimental settings (Full, Hidden, Interaction) isolating the effect of information and interaction; (2) analysis of navigational vs. informational details (Table 1); (3) three prompt encouragement levels for detection (Table 2); (4) question quality decomposition (§5). These systematically isolate individual factors."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper uses multiple metrics: resolve rate (primary), accuracy/FPR/FNR for detection (Table 2), cosine distance for information gain (Figure 5), LLM-as-judge scores (Figure 6), average number of questions (Table 6), and ROUGE/BERTScore for summarization quality (Table 3)."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper uses automated evaluation throughout: test suite pass/fail for resolve rates, LLM-as-judge (GPT-4o) for question quality scoring, and embedding-based cosine distance for information gain. No human evaluation of agent outputs, question quality, or interaction behavior is included. Given that claims are made about question quality and interaction effectiveness, human evaluation would strengthen the findings."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The evaluation uses SWE-Bench Verified's existing test cases to verify solutions. The test cases are separate from the issues provided to agents. There is no tuning/dev split concern since the paper evaluates models without fine-tuning."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper provides per-model breakdowns across all settings (Figure 3), per-model detection performance across three prompt types (Table 2), per-model analysis of navigational vs. informational information usage (Table 1), and qualitative per-model question strategy analysis (Table 7)."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses multiple failure patterns: Qwen 3 Coder's complete failure to interact (100% FNR, §4.2), Llama 3.1's poor question quality and vague phrasing (§5.2-5.3), Deepseek's counterintuitive degradation with stronger prompting (§4.2), and Qwen 3 Coder's rigid protocol-following that worsens performance with navigational information (§3.3, Table 1)."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Several negative results are reported: interaction does not yield efficiency gains (§3.2), Qwen 3 Coder's resolve rate worsens with navigational information (Table 1), stronger prompting degrades Deepseek's detection (Table 2), and 'prompt engineering offers limited improvement' for detection (§4.2)."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims are supported: 'up to 74% over non-interactive settings' is supported by Figure 3 data; 'models struggle to distinguish between well-specified and underspecified instructions' is supported by Table 2; 'significant improvements in performance' is supported by the Wilcoxon tests in Table 4."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The study design uses controlled comparisons: the same 500 issues are evaluated across Full, Hidden, and Interaction settings with only the information/interaction variable changed. This controlled single-variable manipulation supports causal claims about interaction improving performance. The authors also acknowledge the simulated user proxy limitation and note they 'did not evaluate on naturally underspecified SWE-Bench examples because they lack the paired ground truth necessary for causal measurement of interaction impact' (§2.1)."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title 'Interactive Agents to Overcome Underspecificity in Software Engineering' is broader than what was tested. The study is limited to Python repositories from SWE-Bench Verified, synthetic underspecification generated by GPT-4o, and a simulated user proxy (GPT-4o). While the conclusion mentions 'While we focus on software engineering, the methods and insights can extend to other complex, real-world agentic tasks,' the paper does not sufficiently bound its generalizations to the Python/SWE-Bench setting in the title or abstract."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper discusses several alternative explanations: data leakage as a potential factor for Qwen 3 Coder's Hidden performance (§3.2), model's 'superior programming acumen' vs. interaction ability (§3.2), instruction-following capability vs. interaction capability for Sonnet models (§4.3), and rigid protocol-following as an alternative explanation for Qwen 3 Coder's behavior (§3.3)."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses marketing names without specific API versions or snapshot dates: 'Claude Sonnet 3.5', 'Claude Sonnet 4', 'Claude Haiku 3.5', 'GPT-4o' (for user proxy and summarization), 'Llama 3.1 70B-Instruct', 'Deepseek-v2', 'Qwen 3 Coder 480B'. No API snapshot dates or version identifiers (e.g., 'claude-3-5-sonnet-20241022') are provided."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Full prompts are provided in the Appendix: Full Setting prompt (A.2.1), Interaction Setting prompt with mandatory interaction (A.2.2), User Proxy prompt (A.2.2), Hidden Setting prompt (A.2.3), Summarization prompt (A.2.3), LLM Underspecification Analysis prompt (A.2.3), and three detection prompts (A.5). These are complete prompt texts, not just descriptions."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No temperature, top-p, max tokens, or other sampling hyperparameters are reported for any of the models used (coding agents, user proxy GPT-4o, summarization GPT-4o, or embedding model). Only the max turn counts are mentioned (30 for most models, 100 for Claude Sonnet 4 and Qwen 3 Coder)."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The OpenHands agentic framework is described in detail (§2.2 and Appendix A.1): tools available (bash terminal, file system, code execution, browsing disabled during evaluation), execution environment (Docker container), agent capabilities (edit files, execute scripts, debug), and completion mechanism (FinishAction, git_patch extraction). The interaction mechanism with user proxy is also fully described."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The data creation pipeline is documented: starting from 500 SWE-Bench Verified issues, GPT-4o generates underspecified summaries using a provided prompt (A.2.3), the quality is validated with distributional difference analysis and quantitative metrics (Table 3, ROUGE/BERTScore), and LLM annotations of missing information are provided in supplementary materials."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 7 is titled 'Conclusion, Limitations, and Future Work' and contains substantive discussion of limitations spanning an entire paragraph before the findings summary."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The paper discusses specific threats: 'Underspecificity detection is measured only within the first three turns, as models rarely recover if they fail to engage early' (§7), 'Question quality is approximated via latent vector changes that weigh all information equally, though models may prioritize details differently' (§7), and 'our simulated user proxy may be more cooperative than real users' (§7)."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "While the paper mentions 'While we focus on software engineering' in the conclusion, it does not explicitly state what the results do NOT show. It does not bound its findings to Python repositories, SWE-Bench-style tasks, or synthetic underspecification. The conclusion instead extends claims: 'the methods and insights can extend to other complex, real-world agentic tasks.'"
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The reproducibility statement indicates 'We have also attached the code with the steps to reproduce and the experimental data.' The underlying SWE-Bench Verified dataset is publicly available. LLM annotations are provided in supplementary materials."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The data collection is well described: SWE-Bench Verified provides the 500 base issues (§2.1), GPT-4o generates underspecified summaries using a specified prompt (§A.2.3), and the quality is validated with distributional difference analysis and quantitative metrics (Table 3)."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The data source is the standard SWE-Bench Verified benchmark."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline is documented: SWE-Bench Verified (500 issues) → GPT-4o summarization → distributional difference validation → three experimental settings (Full, Hidden, Interaction) → agent execution in OpenHands Docker containers → git_patch extraction → test verification. The process is clearly traceable."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding sources or acknowledgments section listing grants or sponsors is present in the paper."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are disclosed as being from 'Language Technologies Institute, Carnegie Mellon University, Pittsburgh, USA.' The paper evaluates models from Anthropic, Meta, DeepSeek, Alibaba, and OpenAI. CMU is an academic institution with no direct product stake in the evaluated models."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so independence cannot be verified. The paper evaluates commercial products (Claude, GPT-4o, Llama, Deepseek, Qwen) without disclosing whether any company provided funding, credits, or API access."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper. Graham Neubig is known to be involved with OpenHands/All Hands AI, which is the framework used in this evaluation, but this is not disclosed in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No training data cutoff dates are stated for any of the evaluated models. This is relevant because SWE-Bench issues are from public GitHub repositories, and models trained after their publication could have seen the solutions."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The paper acknowledges potential data leakage: 'Some models achieve higher resolve rates in the Hidden setting likely due to their superior programming acumen, or data leakage' (§3.2). For Qwen 3 Coder specifically, the paper notes 'correct assumptions potentially inflate its performance in this setting' (§3.2, Appendix A.7) and discusses the model's use of internal RFC knowledge."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "While data leakage is mentioned as a possibility (§3.2), the paper does not systematically address benchmark contamination. SWE-Bench has been publicly available since 2023, and most evaluated models were trained after this date. No temporal analysis, contamination checks, or canary strings are used."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study. All evaluations use LLM agents and automated metrics."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No inference costs, API costs, or per-instance costs are reported. The paper mentions cost concerns only qualitatively: 'The model compensates for the lack of information with increased exploration and solution attempts leading to substantially higher evaluation costs' (footnote 4) but does not quantify them."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper mentions using '16 workers in the Remote Runtime (beta) provided in OpenHands' (Appendix A.3.2) but does not state total API spend, GPU hours, or total computational budget for the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "Interactivity can boost performance on underspecified inputs by up to 74% over non-interactive settings.",
    292       "evidence": "Figure 3 shows resolve rates across Hidden, Interaction, and Full settings for all six models. The 74% figure appears to come from the relative improvement for specific models (e.g., Claude Haiku 3.5 or similar). Wilcoxon tests in Table 4 confirm statistical significance for all models.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Claude Sonnet 4 achieves the highest relative performance recovery (89%) in the Interaction setting compared to the Full setting.",
    297       "evidence": "Stated in §3.2 with resolve rate data from Figure 3. Claude Sonnet 4 demonstrates the strongest ability to integrate information from interaction.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Models default to non-interactive behavior unless explicitly prompted, and even with strong encouragement, most models struggle to distinguish well-specified from underspecified inputs.",
    302       "evidence": "Table 2 shows detection accuracy, FPR, and FNR across three prompt levels for all models. Qwen 3 Coder has 100% FNR across all prompts. Only Claude Sonnet 4 with strong encouragement achieves 89% accuracy.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Qwen 3 Coder completely fails to interact under any prompting condition (100% FNR across all prompts).",
    307       "evidence": "Table 2 shows FNR = 1.00 for Qwen 3 Coder across Neutral, Moderate, and Strong encouragement conditions. Discussed in §4.2.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Claude Sonnet 4 achieves comparable information gain to Qwen 3 Coder (0.171 vs 0.179 cosine distance) with 50% fewer questions (4.03 vs 6.02).",
    312       "evidence": "Figure 5 shows cosine distance values, and Table 6 shows average question counts. Discussed in §5.2.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Interaction improves task success but does not yield efficiency gains (no reduction in action steps).",
    317       "evidence": "§3.2 states that Qwen 3 Coder uses roughly the same number of steps (65) in both settings, while Claude Sonnet 4 increases from 65 to 75 steps with interaction.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "As model capability scales (e.g., from Claude Sonnet 3.5 to Claude Sonnet 4), substantial performance gains occur in both interactive and non-interactive settings.",
    322       "evidence": "Figure 3 data comparing resolve rates across Claude model generations. §3.2 discusses this scaling trend.",
    323       "supported": "moderate"
    324     }
    325   ],
    326   "methodology_tags": [
    327     "benchmark-eval"
    328   ],
    329   "key_findings": "The paper introduces Ambig-SWE, a benchmark for evaluating LLM agents' ability to handle underspecified software engineering tasks through interaction. Across six models, interaction recovers significant performance lost to underspecification (up to 74% improvement), with Claude Sonnet 4 achieving the best recovery rate (89% of full-specification performance). Most models default to non-interactive behavior and struggle to detect underspecification, with Qwen 3 Coder completely failing to interact under any prompting condition. Effective interaction strategies prioritize exploration-first approaches and targeted questions over high question volume.",
    330   "red_flags": [
    331     {
    332       "flag": "Synthetic underspecification may not represent real-world patterns",
    333       "detail": "Underspecified issues are generated by GPT-4o, not collected from actual user interactions. The paper acknowledges that natural underspecified issues differ (having more concrete technical details, reproducibility information, external references, and conversational fragments). The synthetic approach uses 'more aggressive information removal' which may not match real user behavior."
    334     },
    335     {
    336       "flag": "Simulated user proxy instead of real users",
    337       "detail": "GPT-4o serves as the user proxy, which may be more cooperative, consistent, and predictable than real users. The paper acknowledges this limitation but does not quantify the gap. Real users might provide noisy, incomplete, or misleading responses."
    338     },
    339     {
    340       "flag": "No confidence intervals or variance across runs",
    341       "detail": "All resolve rates are reported as single-run point estimates without error bars, confidence intervals, or repeated runs. LLM outputs are stochastic, so single-run results may not be stable. Only p-values from Wilcoxon tests are provided."
    342     },
    343     {
    344       "flag": "Missing hyperparameters",
    345       "detail": "Temperature, top-p, and other sampling parameters are not reported for any model. These significantly affect LLM output behavior and reproducibility."
    346     },
    347     {
    348       "flag": "Unequal evaluation conditions across models",
    349       "detail": "Claude Sonnet 4 was evaluated on only 100/500 instances in the Hidden setting due to cost (footnote 4), while all other models were evaluated on all 500. Claude Sonnet 4 and Qwen 3 Coder received 100 turns vs. 30 for other models. Qwen 3 Coder received a modified prompt with mandatory clarification phase. These asymmetries complicate cross-model comparison."
    350     },
    351     {
    352       "flag": "Undisclosed potential conflict of interest",
    353       "detail": "Graham Neubig (senior author) is associated with OpenHands/All Hands AI, the agent framework used for all experiments. This relationship is not disclosed in the paper, though it could influence framework selection and configuration choices."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    359       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    360       "year": 2024,
    361       "arxiv_id": "2310.06770",
    362       "relevance": "Foundational benchmark for evaluating LLM agents on real-world software engineering tasks, directly used as the base dataset in this work."
    363     },
    364     {
    365       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    366       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    367       "year": 2024,
    368       "arxiv_id": "2407.16741",
    369       "relevance": "The agentic framework used for all experiments in this paper, evaluating LLM capabilities in software engineering tasks."
    370     },
    371     {
    372       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    373       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    374       "year": 2023,
    375       "arxiv_id": "2302.06590",
    376       "relevance": "Empirical study on AI-assisted programming productivity, relevant to understanding the real-world impact of code generation tools."
    377     },
    378     {
    379       "title": "Generative AI at work",
    380       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey R Raymond"],
    381       "year": 2023,
    382       "relevance": "Foundational study on productivity effects of generative AI in workplace settings."
    383     },
    384     {
    385       "title": "TheAgentCompany: Benchmarking LLM agents on consequential real world tasks",
    386       "authors": ["Frank F. Xu", "Yufan Song", "Boxuan Li"],
    387       "year": 2024,
    388       "arxiv_id": "2412.14161",
    389       "relevance": "Benchmark for evaluating LLM agents on real-world tasks beyond code generation, including interactive scenarios."
    390     },
    391     {
    392       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    393       "authors": ["Dong Huang", "Jie M. Zhang", "Michael Luck"],
    394       "year": 2024,
    395       "arxiv_id": "2312.13010",
    396       "relevance": "Multi-agent approach to code generation with testing feedback, relevant to agentic software engineering evaluation."
    397     },
    398     {
    399       "title": "ClarifyGPT: Empowering LLM-based code generation with intention clarification",
    400       "authors": ["Fangwen Mu", "Lin Shi", "Song Wang"],
    401       "year": 2023,
    402       "arxiv_id": "2310.10996",
    403       "relevance": "Directly addresses clarification in LLM code generation, a core topic of this paper's evaluation."
    404     },
    405     {
    406       "title": "LLM-based test-driven interactive code generation: User study and empirical evaluation",
    407       "authors": ["Sarah Fakhoury", "Aaditya Naik", "Georgios Sakkas", "Saikat Chakraborty", "Shuvendu K. Lahiri"],
    408       "year": 2024,
    409       "doi": "10.1109/tse.2024.3428972",
    410       "relevance": "Empirical study of interactive test-driven code generation with LLMs including user study, relevant to understanding human-AI interaction in coding."
    411     },
    412     {
    413       "title": "The AI Scientist: Towards fully automated open-ended scientific discovery",
    414       "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    415       "year": 2024,
    416       "arxiv_id": "2408.06292",
    417       "relevance": "Agentic AI system for scientific research, relevant to understanding capabilities and limitations of autonomous AI agents."
    418     },
    419     {
    420       "title": "Aligning language models to explicitly handle ambiguity",
    421       "authors": ["Hyuhng Joon Kim", "Youna Kim", "Cheonbok Park"],
    422       "year": 2024,
    423       "arxiv_id": "2404.11972",
    424       "relevance": "Addresses LLM handling of ambiguous inputs, directly related to the underspecificity challenges studied in this paper."
    425     },
    426     {
    427       "title": "Sotopia: Interactive evaluation for social intelligence in language agents",
    428       "authors": ["Xuhui Zhou", "Hao Zhu", "Leena Mathur"],
    429       "year": 2024,
    430       "arxiv_id": "2310.11667",
    431       "relevance": "Framework for evaluating interactive capabilities of language agents in social settings, related to the interaction evaluation methodology."
    432     },
    433     {
    434       "title": "Learning to ask: When LLMs meet unclear instruction",
    435       "authors": ["Wenxuan Wang", "Juluan Shi", "Chaozheng Wang"],
    436       "year": 2024,
    437       "arxiv_id": "2409.00557",
    438       "relevance": "Evaluates LLM behavior on ambiguous tool-use instructions, directly relevant to understanding how models handle underspecification."
    439     }
    440   ]
    441 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs