scan.json (28142B)
1 { 2 "paper": { 3 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 4 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 5 "year": 2024, 6 "venue": "NeurIPS 2024", 7 "arxiv_id": "2405.15793" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": true, 16 "justification": "The paper states 'Data, code, and leaderboard at swe-agent.com' and references open-source GitHub repositories. Section E.2 explicitly states 'we open source all of our resources.'" 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "They use the publicly available SWE-bench dataset and HumanEvalFix benchmark. Section E.2 states 'We also open source all inference and evaluation artifacts (e.g., trajectories, code generations, evaluation execution traces, analysis notebooks).'" 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": true, 26 "justification": "Section A.2 describes the Docker-based environment setup. The system uses Docker containers for reproducible execution, and the configuration system is documented in Section A.3." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": true, 31 "justification": "Section E.2 states 'Practitioners should be able to easily recover our findings by running the agent with simple scripts. We provide extensive text and video documentation describing how to run and modify different parts of the codebase.'" 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": true, 38 "justification": "Table 10 reports mean ± std across 6 runs: '17.94±0.49'. Figure 4 shows pass@k performance across 6 runs." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "No statistical significance tests are reported. Comparisons between SWE-agent and baselines (e.g., Shell-only 11.0% vs SWE-agent 18.0%) are made without any formal statistical testing." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper reports effect sizes in context: '10.7 percentage points more instances than the baseline agent' (Section 1), '64% relative increase compared to Shell-only' (Section 5), and detailed percentage improvements in Table 3 ablations." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "No justification is given for why 300 instances (SWE-bench Lite) is sufficient for ablations, or why 37 instances were used for the hyperparameter sweep. The dev set of 37 instances is particularly small." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Table 10 reports variance across 6 runs with std dev (17.94±0.49). Figure 4 shows pass@k across 6 runs. The paper notes 'average performance variance is relatively low, but per-instance resolution can change considerably.'" 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "Table 1 compares against RAG baselines (from Jimenez et al.) and a Shell-only interactive baseline adapted from Yang et al. (InterCode)." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": true, 70 "justification": "The RAG baseline is from the original SWE-bench paper (2024), and the Shell-only baseline uses the same underlying models (GPT-4 Turbo, Claude 3 Opus). These were contemporary at submission time." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": true, 75 "justification": "Table 3 provides detailed ablations of individual ACI components: editor (with/without linting, no edit), search (summarized, iterative, no search), file viewer window size (30, 100, full), and context management (last 5 obs, full history, without demo)." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper reports % Resolved (pass@1), $ Avg. Cost, pass@k for k∈{1-6}, and file localization F1 score. Table 2 also reports Pass@1 across three languages for HumanEvalFix." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": true, 85 "justification": "The failure mode analysis uses both automated LM categorization and a hand-labeled validation set of 15 instances, with 87% agreement reported (Section 5.2, Figure 8). The qualitative analysis in Section D provides detailed manual trajectory walkthroughs." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "The paper reports main results on SWE-bench test set (2,294 instances) and SWE-bench Lite test set (300 instances), separate from the dev split used for design decisions. Section B.1 uses a 37-instance dev split for hyperparameter sweeps." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "Table 6 provides per-repository performance breakdown across 12 repositories. Table 7 provides temporal breakdown by year. Figure 8 provides failure mode distribution. Table 8 provides per-turn action pattern analysis." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 5.2 and Figure 8 provide a detailed failure mode taxonomy (9 categories) with distribution analysis. Section D provides two full qualitative failure case studies. Section B.3.3 discusses editing failure patterns." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper reports that iterative search performed worse than no search at all (12.0% vs 15.7%, Table 3). It also reports that showing the full file hurts performance (12.7% vs 18.0%). Failed experiments with Llama 3 and DeepSeek Coder are mentioned in Section 4." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims of 12.5% on SWE-bench and 87.7% on HumanEvalFix are supported by Tables 1 and 2. The claim of 'state-of-the-art' and 'far exceeding the previous state-of-the-art' is supported by the comparison to prior 3.8% rate." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper's causal claim that ACI design improves performance is supported by controlled ablation studies (Table 3) where individual components are removed/modified while holding the LM fixed. The study design (fixed LM, varying interface) is explicitly stated in Section 2." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper introduces the concept of ACI as a general framework but tests only on Python tasks (SWE-bench) with only two LMs (GPT-4 Turbo, Claude 3 Opus). The title claims 'Automated Software Engineering' broadly. Section E.3 acknowledges scope is 'exclusively focused on programmatic tasks' but the framing throughout suggests broader generality." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper does not discuss alternative explanations for why the ACI improves performance. For example, it does not consider whether the improvement is primarily from the demonstrations/prompts rather than the interface design, or whether the ACI simply provides more tokens of context about the task." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper clearly defines % Resolved as 'the proportion of instances for which all tests pass successfully after the model generated patch is applied' (Section 4). It does not overclaim this as general SE capability — it frames results specifically in terms of SWE-bench task resolution." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 4 specifies exact model versions: 'gpt-4-1106-preview' and 'claude-3-opus-20240229'." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "The full prompt templates are provided in Appendix C (Figures 27-32), including the system prompt, instance template, demonstration template, next step templates, and error message template. Section A.3 describes the configuration system." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Table 5 reports the hyperparameter sweep results including temperature (0.0, 0.2), window size (100, 200), and history processing method. The final configuration uses temperature 0.0, window 100, last 5 observations." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The ACI scaffolding is described in extensive detail across Sections 2-3 and Appendix A. All custom commands are documented in Table 4. The file viewer, editor, search system, and context management are described with implementation details." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 4 describes the datasets, their splits, and how they are used. For HumanEvalFix, Section B.7 describes how the dataset was adapted for the SWE-agent setting. SWE-bench's data collection procedure is referenced from Jimenez et al." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section E.3 is titled 'Limitations & Future Work' and discusses specific limitations. Section 7 (Discussion) also acknowledges scope constraints." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section E.3 discusses specific limitations: the small toolkit, manual ACI development process, exclusive focus on Python/programmatic tasks. Section E.1 discusses specific security risks of executing LM-generated code." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section E.3 explicitly states 'the scope of SWE-agent is exclusively focused on programmatic tasks like software engineering and code generation' and discusses what was NOT tested (other domains, other programming languages beyond the benchmarks)." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section E.2 states 'We also open source all inference and evaluation artifacts (e.g., trajectories, code generations, evaluation execution traces, analysis notebooks). The results presented in the main and supplementary parts of this paper can be fully rendered from the data.'" 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "The data comes from established benchmarks (SWE-bench, HumanEvalFix) whose collection procedures are documented in their respective papers. The paper references these sources clearly." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants. The study uses automated benchmarks (SWE-bench, HumanEvalFix) as data sources." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline from benchmark task instance to evaluation is documented: task instances are loaded, Docker containers are set up, the agent interacts, patches are generated, and evaluation runs unit tests. Section A.2 describes the implementation pipeline." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The Acknowledgements section states 'We acknowledge support from an Oracle Collaborative Research award and the National Science Foundation under Grant No. 2239363.'" 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "All authors are affiliated with Princeton Language and Intelligence, Princeton University. No commercial affiliations with the evaluated products (OpenAI, Anthropic)." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": true, 215 "justification": "Funding from Oracle and NSF. Neither has a direct financial interest in SWE-agent's performance on SWE-bench. The research is from an academic institution." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement is included in the paper. While the authors are academic researchers, the absence of a formal declaration means this criterion is not satisfied." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper does not state the training data cutoff dates for GPT-4 Turbo or Claude 3 Opus, even though it tests them on benchmarks that could overlap with training data." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": true, 232 "justification": "Section B.2 (Temporal Analysis, Table 7) analyzes whether performance correlates with issue creation year, stating 'There is no clear correlation between a task instance's creation year and its resolution rate' — this addresses potential test pollution from training data." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "While the temporal analysis in B.2 controls for 'possible test pollution,' the paper does not directly address whether SWE-bench task instances or their solutions appeared in GPT-4 Turbo's or Claude 3 Opus's training data. HumanEvalFix contamination is not discussed." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants. This is a benchmark evaluation study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Table 1 reports $ Avg. Cost per resolved instance for all configurations. Section 4 states a $4 per-instance budget. Table 15 shows cost distributions for resolved vs unresolved instances." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "The per-instance budget of $4 is stated in Section 4. Total costs can be inferred from the per-instance costs and number of instances. The paper uses API-based models so GPU hours are not applicable." 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Table 10 reports results across 6 separate runs with mean and standard deviation (17.94±0.49). Figure 4 shows pass@k across these 6 runs." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "The paper explicitly states 6 runs for variance analysis (Table 10, Section B.5). Main results are reported as pass@1 from individual runs." 299 }, 300 "hyperparameter_search_budget": { 301 "applies": true, 302 "answer": true, 303 "justification": "Table 5 shows the full hyperparameter sweep: 2 temperatures × 2 window sizes × 2 history settings × 2 models = 16 configurations, each averaged over 5 samples on 37 dev instances." 304 }, 305 "best_config_selection_justified": { 306 "applies": true, 307 "answer": true, 308 "justification": "Section B.1 describes the sweep on dev split instances, and the best configuration (temperature 0.0, window 100, last 5 observations) is selected based on dev performance before testing. All configurations are reported in Table 5." 309 }, 310 "multiple_comparison_correction": { 311 "applies": true, 312 "answer": false, 313 "justification": "Multiple ablations and comparisons are reported (Table 3 has ~10 comparisons) without any correction for multiple comparisons." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": false, 318 "justification": "The authors implement both SWE-agent and the Shell-only baseline. They do not discuss the bias of implementing and tuning their own system vs. baselines. The Shell-only baseline is described as 'adapted from' InterCode but may not have received the same development effort." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": true, 323 "justification": "Table 1 reports both % Resolved and $ Avg. Cost together. The paper notes SWE-agent is '8-13x more costly but yields a 6.7-fold improved % Resolved rate' compared to RAG. Figure 15 shows cost vs resolution distributions." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": true, 328 "justification": "Section 6.1 discusses why SWE-bench is a suitable benchmark, noting it 'unites many separate SE tasks' and uses 'rigorous, execution-based evaluation with human-written unit tests.' The paper also evaluates on HumanEvalFix to test a different aspect (code editing)." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": true, 332 "answer": true, 333 "justification": "The scaffold IS the primary variable under study. The paper explicitly holds the LM fixed and varies the ACI (Section 2: 'we assume a fixed LM and focus on designing the ACI'). Table 1 shows the same models under different scaffolds (RAG, Shell-only, SWE-agent)." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "Section B.2 provides a temporal analysis (Table 7) showing performance by task instance creation year, controlling for potential temporal leakage. The paper notes 'no clear correlation' between issue age and resolution rate." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "The paper does not discuss whether the issue descriptions or test file paths in SWE-bench could leak information about the solution. The agent receives the full issue text which may contain hints." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "SWE-bench instances come from only 12 repositories. The paper does not discuss whether training data for GPT-4/Claude included code from these same repositories, creating non-independence between train and test data." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": false, 355 "justification": "No concrete leakage detection method is applied. The temporal analysis is correlational but not a detection method for data contamination." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "SWE-agent with GPT-4 Turbo solves 12.47% of SWE-bench test instances, substantially outperforming prior non-interactive 3.8% state-of-the-art.", 362 "evidence": "Table 1 shows 12.47% (286/2,294) resolved on full SWE-bench test set vs 3.79% for RAG with Claude 3 Opus.", 363 "supported": "strong" 364 }, 365 { 366 "claim": "SWE-agent's ACI yields a 64% relative increase over the Shell-only baseline (18.0% vs 11.0% on SWE-bench Lite).", 367 "evidence": "Table 1 and Section 5 report these numbers. The Shell-only baseline uses the same GPT-4 Turbo model.", 368 "supported": "strong" 369 }, 370 { 371 "claim": "The ACI concept is portable across LMs: SWE-agent with Claude 3 Opus achieves 10.46% on SWE-bench.", 372 "evidence": "Table 1 shows Claude 3 Opus achieves 10.46% on full SWE-bench, demonstrating the ACI works with a different model.", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "SWE-agent achieves 87.7% pass@1 on HumanEvalFix Python, far exceeding prior methods.", 377 "evidence": "Table 2 shows 87.7% vs next-best 57.9% (WaveCoder-DS-6.7B). Results also reported for JS (89.7%) and Java (87.9%).", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Editing guardrails (linting) improve performance by 3 percentage points (18.0% vs 15.0%).", 382 "evidence": "Table 3 ablation study shows edit with linting at 18.0% vs edit without linting at 15.0%.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Iterative search (UI-inspired) actually hurts performance compared to no search tools (12.0% vs 15.7%).", 387 "evidence": "Table 3 shows iterative search at 12.0% vs no search at 15.7%. Section 5.1 explains agents exhaustively iterate through results.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Successful runs complete earlier and cheaper than unsuccessful ones (median $1.21/12 steps vs $2.52/21 steps).", 392 "evidence": "Section 5.2, Table 15, and Figure 15 show the distribution differences.", 393 "supported": "strong" 394 } 395 ], 396 "methodology_tags": ["benchmark-eval"], 397 "key_findings": "SWE-agent introduces the agent-computer interface (ACI) concept, demonstrating that LM agents benefit from custom-designed interfaces tailored to their capabilities. With GPT-4 Turbo, SWE-agent resolves 12.47% of SWE-bench tasks, a 3.3x improvement over prior non-interactive approaches. Detailed ablations show each ACI component contributes meaningfully: the file editor alone accounts for 7.7 percentage points, and linting guardrails add 3 points. Interestingly, a UI-inspired iterative search interface actually hurts performance because agents exhaustively iterate through results rather than refining queries.", 398 "red_flags": [ 399 { 400 "flag": "Small ablation sample", 401 "detail": "Ablations in Table 3 are performed on SWE-bench Lite (300 instances) with single runs, while the hyperparameter sweep uses only 37 dev instances averaged over 5 samples. Small samples make ablation differences potentially unreliable." 402 }, 403 { 404 "flag": "No significance tests for ablation claims", 405 "detail": "Table 3 reports percentage differences (e.g., 3.0 point improvement from linting) without any significance testing. With 300 instances, some differences may not be statistically significant." 406 }, 407 { 408 "flag": "Self-comparison bias", 409 "detail": "The authors designed and tuned SWE-agent while using their own implementation of the Shell-only baseline. The baseline may not have received comparable development effort." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "SWE-bench: Can language models resolve real-world github issues?", 415 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"], 416 "year": 2024, 417 "relevance": "Defines the SWE-bench benchmark used as primary evaluation, establishing the task formulation for real-world software engineering." 418 }, 419 { 420 "title": "ReAct: Synergizing reasoning and acting in language models", 421 "authors": ["S. Yao", "J. Zhao", "D. Yu"], 422 "year": 2023, 423 "relevance": "Foundational agentic reasoning framework (thought+action) used by SWE-agent's interaction loop." 424 }, 425 { 426 "title": "Reflexion: Language agents with verbal reinforcement learning", 427 "authors": ["N. Shinn", "F. Cassano", "E. Berman"], 428 "year": 2023, 429 "relevance": "Demonstrates LM agents using execution feedback for code generation, motivating SWE-agent's interactive approach." 430 }, 431 { 432 "title": "InterCode: Standardizing and benchmarking interactive coding with execution feedback", 433 "authors": ["J. Yang", "A. Prabhakar", "K. R. Narasimhan", "S. Yao"], 434 "year": 2023, 435 "relevance": "Provides the interactive coding framework that SWE-agent's Shell-only baseline and environment are built upon." 436 }, 437 { 438 "title": "Evaluating large language models trained on code", 439 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 440 "year": 2021, 441 "relevance": "Introduces HumanEval/Codex, the foundational code generation benchmark against which many systems are measured." 442 }, 443 { 444 "title": "Automated repair of programs from large language models", 445 "authors": ["Z. Fan", "X. Gao", "M. Mirchev", "A. Roychoudhury", "S. H. Tan"], 446 "year": 2023, 447 "relevance": "Prior work on LLM-based automated program repair, one of the SE tasks unified under SWE-bench." 448 }, 449 { 450 "title": "LLM agents can autonomously hack websites", 451 "authors": ["R. Fang", "R. Bindu", "A. Gupta", "Q. Zhan", "D. Kang"], 452 "year": 2024, 453 "relevance": "Demonstrates LM agents applied to offensive security, showing agentic capabilities extend to adversarial domains." 454 }, 455 { 456 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 457 "authors": ["S. Hong", "M. Zhuge", "J. Chen"], 458 "year": 2023, 459 "relevance": "Multi-agent framework for software engineering, representing an alternative architectural approach to SWE-agent." 460 }, 461 { 462 "title": "Lost in the middle: How language models use long contexts", 463 "authors": ["N. F. Liu", "K. Lin", "J. Hewitt"], 464 "year": 2023, 465 "relevance": "Demonstrates that distracting context harms LM performance, motivating SWE-agent's context management design." 466 }, 467 { 468 "title": "OctoPack: Instruction tuning code large language models", 469 "authors": ["N. Muennighoff", "Q. Liu", "A. R. Zebaze"], 470 "year": 2024, 471 "relevance": "Introduces HumanEvalFix benchmark used as secondary evaluation in SWE-agent." 472 }, 473 { 474 "title": "Cognitive architectures for language agents", 475 "authors": ["T. Sumers", "S. Yao", "K. Narasimhan", "T. L. Griffiths"], 476 "year": 2023, 477 "relevance": "Theoretical framework for understanding language agents, relevant to ACI design principles." 478 }, 479 { 480 "title": "Language Agent Tree Search unifies reasoning acting and planning in language models", 481 "authors": ["A. Zhou", "K. Yan", "M. Shlapentokh-Rothman"], 482 "year": 2023, 483 "relevance": "Alternative agentic search strategy achieving high HumanEval performance (94.4%), showing benchmark saturation." 484 } 485 ] 486 }