scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25617B)
      1 {
      2   "paper": {
      3     "title": "HyperAgent: Generalist Software Engineering Agents to Solve Coding Tasks at Scale",
      4     "authors": ["Huy N. Phan", "Tien N. Nguyen", "Phong X. Nguyen", "Nghi D. Q. Bui"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2409.16299",
      8     "doi": "10.48550/arXiv.2409.16299"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "HyperAgent is a multi-agent system with four specialized agents (Planner, Navigator, Editor, Executor) that achieves competitive or state-of-the-art results across SWE-Bench (33.00% Verified, 26.00% Lite), RepoExec (53.33% Pass@5), and Defects4J (59.70% Acc@1 fault localization, 22.9% correct repair rate). Ablation studies show each agent role contributes meaningfully, with Navigator removal causing the largest performance drop. The system demonstrates cross-task and cross-language generality (Python and Java) with a single framework.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub link provided: https://github.com/FSoft-AI4Code/HyperAgent (stated on page 1)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks: SWE-Bench, RepoExec, and Defects4J. No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is described in the paper. Only model names are mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repo is linked but the paper itself does not include reproduction steps."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., 33.00%, 26.00%) with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims HyperAgent 'outperforms' or 'surpasses' baselines based solely on comparing raw numbers without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results are reported with baseline context (e.g., 'surpassing AutoFL by 8.7 percentage points (51.00%)' in Section 5.3.2), providing enough context to assess magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for sample sizes. The benchmark sizes are taken as given (SWE-Bench Lite 300, Verified 500, RepoExec 355, Defects4J 353/395/440) without discussing whether these are sufficient for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be single-run."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines compared across all three tasks: SWE-Agent, AutoCodeRover, Agentless for SWE-Bench; RAG baselines and CodeLlama for RepoExec; DeepFL, AutoFL, Grace, DStar, Ochiai, RepairAgent, SelfAPR, ITER for Defects4J."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include contemporary methods like SWE-Agent (2024), Agentless (2024), AutoCodeRover (2024), and RepairAgent (2024)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 6.1 presents ablation studies on agent roles (removing Navigator, Editor, Executor individually) and Section 6.2 presents ablation on tool design choices, both on SWE-bench Tiny."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics used: resolved percentage, average time cost, average dollar cost for SWE-Bench; pass@1 and pass@5 for RepoExec; acc@1 for fault localization; plausible and correct fixes for program repair."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of system outputs. All evaluations are automated (test suite pass/fail, AST matching for repair correctness)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Uses established held-out benchmarks: SWE-Bench Verified (manually validated by professional annotators), SWE-Bench Lite, RepoExec, and Defects4J."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 10 (Appendix A.6) provides per-project breakdown for Defects4J repair across 13 projects. Multiple HyperAgent configurations are compared separately."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.4 provides error analysis with categorized failure types (edit failed loop, early exit, hallucination, timeout) shown in Figure 3."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "HyperAgent-Lite-2 and Full-3 (open-source Llama configurations) show substantially lower performance (11-18%), honestly reported. Ablations show performance drops. Section 6.4 discusses hallucination problems."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 'state-of-the-art results' and 'surpassing strong baselines' are supported by Tables 1-4 showing competitive or best results across benchmarks."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Ablation studies (Section 6.1, Table 5) use controlled single-variable manipulation to support causal claims about component contributions. Each agent role is removed individually to measure its impact."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Generalist Software Engineering Agents' but testing is limited to three benchmarks in Python and Java. The abstract claims 'broad spectrum of SE tasks across multiple programming languages' but only two languages are tested. The paper claims to be 'the first system designed to work off-the-shelf across diverse SE tasks' which overstates what two languages and three task types demonstrate."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the observed results. For example, the paper does not discuss whether performance gains come from the multi-agent architecture or from the specific LLMs used, or whether better prompting of a single agent could achieve similar results."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper frames benchmark performance (SWE-Bench pass rates, Defects4J repair rates) as evidence of 'generalist software engineering' capability without discussing the gap between benchmark performance and actual real-world SE ability."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Table 7 lists 'Claude-3-Sonnet', 'Claude-3-Haiku', 'GPT-4o', 'Llama-3-70B', 'Llama-3-8B' without specific version snapshots or API dates. 'LLaMa-3.1-8B-Instruct' is mentioned for the summarizer but other models lack version specificity."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full system prompts for all four agents (Planner, Navigator, Editor, Executor) are provided in Appendix A.7, including task templates in Appendix A.1."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max tokens, or other LLM hyperparameters are reported anywhere in the paper."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent architecture is described in detail in Section 3, including agent roles, communication via message queues, tool descriptions (Section 3.3, Tables 8-9, Appendix A.3), and workflow diagrams (Figures 1-2)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data setup for each benchmark is documented: SWE-Bench Lite/Verified filtering criteria, RepoExec context exclusion rationale, Defects4J version selection. Section 5 and Appendix A.4 describe the evaluation setup."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations or threats-to-validity section. The conclusion mentions future work directions but does not discuss current limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The error analysis in Section 6.4 discusses system failures but not threats to the validity of the evaluation methodology."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to the tested languages (Python, Java) or task types."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (trajectories, logs, per-instance results) is made available for independent verification. Only aggregate results in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Benchmark datasets are well-described: SWE-Bench sourcing from 12 Python repositories (Section 5.1.1), RepoExec's 355 samples with 96.25% test coverage (Section 5.2.1), Defects4J's 353 active bugs (Section 5.3.1)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The paper uses standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The evaluation pipeline is documented: how benchmarks are set up, how patches are applied, how tests are run, and how correctness is verified (e.g., AST matching for Defects4J in Section 5.3.1)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: FPT Software AI Center and University of Texas at Dallas."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Authors are from FPT Software AI Center, a commercial entity that could benefit from demonstrating strong AI coding capabilities. No discussion of independence. No funding disclosure to assess."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement. Authors are from FPT Software, which has a commercial interest in AI coding tools."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates stated for any of the LLMs used (Claude-3, GPT-4o, Llama-3)."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the LLMs may have seen SWE-Bench, RepoExec, or Defects4J data during training."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "SWE-Bench and Defects4J are publicly available benchmarks that predate the models' training. No discussion of contamination risk."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Average cost per instance reported in Tables 1-3 (e.g., $0.45 for HyperAgent-Lite-1, $1.82 for Full-1 on SWE-Bench; $0.18 for RepoExec and fault localization)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget stated (total API spend, GPU hours for running all experiments). Only per-instance costs are reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. Results are presented without indicating whether they are from single or multiple runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported. Five configurations are tested (Table 7) but no search process is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "All five configurations are reported transparently in Table 1, not just the best one. The paper does not select a single 'best' — it presents all results."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No acknowledgment of author-evaluation bias. The authors evaluate their own system against baselines without discussing this potential bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Tables 1-3 report both performance and cost (time and dollar) per configuration, allowing readers to compare performance-cost tradeoffs across systems."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether SWE-Bench, RepoExec, or Defects4J actually measure 'generalist software engineering' capability as claimed."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper compares HyperAgent (its own scaffold) against SWE-Agent (different scaffold), AutoCodeRover (different scaffold), and Agentless (no scaffold), but does not discuss the scaffold confound. Performance differences could be due to scaffolding rather than the approach."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. SWE-Bench issues and Defects4J bugs may have been seen during model training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether benchmark examples share structural similarities with training data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "HyperAgent-Full-1 achieves 33.00% on SWE-Bench Verified and 26.00% on SWE-Bench Lite, outperforming Agentless+GPT-4o (24.30% Lite) and SWE-Agent+Claude 3.5 Sonnet (23.00% Lite).",
    365       "evidence": "Table 1, Section 5.1.2. Direct comparison of resolved instance percentages.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "HyperAgent-Lite-3 achieves the highest Pass@5 (53.33%) on RepoExec without provided context, outperforming all RAG baselines and full-context models.",
    370       "evidence": "Table 2, Section 5.2.2. Comparison against RAG baselines and CodeLlama with full context.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "HyperAgent achieves 59.70% Acc@1 on Defects4J fault localization, surpassing AutoFL (51.00%) by 8.7 percentage points.",
    375       "evidence": "Table 3, Section 5.3.2.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "HyperAgent achieves 192 correct fixes (22.9%) on Defects4J, outperforming RepairAgent (164, 19.64%), SelfAPR (110, 13.17%), and ITER (57, 6.82%).",
    380       "evidence": "Table 4 and Table 10 (Appendix A.6). Per-project breakdown provided.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "HyperAgent is the first system designed to work off-the-shelf across diverse SE tasks in multiple programming languages without task-specific adaptations.",
    385       "evidence": "Stated in Section 1 contributions. Demonstrated by evaluating on Python (SWE-Bench, RepoExec) and Java (Defects4J) benchmarks.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Removing the Navigator agent causes the most substantial performance drop in ablation studies.",
    390       "evidence": "Table 5, Section 6.1. Full-1 drops from 27% to 19% without Navigator; Lite-1 drops from 24% to 9%.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No variance or multiple runs reported",
    397       "detail": "All results are single point estimates with no error bars, confidence intervals, or multi-run statistics. LLM outputs are stochastic, so single-run results may not be reproducible."
    398     },
    399     {
    400       "flag": "Overclaiming generality",
    401       "detail": "The paper claims 'generalist' capability across 'multiple programming languages' but tests only on Python and Java, and only on three task types. The title and abstract overstate the scope of the evaluation."
    402     },
    403     {
    404       "flag": "No limitations section",
    405       "detail": "The paper has no dedicated limitations or threats-to-validity section, which is unusual for a systems paper making broad claims."
    406     },
    407     {
    408       "flag": "Scaffold confound in comparisons",
    409       "detail": "Comparisons against SWE-Agent, AutoCodeRover, and Agentless conflate model differences with scaffold differences. It's unclear whether gains come from the multi-agent architecture or the underlying models."
    410     },
    411     {
    412       "flag": "Company affiliation without conflict disclosure",
    413       "detail": "Authors are from FPT Software AI Center, which has commercial interest in AI coding tools, but no conflicts of interest are disclosed."
    414     },
    415     {
    416       "flag": "No contamination analysis",
    417       "detail": "Public benchmarks (SWE-Bench, Defects4J) predate the models used. No analysis of whether models may have seen benchmark data during training."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    423       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"],
    424       "year": 2023,
    425       "relevance": "Primary benchmark for evaluating GitHub issue resolution by coding agents."
    426     },
    427     {
    428       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    429       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    430       "year": 2024,
    431       "arxiv_id": "2405.15793",
    432       "relevance": "Key baseline agent system for SWE-Bench, demonstrates importance of agent-computer interface design."
    433     },
    434     {
    435       "title": "AutoCodeRover: Autonomous Program Improvement",
    436       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    437       "year": 2024,
    438       "relevance": "Two-stage agent pipeline for bug fixing, key baseline in SWE-Bench evaluation."
    439     },
    440     {
    441       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    442       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    443       "year": 2024,
    444       "arxiv_id": "2407.01489",
    445       "relevance": "Simplified non-agent approach that outperforms complex agent systems, important baseline."
    446     },
    447     {
    448       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    449       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    450       "year": 2023,
    451       "arxiv_id": "2308.00352",
    452       "relevance": "Multi-agent system for complex software generation from requirements."
    453     },
    454     {
    455       "title": "ChatDev: Communicative Agents for Software Development",
    456       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    457       "year": 2024,
    458       "relevance": "Multi-agent software development system using communicative agents."
    459     },
    460     {
    461       "title": "AgileCoder: Dynamic Collaborative Agents for Software Development Based on Agile Methodology",
    462       "authors": ["Minh Huynh Nguyen", "Thang Phan Chau", "Phong X. Nguyen", "Nghi D. Q. Bui"],
    463       "year": 2024,
    464       "arxiv_id": "2406.11912",
    465       "relevance": "Collaborative multi-agent coding system by same research group."
    466     },
    467     {
    468       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    469       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    470       "year": 2024,
    471       "arxiv_id": "2403.17134",
    472       "relevance": "LLM-based multi-agent system for autonomous bug fixing, key baseline for Defects4J evaluation."
    473     },
    474     {
    475       "title": "Evaluating Large Language Models Trained on Code",
    476       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    477       "year": 2021,
    478       "arxiv_id": "2107.03374",
    479       "relevance": "Introduces HumanEval benchmark and Codex, foundational work in LLM code generation evaluation."
    480     },
    481     {
    482       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    483       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    484       "year": 2024,
    485       "relevance": "Self-reflection mechanism for language agents, relevant to agentic coding workflows."
    486     },
    487     {
    488       "title": "RepoExec: Evaluate Code Generation with a Repository-Level Executable Benchmark",
    489       "authors": ["Nam Le Hai", "Dung Manh Nguyen", "Nghi D. Q. Bui"],
    490       "year": 2024,
    491       "arxiv_id": "2406.11927",
    492       "relevance": "Repository-level code generation benchmark used for evaluation, emphasizes executability and correctness."
    493     },
    494     {
    495       "title": "MASAI: Modular Architecture for Software-Engineering AI Agents",
    496       "authors": ["Daman Arora", "Atharv Sonwane", "Nalin Wadhwa"],
    497       "year": 2024,
    498       "arxiv_id": "2406.11638",
    499       "relevance": "Modular multi-agent architecture for SE tasks, directly comparable approach."
    500     }
    501   ]
    502 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs