scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28281B)
      1 {
      2   "paper": {
      3     "title": "AGENTLESS: Demystifying LLM-based Software Engineering Agents",
      4     "authors": [
      5       "Chunqiu Steven Xia",
      6       "Yinlin Deng",
      7       "Soren Dunn",
      8       "Lingming Zhang"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv",
     12     "arxiv_id": "2407.01489"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'We have open-sourced AGENTLESS at: https://github.com/OpenAutoCoder/Agentless' in the abstract. A working GitHub URL is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation uses the publicly available SWE-bench Lite benchmark (300 problems), which is a standard public dataset. No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The implementation section mentions LlamaIndex, OpenAI's text-embedding-3-small, and Python's ast library, but no requirements.txt, Dockerfile, or conda environment file with pinned library versions is described in the paper. Mentioning specific libraries without version pinning is insufficient."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper describes the approach's steps in detail but does not include step-by-step reproduction instructions (e.g., commands to run the experiments). These may exist in the GitHub repository but are not provided in the paper itself."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates only (e.g., '96 (32.00%)' resolved). No confidence intervals or error bars are provided for any of the main results."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims AGENTLESS outperforms all open-source agent-based approaches but makes no use of statistical significance tests. Comparisons are based solely on raw percentage differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes (e.g., Cohen's d, odds ratios) are reported. Improvements are given as raw differences in solve rates without statistical context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses the SWE-bench Lite benchmark with 300 problems without justifying why this sample size is sufficient for the statistical power needed to support its comparative claims."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results represent single-run outcomes. The paper does not report variance, standard deviation, or results across multiple runs. Figure 6 shows performance vs. number of samples but does not report variance across repeated runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 1 compares AGENTLESS against 26 agent-based approaches including both open-source tools (SWE-agent, AutoCodeRover, Moatless, Aider, etc.) and closed-source commercial tools. A RAG agentless baseline is also included."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines include contemporary tools from 2024 such as CodeStory Aide, Bytedance MarsCode, SWE-agent with Claude 3.5 Sonnet, and AutoCodeRover-v2, representing the state-of-the-art at the time of submission."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.2 contains an extensive ablation study covering localization (Table 2), repair (Table 3), and patch validation (Table 4) components. Individual design choices like skeleton format, embedding retrieval, and majority voting are each evaluated separately."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports multiple metrics: % Resolved, Avg. $ Cost, Avg. # Tokens, and % Correct Location at line/function/file granularities. Additionally, Section 5.1.3 evaluates reproduction test quality."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Section 6.1 describes manual classification of SWE-bench Lite problems, but the schema explicitly states 'manual classification of the benchmark or dataset itself does not count.' There is no human evaluation of AGENTLESS's outputs (generated patches, localizations, or reproduction tests). The evaluation of system outputs is entirely automated via test suites."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "SWE-bench Lite is used as a fixed test set. The paper does not use any of the benchmark examples for tuning AGENTLESS. The approach is evaluated directly on the benchmark without a separate validation set being used for parameter selection."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 9 shows solve rates broken down by problem category (description quality, solution type, and location information) for multiple tools. Table 2 provides per-step localization breakdowns, and Table 3 shows per-repair-configuration results."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.2 discusses that closed-source agent tools outperform AGENTLESS on problems where no location clue is provided, explicitly identifying where AGENTLESS fails. Section 5.1.3 discusses the reproduction test drop-off from 213 selected to 94 plausible tests."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The ablation study reports configurations that hurt performance. For example, Table 2 shows that 'direct from file-level' localization achieves lower performance (47.00%) than the default (56.33%). Table 3 shows that merged multi-samples (28.33%) underperform multi-samples (32.00%)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims AGENTLESS achieves 'the highest performance (32.00%, 96 correct fixes)' among open-source approaches with low cost ($0.70), which is directly supported by Table 1. The claim about OpenAI adoption is also discussed in Section 5.1.4."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation studies (Section 5.2) test causal claims about individual components. Each ablation varies one component at a time while holding others constant, making controlled causal comparisons for claims like 'embedding-based retrieval improves file localization from 78.67% to 81.67% when combined with prompting-based.'"
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper evaluates only on Python repositories in SWE-bench Lite but the title 'Demystifying LLM-based Software Engineering Agents' implies broad applicability. Section 7 notes the external threat that 'performance might not generalize to other datasets' but the framing throughout overstates generality beyond the tested setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section 7 mentions data leakage (contamination) as a threat but does not discuss alternative explanations for why AGENTLESS outperforms agent-based tools. For instance, it does not consider whether the advantage comes from the specific benchmark characteristics (Python-only, well-formed issues) rather than the agentless approach itself."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The implementation section explicitly states 'GPT-4o (gpt-4o-2024-05-13)' and 'text-embedding-3-small' for the embedding model. Specific model version identifiers are provided."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper describes what prompts do in natural language (e.g., 'we prompt the LLM to localize and rank the top N most suspicious files') and shows example outputs, but does not provide the actual prompt text. No appendix with full prompts is included in the paper."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 4 (Implementation) specifies: greedy decoding by default, sampling temperature of 0.8 for sampling, chunk size of 512 and overlap of 0 for embeddings, top 3 suspicious files, 4 samples of edit locations, 10 patches per location set (1 greedy + 9 samples), 40 total patches, 40 reproduction test samples."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The three-phase pipeline (localization, repair, patch validation) is described in detail in Section 3 with a workflow diagram (Figure 1). Each step's inputs and outputs are explicitly described, including the hierarchical localization, search/replace diff format, reproduction test generation, and majority voting re-ranking."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 6.1 describes the manual classification procedure for SWE-bench Lite problems with explicit categories and criteria. Section 6.2 describes the filtering criteria used to produce SWE-bench Lite-S (removing problems with exact patches, misleading solutions, or insufficient information)."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 ('Threats to Validity') provides dedicated discussion of both internal and external threats to validity, covering data leakage and generalization to other datasets."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The threats are specific to this study: the internal threat identifies GPT-4o training data leakage as the specific mechanism, and Section 6.2 identifies that closed-source tools outperform AGENTLESS specifically on problems with no location clues. These are not generic disclaimers."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 7 acknowledges that results may not generalize to other datasets, but does not explicitly state what the results do NOT show (e.g., does not claim results are restricted to Python only, or to self-contained issues, or to a specific issue type distribution)."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The SWE-bench Lite benchmark is publicly available, meaning the raw evaluation data (issues and ground truth patches) can be independently verified. The paper also open-sources AGENTLESS, allowing reproduction of the submission patches."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The benchmark data originates from SWE-bench (cited [44]), whose collection methodology is in the original paper. For the manual classification in Section 6.1, the classification dimensions and categories are described in detail."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants were recruited. The evaluation uses a standard public benchmark (SWE-bench Lite) and the manual classification was performed by the authors themselves. This criterion does not apply."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The pipeline from SWE-bench Lite (300 problems) to SWE-bench Lite-S (249 problems) is documented with explicit filtering criteria (removing exact patches, misleading solutions, insufficient information). The count of excluded problems (51) can be inferred from 300 - 249."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The acknowledgments section thanks two individuals and mentions a bike but does not disclose any funding source (grants, corporate sponsors, or funding agencies). No funding information is present in the paper."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "All four authors are listed with their University of Illinois Urbana-Champaign affiliation. The paper evaluates GPT-4o (OpenAI) and Claude 3.5 Sonnet (Anthropic) as baselines, and none of the authors are affiliated with those companies."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source is disclosed. The authors are at a major research university using substantial API credits, so funding likely exists but is not disclosed. Cannot confirm funder independence without knowing the funder."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "There is no competing interests statement in the paper. The authors may or may not have financial interests (e.g., via commercialization of AGENTLESS or the GitHub repository), but no disclosure is made."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Section 7 acknowledges the threat from GPT-4o training data leakage but does not state GPT-4o's specific training cutoff date. The paper says 'GPT-4o is a closed-source model, we do not have access to the training data.'"
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section 7 explicitly addresses this as an internal threat: 'One threat to validity comes from the data leakage of ground truth developer patches in SWE-bench Lite being part of the training data for GPT-4o.' The paper also cites the SWE-bench authors' comparison of resolve rates before and after GPT-4's knowledge cutoff date as mitigating evidence."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 7 explicitly discusses this risk and cites the SWE-bench authors' finding that there was 'no significant difference' in resolve rates between issues collected before and after GPT-4's knowledge cutoff. This is at least an attempt to address the concern, though incomplete for GPT-4o."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "This is a benchmark evaluation paper with no human participants. The manual classification in Section 6.1 is author analysis of a public benchmark, not a human subjects study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved. The evaluation is on a public software engineering benchmark."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Table 1 reports the average cost per issue as $0.70 for AGENTLESS. Additional costs are reported per component in the ablation tables (e.g., $0.02 for prompting-based file localization, $0.25 for reproduction tests)."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "While per-query API costs are reported, the total compute budget (total API spend for running all 300 benchmark problems, GPU/compute hardware, wall-clock time) is not explicitly stated. The paper reports average per-problem costs but not the total experimental budget."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "AGENTLESS achieves the highest performance (32.00%, 96 correct fixes) among all open-source approaches on SWE-bench Lite while costing only $0.70 on average.",
    291       "evidence": "Table 1 in Section 5.1 compares AGENTLESS against 26 agent-based approaches. Among open-source tools, AGENTLESS scores 32.00% while the next best open-source tool (AutoCodeRover-v2) scores 30.67%. Cost is $0.70 vs. $1.62 for SWE-agent (Claude 3.5 S).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The simplistic agentless approach can match or exceed the performance of complex autonomous agents for software development issue resolution.",
    296       "evidence": "Table 1 shows AGENTLESS (32.00%) outperforms all open-source agents. However, 10 closed-source/commercial tools outperform it, with the best (CodeStory Aide) reaching 43.00%. The claim is partially qualified in the paper text (Section 5.1).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "AGENTLESS has an upper bound of 42.0% (126/300) issues solvable if an oracle patch selection method were used.",
    301       "evidence": "Figure 6 in Section 5.2.2 shows that when considering all 40 candidate patches per issue (not just the selected one), 126 issues (42.0%) have at least one correct patch.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "SWE-bench Lite contains problematic problems including 4.3% with exact ground truth patches in the description, 10.0% with insufficient information, and 5.0% with misleading solutions.",
    306       "evidence": "Section 6.1 describes the manual classification of all 300 problems. Figure 8b shows the breakdown: 4.3% exact patch, 9.7% complete steps, 5.0% misleading. The classification criteria are described qualitatively.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Combining prompting-based and embedding-based file localization outperforms either method alone (81.67% vs. 78.67% and 67.67% respectively).",
    311       "evidence": "Table 2 in Section 5.2.1 shows the combined method achieves 81.67% ground-truth file localization vs. 78.67% for prompting-only and 67.67% for embedding-only. These are point estimates without variance.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Adding reproduction tests for patch selection improves performance from 81 to 96 issues resolved (from 27.00% to 32.00%).",
    316       "evidence": "Table 4 in Section 5.2.3 shows that regression-test-only selection gives 81 (27.00%) while adding reproduction test filtering gives 96 (32.00%). This is a controlled ablation.",
    317       "supported": "strong"
    318     }
    319   ],
    320   "methodology_tags": [
    321     "benchmark-eval"
    322   ],
    323   "key_findings": "AGENTLESS proposes a simple three-phase (localize, repair, validate) non-agentic approach to autonomous software engineering that achieves 32.00% (96/300) on SWE-bench Lite, outperforming all open-source agent-based approaches at an average cost of $0.70 per issue. The paper demonstrates through ablation studies that hierarchical localization, multi-sample patch generation, and LLM-generated reproduction tests each contribute meaningfully to performance. A manual classification of SWE-bench Lite problems reveals quality issues including problems with exact patches in descriptions (4.3%), misleading solutions (5.0%), and insufficient information (10.0%), leading to the construction of SWE-bench Lite-S as a cleaner evaluation benchmark. Despite its simplicity, AGENTLESS was adopted by OpenAI as the default showcase approach for GPT-4o and o1 models on SWE-bench.",
    324   "red_flags": [
    325     {
    326       "flag": "No statistical uncertainty quantification",
    327       "detail": "All performance comparisons are point estimates with no confidence intervals, error bars, or statistical significance tests. Given that a single-run difference of ~1-2% can affect rankings (e.g., AGENTLESS at 32.00% vs. AutoCodeRover-v2 at 30.67%), the lack of variance reporting makes it impossible to assess whether observed differences are reliable."
    328     },
    329     {
    330       "flag": "No funding disclosure",
    331       "detail": "The acknowledgments section thanks two named individuals and a bicycle but discloses no funding sources. For academic research using substantial cloud API compute (running 40 patches x 300 problems on GPT-4o), the funding source is relevant for assessing potential conflicts of interest."
    332     },
    333     {
    334       "flag": "Prompts not provided",
    335       "detail": "The paper describes what prompts ask the LLM to do in natural language but does not provide the actual prompt text used in experiments. Without the prompts, exact reproduction requires reverse-engineering from the GitHub repository, and the paper cannot be independently evaluated from the text alone."
    336     },
    337     {
    338       "flag": "Single-model evaluation",
    339       "detail": "All AGENTLESS experiments use GPT-4o (gpt-4o-2024-05-13) only. The claim that agentless approaches can match complex agents is tested with only one LLM backbone, while many competing approaches use a variety of models including Claude 3.5 Sonnet. Results may not generalize across LLMs."
    340     },
    341     {
    342       "flag": "Manual benchmark classification by authors",
    343       "detail": "The manual classification of all 300 SWE-bench Lite problems (Section 6.1) that motivated the construction of SWE-bench Lite-S was performed by the same authors who built and evaluated AGENTLESS. No inter-rater reliability is reported and no independent verifier is mentioned, creating potential for motivated reasoning in which problems are deemed 'problematic.'"
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    349       "authors": [
    350         "Carlos E Jimenez",
    351         "John Yang",
    352         "Alexander Wettig",
    353         "Shunyu Yao",
    354         "Kexin Pei",
    355         "Ofir Press",
    356         "Karthik R Narasimhan"
    357       ],
    358       "year": 2024,
    359       "relevance": "Primary benchmark used for evaluation; SWE-bench is the foundational evaluation framework for LLM-based software engineering agents."
    360     },
    361     {
    362       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    363       "authors": [
    364         "John Yang",
    365         "Carlos E Jimenez",
    366         "Alexander Wettig",
    367         "Kilian Lieret",
    368         "Shunyu Yao",
    369         "Karthik Narasimhan",
    370         "Ofir Press"
    371       ],
    372       "year": 2024,
    373       "arxiv_id": "2405.15793",
    374       "relevance": "Key agent-based baseline; SWE-agent represents the paradigm of complex LLM agent interfaces that AGENTLESS is compared against."
    375     },
    376     {
    377       "title": "AutoCodeRover: Autonomous Program Improvement",
    378       "authors": [
    379         "Yuntong Zhang",
    380         "Haifeng Ruan",
    381         "Zhiyu Fan",
    382         "Abhik Roychoudhury"
    383       ],
    384       "year": 2024,
    385       "arxiv_id": "2404.05427",
    386       "relevance": "Agent-based baseline providing LLMs with code-search APIs for iterative bug localization and repair."
    387     },
    388     {
    389       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    390       "authors": [
    391         "Junwei Liu",
    392         "Kaixin Wang",
    393         "Yixuan Chen",
    394         "Xin Peng",
    395         "Zhenpeng Chen",
    396         "Lingming Zhang",
    397         "Yiling Lou"
    398       ],
    399       "year": 2024,
    400       "arxiv_id": "2409.02977",
    401       "relevance": "Survey of LLM-based software engineering agents, directly relevant to the survey scope."
    402     },
    403     {
    404       "title": "CodeR: Issue Resolving with Multi-Agent and Task Graphs",
    405       "authors": [
    406         "Dong Chen",
    407         "Shaoxin Lin",
    408         "Muhan Zeng",
    409         "Daoguang Zan"
    410       ],
    411       "year": 2024,
    412       "arxiv_id": "2406.01304",
    413       "relevance": "Multi-agent approach for software issue resolution used as a baseline in AGENTLESS evaluation."
    414     },
    415     {
    416       "title": "SpecRover: Code Intent Extraction via LLMs",
    417       "authors": [
    418         "Haifeng Ruan",
    419         "Yuntong Zhang",
    420         "Abhik Roychoudhury"
    421       ],
    422       "year": 2024,
    423       "arxiv_id": "2408.02232",
    424       "relevance": "Agent-based approach that generates reproduction tests and function summaries; used as a baseline and compared for test generation strategy."
    425     },
    426     {
    427       "title": "MASAI: Modular Architecture for Software-engineering AI Agents",
    428       "authors": [
    429         "Daman Arora",
    430         "Atharv Sonwane",
    431         "Nalin Wadhwa"
    432       ],
    433       "year": 2024,
    434       "arxiv_id": "2406.11638",
    435       "relevance": "Modular agent architecture for software engineering used as a competitive baseline on SWE-bench Lite."
    436     },
    437     {
    438       "title": "MarsCode Agent: AI-native Automated Bug Fixing",
    439       "authors": [
    440         "Yizhou Liu",
    441         "Pengfei Gao",
    442         "Xinchen Wang",
    443         "Chao Peng",
    444         "Zhao Zhang"
    445       ],
    446       "year": 2024,
    447       "arxiv_id": "2409.00899",
    448       "relevance": "Commercial agent system for bug fixing partly inspired by AGENTLESS; competitive baseline on SWE-bench Lite."
    449     },
    450     {
    451       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    452       "authors": [
    453         "Chunqiu Steven Xia",
    454         "Yuxiang Wei",
    455         "Lingming Zhang"
    456       ],
    457       "year": 2023,
    458       "relevance": "Prior work on LLM-based automated program repair that motivates AGENTLESS's patch sampling approach."
    459     },
    460     {
    461       "title": "How to Understand Whole Software Repository?",
    462       "authors": [
    463         "Yingwei Ma",
    464         "Qingping Yang",
    465         "Rongyu Cao",
    466         "Binhua Li",
    467         "Fei Huang",
    468         "Yongbin Li"
    469       ],
    470       "year": 2024,
    471       "arxiv_id": "2406.01422",
    472       "relevance": "Repository-level understanding approach for software agents, used as a baseline (RepoUnderstander) in the evaluation."
    473     },
    474     {
    475       "title": "A Systematic Literature Review on Large Language Models for Automated Program Repair",
    476       "authors": [
    477         "Quanjun Zhang",
    478         "Chunrong Fang",
    479         "Yang Xie"
    480       ],
    481       "year": 2024,
    482       "arxiv_id": "2405.01466",
    483       "relevance": "Systematic review of LLM-based program repair, directly relevant to the survey scope on methodology quality in AI-assisted software engineering."
    484     },
    485     {
    486       "title": "The rise and potential of large language model based agents: A survey",
    487       "authors": [
    488         "Zhiheng Xi",
    489         "Wenxiang Chen",
    490         "Xin Guo"
    491       ],
    492       "year": 2023,
    493       "arxiv_id": "2309.07864",
    494       "relevance": "Survey of LLM-based agent frameworks that contextualizes the agent-based approaches AGENTLESS is designed to challenge."
    495     }
    496   ]
    497 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs