ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30574B)


      1 {
      2   "paper": {
      3     "title": "EXPEREPAIR: Dual-Memory Enhanced LLM-based Repository-Level Program Repair",
      4     "authors": [
      5       "Fangwen Mu",
      6       "Junjie Wang",
      7       "Lin Shi",
      8       "Song Wang",
      9       "Shoubin Li",
     10       "Qing Wang"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2506.10484",
     15     "doi": "10.48550/arXiv.2506.10484"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "EXPEREPAIR introduces a dual-memory system (episodic + semantic) for LLM-based automated program repair, achieving 47.7% pass@1 with Claude 3.5 Sonnet V2 and 49.3% with Claude 3.7 Sonnet on SWE-Bench Lite, outperforming all reported open-source baselines. Ablation studies show removing the full experience module drops resolved rate from 47.7% to 41.3%, with episodic demonstrations contributing more than semantic insights. The approach is substantially cheaper than the comparable DARS method ($2.07 vs $12.24 per instance).",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Section 7 states 'We release our code and data to support further research [10]' and reference [10] provides a GitHub URL: https://github.com/ExpeRepair/ExpeRepair."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The evaluation uses SWE-Bench Lite, a publicly available benchmark (reference [17]). The paper also claims to release data alongside the code at the GitHub repository."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper. The implementation details (Appendix A.1) describe algorithmic parameters but not the software environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are included in the paper. The appendix provides implementation details but no commands or scripts to replicate experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 1 and 2 report only point estimates (e.g., 47.7% resolved) with no confidence intervals, error bars, or uncertainty quantification."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are reported. Claims that EXPEREPAIR 'outperforms' baselines are based solely on comparing point estimate percentages (e.g., 47.7% vs 47.0%)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Results are reported as absolute percentages with baseline context (Table 1: EXPEREPAIR 47.7% vs DARS 47.0%, Agentless 40.7%, etc.), and the ablation (Table 2) shows the effect of each component removal (e.g., 47.7% → 41.3% without experience module)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The evaluation uses SWE-Bench Lite's fixed 300 issues. No justification is provided for why this sample size is adequate, nor is a power analysis discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be from single runs with no indication of result stability across multiple executions."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Table 1 compares EXPEREPAIR against 8 open-source baselines including SWE-Agent, Agentless, AutoCodeRover, OpenHands, PatchPilot, DARS, and Moatless Tools."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include recent methods from 2024-2025: DARS (2025), PatchPilot (2025), OpenHands (2024), and several with the same LLM (Claude 3.5 Sonnet V2)."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 2 presents a systematic ablation removing three components: the full experience module, demonstrations only, and insights only, measuring resolved rate, ESR, and RSR for each."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Four metrics are used: % Resolved (pass@1), Average Cost ($), ESR (Execution Success Rate), and RSR (Reproduction Success Rate). The first two appear in Table 1, all four in Table 2."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 4 states RSR is 'manually verified by human annotators' — human annotators evaluate whether the system's reproduction scripts successfully reproduce the target issue."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are reported on SWE-Bench Lite, a standard held-out benchmark of 300 issues. The seed issues used to populate memory are part of the benchmark (evaluated without memory), not a separate tuning set."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No per-project or per-category breakdown is provided. SWE-Bench Lite contains 12 projects but results are reported only as aggregate percentages. The intersection analysis (Figure 2) shows overlap with baselines but not per-category performance."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "No error analysis or specific failure cases are discussed. Section 6 (Limitations) discusses the methodological limitation of bug localization but does not analyze specific issues the system failed to resolve."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "No negative results are reported. Every ablation variant shows the full system outperforms reduced versions. No mention of approaches tried and abandoned, or configurations that failed."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 49.3% pass@1 with Claude 3.7 Sonnet (supported by Figure 3) and 'outperforming all state-of-the-art open-source methods' (supported by Table 1 comparisons). All abstract claims are backed by results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The ablation study (Table 2) makes causal claims about each component's contribution through controlled single-variable removal. Each variant removes one component from the full system, isolating its effect."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Repository-Level Program Repair' broadly, but evaluation is only on SWE-Bench Lite (300 Python issues from 12 GitHub projects). No discussion of generalization to other languages, issue types, or project scales."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations are discussed. The improvement could be partly due to the additional DeepSeek-R1 reviewer (used for patch selection but not by all baselines), the extra compute from memory retrieval, or the multi-sampling strategy. None of these confounds are addressed."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The primary claims are stated in terms of pass@1 on SWE-Bench Lite, which directly measures issue resolution. The paper does not significantly overframe these results — the main claims stay close to 'resolved X% of issues on this benchmark.'"
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper specifies 'Claude-3.5-Sonnet V2', 'Claude 3.7 Sonnet', 'DeepSeek-R1', and 'o1-mini' — these are versioned model names that identify specific model releases."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Figures 4-8 in the appendix provide the actual prompt texts used for reproduction script generation, validation test generation, patch generation, patch refinement, and insight summarization."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Algorithmic parameters are reported (iteration limits of 3, 4 candidate patches, top-5 retrieval, max 15 insights). However, key LLM generation parameters are missing — the paper only says 'high temperature' without specifying the value, and omits top-p and max tokens."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The agentic scaffolding is described in detail: two-phase workflow (Section 3.1), test agent and patch agent with ReAct-based iteration (Section 3.2), episodic and semantic memory modules with retrieval and update mechanisms (Section 3.3), and a review agent for patch selection."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The benchmark (SWE-Bench Lite) is used as-is without modification. The memory construction pipeline is documented in Section 3.3 with clear extraction and update procedures. Appendix A.1 describes implementation details."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6 'Limitations' provides a dedicated discussion of the approach's weakness in bug localization, explaining why memory-based localization is difficult and left for future work."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6 discusses a specific limitation: bug localization lacks an automated oracle, making it hard to selectively accumulate localization experiences. They explain why a conservative strategy would limit memory diversity and coverage."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The limitations section discusses only one technical limitation (bug localization). No explicit statements about what the results do NOT show — no mention of generalization limits to non-Python projects, different issue types, or other benchmarks."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The paper states 'We release our code and data to support further research' with a GitHub repository link. SWE-Bench Lite is also publicly available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4 describes the benchmark: 'SWE-Bench Lite benchmark consists of 300 GitHub issues drawn from 12 diverse real-world software projects written in Python, each accompanied by an issue report and the corresponding codebase.'"
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants in this study. The data source is a standard public benchmark (SWE-Bench Lite)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented in Sections 3.1-3.3 and Appendix A.1: issues are processed through test generation (3 iterations), patch generation (3 iterations, 4 candidates each), patch validation with additional tests, and memory updates after each issue."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgment section is present in the paper. The authors are from Chinese Academy of Sciences, Beihang University, and York University, which implies government/academic funding, but none is disclosed."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Chinese Academy of Sciences, University of Chinese Academy of Sciences, Beihang University, and York University. Authors are not affiliated with the companies whose models they evaluate (Anthropic, OpenAI, DeepSeek)."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Authors are from academic institutions (Chinese Academy of Sciences, universities). Their implied funders (government/academic) have no financial stake in whether EXPEREPAIR outperforms competing APR methods."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the models used (Claude 3.5 Sonnet V2, Claude 3.7 Sonnet, DeepSeek-R1, o1-mini). This is critical because SWE-Bench Lite issues predate these models."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether SWE-Bench Lite issues or their solutions appeared in the training data of the models. The GitHub issues in SWE-Bench Lite were public before model training."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "SWE-Bench was published in 2023 (reference [17]) and the GitHub issues are older. All models used were trained after this, yet no contamination analysis is performed or discussed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. Human annotators verified RSR as evaluators, not as study subjects."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. The study is a benchmark evaluation of an automated program repair system."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. The study evaluates automated methods on a software benchmark."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants. Evaluation is on a fixed public benchmark (SWE-Bench Lite)."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants and no experimental conditions requiring randomization of subjects."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants. Automated benchmark evaluation does not involve blinding."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Table 1 reports average inference cost per instance: EXPEREPAIR costs $2.07 per issue on average. Costs are also reported for all baselines."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Only average per-instance cost is reported ($2.07). Total API spend for all 300 issues, cost of memory construction, and hardware used are not stated."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No multiple random seed results are reported. All results appear to be from a single run of the system. LLM sampling introduces randomness that is not quantified."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not explicitly stated. Results are presented as single numbers (e.g., 47.7%) without stating how many runs produced them."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. Values like k=5 for retrieval, max 3 iterations, and 4 candidate patches appear chosen without documented search or justification."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No justification for how the configuration was selected. Key parameters (iteration limits, number of candidates, retrieval k) are stated without explaining the selection process or whether a validation set was used."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper compares against 8+ baselines using only point estimates."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors do not acknowledge evaluation bias. Baseline results are taken from official papers/leaderboards (Table 1 note), which avoids re-implementation bias but still involves self-evaluation of their own system."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Table 1 reports both resolved rate and average cost per instance, enabling compute-performance comparison. DARS achieves 47.0% at $12.24 while EXPEREPAIR achieves 47.7% at $2.07."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether SWE-Bench Lite adequately measures repository-level program repair capability. The benchmark is used without questioning its construct validity or comparing against alternative benchmarks."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The main comparison (Table 1) compares different systems each with their own scaffolding. The improvement could be from the scaffold design rather than the dual-memory mechanism. While the ablation isolates memory effects, the cross-system comparison does not control for scaffold differences."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of temporal leakage. SWE-Bench Lite issues were created before the models' training periods, meaning models may have seen the issues and their solutions during training."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the models have been trained on the GitHub repositories underlying SWE-Bench Lite, which would leak feature information about code structure and issue context."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Not addressed. The seed issues used to populate memory are from the same SWE-Bench Lite benchmark as the inference issues, creating within-benchmark information sharing that is not discussed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention methods are applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "EXPEREPAIR achieves 47.7% pass@1 with Claude 3.5 Sonnet V2 on SWE-Bench Lite, outperforming all state-of-the-art open-source baselines.",
    372       "evidence": "Table 1 (Section 5.1) shows EXPEREPAIR at 47.7% vs next-best DARS at 47.0%, PatchPilot at 45.3%, and other baselines below 42%.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "EXPEREPAIR achieves 49.3% pass@1 with Claude 3.7 Sonnet on SWE-Bench Lite.",
    377       "evidence": "Figure 3 (Section 5.3) shows results across four LLMs, with Claude 3.7 achieving the highest resolved rate at 49.3%.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "The dual-memory mechanism is critical for performance — removing the full experience module drops resolved rate from 47.7% to 41.3%.",
    382       "evidence": "Table 2 (Section 5.2) ablation study shows w/o Experience Module achieves 41.3% resolved, 63.0% ESR, 58.7% RSR vs full system at 47.7%, 82.7%, 79.3%.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Episodic demonstrations contribute more than semantic insights to repair performance.",
    387       "evidence": "Table 2: removing demonstrations drops resolved to 43.7% while removing insights drops it to 46.0%, showing a larger impact from demonstrations (4.0pp vs 1.7pp).",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "EXPEREPAIR uniquely resolves 9 issues that no other open-source method can fix.",
    392       "evidence": "Figure 2 (Section 5.1) intersection analysis comparing EXPEREPAIR against four leading baselines (DARS, OpenHands, Agentless, Moatless).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "EXPEREPAIR is significantly more cost-efficient than the comparably performing DARS method.",
    397       "evidence": "Table 1: EXPEREPAIR costs $2.07/instance for 47.7% resolved vs DARS at $12.24/instance for 47.0% — nearly 6x cheaper for comparable performance.",
    398       "supported": "strong"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No error bars or variance reporting",
    404       "detail": "All results are reported as single point estimates with no confidence intervals, standard deviation, or multiple-run variance. Given LLM sampling stochasticity, the 0.7pp difference over DARS (47.7% vs 47.0%) could easily be within noise."
    405     },
    406     {
    407       "flag": "No statistical significance tests",
    408       "detail": "Claims of outperforming baselines are based solely on comparing raw percentages. With 300 issues, a 0.7pp difference (2 issues) is unlikely to be statistically significant."
    409     },
    410     {
    411       "flag": "No contamination analysis",
    412       "detail": "SWE-Bench Lite issues were public on GitHub before the models were trained. The LLMs may have seen both the issues and their solutions during training. No contamination detection or mitigation is attempted."
    413     },
    414     {
    415       "flag": "Confounded comparison with baselines",
    416       "detail": "EXPEREPAIR uses DeepSeek-R1 as a reviewer for final patch selection (Section 4), following DARS. Most other baselines (SWE-Agent, Agentless, AutoCodeRover, etc.) likely do not use a separate reasoning model for review. This additional model introduces a confound that inflates results relative to those baselines."
    417     },
    418     {
    419       "flag": "Within-benchmark information leakage",
    420       "detail": "Seed issues from SWE-Bench Lite are used to build the memory, which then helps resolve other SWE-Bench Lite issues. This means information from some test instances informs predictions on others — a form of data leakage that is not discussed or mitigated."
    421     },
    422     {
    423       "flag": "Missing per-project breakdown",
    424       "detail": "SWE-Bench Lite spans 12 projects, but results are only reported as aggregates. Performance could vary dramatically across projects, and the memory mechanism might only help on certain projects with recurring patterns."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "SWE-bench: Can language models resolve real-world github issues?",
    430       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    431       "year": 2023,
    432       "arxiv_id": "2310.06770",
    433       "relevance": "The benchmark used for evaluation — defines the SWE-Bench Lite dataset of 300 real-world GitHub issues for repository-level program repair."
    434     },
    435     {
    436       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    437       "authors": ["John Yang", "Carlos Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    438       "year": 2024,
    439       "relevance": "Major agentic APR baseline; introduces agent-computer interface design for autonomous code repair and is compared against in Table 1."
    440     },
    441     {
    442       "title": "Agentless: Demystifying LLM-based software engineering agents",
    443       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    444       "year": 2024,
    445       "arxiv_id": "2407.01489",
    446       "relevance": "Procedural APR baseline that follows predefined pipelines without autonomous agent planning; compared against in Table 1."
    447     },
    448     {
    449       "title": "AutoCodeRover: Autonomous program improvement",
    450       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    451       "year": 2024,
    452       "relevance": "Agent-based APR system that uses LLMs for planning and code navigation; baseline in Table 1."
    453     },
    454     {
    455       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    456       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    457       "year": 2024,
    458       "relevance": "Open-source platform for AI coding agents used as a baseline, achieving 41.7% on SWE-Bench Lite."
    459     },
    460     {
    461       "title": "PatchPilot: A stable and cost-efficient agentic patching framework",
    462       "authors": ["Hongwei Li", "Yuheng Tang", "Shiqi Wang", "Wenbo Guo"],
    463       "year": 2025,
    464       "arxiv_id": "2502.02747",
    465       "relevance": "Procedural APR method achieving 45.3% on SWE-Bench Lite; one of the top open-source baselines compared against."
    466     },
    467     {
    468       "title": "DARS: Dynamic action re-sampling to enhance coding agent performance by adaptive tree traversal",
    469       "authors": ["Vaibhav Aggarwal", "Ojasv Kamal", "Abhinav Japesh", "Zhijing Jin", "Bernhard Schölkopf"],
    470       "year": 2025,
    471       "arxiv_id": "2503.14269",
    472       "relevance": "Closest-performing baseline at 47.0% on SWE-Bench Lite but at 6x the cost; also uses DeepSeek-R1 as reviewer."
    473     },
    474     {
    475       "title": "RepairAgent: An autonomous, LLM-based agent for program repair",
    476       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    477       "year": 2024,
    478       "arxiv_id": "2403.17134",
    479       "relevance": "LLM-based autonomous agent for program repair, representative of the agentic APR paradigm."
    480     },
    481     {
    482       "title": "ReAct: Synergizing reasoning and acting in language models",
    483       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
    484       "year": 2023,
    485       "relevance": "Core reasoning framework used by EXPEREPAIR's agents for iterative processing — foundational work on LLM agent reasoning."
    486     },
    487     {
    488       "title": "ExpeL: LLM agents are experiential learners",
    489       "authors": ["Andrew Zhao", "Daniel Huang", "Quentin Xu", "Matthieu Lin", "Yong-Jin Liu", "Gao Huang"],
    490       "year": 2024,
    491       "relevance": "Directly inspirational work on experience-based learning for LLM agents; the dual-memory concept in EXPEREPAIR extends this to program repair."
    492     },
    493     {
    494       "title": "RepairLlama: Efficient representations and fine-tuned adapters for program repair",
    495       "authors": ["André Silva", "Sen Fang", "Martin Monperrus"],
    496       "year": 2023,
    497       "arxiv_id": "2312.15698",
    498       "relevance": "Fine-tuned LLM approach for program repair using parameter-efficient adapters, representing the fine-tuning paradigm that EXPEREPAIR aims to avoid."
    499     },
    500     {
    501       "title": "LLMs as continuous learners: Improving the reproduction of defective code in software issues",
    502       "authors": ["Yalan Lin", "Yingwei Ma", "Rongyu Cao"],
    503       "year": 2024,
    504       "arxiv_id": "2411.13941",
    505       "relevance": "Related work on continuous learning for LLMs in bug reproduction, sharing the theme of learning from past repair experiences."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "The tool addresses a real practitioner need (automated bug fixing) and code is released, but requires significant LLM API costs and setup to use."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "The dual-memory idea is interesting but not contrarian — augmenting LLM agents with memory is a well-explored direction."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No safety or security concerns raised; this is a program repair tool with no adversarial implications."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "No controversial claims or conflicts; straightforward benchmark improvement paper."
    524     },
    525     "demo_ability": {
    526       "score": 2,
    527       "justification": "Code is released on GitHub and the system runs on SWE-Bench Lite, but requires API keys for Claude/DeepSeek models."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "Uses Claude 3.5/3.7 Sonnet (Anthropic) which adds some recognition, but from an academic lab not a major AI company."
    532     }
    533   }
    534 }

Impressum · Datenschutz