ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29816B)


      1 {
      2   "paper": {
      3     "title": "SemAgent: A Semantics Aware Program Repair Agent",
      4     "authors": [
      5       "Anvith Pabba",
      6       "Alex Mathai",
      7       "Anindya Chakraborty",
      8       "Baishakhi Ray"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2506.16650",
     13     "doi": "10.48550/arXiv.2506.16650"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "SemAgent achieves 44.66% on SWE-Bench Lite, the highest among workflow-based APR approaches, through a pipeline leveraging execution, issue, and code semantics. Ablation studies show incremental value of each component (baseline 37% → +Repair Stage 42% → +Reviewer 44.66%), though the issue semantics ablation is limited to 50 issues. The system is restricted to single-file fixes and costs approximately $6.9 per issue using Claude Sonnet 3.7 at temperature 0.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No source code repository URL is provided anywhere in the paper. No mention of code release on GitHub, Zenodo, or any other platform."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses SWE-Bench Lite (Jimenez et al., 2024), a publicly available benchmark of 300 real-world GitHub issues across 11 Python repositories."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions Claude Sonnet 3.7 and temperature 0 but does not specify the full software environment needed to reproduce the system."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level (§4) but without concrete commands or scripts to replicate the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No confidence intervals or error bars are reported. All results are point estimates (e.g., '44.66%'). Section 6.1 explicitly states 'All results are from a single run.'"
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are used. Claims like '7.66 percentage point increase in absolute resolution rate' (§6.1) are based solely on comparing two numbers without any test of statistical significance."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Effect sizes are provided with baseline context: '44.66% vs 37% baseline, outperforming it by 23 additional resolved issues with a 7.66 percentage point increase' (§6.1). Tables 2-4 similarly provide absolute percentages and differences."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for sample sizes. The issue semantics ablation (Table 3) uses only 50 issues ('49 randomly selected low-cost issues along with the motivating example') with no justification for why 50 is adequate for the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Section 6.1 explicitly states 'All results are from a single run.' No variance, standard deviation, or spread measures are reported."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 1 compares against multiple baselines: AutoCodeRover, SpecRover, Agentless, OpenHands, CodeStory Aide, DARS Agent, and Globant Code Fixer Agent."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include contemporary systems: DARS Agent (Aggarwal et al., 2025), OpenHands (Wang et al., 2025), Agentless (Xia et al., 2024), and SpecRover (Ruan et al., 2024), all recent state-of-the-art systems."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "RQ2 (§6.2) ablates four variants: (1) baseline, (2) +Repair Stage with both semantics, (3) Repair Stage with only code semantics, (4) full system with reviewer. RQ3 ablates execution semantics for localization (Table 4)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Only a single metric — solve rate (percentage of issues resolved) — is reported throughout all experiments. No secondary metrics such as patch quality, precision of changes, or partial resolution are provided."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Evaluation is entirely automated via SWE-Bench test suite pass/fail. No human evaluation of patch quality, readability, or alignment with developer intent is conducted, despite claims about 'developer-aligned repairs.'"
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "SWE-Bench Lite is a standard fixed benchmark of 300 issues. The paper reports results on the full benchmark and does not use any subset for tuning. §5.2 states the system 'performs a single pass@1 attempt.'"
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No systematic per-repository or per-category breakdown is provided. There are informal mentions of Django, Matplotlib, and scikit-learn in §6.3, but no table showing performance across the 11 repositories or issue categories."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6.2 discusses failures: 'SemAgent adds extra fixes that can potentially break existing functionality or go above and beyond a simple fix.' Section 8 discusses inability to handle multi-file issues. The motivating example (§3) shows incomplete and overfitted patches."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6.2 reports that the Repair Stage without the Reviewer generates patches that 'break existing functionality,' motivating the Reviewer component. The ablation without issue semantics (Table 3) shows 10 percentage point degradation."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of 44.66% on SWE-Bench Lite are supported in Table 1. The 7.66% improvement over baseline is supported by Table 1 (44.66% vs 37%). The 51.33% multi-agent potential is supported in Figure 3, though appropriately hedged as 'potential of achieving around 51.33%.'"
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims ('incorporating issue and code semantics... improve performance') are supported by controlled ablation studies in Tables 2 and 3, where individual components are added/removed while holding others constant. However, the issue semantics ablation sample is small (n=50)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper tests only on SWE-Bench Lite (300 Python issues from 11 repos) but the title and abstract frame results broadly as 'Program Repair' without language or domain qualifiers. §8 acknowledges single-file limitation but doesn't bound claims to Python or the tested repositories."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations for the improvements are discussed. For example, the use of Amazon Q Developer reproduction tests (not available to all baselines) as a confound is not addressed. No discussion of whether improvements stem from the semantic components versus simply additional LLM calls."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures test suite pass/fail but claims patches are 'complete and consistent' and 'aligned with developer expectations' (Abstract, §1). The gap between passing automated tests and actual developer-aligned completeness is not discussed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "§5.4 states 'Claude Sonnet 3.7' without a snapshot date, API version, or specific model ID. This is a marketing name; model behavior can change across versions."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix A.4 provides the full prompt text for all major components: issue semantics generation, workflow generation, context retrieval, fix generation, and the reviewer agent. While placeholders like '{issue statement}' are used, the actual prompt templates and their fill semantics are documented."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "§5.4 reports: 'temperature to zero,' 'up to 3 pipeline retries, 15 localization rounds, and 10 reproducer attempts per issue.' Model cost of '$3 per one million tokens' is also stated."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The full pipeline is described in §4 with a detailed workflow diagram (Figure 2). Components include execution semantics module, repair stage, reviewer agent, and patch aggregator, with clear descriptions of each component's role and interactions."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "§5.1-5.2 describe the evaluation setup: SWE-Bench Lite benchmark with 300 issues, pre-loaded Amazon Q Developer reproduction tests, and the pipeline from issue to patch generation. The input processing is adequately documented."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8 'Limitations' provides substantive discussion of single-file restriction, cost ($6.9 per issue), and scalability challenges."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "§8 identifies specific threats: 'restricted to single-file fixes, which limits its applicability to issues that span multiple files' and 'may miss relevant code outside the immediate scope of the identified file.' Cost limitations are quantified at '$6.9 per issue.'"
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "§8 explicitly states scope boundaries: 'restricted to single-file fixes,' 'Scaling the approach to operate effectively across these larger scopes remains an open challenge.' Section 9 discusses potential risks of the approach."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw output data is released — no patch files, intermediate results, or per-issue breakdowns are available for independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "§5.1 describes the data source: SWE-Bench Lite, 300 real-world GitHub issues across 11 Python repositories. The benchmark's construction is referenced via Jimenez et al. (2024)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data comes from a standard public benchmark (SWE-Bench Lite)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The full pipeline from input (issue, codebase, regression tests) to output (patch) is documented in §4 and Figure 2. §5.2 describes the evaluation flow. However, no filtering or exclusion of benchmark issues is mentioned."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding sources or acknowledgments section is present in the provided paper text."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly stated: Columbia University for three authors, and an independent affiliation for the fourth. No evaluated product is affiliated with the authors."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses Anthropic's Claude Sonnet 3.7 and Amazon Q Developer reproduction tests, but no financial relationship is declared."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial disclosure statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff date is stated for Claude Sonnet 3.7. The paper uses this model to evaluate on SWE-Bench Lite without discussing when the model's training data was collected."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether SWE-Bench Lite issues or their solutions appeared in Claude Sonnet 3.7's training data. SWE-Bench Lite issues are from public GitHub repositories that could be in the training set."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "SWE-Bench Lite was published before Claude Sonnet 3.7's training cutoff. The benchmark issues come from public GitHub repositories. No contamination analysis or discussion is provided."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. Evaluation is entirely automated via benchmark test suites."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "§6.1 reports: 'average cost of approximately $6.9 per issue,' 'median cost per issue is 4.87$,' 'costs can be reduced to 4.77$ an issue across multiple runs,' and 'takes on average 16 minutes to solve an issue.'"
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Per-issue costs and time are reported but total computational budget for the full benchmark evaluation is not explicitly stated. No hardware specifications or total API spend are provided."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "§6.1 states 'All results are from a single run.' Temperature is set to 0 for determinism (§5.4), but no seed sensitivity analysis is performed to verify stability."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "§6.1 explicitly states 'All results are from a single run,' clearly documenting the number of experimental runs."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Pipeline parameters (3 retries, 15 localization rounds, 10 reproducer attempts) are stated in §5.4 but no search budget or justification for how these values were selected is provided."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No explanation of how the pipeline configuration was selected. Parameters appear chosen without documented justification or validation-set tuning."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "SemAgent builds on SpecRover and compares against it as the primary baseline. The authors' implementation of the baseline and extensions are not independently validated, and author-evaluation bias is not acknowledged."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "SemAgent costs $6.9 per issue (§6.1) but no cost comparison with baselines is provided. Different approaches may use vastly different compute budgets, making raw accuracy comparisons potentially misleading."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether SWE-Bench Lite adequately measures the claimed capabilities (semantic understanding, patch completeness). The paper does not question the benchmark's construct validity."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "Table 1 compares systems using different scaffolds and models (SemAgent with Claude Sonnet 3.7 vs others with unspecified models). Performance differences are attributed to methodology rather than scaffold/model confounds."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether SWE-Bench Lite issues predate Claude Sonnet 3.7's training data, despite the benchmark being published well before the model."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The system uses 'pre-loaded Amazon Q Developer reproduction tests' (§5.2) as input, which provide additional signal beyond the standard SWE-Bench setup. No discussion of whether this constitutes feature leakage or unfair advantage."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether SWE-Bench Lite issues overlap with the model's training data or whether structural similarities between benchmark instances could inflate results."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, temporal splits, or decontamination analysis."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "SemAgent achieves 44.66% solve rate on SWE-Bench Lite, the highest among workflow-based methods.",
    370       "evidence": "Table 1 (§6.1) shows 134/300 issues resolved at 44.66%, compared to Agentless at 40.67% and SpecRover at 37%.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "SemAgent provides a 7.66 percentage point absolute improvement over the SpecRover baseline.",
    375       "evidence": "Table 1 shows SpecRover at 37% (111 issues) vs SemAgent at 44.66% (134 issues), a difference of 23 issues (§6.1).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The Repair Stage with semantics improves baseline from 37% to 42%.",
    380       "evidence": "Table 2 (§6.2) shows baseline at 37% and w/ Repair Stage at 42% (126 issues resolved).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Issue semantics improve performance from 50% to 60% on a random subset of issues.",
    385       "evidence": "Table 3 (§6.2) shows w/o Issue Semantics at 50% (25/50) vs w/ Issue Semantics at 60% (30/50) on 49 randomly selected issues plus one motivating example.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Execution semantics improve localization accuracy from 82% to 85.67%.",
    390       "evidence": "Table 4 (§6.3) shows localization accuracy with and without execution semantics.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "SemAgent-Multi could potentially resolve up to 51.33% (154/300) of issues.",
    395       "evidence": "Figure 3 (§6.1) shows a Venn diagram of ablation overlaps totaling 154 unique issues. This is a union of different configurations, not an actual running system.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "SemAgent performs particularly well on issues requiring multi-line reasoning and edge-case handling.",
    400       "evidence": "Claimed in the abstract and supported qualitatively by the motivating example (§3), but no systematic breakdown or quantitative evidence of multi-line vs single-line performance is provided.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "SemAgent-Multi is hypothetical",
    407       "detail": "The 51.33% claim (presented prominently in Table 1 and the abstract) comes from a Venn diagram union of different ablation variants (Figure 3), not an actual integrated system. Presenting it alongside real system results is misleading."
    408     },
    409     {
    410       "flag": "External reproduction tests not available to all baselines",
    411       "detail": "SemAgent uses 'pre-loaded Amazon Q Developer reproduction tests' (§5.2) as input, giving it additional signal that other baselines in Table 1 may not have. This confound is not discussed or controlled for."
    412     },
    413     {
    414       "flag": "Issue semantics ablation on tiny sample",
    415       "detail": "Table 3 evaluates issue semantics on only 50 issues (49 random + 1 motivating example), finding a 10 percentage point difference (25 vs 30 resolved). With n=50 and no significance test, this difference could be due to chance."
    416     },
    417     {
    418       "flag": "Single-run results with no uncertainty quantification",
    419       "detail": "All results are from a single run (§6.1) with temperature 0. While temperature 0 reduces variance, the pipeline has multiple stochastic components (reproducer generation, localization rounds, retries) that could produce different results."
    420     },
    421     {
    422       "flag": "No code release for verification",
    423       "detail": "Despite being a systems paper with specific implementation claims, no code is released. The SWE-Bench leaderboard results cannot be independently verified."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    429       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    430       "year": 2024,
    431       "arxiv_id": "2310.06770",
    432       "relevance": "Primary evaluation benchmark for repository-level program repair with LLMs."
    433     },
    434     {
    435       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    436       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    437       "year": 2024,
    438       "arxiv_id": "2405.15793",
    439       "relevance": "Agentic APR system providing LLM-friendly interfaces for code navigation and editing."
    440     },
    441     {
    442       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    443       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    444       "year": 2025,
    445       "arxiv_id": "2407.16741",
    446       "relevance": "Open-source agentic platform for software engineering, key baseline in Table 1."
    447     },
    448     {
    449       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    450       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    451       "year": 2024,
    452       "arxiv_id": "2407.01489",
    453       "relevance": "Workflow-based APR approach guiding agents through repository structure, key baseline."
    454     },
    455     {
    456       "title": "AutoCodeRover: Autonomous Program Improvement",
    457       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    458       "year": 2024,
    459       "arxiv_id": "2404.05427",
    460       "relevance": "Workflow-based APR using program analysis tools for context lookup."
    461     },
    462     {
    463       "title": "SpecRover: Code Intent Extraction via LLMs",
    464       "authors": ["Haifeng Ruan", "Yuntong Zhang", "Abhik Roychoudhury"],
    465       "year": 2024,
    466       "arxiv_id": "2408.02232",
    467       "relevance": "Direct architectural baseline for SemAgent; adds specification generation and patch review to APR."
    468     },
    469     {
    470       "title": "DARS: Dynamic Action Re-Sampling to Enhance Coding Agent Performance by Adaptive Tree Traversal",
    471       "authors": ["Vaibhav Aggarwal", "Ojasv Kamal", "Abhinav Japesh", "Zhijing Jin", "Bernhard Schölkopf"],
    472       "year": 2025,
    473       "arxiv_id": "2503.14269",
    474       "relevance": "Agentic APR system using adaptive tree search, top open-source baseline on SWE-Bench Lite."
    475     },
    476     {
    477       "title": "Evaluating Large Language Models Trained on Code",
    478       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    479       "year": 2021,
    480       "arxiv_id": "2107.03374",
    481       "relevance": "Foundational work on LLM code evaluation (Codex/HumanEval), establishes code generation benchmarks."
    482     },
    483     {
    484       "title": "PatchPilot: A Stable and Cost-Efficient Agentic Patching Framework",
    485       "authors": ["Hongwei Li", "Yuheng Tang", "Shiqi Wang", "Wenbo Guo"],
    486       "year": 2025,
    487       "relevance": "Workflow-based APR with patch refinement, directly related to the semantics-guided repair approach."
    488     },
    489     {
    490       "title": "Executable Code Actions Elicit Better LLM Agents",
    491       "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan"],
    492       "year": 2024,
    493       "arxiv_id": "2402.01030",
    494       "relevance": "CodeAct framework for LLM agent action spaces, used by OpenHands."
    495     },
    496     {
    497       "title": "KGym: A Platform and Dataset to Benchmark Large Language Models on Linux Kernel Crash Resolution",
    498       "authors": ["Alex Mathai", "Chenxi Huang", "Petros Maniatis"],
    499       "year": 2024,
    500       "relevance": "Benchmark for LLM-based crash resolution extending APR to kernel-level bugs."
    501     },
    502     {
    503       "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models",
    504       "authors": ["Andy Zhou", "Kai Yan", "Michal Shlapentokh-Rothman", "Haohan Wang", "Yu-Xiong Wang"],
    505       "year": 2024,
    506       "arxiv_id": "2310.04406",
    507       "relevance": "MCTS-based planning for LLM agents, referenced as potential extension for SemAgent's sampling strategy."
    508     },
    509     {
    510       "title": "SWE-PolyBench: A Multi-Language Benchmark for Repository Level Evaluation of Coding Agents",
    511       "authors": ["Daoguang Zan", "Zhirong Huang", "Wei Liu"],
    512       "year": 2025,
    513       "arxiv_id": "2504.02605",
    514       "relevance": "Multi-language extension of SWE-Bench concept for evaluating coding agents across programming languages."
    515     }
    516   ],
    517   "engagement_factors": {
    518     "practical_relevance": {
    519       "score": 2,
    520       "justification": "Practitioners building APR systems could adopt the semantic pipeline approach, though no code is released to enable direct use."
    521     },
    522     "surprise_contrarian": {
    523       "score": 1,
    524       "justification": "Confirms the intuition that deeper semantic understanding helps APR — not particularly surprising or contrarian."
    525     },
    526     "fear_safety": {
    527       "score": 0,
    528       "justification": "No safety, security, or risk angle; purely a software engineering tool improvement."
    529     },
    530     "drama_conflict": {
    531       "score": 0,
    532       "justification": "No controversy or dramatic claims; straightforward benchmark improvement paper."
    533     },
    534     "demo_ability": {
    535       "score": 0,
    536       "justification": "No code release, no demo, no tool that practitioners can try."
    537     },
    538     "brand_recognition": {
    539       "score": 1,
    540       "justification": "From Columbia University and uses Anthropic's Claude Sonnet 3.7, but neither the lab nor the tool has major brand recognition."
    541     }
    542   }
    543 }

Impressum · Datenschutz