scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24831B)
      1 {
      2   "paper": {
      3     "title": "BackportBench: A Multilingual Benchmark for Automated Backporting of Patches",
      4     "authors": [
      5       "Zhiqing Zhong",
      6       "Jiaming Huang",
      7       "Pinjia He"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv preprint",
     11     "arxiv_id": "2512.01396"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a GitHub link: https://github.com/BackportBench/BackportBench. Section 8 (Data Availability) states 'The code and data of BackportBench are available at' this URL."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The benchmark data (202 task instances with Docker environments) is released at the same GitHub repository. Section 8 confirms code and data availability."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Each task instance is provided as a Docker container with all dependencies installed (Section 3.5: 'all BackportBench task instances are released as Docker containers with all dependencies installed')."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The benchmark includes executable Docker environments and test scripts for each task instance. The GitHub repository provides the benchmark suite for reproduction."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars are reported. All results are point estimates (e.g., '70.8% (143/202)') with no uncertainty quantification."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims (e.g., '(M)SWE-agent with Claude Sonnet 4 achieves the highest resolve rate') but provides no statistical significance tests. Comparisons are based solely on raw percentages."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports resolve rates with both percentage and raw counts (e.g., '70.8% (143/202)'), and provides context for comparison such as 'a 15% relative improvement over Mystiqe.' This provides sufficient magnitude context."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The benchmark contains 202 instances but there is no justification for why this size is sufficient for the claims made. Some sub-categories have very small samples (e.g., 3 namespace-only instances, 1 instance per non-file-matched category) with no discussion of statistical power."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Results appear to be from single runs. No variance, standard deviation, or multiple-run results are reported for any of the evaluated methods."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper includes multiple baselines: (M)SWE-agent, (M)Agentless, Oracle Retrieval, Mystiqe, and PPatHF. These cover agentic, procedural, oracle, and traditional patch porting methods."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The baselines include GPT-5 (2025-09-07), Claude Sonnet 4 (20250514), Qwen3-Coder, SWE-agent, and Agentless, which are all contemporary and competitive systems at the time of writing."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is performed. The paper does not test variants of the benchmark construction or the evaluation pipeline. The Oracle Retrieval method provides some insight into the impact of file localization but is not a systematic ablation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses resolve rate as the primary metric, with breakdowns by FAIL_TO_PASS and PASS_TO_PASS test outcomes, and further categorizes failures into five types (Generation Failed, Only F2P Failed, Only P2P Failed, Both Failed, Timeout)."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation of system outputs is not applicable here. The benchmark uses execution-based evaluation (test pass/fail), which is more rigorous than human judgment for this task."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is a benchmark construction paper, not a machine learning training paper. The benchmark itself serves as a test set for evaluating methods."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Extensive per-category breakdowns are provided: by programming language (Table 3), by content-level category (Table 4), by file-level category (Table 5), and by failure type (Tables 6, 8)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Failure cases are analyzed in detail in Tables 6 and 8, with breakdowns by failure type. The paper discusses why PPatHF fails (84.8% generation failures due to misaligned paths) and why Mystiqe fails (74.4% P2P failures due to function-level design)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results are reported: all methods perform poorly on logical/structural changes, all methods fail on 'No Overlap' file-level category, Mystiqe's backported patches frequently break existing functionality (74.4% P2P failures), and performance drops sharply for Java and JavaScript compared to Python."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims that 'the agentic method has outperformed traditional patch porting methods, especially on cases that require logical and structural changes' and 'the performance varies across different programming languages.' Both are supported by Tables 3, 4, and 7."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes limited causal claims. The suggestion that SWE-agent outperforms due to 'dynamic reflection-and-retry mechanism' (Section 5.1) is an interpretation but is hedged with 'potentially.' Most claims are comparative performance statements supported by controlled experiments on the same benchmark."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper bounds its findings to three programming languages (Python, Java, JavaScript) from three ecosystems (PyPI, Maven, npm). Section 6.2 (External Validity) explicitly discusses generalizability limitations, noting that findings depend on the specific packages used."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 6 discusses multiple alternative explanations: environment mismatch (Section 6.1), data contamination from pre-LLM-cutoff backports (Section 6.1), package-specific maintenance strategies biasing results (Section 6.2), and language-specific LLM performance differences (Section 6.2)."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Exact model versions are specified: 'GPT-5-Chat 2025-09-07', 'Claude-Sonnet-4-20250514', and 'Qwen3-Coder-480B-A35B-Instruct' (Section 4.2, Model Selection)."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper describes prompt adaptations in natural language ('we add the original patches and information from the original codebase... to the prompt') but does not provide the actual prompt text used for any of the methods."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "No hyperparameters are reported for any of the LLM-based methods. Temperature, top-p, max tokens, and other sampling parameters are not mentioned."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "SWE-agent's scaffolding is described as providing tools to interact with the codebase environment. Agentless's procedural pipeline (hierarchical localization, patch generation, test-based selection) is described. However, details rely on references to the original papers rather than full descriptions."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The data construction pipeline is documented in detail across Sections 3.1-3.5, including: data source selection (OSV database), deduplication, attribute filtering (3 criteria), manual validation (619 from 1,527 pairs), categorization, and execution-based validation. Specific filtering counts are provided at each stage."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 6 (Threats to Validity) provides a dedicated discussion with subsections on Internal Validity (6.1) and External Validity (6.2)."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Specific threats are discussed: environment mismatch due to loose dependency constraints, data contamination from pre-cutoff backports, package-specific maintenance strategies biasing results, and programming language coverage. These are specific to this study, not generic boilerplate."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6.2 states specific scope boundaries: the benchmark covers only three programming languages, findings depend on the 12 selected packages, and each package follows its own maintenance strategy which could bias results. The paper also notes the benchmark focuses on vulnerability patches specifically."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The full benchmark data including task instances, Docker environments, and test cases is available at the GitHub repository. The data collection and curation code is also open-sourced (Section 6.1)."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 3 describes the data collection in detail: OSV database as source, 2024.8.29 data dump, three ecosystems (PyPI, Maven, npm), with specific filtering criteria and counts at each stage (37,284 records to 485 after filtering to 619 confirmed pairs to 202 final instances)."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants are recruited. The benchmark is constructed from public vulnerability databases and open-source repositories."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The full pipeline is documented in Figure 1 and Sections 3.1-3.5: data sources → deduplication/filtering (37,284→485) → commit combination (1,527 pairs) → manual validation (619 confirmed) → repository selection (12 repos, 51.1% coverage) → execution-based validation (202 final instances)."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or funding."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "All three authors are from The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China. Affiliations are clearly listed."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is a gap."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "While the paper acknowledges contamination risk in Section 6.1 ('many of them were conducted predate the LLM's knowledge cutoff date'), the actual training cutoff dates of GPT-5, Claude Sonnet 4, and Qwen3-Coder are not stated."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Section 6.1 explicitly discusses this: 'due to the scarcity of high-quality backports, many of them were conducted predate the LLM's knowledge cutoff date, which may introduce data contamination into the evaluation results.' They note the open-sourced code facilitates dynamic updates to mitigate this."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 6.1 acknowledges contamination risk and provides a mitigation plan: 'we open-source our data collection and curation code to facilitate dynamic updates for BackportBench.' They also plan automated approaches to identify new instances, reducing contamination over time."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants are involved in the evaluation. The manual annotation during benchmark construction is not a human subjects study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in the evaluation."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in the evaluation."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in the evaluation."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in the evaluation."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in the evaluation."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in the evaluation."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No inference costs or API expenditures are reported for any of the evaluated methods, despite using multiple commercial LLM APIs (GPT-5, Claude Sonnet 4) across 202 instances with three methods."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No computational budget is stated. The paper does not report total API spend, GPU hours, or wall-clock time for the experiments."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "(M)SWE-agent with Claude Sonnet 4 achieves the highest overall resolve rate of 70.8% on BackportBench.",
    290       "evidence": "Table 3 shows (M)SWE-agent with Claude Sonnet 4 resolving 143/202 instances (70.8%), compared to 69.8% for Oracle Retrieval with GPT-5 and 55.4% for (M)Agentless with GPT-5.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "All methods perform significantly worse on Java and JavaScript instances compared to Python instances.",
    295       "evidence": "Table 3 shows (M)SWE-agent with Claude Sonnet 4 achieving 91.1% on Python vs. 46.3% on Java and 43.5% on JavaScript. This pattern holds across all baseline/model combinations.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Agentic methods outperform traditional patch porting methods, especially on cases requiring logical and structural changes.",
    300       "evidence": "Table 7 shows MSWE-agent with Claude Sonnet 4 resolving 38.5% of logical/structural Java instances vs. Mystiqe at 7.7% and PPatHF at 0.0%. Overall, MSWE-agent achieves 46.3% vs. Mystiqe at 41.8% on Java instances.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Mystiqe's function-level backporting design causes it to break original functionality, as evidenced by 74.4% of failures being P2P test failures.",
    305       "evidence": "Table 8 shows Mystiqe's failure breakdown: 74.4% (29/39) of failures are 'Only P2P Failed,' meaning backported patches pass vulnerability tests but break existing functionality.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Most high-quality backports come from PyPI and Maven ecosystems, with npm having fewer backports despite having more vulnerability records.",
    310       "evidence": "Section 4.1 reports npm has 30.1% of vulnerability records but only 11.5% of 619 high-quality backports, while PyPI has 27.7% of records but 42.7% of backports.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "BackportBench is the first comprehensive benchmark suite for patch backporting.",
    315       "evidence": "Section 2 reviews related work showing existing methods focus on code-hunk or function-level scenarios and use equivalence-based metrics. No prior repository-level, multilingual benchmark with execution-based evaluation is identified.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "BackportBench is a multilingual benchmark of 202 real-world patch backporting problems across Python, Java, and JavaScript with executable Docker environments and test suites. Evaluation of LLM-based methods shows that the agentic approach (SWE-agent) outperforms procedural methods (Agentless) and traditional patch porting methods (Mystiqe, PPatHF), achieving up to 70.8% resolve rate overall but with significant performance variation across programming languages (91.1% Python vs. 43.5-46.3% Java/JavaScript). The benchmark reveals that traditional patch porting methods frequently break existing functionality, a problem hidden by prior equivalence-based evaluation metrics.",
    323   "red_flags": [
    324     {
    325       "flag": "No statistical uncertainty quantification",
    326       "detail": "All comparative claims are made on raw percentages from single runs without confidence intervals, significance tests, or multi-run variance. Given some sub-categories have very small samples (e.g., 3 namespace-only instances), observed differences may not be statistically meaningful."
    327     },
    328     {
    329       "flag": "Potential selection bias in repository choice",
    330       "detail": "Only the top 4 repositories per ecosystem by backporting instance count were selected, and those with environment setup failures were replaced. This means the benchmark may over-represent well-maintained packages with good test infrastructure, not representative of the broader ecosystem."
    331     },
    332     {
    333       "flag": "Missing cost reporting",
    334       "detail": "The paper evaluates 3 methods x 3 commercial LLMs x 202 instances without reporting any API costs or computational budget, making it difficult to assess practical feasibility."
    335     },
    336     {
    337       "flag": "Contamination risk acknowledged but not measured",
    338       "detail": "The paper acknowledges that many backporting instances predate LLM training cutoffs but does not quantify what fraction of instances may be contaminated or test whether contaminated instances have higher resolve rates."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "Swe-bench: Can language models resolve real-world github issues?",
    344       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    345       "year": 2023,
    346       "arxiv_id": "2310.06770",
    347       "relevance": "Foundational benchmark for evaluating LLM agents on repository-level GitHub issue resolution, which BackportBench extends to the patch backporting domain."
    348     },
    349     {
    350       "title": "Swe-agent: Agent-computer interfaces enable automated software engineering",
    351       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    352       "year": 2024,
    353       "relevance": "Key agentic baseline evaluated in the paper, representing the state-of-the-art in autonomous LLM-based software engineering agents."
    354     },
    355     {
    356       "title": "Agentless: Demystifying llm-based software engineering agents",
    357       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    358       "year": 2024,
    359       "arxiv_id": "2407.01489",
    360       "relevance": "Key procedural baseline evaluated in the paper, representing the alternative to agentic approaches for automated code modification."
    361     },
    362     {
    363       "title": "Multi-swe-bench: A multilingual benchmark for issue resolving",
    364       "authors": ["Daoguang Zan", "Zhirong Huang", "Wei Liu"],
    365       "year": 2025,
    366       "arxiv_id": "2504.02605",
    367       "relevance": "Extended SWE-bench to multiple languages; the paper uses MSWE-agent and Magentless from this work for Java and JavaScript evaluation."
    368     },
    369     {
    370       "title": "Evaluating large language models trained on code",
    371       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    372       "year": 2021,
    373       "arxiv_id": "2107.03374",
    374       "relevance": "Foundational work on LLM code generation evaluation (HumanEval/Codex), establishing benchmarking practices for code-related tasks."
    375     },
    376     {
    377       "title": "Openhands: An open platform for ai software developers as generalist agents",
    378       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    379       "year": 2024,
    380       "arxiv_id": "2407.16741",
    381       "relevance": "Open platform for AI software engineering agents, relevant to the survey's coverage of agentic AI tools."
    382     },
    383     {
    384       "title": "Autocoderover: Autonomous program improvement",
    385       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    386       "year": 2024,
    387       "relevance": "Autonomous agent for program improvement with AST-based search tools, representing agentic approaches to software engineering."
    388     },
    389     {
    390       "title": "Large language model-based agents for software engineering: A survey",
    391       "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen"],
    392       "year": 2024,
    393       "arxiv_id": "2409.02977",
    394       "relevance": "Survey of LLM-based agents for software engineering tasks, providing landscape context for this benchmark paper."
    395     },
    396     {
    397       "title": "Mystique: Automated Vulnerability Patch Porting with Semantic and Syntactic-Enhanced LLM",
    398       "authors": ["Susheng Wu", "Ruisi Wang", "Yiheng Cao", "Bihuan Chen"],
    399       "year": 2025,
    400       "relevance": "State-of-the-art patch porting method evaluated as a baseline, using fine-tuned LLMs with semantic/syntactic signatures."
    401     },
    402     {
    403       "title": "Automated patch backporting in Linux (experience paper)",
    404       "authors": ["Ridwan Shariffdeen", "Xiang Gao", "Gregory J Duck", "Shin Hwei Tan", "Julia Lawall", "Abhik Roychoudhury"],
    405       "year": 2021,
    406       "relevance": "Prior work on automated patch backporting using syntactic transformation rules, representing traditional pre-LLM approaches."
    407     },
    408     {
    409       "title": "A Survey on Code Generation with LLM-based Agents",
    410       "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"],
    411       "year": 2025,
    412       "arxiv_id": "2508.00083",
    413       "relevance": "Recent survey on LLM-based code generation agents, relevant to understanding the broader landscape of agentic coding approaches."
    414     }
    415   ]
    416 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs