ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28911B)


      1 {
      2   "paper": {
      3     "title": "SVRepair: Structured Visual Reasoning for Automated Program Repair",
      4     "authors": [
      5       "Xiaoxuan Tang",
      6       "Jincheng Wang",
      7       "Liwei Luo",
      8       "Jingxuan Xu",
      9       "Sheng Zhou",
     10       "Dajun Chen",
     11       "Wei Jiang",
     12       "Yong Li"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv",
     16     "arxiv_id": "2602.06090"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "SVRepair proposes a multimodal APR framework that transforms visual artifacts (screenshots, control-flow graphs) into semantic scene graphs via a fine-tuned vision-language model (SVR), then drives a coding agent for fault localization and patch generation. It achieves 36.47% on SWE-Bench M (vs 35.98% for prior SOTA GUIRepair), 38.02% on MMCode, and 95.12% on CodeVision. Ablation shows that the structured visual representation contributes most of the improvement over raw vision input, while the iterative sub-artifact segmentation strategy only helps on the complex real-world SWE-Bench M benchmark.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "GitHub repository (https://github.com/codefuse-ai/CodeFuse-SVR) and HuggingFace model (http://huggingface.co/codefuse-ai/CodeFuse-SVR-8B) are provided on the first page of the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Evaluation uses standard public benchmarks (SWE-Bench M, MMCode, CodeVision). Training data uses WebSight (public) and public GitHub repositories. Model weights are released on HuggingFace."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions '8 NVIDIA H20 96GB GPUs' for training but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. No README commands or 'Reproducing Results' section described."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results are reported as point estimates (e.g., 36.47%, 38.02%, 95.12%) with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Claims of outperformance (e.g., 36.47% vs 35.98%) are based on raw number comparisons with no statistical significance tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Tables 1 and 2 report absolute Pass@1 percentages for all methods, providing baseline context for understanding the magnitude of improvements (e.g., SVRepair 36.47% vs GUIRepair 35.98% on SWE-Bench M)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification for benchmark sizes (SWE-Bench M has 617 instances, MMCode 3,548 questions) or power analysis. Benchmark sizes are taken as given."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "All results appear to be single-run numbers with no standard deviation, variance, or spread measures reported across runs."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Tables 1 and 2 include multiple baselines: RAG, SWE-Agent, Agentless Lite, OpenHands-Versa, GUIRepair (Table 1), and GPT-4o, Claude 3.5 Sonnet, Claude 4.0, Qwen3-VL-235B (Table 2)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include recent systems: GUIRepair (2025), OpenHands-Versa with Claude-Sonnet 4, Claude 4.0, and GPT-o3 are all contemporary models."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 3 presents a systematic ablation study with four configurations (V1-V4) isolating the contributions of vision input, SVR model, and sub-artifact feedback across all three benchmarks."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The main APR evaluation uses only Pass@1 (%Resolved) across all three benchmarks. The SVR model evaluation uses Rendering Accuracy and SSIM (Table 4), but these evaluate a sub-component, not the main system."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation is entirely automated via test suite pass/fail. No human evaluation of patch quality, readability, or correctness beyond test passing."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "SVR is trained on WebSight and GitHub CFGs, while evaluation is on separate standard benchmarks (SWE-Bench M, MMCode, CodeVision). There is no overlap between training and test data."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "Only aggregate Pass@1 numbers per benchmark are reported. No per-repository, per-difficulty, or per-task-type breakdowns within any benchmark."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The case study (Figure 3, Section 4.5) shows where the traditional method fails and SVRepair succeeds, but does not show or analyze cases where SVRepair itself fails."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4.4 honestly reports that sub-artifact feedback (V3→V4) provides no improvement on MMCode and CodeVision, with a substantive explanation that these benchmarks lack visual noise since images are manually generated."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of 36.47% on SWE-Bench M, 38.02% on MMCode, and 95.12% on CodeVision are all supported by Tables 1 and 2. 'State-of-the-art' claim is supported by comparison tables."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The ablation study (Table 3) uses controlled single-variable manipulation (V1→V2→V3→V4) to isolate the causal contribution of each component. The design is adequate for the causal claims made."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims 'Automated Program Repair' broadly, but SWE-Bench M covers only 17 JavaScript repositories, MMCode and CodeVision are code generation tasks adapted as APR, and the SVR model is trained only on HTML and control-flow graphs. The paper doesn't adequately bound its generalization claims."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No discussion of alternative explanations. The improvement over GUIRepair could be due to the different base model combination (SVR-8B+GPT-o3 vs GPT-o3 alone), or to differences in sampling strategy (greedy vs 40 candidates). These confounds are not addressed."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures Pass@1 (whether the top patch passes tests) and frames this as APR effectiveness. The measurement directly matches the claim — fixing bugs as verified by test suites."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Models are referenced by marketing names only: 'GPT-o3', 'GPT-4o', 'Claude-3.5 Sonnet', 'Claude 4.0', 'Claude-Sonnet 4', 'Qwen3-VL-8B', 'Qwen3-VL-235B'. No snapshot dates or API versions are provided."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Appendix A provides the full coding agent system prompt with a concrete input example. Appendix B provides the artifact segmentation prompt template with placeholders (problem_statement, code_snips) and structured output format."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "SVR training reports learning rate (1e-5) and conflicting epoch counts ('three epochs' in one sentence, '2 epochs' in the next). Inference uses 'greedy decoding' but no max tokens, top-p, or other generation parameters are stated for the coding LLM."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.2 describes the agent scaffold in detail: Docker environment, tool suite (grep, glob, read_file, write_file, edit_file, bash), cyclic Localization→Generation→Validation workflow, and the iterative visual segmentation feedback loop (Section 3.3)."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.1 describes data collection and transformation: WebSight HTML parsed into DOM→SSG nodes, GitHub repos filtered by popularity/issue count, functions >20 LOC extracted, CFGs built by staticfg and mapped to SSG."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6 'Limitation' is a dedicated section discussing scope, computational overhead, and Docker-based reproduction challenges."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 6 identifies specific limitations: scope centered on HTML renderings and control-flow graphs, computational overhead from iterative segmentation (mitigated by k=2 threshold), and OS-specific dependency issues with Docker reproduction."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 6 explicitly states: 'its scope is currently centered on HTML renderings and control-flow graphs' and acknowledges that extending to 'other artifacts like sequence diagrams' requires 'further domain-specific fine-tuning.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The SVR training data (processed SSGs from 37 GitHub repos) is not released. Only the model weights are on HuggingFace. Evaluation benchmarks are publicly available but the authors' own training data pipeline output is not."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.1 describes collection: 2M examples from WebSight, 37 high-rating GitHub repos selected by popularity and issue count, function extraction (>20 LOC), CFG construction via staticfg."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Evaluation uses standard public benchmarks (SWE-Bench M, MMCode, CodeVision)."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The pipeline stages are described (HTML→DOM→SSG, code→CFG→SSG) but no counts of examples at each stage, no filtering statistics, and no explanation of how many of the 37 repos' functions survived the >20 LOC threshold."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding, grants, or acknowledgments section is present in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly stated: Ant Group (7 authors) and Zhejiang University (1 author)."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "All primary authors are from Ant Group, which develops the CodeFuse product line (visible in the GitHub/HuggingFace URLs). They are evaluating their own system, creating a non-independent evaluation. No funding disclosure makes independence assessment impossible."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present. The CodeFuse-SVR branding suggests commercial interest in the product."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates stated for any of the models used (GPT-o3, GPT-4o, Claude, Qwen). The SVR model's training data temporal scope is not specified either."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether the coding LLMs (GPT-o3, etc.) may have seen SWE-Bench M, MMCode, or CodeVision problems in their training data."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "SWE-Bench M tasks come from GitHub issues that predate most model training cutoffs. MMCode problems are from programming competitions. No contamination analysis is performed for any benchmark."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time reported. The system calls GPT-o3 iteratively with a feedback loop but does not quantify the cost."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Training hardware is mentioned ('8 NVIDIA H20 96GB GPUs') but total training time, total API spend for benchmark evaluation, and overall computational budget are not stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "All results appear to be single-run. No analysis of sensitivity to random seeds or stochastic variation."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never explicitly stated. Results are presented as single numbers with no indication of how many runs produced them."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No mention of hyperparameter search budget, number of configurations tried, or search methodology."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No explanation of how the final configuration was selected. The paper presents results from one configuration without discussing alternatives tried."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Multiple comparisons are made across baselines and benchmarks without any statistical testing, let alone correction for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Ant Group authors evaluate their own CodeFuse-SVR system against baselines without acknowledging self-evaluation bias. No independent evaluation or discussion of this confound."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "SVRepair uses greedy decoding (1 patch per iteration) while GUIRepair generates up to 40 candidates. This efficiency difference is mentioned but compute budgets are not compared quantitatively."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether the benchmarks validly measure multimodal APR capability. MMCode and CodeVision are code generation benchmarks adapted as APR — this construct validity gap is not addressed."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "SVRepair uses a custom scaffold (Docker environment, specific tools, iterative feedback loop) while baselines use their own scaffolds. Differences in performance could be due to scaffold rather than the SVR model, but this confound is not discussed."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether GPT-o3 or other models' training data includes solutions to SWE-Bench M issues, MMCode problems, or CodeVision examples."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether evaluation setup leaks information not available in real-world APR scenarios."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "SVR is trained on WebSight and 37 GitHub repos. No analysis of whether these overlap with SWE-Bench M's 17 JavaScript repositories or other benchmark sources."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention methods are applied (no canary strings, no membership inference, no decontamination)."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "SVRepair achieves state-of-the-art 36.47% Pass@1 on SWE-Bench M, outperforming GUIRepair (35.98%).",
    373       "evidence": "Table 1 shows SVRepair at 36.47% vs GUIRepair at 35.98%. Margin is 0.49 percentage points. No statistical test performed.",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "SVRepair achieves 38.02% on MMCode and 95.12% on CodeVision, surpassing strong multimodal baselines.",
    378       "evidence": "Table 2 shows SVRepair leading both benchmarks: 38.02% vs Claude 4.0's 37.02% on MMCode, and 95.12% vs GPT-4o's 92.07% on CodeVision.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The SVR structured visual representation significantly improves performance over raw vision input.",
    383       "evidence": "Table 3 ablation: V2→V3 (+SVR) increases MMCode from 16.33% to 38.02% (+21.69pp) and CodeVision from 85.36% to 95.12% (+9.76pp). SWE-Bench M improves from 33.08% to 35.01%.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "The sub-artifact feedback strategy further improves performance on complex real-world issues.",
    388       "evidence": "Table 3: V3→V4 improves SWE-Bench M from 35.01% to 36.47% (+1.46pp). No improvement on MMCode or CodeVision, as honestly reported and explained (these benchmarks lack visual noise).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "SVR-8B achieves competitive rendering accuracy (94.29%) to Qwen3-VL-235B (94.97%) and highest SSIM (0.7892).",
    393       "evidence": "Table 4 compares SVR-8B against Qwen3-VL-8B and Qwen3-VL-235B on 1,300 code-control flow graph pairs from GitHub repositories.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "SVRepair is more efficient than GUIRepair, using greedy decoding with a lightweight feedback loop vs up to 40 patch candidates.",
    398       "evidence": "Section 4.3 states SVRepair 'follows a greedy decoding strategy within a lightweight feedback loop' while 'GUIRepair relies on multi-sampling (temperature = 1) to produce up to 40 patch candidates.' No quantitative efficiency comparison provided.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Company evaluating own product",
    405       "detail": "All primary authors are from Ant Group, evaluating their own CodeFuse-SVR system. The GitHub and HuggingFace URLs use the 'codefuse-ai' organization. No independent evaluation or acknowledgment of self-evaluation bias."
    406     },
    407     {
    408       "flag": "Marginal improvement without statistical testing",
    409       "detail": "The headline SWE-Bench M result (36.47% vs 35.98% for GUIRepair) is a 0.49pp difference with no error bars, no significance test, and no multi-run variance. This margin could easily be within noise."
    410     },
    411     {
    412       "flag": "No error bars or variance on any result",
    413       "detail": "All experimental results across three benchmarks and the ablation study are single point estimates. Without variance information, it's impossible to assess whether differences are meaningful."
    414     },
    415     {
    416       "flag": "Inconsistent training details",
    417       "detail": "Section 4.1 states both 'three epochs' and '2 epochs' for SVR training within consecutive sentences, undermining confidence in the experimental setup description."
    418     },
    419     {
    420       "flag": "Benchmark adaptation questionable",
    421       "detail": "MMCode and CodeVision are code generation benchmarks adapted as APR tasks by treating 'high-level program requirements as issue descriptions.' This adaptation changes the task semantics, and construct validity of using these as APR benchmarks is not discussed."
    422     },
    423     {
    424       "flag": "Confounded baseline comparison",
    425       "detail": "SVRepair uses SVR-8B+GPT-o3, while baselines use varying models (GPT-4o for RAG/SWE-Agent, Claude-3.5 Sonnet for Agentless Lite). The comparison conflates framework and model differences. Only GUIRepair also uses GPT-o3, making it the only fair direct comparison."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    431       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    432       "year": 2024,
    433       "relevance": "Foundational agentic APR system that designs agent-computer interfaces for solving GitHub issues."
    434     },
    435     {
    436       "title": "SWE-bench: Can language models resolve real-world github issues?",
    437       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    438       "year": 2023,
    439       "arxiv_id": "2310.06770",
    440       "relevance": "Foundational benchmark for evaluating LLM agents on real-world software engineering tasks."
    441     },
    442     {
    443       "title": "SWE-bench multimodal: Do AI systems generalize to visual software domains?",
    444       "authors": ["John Yang", "Carlos E. Jimenez", "Alex L. Zhang"],
    445       "year": 2025,
    446       "relevance": "Multimodal extension of SWE-bench evaluating agents on visual software engineering issues across 17 JavaScript repositories."
    447     },
    448     {
    449       "title": "Seeing is fixing: Cross-modal reasoning with multimodal LLMs for visual software issue fixing",
    450       "authors": ["Kai Huang", "Jian Zhang", "Xiaofei Xie", "Chunyang Chen"],
    451       "year": 2025,
    452       "arxiv_id": "2506.16136",
    453       "relevance": "Prior SOTA multimodal APR system (GUIRepair) that translates visual artifacts into issue reproduction code."
    454     },
    455     {
    456       "title": "Agentless: Demystifying LLM-based software engineering agents",
    457       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    458       "year": 2024,
    459       "arxiv_id": "2407.01489",
    460       "relevance": "Non-agentic approach to LLM-based APR that achieves competitive results without agent scaffolding."
    461     },
    462     {
    463       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    464       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    465       "year": 2024,
    466       "arxiv_id": "2407.16741",
    467       "relevance": "Open-source platform for AI software development agents used as a baseline."
    468     },
    469     {
    470       "title": "Alibaba LingmaAgent: Improving automated issue resolution via comprehensive repository exploration",
    471       "authors": ["Yingwei Ma", "Qingping Yang", "Rongyu Cao"],
    472       "year": 2025,
    473       "relevance": "Agent-based approach to automated issue resolution with comprehensive repository exploration."
    474     },
    475     {
    476       "title": "AutoCodeRover: Autonomous program improvement",
    477       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    478       "year": 2024,
    479       "relevance": "Autonomous agent for program improvement using code search and context gathering."
    480     },
    481     {
    482       "title": "MMCode: Benchmarking multimodal large language models for code generation with visually rich programming problems",
    483       "authors": ["Kaixin Li", "Yuchen Tian", "Qisheng Hu"],
    484       "year": 2024,
    485       "arxiv_id": "2404.09486",
    486       "relevance": "Multimodal code generation benchmark with 3,548 questions and 6,620 images from programming competitions."
    487     },
    488     {
    489       "title": "SWE-Search: Enhancing software agents with Monte Carlo tree search and iterative refinement",
    490       "authors": ["Antonis Antoniades", "Albert Örwall", "Kexun Zhang"],
    491       "year": 2024,
    492       "arxiv_id": "2410.20285",
    493       "relevance": "MCTS-based approach to enhancing software engineering agents for issue resolution."
    494     },
    495     {
    496       "title": "Automated program repair: Emerging trends pose and expose problems for benchmarks",
    497       "authors": ["Joseph Renzullo", "Pemma Reiter", "Westley Weimer", "Stephanie Forrest"],
    498       "year": 2025,
    499       "relevance": "Survey on APR benchmarking challenges, relevant to understanding limitations of current evaluation methodology."
    500     },
    501     {
    502       "title": "Evaluating large language models trained on code",
    503       "authors": ["Mark Chen"],
    504       "year": 2021,
    505       "arxiv_id": "2107.03374",
    506       "relevance": "Foundational work on evaluating code generation models, introduced Pass@k metric used throughout this paper."
    507     }
    508   ],
    509   "engagement_factors": {
    510     "practical_relevance": {
    511       "score": 2,
    512       "justification": "APR tool with released code and model weights on GitHub/HuggingFace, but requires significant infrastructure (Docker, GPT-o3 API access) to use in practice."
    513     },
    514     "surprise_contrarian": {
    515       "score": 1,
    516       "justification": "Using structured scene graphs from visual artifacts for APR is a novel approach, but doesn't challenge any widely-held belief."
    517     },
    518     "fear_safety": {
    519       "score": 0,
    520       "justification": "No AI safety or security implications — this is a tool for fixing bugs, not an attack or risk vector."
    521     },
    522     "drama_conflict": {
    523       "score": 0,
    524       "justification": "No controversy or conflict angle."
    525     },
    526     "demo_ability": {
    527       "score": 2,
    528       "justification": "GitHub code and HuggingFace model are available, but running the full pipeline requires Docker setup, multiple models, and API access."
    529     },
    530     "brand_recognition": {
    531       "score": 1,
    532       "justification": "Ant Group (CodeFuse) is known in the Chinese tech ecosystem but not a top global AI research brand. Uses GPT-o3 which adds some recognition."
    533     }
    534   }
    535 }

Impressum · Datenschutz