scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31580B)
      1 {
      2   "paper": {
      3     "title": "Seeing is Fixing: Cross-Modal Reasoning with Multimodal LLMs for Visual Software Issue Fixing",
      4     "authors": [
      5       "Kai Huang",
      6       "Jian Zhang",
      7       "Xiaofei Xie",
      8       "Chunyang Chen"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2506.16136",
     13     "doi": "10.48550/arXiv.2506.16136"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "key_findings": "GUIRepair introduces cross-modal reasoning (Image2Code and Code2Image) for multimodal software bug fixing, resolving 157 of 517 SWE-bench M test instances with GPT-4o, outperforming all open-source baselines by at least 26 instances. The ablation study shows Image2Code contributes 7.35% improvement and Code2Image 8.82%, with the full pipeline achieving 15.44% over the base agentless workflow. Using o4-mini, GUIRepair resolves 175 instances, surpassing the best commercial system (Globant, 153) by 22 instances.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper states 'We have released our code and experimental data [27]' with a link to https://sites.google.com/view/guirepair as the project page."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses the publicly available SWE-bench M benchmark and states experimental data is released at [27]. SWE-bench M is a standard public benchmark."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions using fnm for Node.JS version control, npm/pnpm/yarn, and Playwright, and specifies model names (gpt-4o-2024-08-06, text-embedding-3-small), but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. The Code2Image component explicitly requires manual environment setup per repository: 'we follow the procedure used in the SWE-bench M study [22] and perform this process manually.'"
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No confidence intervals or error bars are reported. All results are point estimates (e.g., '157 resolved', '30.37%') with no uncertainty quantification."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are used. Claims of superiority (e.g., 'solves 4 more task instances', '26 more') are based on raw count comparisons without any statistical testing."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Effect sizes are reported with baseline context: '15.44% improvement' (21 more instances over 136 base), '7.35% improvement' for Image2Code (10 more), '8.82% improvement' for Code2Image (12 more). Tables I and III provide both absolute and relative improvements."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for sample size. The benchmark size (517 test, 102 dev) is inherited from SWE-bench M without discussion of statistical power or adequacy."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance or standard deviation is reported. Results appear to be from a single run of the pipeline. No mention of multiple runs or spread measures."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "13 baseline systems are compared against in Table I, including SWE-agent, Agentless, Agentless Lite, RAG, Computer-Use Agents, and commercial systems Globant and Zencoder."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines are selected from 'the top representative techniques on the official SWE-bench M leaderboard as of May 2025 (around the paper submission time).' These represent state-of-the-art systems."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table III presents a thorough ablation with four variants: GUIRepairbase (no components), GUIRepairI2C (Image2Code only), GUIRepairC2I (Code2Image only), and GUIRepairfull (both). Each component's individual and combined contributions are quantified."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Only one performance metric is used: resolved count / %Resolved (equivalent to Pass@1). While average cost is also reported, no alternative quality metrics (e.g., Pass@K for different K, partial resolution, localization accuracy) are provided."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation of the system's outputs. Evaluation is entirely automated via the SWE-bench M evaluation platform (test suite pass/fail)."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "SWE-bench M is divided into test (517 instances) and dev (102 instances). Main results use the test split; the dev split is used only for the generalizability study in RQ3."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table II provides per-repository breakdowns across 12 repositories (next, bpmn-js, carbon, eslint, etc.) for GUIRepair and top baselines."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "The paper presents case studies of successes (eslint-15243, next-4182, prism-1602) but does not systematically analyze failure cases or discuss where GUIRepair breaks down. scratch-gui has 0% resolution for all systems but this is not analyzed."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "Every experiment shows improvement. No configurations or approaches that were tried and failed are reported. All variants show monotonic improvement from base to full."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims are supported: '157 instances' with GPT-4o matches Table I; 'outperforming the best open-source baseline by 26 instances' matches Agentless Lite (131); '175 instances' with o4-mini matches Table V; 'outperforming the top commercial system by 22 instances' matches Globant (153)."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims about component contributions are supported by controlled ablation: Image2Code and Code2Image are individually added to the base workflow (Table III), demonstrating each component's independent and combined effect. The ablation design is a controlled single-variable manipulation."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Claims are generally bounded to SWE-bench M. The paper explicitly evaluates on 'SWE-bench M test' and 'SWE-bench M dev.' The title references 'Visual Software Issue Fixing' which matches the benchmark scope of visual front-end JavaScript issues."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Section VI discusses data leakage and repo configuration as threats but does not discuss alternative explanations for the observed improvements. For instance, whether the gains come from simply providing more context to the model (more tokens from documents and reproduction code) rather than the specific cross-modal design is not addressed."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures 'resolved instances' on SWE-bench M (Pass@1) and frames claims in terms of multimodal issue resolution effectiveness. The measurement and the claim are at the same granularity — no proxy gap exists."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-D specifies 'gpt-4o-2024-08-06' as the chat model and 'text-embedding-3-small' as the embedding model. GPT-4.1 and o4-mini are named in Table V but without snapshot dates. The main experiments use a versioned model."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Prompts are described in natural language (e.g., 'we prompt the chat model to analyze the issue report and identify potentially relevant documents') but no actual prompt text is provided in the paper or appendix."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section IV-D reports detailed hyperparameters: temperature (0 or 1), sampling times (1, 2, or 39), Top-N values (4, 6), chunk size (512), chunk overlap (0), context window (500 lines), maximum candidate files (4), and decoding strategy (greedy vs. sampling)."
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The full pipeline is described in detail across Section III: Knowledge Mining (document retrieval), Repro Generation, File Localization (chat + embedding), Hunk Localization, Patch Generation (Search/Replace), GUI Rendering, and Patch Selection. Figure 4 provides a comprehensive workflow diagram."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The paper describes how repository structures are extracted for localization (skeleton format with imports preserved, variable declarations removed), how documents are collected and merged from chat model and embedding model retrieval, and how patches are converted to unified diff format."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section VI 'Threats to Validity' discusses two specific threats: data leakage and repo configuration (manual environment setup)."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section VI discusses specific threats: (1) data leakage risk with GPT-4o on SWE-bench M, mitigated by comparing against same-model baselines; (2) manual dependency installation and environment deployment, noted as 'an inefficient process' with plans to use automated techniques and Docker images."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not explicitly state what the results do NOT show. There is no statement bounding results to JavaScript front-end libraries only, no acknowledgment that the approach may not generalize to non-visual issues or other programming languages, and no explicit scope exclusions."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper states 'We have released our code and experimental data [27]' with a URL to the project page. SWE-bench M itself is publicly available."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "SWE-bench M's composition is described: 619 task instances from 17 JavaScript libraries, divided into test (517) and dev (102). The benchmark source and structure are clearly documented."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. The study uses a standard benchmark (SWE-bench M) with no recruitment involved."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The full pipeline from input (issue reports + codebase) to output (submitted patch) is documented across Sections III and IV-D, including knowledge mining, repro generation, localization, patch generation, GUI rendering, and patch selection."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Technical University of Munich, Nanyang Technological University, and Singapore Management University. The authors are not evaluating a commercial product from their own company."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding is disclosed, so independence of the funder cannot be assessed."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The model's training data cutoff date is not stated for GPT-4o, GPT-4.1, or o4-mini. The paper does not discuss when any model's training data was collected."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Section VI mentions data leakage but only argues 'we keep the same model as baselines' as mitigation. No analysis of whether SWE-bench M issues appeared in GPT-4o's training data is performed."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "SWE-bench M contains issues from public GitHub repositories that likely predate GPT-4o's training cutoff. The paper acknowledges the leakage concern but provides no contamination analysis, canary strings, or temporal split verification."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Table I reports average cost per instance: GUIRepair costs $0.29/instance. Tables III and V also report per-variant costs (e.g., GUIRepairbase $0.08, GUIRepairfull $0.29). Baseline costs are also reported where available."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Only per-instance average costs are reported. Total computational budget (total API spend, GPU hours for environment setup, total wall-clock time) is not stated."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No seed sensitivity analysis is reported. Results appear to be from a single run. While multiple sampling is used within the pipeline (temperature=1, sampling times=2 or 39), no analysis of result stability across seeds is provided."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of independent end-to-end experimental runs is not stated. Internal sampling parameters are specified (sampling times of 1, 2, or 39 for different phases) but these are pipeline parameters, not independent replications."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search is reported. The paper states 'we follow the Agentless experience' for parameter settings but does not report any search budget or how the specific values (Top-6, Top-4, context window 500) were selected."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Configuration choices (Top-N values, temperature settings, sampling times, context window sizes) are adopted from Agentless without justification or comparison against alternative configurations."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors evaluate their own system (GUIRepair) against baselines. While official leaderboard results are used for most baselines, the authors implement their own GPT-4o version of Agentless Lite as a 'complementary baseline.' No acknowledgment of self-comparison bias."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Table III reports both resolved instances and average cost for each variant: GUIRepairbase ($0.08, 136), GUIRepairI2C ($0.10, 146), GUIRepairC2I ($0.27, 148), GUIRepairfull ($0.29, 157). The paper explicitly discusses cost-performance tradeoffs: 'GUIRepairC2I is almost three times more expensive.'"
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether SWE-bench M adequately measures multimodal issue resolution capability. The paper uses the benchmark without questioning its construct validity or comparing against alternative evaluation approaches."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The paper notes 'closed-source systems do not release the base model they use' making direct comparison unfair, but does not address the scaffold confound. GUIRepair uses an agentless pipeline while baselines use various scaffolds (agent-based, commercial). Performance differences could stem from scaffold design rather than the cross-modal components."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether SWE-bench M issues were created before GPT-4o's training cutoff. The temporal relationship between benchmark creation and model training is not analyzed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information that would not be available in real usage. The Image2Code component provides additional context that a real user might not have."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether SWE-bench M instances are independent (e.g., multiple issues from the same repository sharing structural similarities). Table II shows heavy concentration in certain repos (openlayers: 79 instances, carbon: 134)."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No concrete leakage detection method is used. The paper's only mitigation is using the same model as baselines, which does not detect or prevent leakage."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "GUIRepair resolves 157 instances on SWE-bench M test with GPT-4o, outperforming all open-source baselines by at least 26 instances.",
    375       "evidence": "Table I shows GUIRepair at 157 resolved (30.37%) vs. Agentless Lite Claude3.5 at 131 (25.34%) and GPT-4o at 127 (24.56%). Also outperforms commercial Globant (153) and Zencoder (140).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Image2Code and Code2Image together contribute a 15.44% improvement (21 more instances) over the base agentless workflow.",
    380       "evidence": "Table III: GUIRepairfull resolves 157 vs. GUIRepairbase resolves 136, a difference of 21 instances (15.44%). Ablation with four variants isolates each component's contribution.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "With o4-mini, GUIRepair achieves 175 resolved instances, outperforming the best commercial system (Globant, 153) by 22 instances.",
    385       "evidence": "Table V shows o4-mini GUIRepairfull at 175 resolved. Table I shows Globant at 153. However, this is a cross-model comparison (o4-mini vs. unknown Globant model).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Image2Code alone improves resolution by 7.35% (10 instances) through project-specific knowledge mining and reproduction code generation.",
    390       "evidence": "Table III: GUIRepairI2C resolves 146 vs. GUIRepairbase at 136. Case study of eslint-15243 (Figure 9) shows how knowledge mining helps locate the correct bug file.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Code2Image alone improves resolution by 8.82% (12 instances) through visual patch validation.",
    395       "evidence": "Table III: GUIRepairC2I resolves 148 vs. GUIRepairbase at 136. The prism-1602 case study (Figure 7) illustrates visual patch selection.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "GUIRepair generalizes across different base models (GPT-4o, GPT-4.1, o4-mini), with the full pipeline consistently outperforming the base workflow.",
    400       "evidence": "Table V: GUIRepairfull outperforms GUIRepairbase by 21 (GPT-4o), 13 (GPT-4.1), and 15 (o4-mini) instances across all three models.",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "This is the first work to explicitly address issues with visual cues in front-end libraries for automated issue resolution.",
    405       "evidence": "Section I contributions: 'To the best of our knowledge, this is the first work to explicitly address issues with visual cues in front-end libraries.' Section VII acknowledges DesignRepair and Iris but argues they target specific bug types.",
    406       "supported": "weak"
    407     }
    408   ],
    409   "red_flags": [
    410     {
    411       "flag": "Manual steps undermine reproducibility",
    412       "detail": "The Code2Image component requires manual setup of runtime environments for each JavaScript project: 'we follow the procedure used in the SWE-bench M study and perform this process manually.' This includes manually installing dependencies, building front-end projects, importing JS packages, and sometimes making manual modifications. This makes the approach not fully automated and significantly hampers independent reproduction."
    413     },
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All performance comparisons are based on raw count differences (e.g., '4 more', '26 more') without any statistical tests. On a 517-instance benchmark, a difference of 4 instances between GUIRepair (157) and Globant (153) may not be statistically significant."
    417     },
    418     {
    419       "flag": "Unfair cross-system comparisons",
    420       "detail": "Table I and the abstract compare GUIRepair (GPT-4o) against closed-source commercial systems (Globant, Zencoder) whose base models are unknown. The o4-mini comparison (175 vs. Globant's 153) conflates model capability with approach design."
    421     },
    422     {
    423       "flag": "No variance or reproducibility analysis",
    424       "detail": "Results are reported from what appears to be a single run. LLM-based systems with temperature > 0 can produce substantially different results across runs. The patch generation phase uses temperature=1 with 39 sampling rounds, making the output non-deterministic, yet no stability analysis is provided."
    425     },
    426     {
    427       "flag": "Weak data leakage mitigation",
    428       "detail": "The only contamination mitigation is 'we keep the same model as baselines.' This does not address whether GPT-4o has seen SWE-bench M solutions in its training data. No temporal analysis, canary strings, or decontamination methods are used."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "SWE-bench: Can language models resolve real-world github issues?",
    434       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. R. Narasimhan"],
    435       "year": 2024,
    436       "relevance": "Foundational benchmark for evaluating LLMs on real-world GitHub issue resolution, which SWE-bench M extends to multimodal settings."
    437     },
    438     {
    439       "title": "SWE-bench multimodal: Do ai systems generalize to visual software domains?",
    440       "authors": ["J. Yang", "C. E. Jimenez", "A. L. Zhang", "K. Lieret", "J. Yang", "X. Wu", "O. Press", "N. Muennighoff", "G. Synnaeve", "K. R. Narasimhan"],
    441       "year": 2025,
    442       "relevance": "The multimodal benchmark (SWE-bench M) used as the primary evaluation in this paper, extending SWE-bench to visual JavaScript software issues."
    443     },
    444     {
    445       "title": "Swe-agent: Agent-computer interfaces enable automated software engineering",
    446       "authors": ["J. Yang", "C. Jimenez", "A. Wettig", "K. Lieret", "S. Yao", "K. Narasimhan", "O. Press"],
    447       "year": 2024,
    448       "relevance": "Major agent-based baseline for SWE-bench; demonstrates the agent-computer interface paradigm for automated SE."
    449     },
    450     {
    451       "title": "Demystifying llm-based software engineering agents",
    452       "authors": ["C. S. Xia", "Y. Deng", "S. Dunn", "L. Zhang"],
    453       "year": 2025,
    454       "relevance": "Describes the Agentless approach to LLM-based SE, which GUIRepair extends with cross-modal components."
    455     },
    456     {
    457       "title": "Agentless lite: Rag-based swe-bench software engineering scaffold",
    458       "authors": ["C. S. Xia", "Y. Deng", "S. Dunn", "L. Zhang"],
    459       "year": 2025,
    460       "relevance": "Best open-source baseline on SWE-bench M; GUIRepair outperforms it by 26 instances with the same GPT-4o model."
    461     },
    462     {
    463       "title": "Repairagent: An autonomous, llm-based agent for program repair",
    464       "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"],
    465       "year": 2025,
    466       "relevance": "LLM-based autonomous agent for program repair, representing the agent-based paradigm that GUIRepair competes with."
    467     },
    468     {
    469       "title": "Autocoderover: Autonomous program improvement",
    470       "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"],
    471       "year": 2024,
    472       "relevance": "Autonomous SWE system using specification inference for program improvement on SWE-bench."
    473     },
    474     {
    475       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using chatgpt",
    476       "authors": ["C. S. Xia", "L. Zhang"],
    477       "year": 2024,
    478       "relevance": "Conversation-based LLM approach to automated program repair, demonstrating cost-effective APR with ChatGPT."
    479     },
    480     {
    481       "title": "Programming with pixels: Computer-use meets software engineering",
    482       "authors": ["P. Aggarwal", "S. Welleck"],
    483       "year": 2025,
    484       "arxiv_id": "2502.18525",
    485       "relevance": "Computer-use agent approach to software engineering that directly uses screen pixels, a relevant baseline for multimodal SE."
    486     },
    487     {
    488       "title": "OpenHands: An open platform for ai software developers as generalist agents",
    489       "authors": ["X. Wang", "B. Li", "Y. Song", "F. F. Xu", "X. Tang", "M. Zhuge"],
    490       "year": 2024,
    491       "relevance": "Open platform for AI-based software development agents, representing the broader agentic coding ecosystem."
    492     },
    493     {
    494       "title": "Agentic bug reproduction for effective automated program repair at google",
    495       "authors": ["R. Cheng", "M. Tufano", "J. Cito", "J. Cambronero", "P. Rondon", "R. Wei", "A. Sun", "S. Chandra"],
    496       "year": 2025,
    497       "arxiv_id": "2502.01821",
    498       "relevance": "Bug reproduction as a key step in automated repair at industrial scale, directly related to GUIRepair's reproduction generation component."
    499     },
    500     {
    501       "title": "SWE-RL: Advancing llm reasoning via reinforcement learning on open software evolution",
    502       "authors": ["Y. Wei", "O. Duchenne", "J. Copet", "Q. Carbonneaux", "L. Zhang", "D. Fried", "G. Synnaeve", "R. Singh", "S. I. Wang"],
    503       "year": 2025,
    504       "arxiv_id": "2502.18449",
    505       "relevance": "RL-based approach to improving LLM reasoning for software engineering tasks, relevant to understanding model capability improvements."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "Addresses a real problem (visual bug fixing in front-end libraries) but requires manual environment setup per project, limiting immediate practical adoption."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "Cross-modal reasoning for APR is a novel framing, but the finding that understanding visual context helps fix visual bugs is expected rather than surprising."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No AI safety or security concerns; the work is about automated bug fixing in front-end libraries."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "No controversial claims or challenges to existing systems; straightforward benchmark improvement paper."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "Code is claimed to be released at a project page, but significant manual setup is required per repository and the approach depends on commercial API access (GPT-4o)."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "Uses GPT-4o (OpenAI) and targets SWE-bench M (a known benchmark), but the research group (TUM/NTU/SMU) is not widely recognized in mainstream AI discourse."
    532     }
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs