ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29196B)


      1 {
      2   "paper": {
      3     "title": "SIADAFIX: Issue Description Response for Adaptive Program Repair",
      4     "authors": ["Xin Cao", "Nan Yu"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2510.16059",
      8     "doi": "10.48550/arXiv.2510.16059"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SIADAFIX proposes an adaptive program repair framework combining fast thinking (issue description optimization and classification) with slow thinking (iterative bug fix agent). The method achieves 60.7% Pass@1 on SWE-bench Lite using Claude-4 Sonnet, solving 182/300 problems. Ablation study shows incremental contribution of each component: basic Bug Fix Agent (47.3%), adding Checker (53.0%), adding Optimizer (52.3%), and the full system with Classifier (60.7%). The system adaptively selects easy, middle, or hard repair modes based on issue description complexity classification.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a GitHub URL: 'Our code is available at https://github.com/liauto-siada/siada-cli' in the abstract and references."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses SWE-bench Lite, a publicly available benchmark dataset. No proprietary data modifications were made to the evaluation dataset."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper. The paper does not specify library versions or dependencies."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper contains no README instructions, commands to run, or a 'Reproducing Results' section."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., '60.7% Pass@1') with no confidence intervals or error bars anywhere in the paper."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims SIADAFIX outperforms baselines (e.g., 'outperforming SWE-agent by 4.0 percentage points') based solely on comparing raw numbers without any statistical significance test."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports both absolute numbers and percentage point differences with baseline context, e.g., 'outperforming SWE-agent by 4.0 percentage points (182 vs. 170 problems solved)' and ablation improvements like '+8.7%' from BFA to BFA+CH on scikit-learn."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why the 300-problem SWE-bench Lite dataset is sufficient for the claims being made. No power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from a single run with no indication of variability across trials."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 2 compares SIADAFIX against four baselines: ExpeRepair-v1.0, Refact.ai Agent, KGCompass, and SWE-agent, all using Claude-4 Sonnet on SWE-bench Lite."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All baselines are recent (2024-2025 work): SWE-agent (2024), ExpeRepair (2025), KGCompass (2025), Refact.ai (2024). These represent current state-of-the-art open-source methods."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 presents a comprehensive ablation study with five configurations: BFA alone (47.3%), BFA+CH (53.0%), BFA+OP (52.3%), BFA+CH+OP (57.7%), and full SIADAFIX (60.7%), broken down by project."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only Pass@1 is used. The paper explicitly states 'This paper adopts Pass@1 as the main evaluation metric' with no secondary metrics such as cost efficiency, time-to-fix, or code quality measures."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of patches is performed. All evaluation is automated through SWE-bench's test suite verification."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "While SWE-bench Lite is used as the test set, there is no explicit mention of a separate validation split for tuning system hyperparameters (e.g., classifier thresholds, mode selection parameters). It is unclear whether any system design decisions were made based on SWE-bench Lite results."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 breaks down results by project (12 projects) for all methods, and Table 3 provides per-project ablation results."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No failure case analysis is provided. The paper does not discuss specific examples where SIADAFIX fails, nor does it analyze why certain problems remain unsolved (e.g., 0/3 on Flask)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Every component in the ablation study shows improvement. No approaches that were tried and abandoned, no configurations that failed, and no negative experimental outcomes are reported."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims '60.67% pass@1 performance' which matches Table 2's 182/300 (60.7%). The claim of 'state-of-the-art levels among all open-source methods' is supported by Table 2 showing SIADAFIX has the highest total among compared methods."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Table 3) uses controlled single-variable manipulation to test causal claims about each component's contribution (e.g., adding Checker improves from 47.3% to 53.0%). This is an adequate ablation design for the causal claims made."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper's title and abstract frame results broadly as 'adaptive program repair' and 'new insights for automated program repair,' but all evaluation is on SWE-bench Lite (Python projects from a limited set of repos). No explicit bounds on generalization are stated."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations for the results are discussed. For instance, the paper does not consider whether improvements come from increased compute budget (more LLM calls in hard mode) rather than architectural innovations."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures Pass@1 on SWE-bench Lite and frames it as 'program repair' capability broadly. No discussion of whether passing tests is a sufficient proxy for correct bug fixes, or whether test-passing patches might be overfitting to tests rather than addressing root causes."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper uses 'Claude-4 Sonnet model' throughout — a marketing name without a specific API version, snapshot date, or model ID. Model behavior changes across versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Appendices B (Issue Description Optimizer), C (Bug Fix Agent), D (Checker), E (Enhanced Checker), and F (Selector) with detailed instructions and templates."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max tokens, or other LLM API parameters are reported anywhere in the paper. The search tool mentions '300 files' as a limit, but core LLM hyperparameters are missing."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The agentic scaffolding is described in detail: Section 3 covers the two-phase workflow (decision + execution), Table 1 lists all tools with their purposes and features, Figure 2 shows the framework overview, and Figure 3 shows the Bug Fix Agent workflow."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The classifier training data is described only vaguely as 'extracted from our internal code generation requirement data' from '500,000 calls' without detailing how issue-like descriptions were extracted, filtered, or labeled. Feature extraction methods are listed in Appendix A but the data pipeline has unexplained steps."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "Section 5 is 'Conclusion and Future Work' which discusses future directions but contains no dedicated limitations discussion. There is no explicit limitations or threats-to-validity section."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed anywhere in the paper. No acknowledgment of specific methodological limitations."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show. Future work sections implicitly acknowledge gaps (multi-modal, domain knowledge) but never explicitly bound the claims."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (patch diffs, agent trajectories, per-problem outcomes) is released. Only aggregate results in tables are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The SWE-bench Lite benchmark is referenced but the classifier training data collection is only partially described: 'over 500,000 calls to LI AUTO's internal code generation tools from April to September 2025' without detailing how issue-like descriptions were selected, labeled for difficulty, or validated."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The evaluation uses a standard public benchmark (SWE-bench Lite)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The classifier training pipeline has undocumented steps: how 500K+ internal calls were filtered to training examples, how difficulty labels were assigned, and what validation was performed on the classifier. The evaluation pipeline (SWE-bench harness) is not described in detail either."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed. No acknowledgments section mentions grants or sponsors. The work is corporate (LI AUTO) but no explicit funding statement is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: both authors are from 'Code Intelligence Team, LI AUTO, Beijing, China.'"
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "The authors work at LI AUTO and evaluate their own product (SIADA-CLI, open-sourced by LI AUTO). Their employer has a direct commercial interest in positive results for their code intelligence tools."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is included. No disclosure of patents, equity, or other financial interests related to SIADA-CLI or the findings."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not state the training data cutoff for Claude-4 Sonnet. SWE-bench Lite problems come from real GitHub issues, and without knowing the model's training cutoff, contamination cannot be assessed."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether SWE-bench Lite problems or their solutions appeared in Claude-4 Sonnet's training data. No overlap analysis is performed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "SWE-bench Lite was published in 2023 and its problems come from public GitHub repositories. Claude-4 Sonnet may have seen both the benchmark and the original issues/fixes during training. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. It is a benchmark evaluation of an automated program repair tool."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, API cost, tokens consumed, or wall-clock time is reported. The method calls Claude-4 Sonnet multiple times per problem (especially in hard mode with multiple candidates) but no cost information is provided."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget, API spend, or hardware requirements are stated. The classifier training and full SWE-bench evaluation costs are not quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No multi-seed results are reported. All results appear to be from a single run. LLM outputs are non-deterministic, and seed sensitivity is not addressed."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. Results appear to be from a single execution without indication of replication."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The classifier has 20 features (Appendix A) and the system has many configurable parameters (search limits, mode thresholds) but no search methodology or budget is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No explanation of how the final system configuration was selected. The choice of parameters (e.g., search file limit of 300, mode thresholds for easy/middle/hard) appears unjustified."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own system (SIADA-CLI) against baselines without acknowledging potential author-evaluation bias. No independent evaluation or mitigation strategy is discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "SIADAFIX's hard mode uses multiple bug fix agent runs plus a selector, consuming significantly more compute than easy mode or simpler baselines like Agentless. This compute difference is not discussed or controlled for."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether SWE-bench Lite actually measures 'program repair' capability as claimed. The paper notes that SWE-bench Verified has different characteristics than Lite but does not critically examine what Lite measures."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "While all compared methods use Claude-4 Sonnet (controlling for model), the paper does not explicitly discuss the scaffold confound or acknowledge that performance differences are attributable to scaffolding rather than fundamental algorithmic advances."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "SWE-bench Lite problems come from public GitHub issues dating back several years. Claude-4 Sonnet may have seen both the issues and their resolutions during training. This temporal leakage risk is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides information not available in real usage scenarios. The agent has access to the full repository including test files, which may leak solution hints."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training and test data. SWE-bench problems come from popular open-source repositories likely present in Claude's training data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or temporal splits."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SIADAFIX achieves 60.7% Pass@1 on SWE-bench Lite (182/300 problems solved), reaching state-of-the-art among all open-source methods.",
    365       "evidence": "Table 2 shows 182/300 problems solved, compared to ExpeRepair (181), Refact.ai (180), KGCompass (175), and SWE-agent (170). Section 4.2.1.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Each component incrementally improves performance: BFA 47.3% → BFA+CH 53.0% → BFA+OP 52.3% → BFA+CH+OP 57.7% → full SIADAFIX 60.7%.",
    370       "evidence": "Table 3 and Figure 1 show the ablation progression across all 12 projects. Section 4.2.2.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "The Checker component provides consistent improvement across most projects, achieving 53.0% overall (+5.7pp over baseline BFA).",
    375       "evidence": "Table 3 shows BFA+CH outperforms BFA on 9 of 12 projects, with notable improvements on django (+4.4%) and scikit-learn (+8.7%). Section 4.2.2.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The adaptive Classifier enables selecting appropriate easy, middle, and hard repair modes based on problem complexity.",
    380       "evidence": "Section 3.1.2 describes the classifier design and training on internal data. Table 3 shows BFA+CH+OP+CL (60.7%) outperforms BFA+CH+OP (57.7%). However, no classification accuracy or mode distribution is reported.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "The fast and slow thinking combination effectively balances repair efficiency and accuracy.",
    385       "evidence": "The claim is the central thesis of the paper. Ablation results support component contributions, but no efficiency metrics (cost, time, tokens) are reported to substantiate the 'efficiency' part of the claim. Section 1, Section 5.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Company evaluating its own product",
    392       "detail": "Both authors are from LI AUTO's Code Intelligence Team evaluating their own SIADA-CLI tool. No independent evaluation or acknowledgment of this conflict."
    393     },
    394     {
    395       "flag": "No error bars or multiple runs",
    396       "detail": "Single-run results on a 300-problem benchmark where LLM outputs are non-deterministic. The 1-problem margin over ExpeRepair (182 vs 181) may be within noise."
    397     },
    398     {
    399       "flag": "No statistical significance testing",
    400       "detail": "Claims of 'state-of-the-art' and 'outperforming' baselines are based solely on comparing raw numbers. The margin over ExpeRepair is 0.3pp (1 problem), which is not tested for significance."
    401     },
    402     {
    403       "flag": "No cost reporting despite variable compute usage",
    404       "detail": "Hard mode runs multiple bug fix agents plus a selector, consuming far more compute than easy mode or simpler baselines. No cost comparison is provided despite this being central to practical applicability."
    405     },
    406     {
    407       "flag": "Classifier trained on proprietary data",
    408       "detail": "The RandomForest difficulty classifier was trained on '500,000+ calls to LI AUTO's internal code generation tools' — data that is not released. The classifier weights are claimed to be open-sourced but the training data and process cannot be independently verified or reproduced."
    409     },
    410     {
    411       "flag": "No failure analysis",
    412       "detail": "The paper does not analyze why 118/300 problems remain unsolved, including 0/3 on Flask and 1/5 on xarray. No qualitative error analysis is provided."
    413     },
    414     {
    415       "flag": "No limitations section",
    416       "detail": "The paper has no limitations, threats to validity, or scope boundaries section. All results are presented uncritically with no acknowledgment of methodological weaknesses."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    422       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    423       "year": 2024,
    424       "arxiv_id": "2407.16741",
    425       "relevance": "Open platform for LLM-based software engineering agents, directly relevant as a baseline and framework in the agentic coding space."
    426     },
    427     {
    428       "title": "Trae Agent: An LLM-Based Agent for Software Engineering with Test-Time Scaling",
    429       "authors": ["Pengfei Gao", "Zhao Tian", "Xiangxin Meng"],
    430       "year": 2025,
    431       "arxiv_id": "2507.23370",
    432       "relevance": "LLM-based agent using test-time scaling for software engineering, directly comparable to SIADAFIX's approach of scaling compute for hard problems."
    433     },
    434     {
    435       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    436       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"],
    437       "year": 2024,
    438       "relevance": "Foundational work on agent-computer interfaces for program repair, used as a baseline in this paper's evaluation."
    439     },
    440     {
    441       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    442       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    443       "year": 2023,
    444       "arxiv_id": "2310.06770",
    445       "relevance": "The benchmark dataset used for evaluation in this paper; central benchmark for evaluating automated program repair."
    446     },
    447     {
    448       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    449       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    450       "year": 2024,
    451       "arxiv_id": "2407.01489",
    452       "relevance": "Demonstrates effective program repair without autonomous agents, representing the 'fast thinking' paradigm referenced in this paper."
    453     },
    454     {
    455       "title": "ExpeRepair: Dual-Memory Enhanced LLM-Based Repository-Level Program Repair",
    456       "authors": ["Fangwen Mu", "Junjie Wang", "Lin Shi"],
    457       "year": 2025,
    458       "arxiv_id": "2506.10484",
    459       "relevance": "Memory-enhanced program repair method, used as a key baseline achieving comparable results (60.3% vs 60.7%)."
    460     },
    461     {
    462       "title": "Enhancing Repository-Level Software Repair via Repository-Aware Knowledge Graphs",
    463       "authors": ["Boyang Yang", "Haoye Tian", "Jiadong Ren"],
    464       "year": 2025,
    465       "arxiv_id": "2503.21710",
    466       "relevance": "Knowledge graph approach to program repair (KGCompass), used as a baseline in this paper's evaluation."
    467     },
    468     {
    469       "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories",
    470       "authors": ["Islem Bouzenia", "Michael Pradel"],
    471       "year": 2025,
    472       "arxiv_id": "2506.18824",
    473       "relevance": "Analyzes the thought-action-result patterns of software engineering agents, relevant to understanding agent behavior in program repair."
    474     },
    475     {
    476       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    477       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    478       "year": 2024,
    479       "relevance": "Demonstrates cost-effective conversational program repair, relevant as a fast-thinking baseline and for cost comparison."
    480     },
    481     {
    482       "title": "Impact of Code Language Models on Automated Program Repair",
    483       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    484       "year": 2023,
    485       "relevance": "Studies how code language models affect automated program repair performance, relevant to understanding LLM capabilities in this domain."
    486     },
    487     {
    488       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    489       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    490       "year": 2024,
    491       "relevance": "Comprehensive survey of LLM applications in software engineering, providing broader context for the program repair space."
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 2,
    497       "justification": "Open-sourced CLI tool (SIADA-CLI) for automated program repair that practitioners could integrate into their workflows."
    498     },
    499     "surprise_contrarian": {
    500       "score": 0,
    501       "justification": "Incremental improvement over existing methods with no surprising or contrarian findings; confirms that more components help."
    502     },
    503     "fear_safety": {
    504       "score": 0,
    505       "justification": "No safety, security, or risk implications raised by this work."
    506     },
    507     "drama_conflict": {
    508       "score": 0,
    509       "justification": "No controversy or conflict; straightforward benchmark comparison paper."
    510     },
    511     "demo_ability": {
    512       "score": 2,
    513       "justification": "Code released on GitHub as a CLI tool, potentially installable and runnable, though requires Claude API access."
    514     },
    515     "brand_recognition": {
    516       "score": 1,
    517       "justification": "LI AUTO is a known Chinese EV company but not a major AI research lab; limited brand recognition in the AI research community."
    518     }
    519   }
    520 }

Impressum · Datenschutz