scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35378B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EXPEREPAIR: Dual-Memory Enhanced LLM-based Repository-Level Program Repair",
      6     "authors": [
      7       "Fangwen Mu",
      8       "Junjie Wang",
      9       "Lin Shi",
     10       "Song Wang",
     11       "Shoubin Li",
     12       "Qing Wang"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2506.10484",
     17     "doi": "10.48550/arXiv.2506.10484"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 49.3% pass@1 with Claude 3.7 Sonnet (supported by Figure 3) and 'outperforming all state-of-the-art open-source methods' (supported by Table 1 comparisons). All abstract claims are backed by results.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The ablation study (Table 2) makes causal claims about each component's contribution through controlled single-variable removal. Each variant removes one component from the full system, isolating its effect.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'Repository-Level Program Repair' broadly, but evaluation is only on SWE-Bench Lite (300 Python issues from 12 GitHub projects). No discussion of generalization to other languages, issue types, or project scales.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are discussed. The improvement could be partly due to the additional DeepSeek-R1 reviewer (used for patch selection but not by all baselines), the extra compute from memory retrieval, or the multi-sampling strategy. None of these confounds are addressed.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The primary claims are stated in terms of pass@1 on SWE-Bench Lite, which directly measures issue resolution. The paper does not significantly overframe these results — the main claims stay close to 'resolved X% of issues on this benchmark.'",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 'Limitations' provides a dedicated discussion of the approach's weakness in bug localization, explaining why memory-based localization is difficult and left for future work.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 6 discusses a specific limitation: bug localization lacks an automated oracle, making it hard to selectively accumulate localization experiences. They explain why a conservative strategy would limit memory diversity and coverage.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The limitations section discusses only one technical limitation (bug localization). No explicit statements about what the results do NOT show — no mention of generalization limits to non-Python projects, different issue types, or other benchmarks.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding acknowledgment section is present in the paper. The authors are from Chinese Academy of Sciences, Beihang University, and York University, which implies government/academic funding, but none is disclosed.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: Chinese Academy of Sciences, University of Chinese Academy of Sciences, Beihang University, and York University. Authors are not affiliated with the companies whose models they evaluate (Anthropic, OpenAI, DeepSeek).",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Authors are from academic institutions (Chinese Academy of Sciences, universities). Their implied funders (government/academic) have no financial stake in whether EXPEREPAIR outperforms competing APR methods.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interests declaration is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Repository-level APR defined in intro; episodic/semantic memory grounded in cognitive science; pass@1 is standard benchmark metric. Terms sufficiently clear from context.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Contributions explicitly stated in introduction: novel APR method using dual memory, dynamic prompt generation mechanism, comprehensive experiments. Paper clearly articulates what it adds.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 engages with RAG approaches, fine-tuning, agent-based frameworks (SWE-Agent, AutoCodeRover, etc.), explicitly positioning EXPEREPAIR as addressing treatment-of-issues-in-isolation.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Code promised with GitHub link provided, but paper itself lacks detailed environment specs, dependency documentation, or step-by-step reproduction instructions needed to reproduce from the paper alone.",
    125         "source": "haiku",
    126         "code_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Section 7 states 'We release our code and data to support further research [10]' and reference [10] provides a GitHub URL: https://github.com/ExpeRepair/ExpeRepair.",
    130           "source": "opus"
    131         },
    132         "data_released": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "The evaluation uses SWE-Bench Lite, a publicly available benchmark (reference [17]). The paper also claims to release data alongside the code at the GitHub repository.",
    136           "source": "opus"
    137         },
    138         "environment_specified": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper. The implementation details (Appendix A.1) describe algorithmic parameters but not the software environment.",
    142           "source": "opus"
    143         },
    144         "reproduction_instructions": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No step-by-step reproduction instructions are included in the paper. The appendix provides implementation details but no commands or scripts to replicate experiments.",
    148           "source": "opus"
    149         }
    150       },
    151       "statistical_methodology": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Critical gaps: no confidence intervals (Table 1 shows bare percentages), no significance tests, no variance/error reporting, no power analysis. Difference between EXPEREPAIR 47.7% and DARS 47.0% presented without statistical testing.",
    155         "source": "haiku",
    156         "confidence_intervals_or_error_bars": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Tables 1 and 2 report only point estimates (e.g., 47.7% resolved) with no confidence intervals, error bars, or uncertainty quantification.",
    160           "source": "opus"
    161         },
    162         "significance_tests": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No statistical significance tests are reported. Claims that EXPEREPAIR 'outperforms' baselines are based solely on comparing point estimate percentages (e.g., 47.7% vs 47.0%).",
    166           "source": "opus"
    167         },
    168         "effect_sizes_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Results are reported as absolute percentages with baseline context (Table 1: EXPEREPAIR 47.7% vs DARS 47.0%, Agentless 40.7%, etc.), and the ablation (Table 2) shows the effect of each component removal (e.g., 47.7% → 41.3% without experience module).",
    172           "source": "opus"
    173         },
    174         "sample_size_justified": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "The evaluation uses SWE-Bench Lite's fixed 300 issues. No justification is provided for why this sample size is adequate, nor is a power analysis discussed.",
    178           "source": "opus"
    179         },
    180         "variance_reported": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be from single runs with no indication of result stability across multiple executions.",
    184           "source": "opus"
    185         }
    186       },
    187       "evaluation_design": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Strong design: 8 contemporary baselines (Table 1), ablation study (Table 2), multiple metrics (pass@1, ESR, RSR, cost), held-out test set, multiple LLMs tested (Figure 3). Limitations: no per-category breakdowns, limited failure case analysis.",
    191         "source": "haiku",
    192         "baselines_included": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Table 1 compares EXPEREPAIR against 8 open-source baselines including SWE-Agent, Agentless, AutoCodeRover, OpenHands, PatchPilot, DARS, and Moatless Tools.",
    196           "source": "opus"
    197         },
    198         "baselines_contemporary": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Baselines include recent methods from 2024-2025: DARS (2025), PatchPilot (2025), OpenHands (2024), and several with the same LLM (Claude 3.5 Sonnet V2).",
    202           "source": "opus"
    203         },
    204         "ablation_study": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Table 2 presents a systematic ablation removing three components: the full experience module, demonstrations only, and insights only, measuring resolved rate, ESR, and RSR for each.",
    208           "source": "opus"
    209         },
    210         "multiple_metrics": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Four metrics are used: % Resolved (pass@1), Average Cost ($), ESR (Execution Success Rate), and RSR (Reproduction Success Rate). The first two appear in Table 1, all four in Table 2.",
    214           "source": "opus"
    215         },
    216         "human_evaluation": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Section 4 states RSR is 'manually verified by human annotators' — human annotators evaluate whether the system's reproduction scripts successfully reproduce the target issue.",
    220           "source": "opus"
    221         },
    222         "held_out_test_set": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Results are reported on SWE-Bench Lite, a standard held-out benchmark of 300 issues. The seed issues used to populate memory are part of the benchmark (evaluated without memory), not a separate tuning set.",
    226           "source": "opus"
    227         },
    228         "per_category_breakdown": {
    229           "applies": true,
    230           "answer": false,
    231           "justification": "No per-project or per-category breakdown is provided. SWE-Bench Lite contains 12 projects but results are reported only as aggregate percentages. The intersection analysis (Figure 2) shows overlap with baselines but not per-category performance.",
    232           "source": "opus"
    233         },
    234         "failure_cases_discussed": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "No error analysis or specific failure cases are discussed. Section 6 (Limitations) discusses the methodological limitation of bug localization but does not analyze specific issues the system failed to resolve.",
    238           "source": "opus"
    239         },
    240         "negative_results_reported": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "No negative results are reported. Every ablation variant shows the full system outperforms reduced versions. No mention of approaches tried and abandoned, or configurations that failed.",
    244           "source": "opus"
    245         }
    246       },
    247       "setup_transparency": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Good transparency: exact model versions specified (Claude 3.5 V2, 3.7), actual prompts shown in Appendix A.2 (Figures 4-8), agent scaffolding described in detail. Hyperparameters mostly specified (top-5 demos, 3 iterations) though temperature value omitted.",
    251         "source": "haiku",
    252         "model_versions_specified": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The paper specifies 'Claude-3.5-Sonnet V2', 'Claude 3.7 Sonnet', 'DeepSeek-R1', and 'o1-mini' — these are versioned model names that identify specific model releases.",
    256           "source": "opus"
    257         },
    258         "prompts_provided": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Figures 4-8 in the appendix provide the actual prompt texts used for reproduction script generation, validation test generation, patch generation, patch refinement, and insight summarization.",
    262           "source": "opus"
    263         },
    264         "hyperparameters_reported": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "Algorithmic parameters are reported (iteration limits of 3, 4 candidate patches, top-5 retrieval, max 15 insights). However, key LLM generation parameters are missing — the paper only says 'high temperature' without specifying the value, and omits top-p and max tokens.",
    268           "source": "opus"
    269         },
    270         "scaffolding_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The agentic scaffolding is described in detail: two-phase workflow (Section 3.1), test agent and patch agent with ReAct-based iteration (Section 3.2), episodic and semantic memory modules with retrieval and update mechanisms (Section 3.3), and a review agent for patch selection.",
    274           "source": "opus"
    275         },
    276         "data_preprocessing_documented": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "The benchmark (SWE-Bench Lite) is used as-is without modification. The memory construction pipeline is documented in Section 3.3 with clear extraction and update procedures. Appendix A.1 describes implementation details.",
    280           "source": "opus"
    281         }
    282       },
    283       "data_integrity": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "Uses standard public benchmark (SWE-Bench Lite) but minimal documentation of data handling: no raw data availability, preprocessing steps, or data pipeline from collection to analysis documented.",
    287         "source": "haiku",
    288         "raw_data_available": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The paper states 'We release our code and data to support further research' with a GitHub repository link. SWE-Bench Lite is also publicly available for independent verification.",
    292           "source": "opus"
    293         },
    294         "data_collection_described": {
    295           "applies": true,
    296           "answer": true,
    297           "justification": "Section 4 describes the benchmark: 'SWE-Bench Lite benchmark consists of 300 GitHub issues drawn from 12 diverse real-world software projects written in Python, each accompanied by an issue report and the corresponding codebase.'",
    298           "source": "opus"
    299         },
    300         "recruitment_methods_described": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "No human participants in this study. The data source is a standard public benchmark (SWE-Bench Lite).",
    304           "source": "opus"
    305         },
    306         "data_pipeline_documented": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "The pipeline is documented in Sections 3.1-3.3 and Appendix A.1: issues are processed through test generation (3 iterations), patch generation (3 iterations, 4 candidates each), patch validation with additional tests, and memory updates after each issue.",
    310           "source": "opus"
    311         }
    312       },
    313       "contamination": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Critical omission: Claude 3.7 (early 2025) evaluated on SWE-Bench Lite issues from 2023-2024. No discussion of model training cutoff, train-test overlap, or benchmark contamination risk despite temporal proximity.",
    317         "source": "haiku",
    318         "training_cutoff_stated": {
    319           "applies": true,
    320           "answer": false,
    321           "justification": "No training data cutoff dates are stated for any of the models used (Claude 3.5 Sonnet V2, Claude 3.7 Sonnet, DeepSeek-R1, o1-mini). This is critical because SWE-Bench Lite issues predate these models.",
    322           "source": "opus"
    323         },
    324         "train_test_overlap_discussed": {
    325           "applies": true,
    326           "answer": false,
    327           "justification": "No discussion of whether SWE-Bench Lite issues or their solutions appeared in the training data of the models. The GitHub issues in SWE-Bench Lite were public before model training.",
    328           "source": "opus"
    329         },
    330         "benchmark_contamination_addressed": {
    331           "applies": true,
    332           "answer": false,
    333           "justification": "SWE-Bench was published in 2023 (reference [17]) and the GitHub issues are older. All models used were trained after this, yet no contamination analysis is performed or discussed.",
    334           "source": "opus"
    335         }
    336       },
    337       "human_studies": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No human subjects. Human annotation used only for RSR verification (whether tests reproduce issues), not for evaluating patch quality or repair appropriateness.",
    341         "source": "haiku",
    342         "pre_registered": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study. Human annotators verified RSR as evaluators, not as study subjects.",
    346           "source": "opus"
    347         },
    348         "irb_or_ethics_approval": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants. The study is a benchmark evaluation of an automated program repair system.",
    352           "source": "opus"
    353         },
    354         "demographics_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants. The study evaluates automated methods on a software benchmark.",
    358           "source": "opus"
    359         },
    360         "inclusion_exclusion_criteria": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "No human participants. Evaluation is on a fixed public benchmark (SWE-Bench Lite).",
    364           "source": "opus"
    365         },
    366         "randomization_described": {
    367           "applies": false,
    368           "answer": false,
    369           "justification": "No human participants and no experimental conditions requiring randomization of subjects.",
    370           "source": "opus"
    371         },
    372         "blinding_described": {
    373           "applies": false,
    374           "answer": false,
    375           "justification": "No human participants. Automated benchmark evaluation does not involve blinding.",
    376           "source": "opus"
    377         },
    378         "attrition_reported": {
    379           "applies": false,
    380           "answer": false,
    381           "justification": "No human participants in this study.",
    382           "source": "opus"
    383         }
    384       },
    385       "cost_and_practicality": {
    386         "applies": true,
    387         "answer": true,
    388         "justification": "Inference cost clearly reported ($2.07/instance for EXPEREPAIR vs $12.24 for DARS). Per-instance cost discussed but total computational budget for the full seed phase not stated.",
    389         "source": "haiku",
    390         "inference_cost_reported": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "Table 1 reports average inference cost per instance: EXPEREPAIR costs $2.07 per issue on average. Costs are also reported for all baselines.",
    394           "source": "opus"
    395         },
    396         "compute_budget_stated": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "Only average per-instance cost is reported ($2.07). Total API spend for all 300 issues, cost of memory construction, and hardware used are not stated.",
    400           "source": "opus"
    401         }
    402       },
    403       "experimental_rigor": {
    404         "seed_sensitivity_reported": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "No multiple random seed results are reported. All results appear to be from a single run of the system. LLM sampling introduces randomness that is not quantified.",
    408           "source": "opus"
    409         },
    410         "number_of_runs_stated": {
    411           "applies": true,
    412           "answer": false,
    413           "justification": "The number of experimental runs is not explicitly stated. Results are presented as single numbers (e.g., 47.7%) without stating how many runs produced them.",
    414           "source": "opus"
    415         },
    416         "hyperparameter_search_budget": {
    417           "applies": true,
    418           "answer": false,
    419           "justification": "No hyperparameter search budget is reported. Values like k=5 for retrieval, max 3 iterations, and 4 candidate patches appear chosen without documented search or justification.",
    420           "source": "opus"
    421         },
    422         "best_config_selection_justified": {
    423           "applies": true,
    424           "answer": false,
    425           "justification": "No justification for how the configuration was selected. Key parameters (iteration limits, number of candidates, retrieval k) are stated without explaining the selection process or whether a validation set was used.",
    426           "source": "opus"
    427         },
    428         "multiple_comparison_correction": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper compares against 8+ baselines using only point estimates.",
    432           "source": "opus"
    433         },
    434         "self_comparison_bias_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "The authors do not acknowledge evaluation bias. Baseline results are taken from official papers/leaderboards (Table 1 note), which avoids re-implementation bias but still involves self-evaluation of their own system.",
    438           "source": "opus"
    439         },
    440         "compute_budget_vs_performance": {
    441           "applies": true,
    442           "answer": true,
    443           "justification": "Table 1 reports both resolved rate and average cost per instance, enabling compute-performance comparison. DARS achieves 47.0% at $12.24 while EXPEREPAIR achieves 47.7% at $2.07.",
    444           "source": "opus"
    445         },
    446         "benchmark_construct_validity": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No discussion of whether SWE-Bench Lite adequately measures repository-level program repair capability. The benchmark is used without questioning its construct validity or comparing against alternative benchmarks.",
    450           "source": "opus"
    451         },
    452         "scaffold_confound_addressed": {
    453           "applies": true,
    454           "answer": false,
    455           "justification": "The main comparison (Table 1) compares different systems each with their own scaffolding. The improvement could be from the scaffold design rather than the dual-memory mechanism. While the ablation isolates memory effects, the cross-system comparison does not control for scaffold differences.",
    456           "source": "opus"
    457         }
    458       },
    459       "data_leakage": {
    460         "temporal_leakage_addressed": {
    461           "applies": true,
    462           "answer": false,
    463           "justification": "No discussion of temporal leakage. SWE-Bench Lite issues were created before the models' training periods, meaning models may have seen the issues and their solutions during training.",
    464           "source": "opus"
    465         },
    466         "feature_leakage_addressed": {
    467           "applies": true,
    468           "answer": false,
    469           "justification": "No discussion of whether the models have been trained on the GitHub repositories underlying SWE-Bench Lite, which would leak feature information about code structure and issue context.",
    470           "source": "opus"
    471         },
    472         "non_independence_addressed": {
    473           "applies": true,
    474           "answer": false,
    475           "justification": "Not addressed. The seed issues used to populate memory are from the same SWE-Bench Lite benchmark as the inference issues, creating within-benchmark information sharing that is not discussed.",
    476           "source": "opus"
    477         },
    478         "leakage_detection_method": {
    479           "applies": true,
    480           "answer": false,
    481           "justification": "No concrete leakage detection or prevention methods are applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines.",
    482           "source": "opus"
    483         }
    484       }
    485     }
    486   },
    487   "claims": [
    488     {
    489       "claim": "EXPEREPAIR achieves 47.7% pass@1 on SWE-Bench Lite with Claude 3.5 Sonnet V2 and 49.3% with Claude 3.7",
    490       "evidence": "Table 1 directly reports these exact percentages; results section confirms",
    491       "supported": "strong"
    492     },
    493     {
    494       "claim": "Dual memory mechanism (episodic + semantic) improves repair performance",
    495       "evidence": "Ablation Table 2: full system 47.7% → without experience module 41.3% (6.4pp drop), without demonstrations 43.7%, without insights 46.0%",
    496       "supported": "strong"
    497     },
    498     {
    499       "claim": "EXPEREPAIR outperforms all state-of-the-art open-source methods",
    500       "evidence": "Table 1 shows EXPEREPAIR 47.7% exceeds DARS (47.0%), PatchPilot (45.3%), OpenHands (41.7%), and all others",
    501       "supported": "strong"
    502     },
    503     {
    504       "claim": "Episodic demonstrations provide more benefit than semantic insights",
    505       "evidence": "Ablation: removing demonstrations 47.7%→43.7% (4.0pp) vs removing insights 47.7%→46.0% (1.7pp)",
    506       "supported": "moderate"
    507     },
    508     {
    509       "claim": "Memory-based retrieval enables handling of complex/atypical issues",
    510       "evidence": "Intersection analysis (Figure 2) shows EXPEREPAIR uniquely resolves 9 issues no baseline solves, attributed to experience-driven design",
    511       "supported": "moderate"
    512     },
    513     {
    514       "claim": "The method works across different LLM models",
    515       "evidence": "Figure 3 tests 4 models: Claude 3.7 (49.3%), Claude 3.5 (47.7%), DeepSeek-R1 (45.0%), o1-mini (41.7%), confirming compatibility",
    516       "supported": "strong"
    517     }
    518   ],
    519   "methodology_tags": [
    520     "benchmark-eval",
    521     "observational"
    522   ],
    523   "key_findings": "EXPEREPAIR achieves state-of-the-art pass@1 of 47.7-49.3% on SWE-Bench Lite by continuously accumulating historical repair experience in two complementary memory systems. The dual-memory mechanism—concrete episodic demonstrations and abstract semantic insights—enables dynamic, context-aware prompt adaptation without requiring expensive fine-tuning. Ablation studies reveal episodic memory (4.0pp improvement) outweighs semantic insights (1.7pp) in importance. The method maintains cost efficiency at $2.07/instance while outperforming prior work including DARS (47.0% at $12.24/instance).",
    524   "red_flags": [
    525     {
    526       "flag": "Training data contamination unaddressed",
    527       "detail": "Claude 3.7 (2025) evaluated on SWE-Bench Lite issues from 2023-2024. No discussion of model training cutoff, potential overlap, or temporal bias despite critical importance for benchmark validity."
    528     },
    529     {
    530       "flag": "No statistical significance testing",
    531       "detail": "Results presented as point estimates without confidence intervals, p-values, or error bars. Marginal improvements (47.7% vs 47.0%) lack statistical validation; differences could be noise."
    532     },
    533     {
    534       "flag": "Bug localization not addressed",
    535       "detail": "Authors acknowledge Section 6: bug localization is critical but omitted from memory optimization because correctness is hard to verify. This is a significant blind spot in the method."
    536     },
    537     {
    538       "flag": "Limited evaluation scope",
    539       "detail": "Single benchmark (SWE-Bench Lite), Python-only, 300 issues from 12 projects. Generalization to other languages, domains, or issue types unknown."
    540     },
    541     {
    542       "flag": "Memory retrieval not analyzed",
    543       "detail": "BM25 used for demonstration retrieval (Appendix A.1) but no analysis of retrieval quality, failure modes, or sensitivity to retrieval parameters."
    544     },
    545     {
    546       "flag": "Cold-start problem acknowledged but unresolved",
    547       "detail": "Two-phase approach (seed phase then inference) performance depends entirely on seed set quality, but seed set composition and impact not analyzed."
    548     },
    549     {
    550       "flag": "Human evaluation limited",
    551       "detail": "Human annotators only verify RSR (whether tests reproduce issues), not patch quality or appropriateness of retrieved demonstrations for reasoning."
    552     }
    553   ],
    554   "cited_papers": [
    555     {
    556       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    557       "relevance": "Foundational benchmark used for evaluation; establishes repository-level APR evaluation methodology"
    558     },
    559     {
    560       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    561       "relevance": "Leading agentic APR baseline; exemplifies agent-based approach that EXPEREPAIR builds upon"
    562     },
    563     {
    564       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    565       "relevance": "Procedural APR baseline providing alternative to agentic methods; demonstrates fixed-pipeline approach"
    566     },
    567     {
    568       "title": "AutoCodeRover: Autonomous Program Improvement",
    569       "relevance": "Agent-based APR competitor; part of baseline comparison set"
    570     },
    571     {
    572       "title": "Dual-Process Theories of Higher Cognition: Advancing the Debate",
    573       "relevance": "Cognitive science foundation for dual-memory systems (episodic vs semantic); motivates design choice"
    574     },
    575     {
    576       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    577       "relevance": "Algorithmic foundation for agent reasoning loops used in repair module"
    578     },
    579     {
    580       "title": "LLMs as Continuous Learners: Improving the Reproduction of Defective Code in Software Issues",
    581       "relevance": "Prior work on incremental learning for APR; related to EXPEREPAIR's accumulation strategy"
    582     }
    583   ],
    584   "engagement_factors": {
    585     "practical_relevance": {
    586       "score": 2,
    587       "justification": "The tool addresses a real practitioner need (automated bug fixing) and code is released, but requires significant LLM API costs and setup to use."
    588     },
    589     "surprise_contrarian": {
    590       "score": 1,
    591       "justification": "The dual-memory idea is interesting but not contrarian — augmenting LLM agents with memory is a well-explored direction."
    592     },
    593     "fear_safety": {
    594       "score": 0,
    595       "justification": "No safety or security concerns raised; this is a program repair tool with no adversarial implications."
    596     },
    597     "drama_conflict": {
    598       "score": 0,
    599       "justification": "No controversial claims or conflicts; straightforward benchmark improvement paper."
    600     },
    601     "demo_ability": {
    602       "score": 2,
    603       "justification": "Code is released on GitHub and the system runs on SWE-Bench Lite, but requires API keys for Claude/DeepSeek models."
    604     },
    605     "brand_recognition": {
    606       "score": 1,
    607       "justification": "Uses Claude 3.5/3.7 Sonnet (Anthropic) which adds some recognition, but from an academic lab not a major AI company."
    608     }
    609   },
    610   "hn_data": {
    611     "threads": [
    612       {
    613         "hn_id": "46728063",
    614         "title": "New York Times games are hard: A computational perspective",
    615         "points": 73,
    616         "comments": 33,
    617         "url": "https://news.ycombinator.com/item?id=46728063"
    618       },
    619       {
    620         "hn_id": "43695562",
    621         "title": "M1: Towards Scalable Test-Time Compute with Mamba Reasoning Models",
    622         "points": 33,
    623         "comments": 3,
    624         "url": "https://news.ycombinator.com/item?id=43695562"
    625       },
    626       {
    627         "hn_id": "44024987",
    628         "title": "Can You Trust Code Copilots? Evaluating LLMs from a Code Security Perspec",
    629         "points": 11,
    630         "comments": 2,
    631         "url": "https://news.ycombinator.com/item?id=44024987"
    632       },
    633       {
    634         "hn_id": "31833716",
    635         "title": "What does it take to solve the measurement problem?",
    636         "points": 5,
    637         "comments": 0,
    638         "url": "https://news.ycombinator.com/item?id=31833716"
    639       },
    640       {
    641         "hn_id": "43116772",
    642         "title": "AI Alignment at Your Discretion",
    643         "points": 3,
    644         "comments": 0,
    645         "url": "https://news.ycombinator.com/item?id=43116772"
    646       },
    647       {
    648         "hn_id": "44276478",
    649         "title": "Getting Explicit Instruction Right",
    650         "points": 2,
    651         "comments": 0,
    652         "url": "https://news.ycombinator.com/item?id=44276478"
    653       },
    654       {
    655         "hn_id": "45284415",
    656         "title": "Is In-Context Learning Learning?",
    657         "points": 2,
    658         "comments": 0,
    659         "url": "https://news.ycombinator.com/item?id=45284415"
    660       },
    661       {
    662         "hn_id": "31840313",
    663         "title": "What does it take to solve the measurement problem?",
    664         "points": 2,
    665         "comments": 0,
    666         "url": "https://news.ycombinator.com/item?id=31840313"
    667       },
    668       {
    669         "hn_id": "46345690",
    670         "title": "Computational complexity of New York Times games",
    671         "points": 1,
    672         "comments": 0,
    673         "url": "https://news.ycombinator.com/item?id=46345690"
    674       },
    675       {
    676         "hn_id": "45467729",
    677         "title": "AegisShield: Democratizing Cyber Threat Modeling with Generative AI",
    678         "points": 1,
    679         "comments": 0,
    680         "url": "https://news.ycombinator.com/item?id=45467729"
    681       }
    682     ],
    683     "top_points": 73,
    684     "total_points": 133,
    685     "total_comments": 38
    686   }
    687 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs