scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33769B)
      1 {
      2   "paper": {
      3     "title": "Input Reduction Enhanced LLM-based Program Repair",
      4     "authors": [
      5       "Boyang Yang",
      6       "Luyao Ren",
      7       "Xin Yin",
      8       "Jiadong Ren",
      9       "Haoye Tian",
     10       "Shunfu Jin"
     11     ],
     12     "year": 2025,
     13     "venue": "ICSE'26",
     14     "arxiv_id": "2507.15251",
     15     "doi": "10.48550/arXiv.2507.15251"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [
     19     "experimental_rigor",
     20     "data_leakage"
     21   ],
     22   "methodology_tags": [
     23     "benchmark-eval"
     24   ],
     25   "key_findings": "ReduceFix combines LLM-generated test input reducers with the classical ddmin algorithm to shrink failure-inducing inputs by 89.1% on average, improving LLM-based program repair pass@10 by up to 53.8% on the authors' LFTBench benchmark. Full failing test inputs often hurt repair accuracy compared to no-test baselines, confirming the 'lost-in-the-middle' effect in APR prompts. The approach transfers as a drop-in plugin to ChatRepair (+21.3%) and CREF (+2.6%), and shows improvements on 12 OSS-Fuzz crash instances.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Section 4.4 states 'the full artifact is published at https://github.com/GLEAM-Lab/ReduceFix' with an explicit URL."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "LFTBench and LFTBench-Py are described as part of the published artifact at the GitHub repository. The benchmark is constructed from publicly available AtCoder contest data."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup listing library versions is mentioned in the paper. Only model names and hyperparameters are provided."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper provides a GitHub URL for the artifact but does not include step-by-step reproduction instructions, README commands, or a 'Reproducing Results' section within the paper itself."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results in Tables 3–13 are point estimates (pass@k percentages, compression rates) with no confidence intervals, error bars, or ± notation."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5.3 states 'we leverage MWW tests to further confirm the improvement based on 4 LLMs, returning a two-sided p-value with < 0.05' to test significance of ReduceFix vs Origin Test and ReduceFix vs ddmin-only."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Throughout the paper, results are reported with both absolute differences and relative improvements with baseline context (e.g., 'pass@10 climbs from 20% to 25.5%', '53.8% relative', '21.3% relative gain'). This provides sufficient context for effect size interpretation."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No justification for why 200 bugs across 20 tasks, or 12 OSS-Fuzz instances, were chosen. No power analysis is discussed. The OSS-Fuzz evaluation uses only 12 instances, which is very small."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No standard deviation, IQR, or variance across experimental repetitions is reported. Results are from a single run of each configuration. The pass@k statistic aggregates over sampling but no spread measure across independent runs is given."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple baselines are compared: Baseline (no test input), Origin Test (full failing input), ddmin-only, and pure-LLM reduction. Additionally, ChatRepair and CREF are used as baselines for plug-in evaluation."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "ChatRepair (2024) and CREF (2024) are contemporary APR systems. The evaluated LLMs (DeepSeek-V3, Qwen2.5-Plus) are current models. The ddmin algorithm is a classical but still-relevant baseline for input reduction."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "RQ-3 (Section 5.4) is an ablation study testing five prompt variants (Baseline, Diff Lines, Reduced Test, Origin Test, Reduced + Origin) to isolate the contributions of length reduction vs. information selection. RQ-1 also ablates the reducer components (LLM+ddmin vs ddmin-only vs pure-LLM)."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Multiple metrics are reported: pass@1, pass@5, pass@10 for repair; success rate and compression rate (mean and median) for reduction; and token cost in Table 5."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "All evaluation is automated via test suite pass/fail. No human evaluation of patch quality, correctness, or readability is performed."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Patches are validated against the full official AtCoder hidden test suite, which is not used for any tuning or selection. Section 3.4 states 'each candidate... is executed against the full hidden suite I.' For OSS-Fuzz, Docker-grounded validation is used."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down by difficulty level (C, D, E&F) in Tables 3, 6, 7, 9–11, by input format category in Table 4, and per-project for OSS-Fuzz in Tables 12–13."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 5.2 includes a detailed 'Failed Case Study' analyzing submission 62869553 of task ABC372E, explaining why the reducer failed (renumbering masked the defect) and proposing a fix."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Multiple negative results are reported: Origin Test often hurts performance (e.g., GLM-4-9B-chat drops from 8.5% to 6.5% pass@10); 10/200 reductions failed; CREF improvement is only 2.6% relative; ImageMagick remained unsolved on OSS-Fuzz."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims are supported: 89.1% average compression (Table 3), up to 53.8% pass@10 improvement (Table 6, Qwen2.5-Coder-7B from 19.0% to 25.5% = 34.2%, but GLM-4-9B-chat from 6.5% to 10.0% = 53.8%), 21.3% ChatRepair improvement (Table 10), 2.6% CREF improvement (Table 11). All verifiable in the results tables."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Causal claims ('ReduceFix improves repair') are supported by controlled experiments: same bugs, same LLMs, same hyperparameters, varying only the prompt condition. RQ-3 ablation isolates length reduction vs. information content through controlled single-variable manipulation. MWW significance tests confirm the differences."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title claims 'LLM-based Program Repair' broadly, and the abstract says 'a practical and powerful complement to LLM-based APR.' However, evaluation is primarily on 200 competitive programming bugs from AtCoder (C++ and 20 Python) and only 12 OSS-Fuzz instances. Competitive programming is a narrow domain; the generalization to real-world software bugs is not bounded."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "RQ-3 explicitly tests the alternative explanation that gains come from length reduction alone (Diff Lines variant) vs. information content, showing both are needed. Section 6 discusses LLM stochasticity as a confound and addresses it via MWW tests and controlled decoding temperature."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures pass@k (patches passing test suites) and frames results as 'repair accuracy.' Pass@k is the standard, directly relevant metric for APR — no proxy gap exists. The metric is formally defined in Section 4.3."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Open-source models are identified by specific names (GLM-4-9B-chat, Qwen2.5-Coder-7B-instruct). However, Qwen2.5-Plus is a closed-weight hosted API service with no snapshot date or API version specified — the paper notes 'the provider does not disclose an exact parameter count.' DeepSeek-V3 is identified by its technical report but no API snapshot date is given."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Listing 1 shows the reducer generation prompt as a template with placeholders ({EXAMPLE_PROBLEM_ID_STR}, {example_problem_title}, etc.) whose fill values are not provided. The repair prompt in Section 3.4 is described only conceptually ('concatenates the task description P, the buggy submission sw and the reduced input i∗') without the actual text."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Table 2 comprehensively lists all hyperparameters: decoding temperature (0.0 for reducer, 0.8 for repair), wall-clock limit (60s), number of samples per bug (10), compilation timeout (10s), execution timeout (5s per test case)."
    165       },
    166       "scaffolding_described": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The three-stage pipeline (reducer generation, input reduction, patch generation) is described in detail through Algorithm 1 (full pseudocode), Figure 2 (overview diagram), and Sections 3.2–3.4 explaining each stage's mechanics including time limits, fallback logic, and truncation handling."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.2 documents the benchmark construction pipeline: contest selection (ABC 361–377), difficulty filter (C–F), input size filter (≥4KB), submission collection method (10 C++ and 1 Python per task, manually collected before July 1 2025), resulting in 200+20 bugs. Input format categorization is given in Table 1."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6 'Threats to Validity' provides substantial discussion organized into internal validity, construct validity, and external validity subsections."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 6 discusses specific threats: LLM stochasticity mitigated by temperature=0 for reducer generation and pass@k sampling for repair; compression ratio reaching 100% hiding variation (addressed by reporting both mean and median); dataset bias mitigated by including all qualifying AtCoder problems without cherry-picking."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Section 6 discusses what the authors DID to mitigate external validity concerns (tested on ChatRepair/CREF, OSS-Fuzz, LFTBench-Py) but does not explicitly state what the results do NOT show — e.g., no explicit acknowledgment that results may not apply to non-competitive-programming domains, languages beyond C++/Python, or bugs without long inputs."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full artifact is published at https://github.com/GLEAM-Lab/ReduceFix (Section 4.4), which should include LFTBench data, reducer scripts, and experimental results for independent verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 4.2 describes the data collection in detail: AtCoder contests 361–377, difficulty C–F, input size ≥4KB, 10 C++ and 1 Python wrong-answer submissions per task manually collected before July 1 2025, with median input size over 1MB."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data is sourced from AtCoder public contest submissions and OSS-Fuzz public crash instances — both standard public sources."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is documented: select ABC 361–377 → filter by test size ≥4KB and difficulty C–F → yields 20 tasks → collect 10 C++ and 1 Python submission per task → 200 C++ and 20 Python bugs. For OSS-Fuzz, the pipeline is: filter ARVO instances to single-file patches with inputs >1KB → select 12 smallest patches from 5 projects."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The Acknowledgments section discloses National Natural Science Foundation of China (Grant 62273292), Central Leading Local Science and Technology Development Project of Hebei Province (Grant 246Z0804G), and Innovation Capability Improvement Plan Project of Hebei Province (22567626H)."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are listed: Yanshan University, Peking University, Zhejiang University, and Aalto University. None of the authors are affiliated with the evaluated model providers (Qwen/Alibaba, DeepSeek, GLM/Zhipu)."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Funding is from Chinese national and provincial science foundations (NSFC, Hebei Province grants) with no apparent financial stake in whether ReduceFix outperforms baselines."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement or financial disclosure is present in the paper. Absence of a disclosure statement does not equal absence of conflicts."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper states LFTBench tasks are 'entirely after the knowledge cut-offs of the 4 LLMs we evaluate' but does not state the actual training cutoff dates for any of the four models. The reader cannot verify the claim without looking up each model's cutoff independently."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Section 4.2 explicitly discusses overlap risk: 'most of those benchmarks were released years ago and are drawn from popular open-source projects that large language models have almost certainly seen.' They chose post-cutoff AtCoder tasks to avoid this. For OSS-Fuzz, they acknowledge 'some OSS-Fuzz repositories may overlap with LLM pretraining corpora' but argue relative comparisons remain valid."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "The primary mitigation is temporal: LFTBench uses AtCoder contests 361–377, stated to be after all evaluated models' training cutoffs. For OSS-Fuzz, they acknowledge potential contamination and note that holding evaluation conditions fixed ensures relative differences remain valid despite potential leakage."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study. It is a benchmark evaluation of automated program repair systems."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. The study uses publicly available code submissions and automated testing."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants involved."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants involved."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants involved."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants involved."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants involved."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 5 reports token consumption and USD costs for reduction ($0.017 for ReduceFix vs $0.632 for pure-LLM on 20 problems). Section 4.1 lists per-token API pricing for Qwen2.5-Plus ($0.11/M input, $0.27/M output) and DeepSeek-V3 ($0.27/M input, $1.11/M output)."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No total computational budget is stated. Table 5 covers only the reduction phase for 20 problems. Total GPU hours for locally-deployed models (GLM-4-9B, Qwen2.5-Coder-7B), total API spend for the full experiment, and hardware specifications are not reported."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single experimental configurations with temperature 0.8 sampling."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Section 4.3 and Table 2 state '10 candidate patches per bug' for repair inference. Section 3.2 states reducer generation uses greedy decoding (deterministic single run). The sampling protocol is explicitly defined."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Section 4.4 states 'All values were chosen with small pilot runs on tasks outside the benchmark' but does not report how many configurations were tried, what search method was used, or the compute spent on hyperparameter selection."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Section 4.4 states hyperparameters were 'chosen with small pilot runs on tasks outside the benchmark and kept unchanged throughout the study,' avoiding test-set snooping by using out-of-benchmark data for tuning."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "MWW tests are run across multiple LLMs and prompt conditions (at least 8 comparisons) but no Bonferroni, Holm, or other family-wise error rate correction is mentioned."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors compare their ReduceFix system against their own implementations of ddmin-only and pure-LLM baselines without acknowledging or discussing author-evaluation bias. No independent evaluation is performed."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "ReduceFix adds an extra LLM call for reducer generation and a ddmin execution phase, but the total compute cost of the full pipeline is not compared against baselines. Table 5 only covers the reduction phase cost, not the complete pipeline including repair inference."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper motivates why long-input benchmarks are needed (Section 4.2) but does not discuss whether LFTBench's competitive programming bugs are a valid proxy for real-world APR scenarios. The construct validity gap between AtCoder tasks and industrial software bugs is not addressed."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "All model comparisons in Tables 6 use the identical ReduceFix pipeline, varying only the model. For ChatRepair and CREF integration (Tables 10–11), only the input is changed (reduced vs original) while keeping all other logic identical, properly isolating the reduction effect."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "The paper specifically uses AtCoder contests 361–377 'entirely after the knowledge cut-offs of the 4 LLMs we evaluate' (Section 4.2) as a temporal leakage prevention measure."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the evaluation setup leaks information not available in real APR usage. For instance, ReduceFix requires a correct reference solution A to drive the reducer's interestingness predicate, which is not available in real-world repair scenarios. This evaluation-setup feature leak is not discussed."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "The 200 bugs comprise 10 submissions per task across 20 tasks, meaning bugs from the same task share the same test inputs, problem description, and likely similar bug patterns. This structural non-independence is not discussed."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": true,
    370         "justification": "Temporal splitting is applied as a concrete leakage prevention method: selecting AtCoder contests after training cutoffs ensures benchmark problems did not appear in training data. This is a prevention method rather than detection, but the schema lists 'temporal splits' as a qualifying method."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "ReduceFix generates syntactically correct reducers for all 200 bugs, with 95% successfully reducing inputs by an average of 89.1%.",
    377       "evidence": "Tables 3 and 4 show 95.0% success rate across 200 bugs. Mean compression rates are 84.5% (C), 97.0% (D), 83.0% (E&F). All 200 reducer scripts passed syntax checks (Section 5.2).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "ReduceFix improves overall pass@10 by up to 53.8% relative to prompts that include the original test input.",
    382       "evidence": "Table 6 shows GLM-4-9B-chat pass@10 rises from 6.5% (Origin Test) to 10.0% (Reduced Test), a 53.8% relative gain. Similar improvements are shown for all four LLMs.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Supplying the full failing test input often hurts repair accuracy compared to the no-test baseline.",
    387       "evidence": "Table 6 shows GLM-4-9B-chat pass@10 drops from 8.5% (Baseline) to 6.5% (Origin Test); DeepSeek-V3 drops from 66.5% to 63.0%. This is consistent across multiple LLMs.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Adding ReduceFix as a plug-in to ChatRepair increases pass@10 by 21.3% relative.",
    392       "evidence": "Table 10 shows ChatRepair overall pass@10 rises from 30.5% to 37.0% (6.5pp absolute, 21.3% relative). Gains hold across all difficulty levels.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Adding ReduceFix to CREF increases pass@10 by 2.6% relative.",
    397       "evidence": "Table 11 shows CREF overall pass@10 rises from 39.0% to 40.0% (1.0pp absolute, 2.6% relative).",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "On OSS-Fuzz, reduced inputs raise pass@10 from 16.7% (Origin Test) to 41.7%.",
    402       "evidence": "Table 13 shows micro-average pass@10 across 12 instances: 16.7% for Origin Test, 41.7% for Reduced Test. However, this is based on only 12 instances across 5 projects.",
    403       "supported": "weak"
    404     },
    405     {
    406       "claim": "The benefit comes from the combination of brevity and complete failure evidence, not length reduction alone.",
    407       "evidence": "Table 9 (RQ-3) shows Reduced Test (25.5% pass@10) outperforms both Diff Lines (20.0%, same length but different info) and Reduced+Origin (19.0%, more info but longer), demonstrating both factors are needed.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "red_flags": [
    412     {
    413       "flag": "Very small OSS-Fuzz evaluation",
    414       "detail": "The OSS-Fuzz evaluation uses only 12 instances across 5 projects, which is too small for reliable generalization to industrial-scale repair scenarios. The paper selects the 12 instances with the smallest patches, which may bias toward easier bugs."
    415     },
    416     {
    417       "flag": "No variance across experimental repetitions",
    418       "detail": "All results are from single experimental runs with stochastic LLMs (temperature 0.8). No standard deviation, IQR, or variance across repetitions is reported. The pass@k computation aggregates sampling noise within a run but does not capture run-to-run variability."
    419     },
    420     {
    421       "flag": "Narrow evaluation domain",
    422       "detail": "200 of 212 total bugs are from AtCoder competitive programming contests — a narrow domain with well-defined input/output specifications, reference solutions, and clear test oracles. Real-world APR scenarios rarely have these properties. The generalizability to industrial software bugs is unclear."
    423     },
    424     {
    425       "flag": "Multiple comparison issue",
    426       "detail": "MWW significance tests are run across multiple LLMs and prompt conditions without correction for multiple comparisons (no Bonferroni, Holm, etc.), inflating the risk of Type I errors."
    427     },
    428     {
    429       "flag": "Reference solution requirement not prominently disclosed",
    430       "detail": "ReduceFix's reducer requires a correct reference solution A to determine if the reduced input still triggers the bug (A(i) ≠ sw(i)). This is available in competitive programming but rarely in real-world repair scenarios. This strong assumption is not prominently discussed as a limitation."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    436       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    437       "year": 2024,
    438       "relevance": "Foundational conversational LLM-based APR system that ReduceFix extends and evaluates against."
    439     },
    440     {
    441       "title": "Cref: An llm-based conversational software repair framework for programming tutors",
    442       "authors": ["Boyang Yang", "Haoye Tian", "Weiguo Pian", "Haoran Yu", "Haitao Wang", "Jacques Klein", "Tegawendé F Bissyandé", "Shunfu Jin"],
    443       "year": 2024,
    444       "relevance": "Conversational LLM-based APR for tutoring that identified the test-input-hurts-repair phenomenon and is used as a baseline."
    445     },
    446     {
    447       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    448       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    449       "year": 2022,
    450       "relevance": "Pioneering work on zero-shot LLM-based APR that demonstrated LLMs can repair without task-specific training."
    451     },
    452     {
    453       "title": "ThinkRepair: Self-Directed Automated Program Repair",
    454       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang", "Zhenhao Li", "Limin Zeng", "Xiaohu Yang"],
    455       "year": 2024,
    456       "doi": "10.1145/3650212.3680359",
    457       "relevance": "Self-directed LLM APR approach that uses multi-step reasoning for patch generation."
    458     },
    459     {
    460       "title": "Automated repair of programs from large language models",
    461       "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"],
    462       "year": 2023,
    463       "relevance": "Early work on LLM-based program repair demonstrating strong results on standard benchmarks."
    464     },
    465     {
    466       "title": "Agentless: Demystifying llm-based software engineering agents",
    467       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    468       "year": 2024,
    469       "relevance": "LLM-based SE agent framework using SEARCH/REPLACE blocks for repository-level repair, whose format ReduceFix adopts for OSS-Fuzz evaluation."
    470     },
    471     {
    472       "title": "A Survey of LLM-based Automated Program Repair: Taxonomies, Design Paradigms, and Applications",
    473       "authors": ["Boyang Yang", "Zijian Cai", "Fengling Liu", "Bach Le", "Lingming Zhang", "Tegawendé F. Bissyandé", "Yang Liu", "Haoye Tian"],
    474       "year": 2025,
    475       "arxiv_id": "2506.23749",
    476       "relevance": "Comprehensive survey of LLM-based APR providing taxonomy and landscape of the field ReduceFix contributes to."
    477     },
    478     {
    479       "title": "Lost in the middle: How language models use long contexts",
    480       "authors": ["Nelson F Liu", "Kevin Lin", "John Hewitt", "Ashwin Paranjape", "Michele Bevilacqua", "Fabio Petroni", "Percy Liang"],
    481       "year": 2023,
    482       "relevance": "Key empirical finding that LLMs attend less to information in the middle of long contexts, motivating ReduceFix's input reduction approach."
    483     },
    484     {
    485       "title": "Simplifying and Isolating Failure-Inducing Input",
    486       "authors": ["Andreas Zeller", "Ralf Hildebrandt"],
    487       "year": 2002,
    488       "doi": "10.1109/32.988498",
    489       "relevance": "Original ddmin algorithm that forms the algorithmic foundation of ReduceFix's input reduction strategy."
    490     },
    491     {
    492       "title": "ContrastRepair: Enhancing conversation-based automated program repair via contrastive test case pairs",
    493       "authors": ["Jiaolong Kong", "Mingfei Cheng", "Xiaofei Xie", "Shangqing Liu", "Xiaoning Du", "Qi Guo"],
    494       "year": 2024,
    495       "relevance": "Conversation-based APR that uses contrastive test pairs to guide repair, related to ReduceFix's use of test evidence in prompts."
    496     },
    497     {
    498       "title": "Code repair with llms gives an exploration-exploitation tradeoff",
    499       "authors": ["Hao Tang", "Keya Hu", "Jin Zhou", "Si Cheng Zhong", "Wei-Long Zheng", "Xujie Si", "Kevin Ellis"],
    500       "year": 2024,
    501       "relevance": "Analysis of the exploration-exploitation tradeoff in LLM-based code repair, another approach embedding test cases in repair prompts."
    502     },
    503     {
    504       "title": "Arvo: Atlas of reproducible vulnerabilities for open source software",
    505       "authors": ["Xiang Mei", "Pulkit Singh Singaria", "Jordi Del Castillo", "Haoran Xi", "Tiffany Bao", "Ruoyu Wang", "Yan Shoshitaishvili", "Adam Doupé", "Hammond Pearce", "Brendan Dolan-Gavitt"],
    506       "year": 2024,
    507       "relevance": "Provides the OSS-Fuzz evaluation infrastructure (Docker images, ground-truth patches) used for ReduceFix's repository-level evaluation."
    508     },
    509     {
    510       "title": "MORepair: Teaching LLMs to Repair Code via Multi-Objective Fine-Tuning",
    511       "authors": ["Boyang Yang", "Haoye Tian", "Jiadong Ren", "Hongyu Zhang", "Jacques Klein", "Tegawende Bissyande", "Claire Le Goues", "Shunfu Jin"],
    512       "year": 2025,
    513       "relevance": "Multi-objective fine-tuning approach for LLM-based code repair, representing fine-tuning-based APR paradigm."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 2,
    519       "justification": "ReduceFix is a practical tool with released code that can be plugged into existing APR pipelines, though it requires competitive-programming-style reference solutions to work."
    520     },
    521     "surprise_contrarian": {
    522       "score": 1,
    523       "justification": "The finding that including full test inputs can hurt LLM repair accuracy is somewhat counter-intuitive but aligns with known lost-in-the-middle effects."
    524     },
    525     "fear_safety": {
    526       "score": 0,
    527       "justification": "No AI safety or security implications; this is a program repair technique."
    528     },
    529     "drama_conflict": {
    530       "score": 0,
    531       "justification": "No controversy or conflict angle; straightforward benchmark evaluation paper."
    532     },
    533     "demo_ability": {
    534       "score": 2,
    535       "justification": "Code is released on GitHub (GLEAM-Lab/ReduceFix) and the benchmark is published, though setting up the evaluation requires AtCoder infrastructure."
    536     },
    537     "brand_recognition": {
    538       "score": 0,
    539       "justification": "Authors are from Yanshan University, Peking University, Zhejiang University, and Aalto University — respected institutions but not famous AI labs."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs