scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31245B)
      1 {
      2   "paper": {
      3     "title": "Tracing Errors, Constructing Fixes: Repository-Level Memory Error Repair via Typestate-Guided Context Retrieval",
      4     "authors": ["Xiao Cheng", "Zhihao Guo", "Huan Huo", "Yulei Sui"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2506.18394",
      8     "doi": "10.48550/arXiv.2506.18394"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "LTFix combines typestate-guided context retrieval with LLMs to repair C memory errors (use-after-free, double-free, memory leaks) at repository scale. It repaired 37 of 49 real-world memory errors across 14 open-source projects (>1M LoC), outperforming SAVER (14.50×) and ProveNFix (2.36×), and fixing 94% more errors than SWE-agent 1.0 while consuming 41× fewer tokens. Ablation studies show that both the error-propagation path and typestate-guided context trace are critical, with the full system achieving 22 correct patches vs 7-15 for ablated variants.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The Data Availability Statement says 'All implementation details and associated data are available to reviewers and will be made publicly available upon acceptance.' A promise of future release counts as NO."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The 49-error benchmark with PoC inputs is described but not released. The Data Availability Statement defers release to paper acceptance."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "§5.1 specifies: Ubuntu 22.04, 24-core 5.60 GHz Intel CPU, 64 GB memory, Valgrind 3.18.1, GDB Python API with GDB 12.1, GNU C Library 2.35, and Claude 3.5 Sonnet as the LLM."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, scripts, or README are provided. The implementation details in §5.1 describe the setup but not how to replicate experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are point estimates (e.g., '37 out of 49', '22 correct patches'). No confidence intervals or error bars are reported despite running experiments 5 times."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims like '14.50× and 2.36× more errors' and '94.7% increase' are based on raw count comparisons. No statistical significance tests are applied to any comparison."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Relative improvements are reported with baseline context throughout: 14.50× and 2.36× more repairs than SAVER/ProveNFix, 94.7% more errors fixed than SWE-agent, 41× fewer tokens, 81.5% vs 58.8% accuracy rates, and percentage improvements in ablation (47% over LTFix-NT, 214% over LTFix-F)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The benchmark contains 49 errors from 14 projects. No justification is given for why this size is sufficient to support the claims, and no power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "§5.1 states 'we conduct each experiment five times for every project' and use a 4/5 consistency criterion, but actual variance, standard deviation, or spread across the 5 runs is never reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Compared against SAVER, ProveNFix (traditional APR tools), SWE-agent 1.0 (LLM-based), and three ablation variants (LTFix-F, LTFix-M, LTFix-NT) in §5.2."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "ProveNFix (2024) and SWE-agent 1.0 (2024) are recent. SAVER (2020) is older but is the seminal work in memory error APR and still represents the state of the art in that specific niche."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "§5.7 presents ablation analysis with three variants: LTFix-F (file-level context), LTFix-M (method-level context), and LTFix-NT (no context trace), isolating the contribution of each component."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics reported: #Δ (total patches), #Δ✓ (correct patches), #Δ✗ (patches introducing new errors), #ΔO (non-harmful incorrect), #E✓ (fixed errors), token consumption, and accuracy rate."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "§5.3 states patches must be 'manually validated to align with the ground truth, adhering to the standards outlined in [42].' Three zero-day fixes were accepted by original project developers."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The same 49-error benchmark is used for both development and evaluation. No separation between development/tuning set and held-out test set is described."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 4 provides per-project breakdown across all 14 projects. Figure 9 shows per-project token consumption. Case studies in §5.5.2 analyze specific error types (complex data structures, pointer aliasing, cyclic allocation)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "§5.5.2 discusses where baselines fail and why. The paper reports 4 incorrect patches and 1 error-introducing patch for LTFix. Multi-threaded programs are excluded as a known limitation. §5.7 discusses cases ablated variants cannot handle."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The ablation study (§5.7) shows degraded performance for all reduced variants. LTFix-F achieves only 7 correct patches and introduces 3 new errors. LTFix itself produces 1 error-introducing patch and 4 incorrect patches (Table 4)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims are supported: 37/49 repairs (Table 4), 14.50× and 2.36× over SAVER/ProveNFix (Table 4), 94% more than SWE-agent at 41× fewer tokens (Figures 8-9), and three zero-day fixes (mentioned in §5.1). The claim about SAVER's dataset (Table 3: 122/153 = 79.7%) is also verified."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims about component contributions are supported by controlled ablation studies (§5.7): removing the context trace (LTFix-NT) reduces correct patches from 22 to 15, removing method context (LTFix-F) reduces to 7. Each ablation modifies a single variable."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract concludes with 'a promising paradigm for repository-level program repair through program analysis-guided, retrieval-augmented LLMs' which generalizes beyond what was tested (only C memory errors of 3 types across 14 projects). The title 'Repository-Level Memory Error Repair' is appropriately scoped, but broader claims in the abstract and conclusion overgeneralize."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "§6 Threats to Validity discusses three specific alternative explanations: (1) benchmark contamination from LLM training data, (2) choice of LLM may affect results, and (3) approach is limited to errors with PoC inputs and may not generalize beyond them."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures patch correctness (fixes the error, passes test suite, matches ground truth) and claims repair capability. The measured outcome directly matches the claimed capability with no proxy gap."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'Claude 3.5 Sonnet [3, 12]' without specifying which version (v1 vs v2), snapshot date, or API version. Multiple Claude 3.5 Sonnet versions exist with different capabilities."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "§4.3 describes the prompting strategy (role play, structured prompting with four steps) in natural language but does not provide the actual prompt text. Descriptions like 'the initiation of the error report typically begins with a statement like...' are templates, not full prompts."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for Claude 3.5 Sonnet. Chain-of-thought is mentioned as a technique but specific generation settings are absent."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The three-phase pipeline is described in detail: error replay with Valgrind/ASan (§4.1), typestate-guided context retrieval with GDB (§4.2 with formal algorithms and definitions), and structured prompting to LLM (§4.3). No agentic loop or tool use by the LLM, but the retrieval pipeline is thoroughly documented."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "§5.1 documents dataset construction: reverse-engineering SAVER's PoCs, excluding multi-threaded programs (reducing from original to 8 projects), constructing PoC inputs for each error, collecting developer-approved ground-truth patches, and confirming errors with CVE identifiers."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 'Threats to Validity' provides substantive discussion across three categories: Dataset contamination, LLM Selection, and Repair Scope."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "§6 discusses specific threats: potential inclusion of benchmark patches in LLM training data, reliance on a single LLM (Claude 3.5 Sonnet), limitation to errors reproducible with PoC inputs, and exclusion of multi-threaded programs due to debugger-based tracing limitations."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "§6 explicitly states: 'Our approach does not aim to detect or repair all possible memory errors within a repository, but rather provides a targeted solution for memory errors that have been reproduced with a specific proof of concept.' Multi-threaded programs are explicitly excluded."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The benchmark, PoC inputs, generated patches, and experimental logs are not publicly available. Deferred to paper acceptance."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "§5.1 describes dataset construction: 14 real-world open-source C projects selected across diverse domains, errors confirmed by developers, 9 CVE-assigned vulnerabilities, PoC inputs reverse-engineered or constructed. Table 2 provides project statistics."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The study evaluates automated tools on open-source software projects."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from error identification to evaluation is documented: SAVER's dataset reverse-engineered (8 projects retained after excluding multi-threaded ones), extended with 6 additional projects, PoC inputs constructed, errors confirmed via dynamic analysis, ground-truth patches collected from project maintainers."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Xiao Cheng and Yulei Sui at University of New South Wales, Zhihao Guo and Huan Huo at University of Technology Sydney. These are academic institutions not affiliated with the evaluated tools."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, making it impossible to assess funder independence."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses Claude 3.5 Sonnet but does not state its training data cutoff date. This is relevant since the benchmark uses open-source projects whose patches may be in the training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "§6 acknowledges: 'A potential threat to validity concerns the possible inclusion of our evaluated open-source projects and patches in the training dataset of the employed LLMs, which could introduce evaluation bias.'"
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "While §6 acknowledges the contamination risk, no mitigation measures are taken. The paper argues this affects all baselines equally but does not verify whether patches existed before the model's training cutoff or apply any decontamination method."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The evaluation is entirely automated on software projects."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates automated program repair tools on open-source code."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Figure 9 reports per-project token consumption for both LTFix and SWE-agent. §5.7 states LTFix's total consumption is 411,900 tokens vs SWE-agent's ~17M tokens (41× reduction). Per-project token counts are shown."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "§5.1 specifies the hardware (Ubuntu 22.04, 24-core 5.60 GHz Intel CPU, 64 GB memory). Total token consumption is reported (411,900 for LTFix). The 5-run experimental protocol is documented."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Experiments are run 5 times with a 4/5 consistency criterion, but actual per-seed results, variance, or sensitivity analysis across the 5 runs is not reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "§5.1 explicitly states: 'we conduct each experiment five times for every project. We only consider a result to be valid if it exhibits consistency in at least four out of the five runs.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. The framework configuration (prompting strategy, FTA specifications) appears fixed without systematic tuning."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No explanation of how the framework configuration was selected. The 4/5 consistency criterion is described but the choice of prompting structure and other design decisions are not justified through systematic comparison."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple comparisons are made across 14 projects and 6 baselines. No correction for multiple comparisons is applied, and no statistical tests are used at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors built LTFix and compare it against their own re-implementations of baselines and ablated variants. No discussion of author-evaluation bias or independent evaluation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 9 directly compares token consumption (a proxy for compute cost) against repair effectiveness for LTFix vs SWE-agent across all 14 projects, showing LTFix achieves better performance at lower compute cost."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether the 49-error benchmark adequately represents the universe of memory errors in C. The benchmark is convenience-sampled from SAVER's dataset plus additional projects without discussing construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "LTFix uses typestate-guided retrieval + structured prompting while SWE-agent uses its own agent scaffold. The comparison attributes performance differences to the approaches but does not isolate the scaffold effect from the model effect. The ablation study partially addresses this but doesn't control for scaffolding."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The benchmark uses open-source projects with known patches (some CVE-assigned). Whether these patches existed before Claude 3.5 Sonnet's training cutoff is not discussed. The LLM could have seen the exact fixes during training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The framework provides the LLM with precise error locations, error types, and context traces. Whether this level of information leaks information beyond what would be available in practice is not discussed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Multiple errors from the same projects (e.g., 14 errors from SmallerC, 5 from recutiles) are treated independently. Potential non-independence from shared project conventions is not discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method is applied. §6 acknowledges contamination risk conceptually but does not use canary strings, membership inference, or temporal splits."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "LTFix repairs 37 out of 49 real-world memory errors across 14 open-source projects, outperforming SAVER by 14.50× and ProveNFix by 2.36×.",
    365       "evidence": "Table 4 shows LTFix fixes 37 errors (22 correct patches) vs SAVER's 2 errors (2 correct patches) and ProveNFix's 11 errors (6 correct patches) on the 14-project benchmark. Only 1 LTFix patch introduces a new error.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "LTFix outperforms SWE-agent 1.0 by repairing 94% more errors while consuming 41× fewer tokens.",
    370       "evidence": "Figure 8 shows LTFix fixes 37 vs SWE-agent's 19 errors (94.7% more). Figure 9 shows per-project token comparison; §5.7 reports LTFix total of 411,900 tokens vs SWE-agent's ~17M tokens.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "LTFix achieves 81.5% patch accuracy (22/27 correct) vs SWE-agent's 58.8% (10/17 correct).",
    375       "evidence": "Figure 8 reports these accuracy rates. LTFix also introduces only 1 error-introducing patch vs SWE-agent's 5.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Typestate-guided context retrieval reduces token consumption by 50× compared to full context traces while maintaining repair quality.",
    380       "evidence": "§5.7 states LTFix consumes 411,900 tokens total vs 22,950,414 tokens for full context traces, with the full system achieving superior repair performance.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Each component of LTFix contributes significantly: removing context trace (LTFix-NT) reduces correct patches from 22 to 15; removing all context (LTFix-F) reduces to 7.",
    385       "evidence": "Figure 10 and §5.7 show ablation results across correct patches, new errors introduced, and total errors fixed for all variants.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Three zero-day memory errors were fixed with patches accepted and implemented by original developers.",
    390       "evidence": "Mentioned in abstract and §1 contributions but details deferred: 'Full details will be disclosed upon paper acceptance.'",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "On SAVER's dataset, LTFix achieves 79.7% fixing rate (122/153 errors) vs SAVER's 68% and ProveNFix's 75.8%.",
    395       "evidence": "Table 3 provides detailed per-project comparison on SAVER's original dataset across all three tools.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Small benchmark size",
    402       "detail": "The primary benchmark has only 49 errors across 14 projects. Large relative improvement claims (14.50×, 2.36×) are based on small absolute numbers (37 vs 2 and 11), making them sensitive to a few additional fixes. No statistical tests are applied."
    403     },
    404     {
    405       "flag": "Code and data not released",
    406       "detail": "Both the implementation and the benchmark are deferred to paper acceptance ('will be made publicly available upon acceptance'), preventing independent verification of all claims."
    407     },
    408     {
    409       "flag": "Zero-day fix details withheld",
    410       "detail": "The paper claims three zero-day fixes accepted by developers but states 'Full details will be disclosed upon paper acceptance.' These claims cannot be independently verified."
    411     },
    412     {
    413       "flag": "Consistency criterion hides variance",
    414       "detail": "Running experiments 5 times with a 4/5 consistency criterion filters out unstable results but the actual distribution of results across runs is never reported. A result that passes in 4/5 runs and fails in 1 is treated identically to one passing in 5/5."
    415     },
    416     {
    417       "flag": "Potential training data contamination",
    418       "detail": "The benchmark uses well-known open-source projects (radare2, quickjs, recutils) with documented CVEs. Claude 3.5 Sonnet may have seen these exact patches during training. The paper acknowledges this risk but applies no mitigation."
    419     },
    420     {
    421       "flag": "Uneven error distribution across projects",
    422       "detail": "SmallerC contributes 14 of the 49 errors (28.6%). Performance on this single project heavily influences aggregate results. LTFix fixes 12/14 SmallerC errors, accounting for nearly a third of its total fixes."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    428       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik R Narasimhan", "Ofir Press"],
    429       "year": 2024,
    430       "arxiv_id": "2405.15793",
    431       "relevance": "State-of-the-art LLM-based agent for automated software engineering, used as primary LLM baseline in this paper's evaluation."
    432     },
    433     {
    434       "title": "InferFix: End-to-End Program Repair with LLMs",
    435       "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano", "Xin Shi", "Shuai Lu", "Neel Sundaresan", "Alexey Svyatkovskiy"],
    436       "year": 2023,
    437       "relevance": "Combines static analysis (Infer) with LLMs for program repair, directly relevant to LLM-based automated program repair approaches."
    438     },
    439     {
    440       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    441       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    442       "year": 2023,
    443       "relevance": "Demonstrates conversational LLM-based program repair at low cost, relevant to LLM cost-effectiveness for automated repair."
    444     },
    445     {
    446       "title": "AutoCodeRover: Autonomous Program Improvement",
    447       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    448       "year": 2024,
    449       "relevance": "LLM-based autonomous program improvement agent, relevant to agentic approaches for code repair."
    450     },
    451     {
    452       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    453       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    454       "year": 2024,
    455       "arxiv_id": "2407.16741",
    456       "relevance": "Open platform for AI software engineering agents, representing the broader agentic AI development ecosystem."
    457     },
    458     {
    459       "title": "CodeAgent: Enhancing Code Generation with Tool-Integrated Agent Systems for Real-World Repo-level Coding Challenges",
    460       "authors": ["Kechi Zhang", "Jia Li", "Ge Li", "Xianjie Shi", "Zhi Jin"],
    461       "year": 2024,
    462       "arxiv_id": "2401.07339",
    463       "relevance": "Tool-integrated agent for repository-level code generation, relevant to agentic AI coding approaches."
    464     },
    465     {
    466       "title": "Lost in the middle: How language models use long contexts",
    467       "authors": ["Nelson F Liu", "Kevin Lin", "John Hewitt", "Ashwin Paranjape", "Michele Bevilacqua", "Fabio Petroni", "Percy Liang"],
    468       "year": 2024,
    469       "relevance": "Foundational work on LLM context window limitations that motivates the paper's context retrieval approach."
    470     },
    471     {
    472       "title": "Enhancing Static Analysis for Practical Bug Detection: An LLM-Integrated Approach",
    473       "authors": ["Haonan Li", "Yu Hao", "Yizhuo Zhai", "Zhiyun Qian"],
    474       "year": 2024,
    475       "relevance": "Combines static analysis with LLMs for bug detection in the Linux kernel, directly relevant to program analysis + LLM synergy."
    476     },
    477     {
    478       "title": "LLMDFA: Analyzing Dataflow in Code with Large Language Models",
    479       "authors": ["Chengpeng Wang", "Wuqi Zhang", "Zian Su", "Xiangzhe Xu", "Xiaoheng Xie", "Xiangyu Zhang"],
    480       "year": 2025,
    481       "relevance": "LLM-powered dataflow analysis for code, relevant to using LLMs for program analysis tasks."
    482     },
    483     {
    484       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    485       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    486       "year": 2022,
    487       "relevance": "Foundational prompting technique used in LTFix's structured prompting approach for repair generation."
    488     },
    489     {
    490       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    491       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    492       "year": 2023,
    493       "relevance": "Comprehensive study of LLM-based automated program repair, establishing the research landscape this paper extends."
    494     },
    495     {
    496       "title": "How Far Can We Go with Practical Function-Level Program Repair?",
    497       "authors": ["Jiahong Xiang", "Xiaoyang Xu", "Fanchu Kong", "Mingyuan Wu", "Haotian Zhang", "Yuqun Zhang"],
    498       "year": 2024,
    499       "relevance": "Explores function-level LLM-based program repair capabilities, directly relevant to the scope of automated repair."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 2,
    505       "justification": "The approach targets a real and costly problem (C memory errors in production code) but requires specialized tooling (Valgrind, GDB, PoC inputs) and is not released."
    506     },
    507     "surprise_contrarian": {
    508       "score": 1,
    509       "justification": "Combining program analysis with LLMs outperforming pure LLM approaches is expected; the magnitude of improvement (14.50× over SAVER) is notable but not paradigm-shifting."
    510     },
    511     "fear_safety": {
    512       "score": 1,
    513       "justification": "Addresses memory safety vulnerabilities (including CVEs and zero-day bugs) in C, which is a genuine security concern, but the paper is about fixing rather than exploiting bugs."
    514     },
    515     "drama_conflict": {
    516       "score": 0,
    517       "justification": "No controversy or conflict. Standard academic comparison against prior work."
    518     },
    519     "demo_ability": {
    520       "score": 0,
    521       "justification": "No code, demo, or tool is released. Everything is deferred to paper acceptance."
    522     },
    523     "brand_recognition": {
    524       "score": 1,
    525       "justification": "Uses Claude 3.5 Sonnet (moderately well-known) and compares against SWE-agent. Authors are from Australian universities, not high-profile AI labs."
    526     }
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs