scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30406B)
      1 {
      2   "paper": {
      3     "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs",
      4     "authors": [
      5       "Mirazul Haque",
      6       "Petr Babkin",
      7       "Farima Farmahinifarahani",
      8       "Manuela Veloso"
      9     ],
     10     "year": 2025,
     11     "venue": "Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing",
     12     "arxiv_id": "2505.04441",
     13     "doi": "10.18653/v1/2025.knowledgenlp-1.17"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Simply incorporating execution traces into LLM prompts for automated program repair provides inconsistent benefits, improving performance in only 2 of 6 model-dataset configurations. Longer and more complex traces tend to reduce repair effectiveness. LLM-optimized (summarized) traces provide the most consistent improvements across configurations. Probing studies show LLMs have limited ability to generate execution traces from scratch (15–50% exact match), suggesting real traces provide information not easily inferred by the model.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The authors mention implementing a fine-tuning pipeline and custom wrappers but do not release them."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All three evaluation datasets are publicly available: Refactory (Hu et al., 2019), RunBugRun (Prenner and Robbes, 2023), and HumanEval-Java (Jiang et al., 2023). The authors did not create proprietary data."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using PySnooper and deepseek-coder-1.3b-instruct but does not list library versions or dependencies."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A researcher would need to reverse-engineer the experimental setup from the paper text."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 1 and 2 report only point estimates for CFA and CPA. No confidence intervals, error bars, or ± notation is provided for any result."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are used. All comparisons between prompt types and models are based solely on comparing raw accuracy numbers without any p-values or formal tests."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Raw accuracy values (CFA, CPA) are reported for all conditions with baselines, allowing readers to assess magnitude. For example, GPT-4 on HumanEval-Java shows Trace Prompt CPA=0.713 vs Error Prompt CPA=0.662, providing sufficient context to gauge improvement magnitude."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for sample sizes. The paper uses 138 programs from Refactory (which has ~2000), 157 from HumanEval-Java, and 1000 sampled from RunBugRun, but does not explain why these particular sizes were chosen or provide power analysis."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or any spread measure is reported. Results are single-run numbers with one prediction per prompt. No indication of result stability across different conditions."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Two baselines are included: Error Prompts (failing test case without trace) and Self-Debug (LLM-generated traces inspired by Chen et al., 2023). Both are evaluated across all datasets and models."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Self-Debug (Chen et al., 2023) and the prompt template from Xia et al. (2023) are recent. TraceFixer (Bouzenia et al., 2023) is used as a fine-tuning comparison. These are contemporary to the work."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple trace format variations are tested: raw traces, collated traces, LLM-optimized traces, confidence-based selection, and trace-length-based routing. This effectively ablates the trace representation component."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Two metrics are used: Correct Fix Accuracy (CFA, percentage of fixes passing all test cases) and Correct Program Accuracy (CPA, percentage of programs with at least one correct fix)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is performed. All evaluation is automated through test-suite pass/fail. The probing studies in Section 5.2 involve manual review of diffs but this evaluates trace understanding, not the repair quality."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "For prompting experiments, the full datasets serve as test data (no training involved). For the fine-tuning experiment (Section 5.1), an explicit 80/20 train/test split is used: '80% of the problems are randomly selected for training, and the rest are reserved for testing.'"
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by dataset (Refactory, HumanEval-Java, RunBugRun), by model (GPT-3.5, GPT-4), and by prompt type in Tables 1 and 2. Figure 2 provides per-dataset trace complexity analysis."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "RQ2 (Section 3.3) analyzes why trace-based prompts fail, showing distributions of trace complexity for correct vs incorrect fixes. Section 5.2 qualitatively analyzes LLM trace manipulation failures (missed variable modifications in loops, wrong function returns, hallucinated exceptions)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The core finding is negative: 'Trace prompts do not consistently outperform Error Prompts on program repair' (RQ1 Summary). Collated traces are described as 'disappointingly' not improving over standard traces. Multiple configurations show traces hurting performance."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are well-matched to results. The claim of 'limited performance improvement over trace-free baselines, in only 2 out of 6 tested dataset / model configurations' is directly supported by Table 1. Claims about LLM-optimized traces being more consistent are supported by Table 2."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The main causal claims come from controlled single-variable comparisons (same programs, same model, different prompt type). The trace complexity analysis (RQ2) is correlational but is presented as an observation rather than a causal claim. The paper uses appropriate hedging ('could highlight', 'implies')."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Program Repair with Code LLMs' is broader than what is tested. Only GPT-3.5 and GPT-4 are evaluated (no open-source models). Only algorithmic/self-contained programs are used. The paper acknowledges the model limitation but the title and framing imply broader applicability to 'Code LLMs' generally."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Multiple alternative explanations are discussed: training data format familiarity for collated traces, attention dilution from long traces, truncation effects, LLM hallucination in Self-Debug, and the qualitative generational gap between GPT-3.5 and GPT-4. Section 4.2 discusses why collated traces fail (lack of training exposure, loop stretching, truncation)."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures test-suite pass rates (CFA, CPA) and frames these as measures of program repair effectiveness. The claims match the granularity of measurements — they do not overclaim that test-passing equates to broader code quality."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper says 'GPT-3.5 Turbo' and 'GPT-4' without specific version identifiers (e.g., gpt-3.5-turbo-0613, gpt-4-0613). For OPT traces they mention 'GPT4-32k' without a version. For fine-tuning, 'deepseek-coder-1.3b-instruct' is specified but this is a model family name, not a snapshot."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Prompts are described in natural language ('We follow the instruction template for complete function generation used by Xia et al., 2023') but the actual prompt text is not provided. Figure 1 shows example data (buggy program, test case, trace) but not the instruction template itself."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No API hyperparameters (temperature, top-p, max tokens) are reported for GPT-3.5 or GPT-4. For fine-tuning, the paper says 'We use the training settings and parameters suggested by deepseek-coder developers' without listing specific values. Prompt truncation at 200 lines is the only parameter mentioned."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The approach is single-turn prompting of LLMs."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Trace generation via PySnooper is described. Postprocessing steps are stated: 'removal of timestamps and stripping of terminal formatting command sequences.' Prompt truncation at 200 lines is documented. RunBugRun wrapper for I/O handling is described. Truncation rates are reported (5% for Refactory, ~10% for RunBugRun)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly mentions scope limitations but does not constitute substantive discussion."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No specific threats to validity are discussed. The paper acknowledges using only GPT models and algorithmic datasets but does not frame these as specific threats or analyze their impact on conclusions."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "While the paper notes 'there is undoubtedly scope for including more proprietary as well as open source models,' it does not explicitly state what the results do NOT show or what populations/settings are excluded. No dedicated scope boundary statements are made."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw experimental data (model outputs, generated fixes, execution traces) is made available for verification. Only aggregated accuracy numbers are reported in tables."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.1 describes dataset selection: 15 datasets surveyed with criteria (size, diversity, unit test availability, origin). Three datasets selected with clear rationale. RunBugRun sampling of 1000 Python bugs is described. Trace generation via PySnooper is detailed."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard public benchmarks (Refactory, RunBugRun, HumanEval-Java)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The pipeline from raw datasets to final experimental samples has gaps. Refactory has ~2000 faulty programs but only 138 are used without explanation of the filtering. The total number of prompts per dataset is given but intermediate steps are not fully documented."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding sources are disclosed. The disclaimer section states the paper was prepared by JP Morgan AI Research but does not constitute a funding disclosure."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors are clearly identified as J.P. Morgan AI Research, with locations (New York and Palo Alto) specified. The affiliation is prominent in the paper header."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "J.P. Morgan does not have a direct financial stake in whether execution traces improve GPT-based program repair. The paper evaluates OpenAI models, not JP Morgan products."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is provided. The disclaimer is a legal boilerplate, not a conflicts-of-interest declaration."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for GPT-3.5 Turbo or GPT-4. This is critical since the benchmarks predate these models."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether benchmark problems appeared in GPT training data. HumanEval (published 2021), Refactory (2019), and CodeNet/RunBugRun (2021-2023) all predate or overlap with GPT model training periods."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "HumanEval is a widely-known benchmark published in 2021, well before GPT-3.5 and GPT-4 training. Refactory and CodeNet are similarly public. No contamination analysis or acknowledgment is provided."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No API costs, tokens consumed, or wall-clock time reported. The approach makes many GPT-3.5/GPT-4 API calls (hundreds per dataset-model pair) and OPT traces require an additional GPT-4-32k call per example, but costs are never quantified."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget is stated. Neither API spend for prompting experiments nor GPU hours for the deepseek-coder fine-tuning are reported."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No multiple-seed experiments are conducted. Single predictions per prompt are generated with no analysis of sensitivity to random seeds or sampling variation."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "The paper explicitly states: 'we generate a single prediction per test case-specific prompt and aggregate across prompts when computing metrics' (Section 3.1, Metrics)."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search is described for the prompting experiments. For the trace-length routing threshold, 6 values are tested (25-50) but no search budget is stated. Fine-tuning uses default deepseek-coder parameters without search."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "For trace-length-based routing (TRL OPT), the paper reports 'we only report the best results in the table' but provides full results for all threshold values in Appendix Figures 6 and 7, enabling verification."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No statistical tests are performed at all, despite comparing many prompt types across multiple datasets and models. No multiple comparison correction could be applied since no tests were run."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors re-implement TraceFixer's fine-tuning pipeline ('As we didn't have access to TraceFixer's code, we implemented our own fine-tuning pipeline') and Self-Debug baseline, but do not acknowledge or discuss the bias of evaluating their own implementations of baselines."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "OPT traces require an additional GPT-4-32k call per example, and confidence-based selection requires extra LLM queries. These compute differences relative to simpler baselines are not discussed or quantified."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section 3.1 discusses dataset selection criteria and explicitly acknowledges the gap between algorithmic benchmarks and realistic programs: 'While realistic datasets are ideal, evaluating them requires significant manual effort due to complex dependencies. Algorithmic datasets offer advantages like manageable length and easily testable, self-contained functions.'"
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No agentic scaffolding is used. The approach is single-turn prompting."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. HumanEval (2021), Refactory (2019), and CodeNet (2021) all predate GPT-3.5 and GPT-4 training, meaning solutions may exist in training data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks answer information. The execution traces themselves come from running the buggy program, which could provide information not available in realistic debugging scenarios."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of independence between train and test data. The fine-tuning experiment uses a random 80/20 split but does not verify that problems don't share structural similarities."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is used (no canary strings, membership inference tests, n-gram overlap analysis, or temporal splits)."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Simply incorporating execution traces into LLM prompts provides limited improvement over trace-free baselines, helping in only 2 of 6 model-dataset configurations.",
    370       "evidence": "Table 1 shows Trace Prompts outperform Error Prompts only for GPT-4 on HumanEval-Java (CPA 0.713 vs 0.662) and RunBugRun (CFA 0.558 vs 0.529). In all GPT-3.5 configurations and GPT-4 on Refactory, traces underperform or match error-only prompts.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "The effectiveness of trace-based prompts decreases as trace complexity (length and number of variable modifications) increases.",
    375       "evidence": "Figure 2 shows median trace length and variable changes are significantly higher for failing fixes than correct ones on HumanEval-Java and RunBugRun with GPT-4. Refactory shows a partial exception where trace length is lower for failing fixes.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLM-optimized (summarized) traces provide the most consistent performance gains across all configurations.",
    380       "evidence": "Table 2 shows OPT Trace achieves top-3 CPA in all 6 configurations and best or second-best CFA in all 6. However, OPT still underperforms 'RQ1 Best' (the best non-optimized prompt) in 4 of 6 CPA comparisons.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Trace-based prompting outperforms fine-tuning a smaller LLM (deepseek-coder-1.3b) for APR.",
    385       "evidence": "Figure 4 shows all prompting techniques outperform fine-tuned models on CPA and CFA across datasets. However, the comparison is between GPT-3.5/4 (much larger models) and deepseek-coder-1.3b with limited training data (459-517 samples).",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "LLMs have limited ability to generate execution traces from scratch, supporting the value of real execution traces.",
    390       "evidence": "Table 3 shows GPT-4 trace prediction accuracy of 50% on reference Refactory programs, 26% on failing programs, and 15% on Geeks-for-geeks. Trace collating accuracy ranges from 45% to 88%.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No statistical testing",
    397       "detail": "All comparisons across 6+ configurations and multiple prompt types are based solely on point estimates. No significance tests, confidence intervals, or variance measures are reported, making it impossible to assess whether observed differences are meaningful or due to chance."
    398     },
    399     {
    400       "flag": "Benchmark contamination unaddressed",
    401       "detail": "HumanEval (2021), Refactory (2019), and CodeNet (2021) were all publicly available before GPT-3.5 and GPT-4 training. The models may have seen benchmark solutions, which could differentially affect trace-based vs trace-free prompts. This is never discussed."
    402     },
    403     {
    404       "flag": "Unfair fine-tuning comparison",
    405       "detail": "The fine-tuning comparison pits a 1.3B parameter model trained on 459-517 samples against GPT-3.5/GPT-4 (orders of magnitude larger). The authors acknowledge limited training data but still present the comparison as evidence that prompting outperforms fine-tuning."
    406     },
    407     {
    408       "flag": "Missing model versions and hyperparameters",
    409       "detail": "No specific GPT model versions, API snapshot dates, or sampling parameters (temperature, top-p) are reported. GPT model behavior changes across versions, making these experiments unreproducible."
    410     },
    411     {
    412       "flag": "Unexplained dataset subsetting",
    413       "detail": "Refactory has ~2000 faulty programs but only 138 are used. No explanation is given for this 93% reduction. The selection criteria could significantly affect results."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Teaching large language models to self-debug",
    419       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    420       "year": 2023,
    421       "arxiv_id": "2304.05128",
    422       "relevance": "Key baseline: Self-Debug approach that prompts LLMs to generate their own execution traces for code repair."
    423     },
    424     {
    425       "title": "TraceFixer: Execution trace-driven program repair",
    426       "authors": ["Islem Bouzenia", "Yangruibo Ding", "Kexin Pei", "Baishakhi Ray", "Michael Pradel"],
    427       "year": 2023,
    428       "relevance": "Fine-tuning baseline that trains CodeT5 with execution traces for APR, showing 13% improvement on synthetic bugs."
    429     },
    430     {
    431       "title": "TRACED: Execution-aware pre-training for source code",
    432       "authors": ["Yangruibo Ding", "Ben Steenhoek", "Kexin Pei", "Gail Kaiser", "Wei Le", "Baishakhi Ray"],
    433       "year": 2023,
    434       "relevance": "Pre-training approach using execution traces that improved clone detection and vulnerability detection over AST-based methods."
    435     },
    436     {
    437       "title": "Automated program repair in the era of large pre-trained language models",
    438       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    439       "year": 2023,
    440       "relevance": "Provides the instruction template for complete function generation used as the prompting framework in this study."
    441     },
    442     {
    443       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    444       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    445       "year": 2022,
    446       "relevance": "Zero-shot LLM-based APR approach that established baselines for prompt-based program repair."
    447     },
    448     {
    449       "title": "Impact of code language models on automated program repair",
    450       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    451       "year": 2023,
    452       "relevance": "Evaluates code LLMs for APR and provides the HumanEval-Java dataset with synthetic bugs used in this study."
    453     },
    454     {
    455       "title": "SelfAPR: Self-supervised program repair with test execution diagnostics",
    456       "authors": ["He Ye", "Matias Martinez", "Xiapu Luo", "Tao Zhang", "Martin Monperrus"],
    457       "year": 2022,
    458       "relevance": "Uses compiler and test diagnostics during self-supervised training for APR, a precursor to trace-augmented repair."
    459     },
    460     {
    461       "title": "Is ChatGPT the ultimate programming assistant – how far is it?",
    462       "authors": ["Haoye Tian", "Weiqi Lu", "Tsz On Li", "Xunzhu Tang", "Shing-Chi Cheung", "Jacques Klein", "Tegawendé F. Bissyandé"],
    463       "year": 2023,
    464       "relevance": "Evaluates ChatGPT on programming tasks including APR, relevant to understanding LLM capability for code repair."
    465     },
    466     {
    467       "title": "RunBugRun – an executable dataset for automated program repair",
    468       "authors": ["Julian Aron Prenner", "Romain Robbes"],
    469       "year": 2023,
    470       "relevance": "One of the three main evaluation datasets, containing a quarter million submissions for 4000 distinct problems."
    471     },
    472     {
    473       "title": "Can large language models reason about program invariants?",
    474       "authors": ["Charles Sutton", "David Bieber", "Kensen Shi", "Kexin Pei", "Pengcheng Yin"],
    475       "year": 2023,
    476       "relevance": "Examines LLM reasoning about program execution behavior, relevant to understanding trace comprehension capabilities."
    477     },
    478     {
    479       "title": "Code execution with pre-trained language models",
    480       "authors": ["Chenxiao Liu", "Shuai Lu", "Weizhu Chen", "Daxin Jiang", "Alexey Svyatkovskiy", "Shengyu Fu", "Neel Sundaresan", "Nan Duan"],
    481       "year": 2023,
    482       "relevance": "Program state prediction pre-training that improved code search and generation, related to execution-aware code models."
    483     }
    484   ],
    485   "engagement_factors": {
    486     "practical_relevance": {
    487       "score": 1,
    488       "justification": "The idea of augmenting APR prompts with traces is interesting but results are mixed and no tools or code are released for practitioners to use."
    489     },
    490     "surprise_contrarian": {
    491       "score": 1,
    492       "justification": "The finding that execution traces often don't help and can hurt LLM-based repair is mildly contrarian to the intuition that more information should help."
    493     },
    494     "fear_safety": {
    495       "score": 0,
    496       "justification": "No safety, security, or risk implications in this work."
    497     },
    498     "drama_conflict": {
    499       "score": 0,
    500       "justification": "No controversy or conflict angle."
    501     },
    502     "demo_ability": {
    503       "score": 0,
    504       "justification": "No code, demo, or tool released."
    505     },
    506     "brand_recognition": {
    507       "score": 1,
    508       "justification": "J.P. Morgan AI Research is a recognized corporate lab; GPT-3.5/GPT-4 are well-known models."
    509     }
    510   }
    511 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs