scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32530B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs",
      6     "authors": [
      7       "Mirazul Haque",
      8       "Petr Babkin",
      9       "Farima Farmahinifarahani",
     10       "Manuela Veloso"
     11     ],
     12     "year": 2025,
     13     "venue": "4th International Workshop on Knowledge-Augmented Methods for NLP",
     14     "arxiv_id": "2505.04441",
     15     "doi": "10.18653/v1/2025.knowledgenlp-1.17"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims are well-matched to results. The claim of 'limited performance improvement over trace-free baselines, in only 2 out of 6 tested dataset / model configurations' is directly supported by Table 1. Claims about LLM-optimized traces being more consistent are supported by Table 2.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The main causal claims come from controlled single-variable comparisons (same programs, same model, different prompt type). The trace complexity analysis (RQ2) is correlational but is presented as an observation rather than a causal claim. The paper uses appropriate hedging ('could highlight', 'implies').",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title 'Program Repair with Code LLMs' is broader than what is tested. Only GPT-3.5 and GPT-4 are evaluated (no open-source models). Only algorithmic/self-contained programs are used. The paper acknowledges the model limitation but the title and framing imply broader applicability to 'Code LLMs' generally.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Multiple alternative explanations are discussed: training data format familiarity for collated traces, attention dilution from long traces, truncation effects, LLM hallucination in Self-Debug, and the qualitative generational gap between GPT-3.5 and GPT-4. Section 4.2 discusses why collated traces fail (lack of training exposure, loop stretching, truncation).",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper measures test-suite pass rates (CFA, CPA) and frames these as measures of program repair effectiveness. The claims match the granularity of measurements — they do not overclaim that test-passing equates to broader code quality.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly mentions scope limitations but does not constitute substantive discussion.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No specific threats to validity are discussed. The paper acknowledges using only GPT models and algorithmic datasets but does not frame these as specific threats or analyze their impact on conclusions.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "While the paper notes 'there is undoubtedly scope for including more proprietary as well as open source models,' it does not explicitly state what the results do NOT show or what populations/settings are excluded. No dedicated scope boundary statements are made.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources are disclosed. The disclaimer section states the paper was prepared by JP Morgan AI Research but does not constitute a funding disclosure.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors are clearly identified as J.P. Morgan AI Research, with locations (New York and Palo Alto) specified. The affiliation is prominent in the paper header.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "J.P. Morgan does not have a direct financial stake in whether execution traces improve GPT-based program repair. The paper evaluates OpenAI models, not JP Morgan products.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is provided. The disclaimer is a legal boilerplate, not a conflicts-of-interest declaration.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms defined: APR ('reduce human effort in debugging'), execution traces ('structured runtime data revealing program behavior'), CFA/CPA metrics provided in Section 3.1.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit research questions (RQ1-3) clearly frame contribution: understanding if/how execution traces help APR, trace complexity effects, and optimal trace formats.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 discusses SelfAPR, TRACED, TraceFixer, Self-Debug with explicit positioning: 'To the best of our knowledge, all of these works do not consider the effect of putting execution traces in the prompt of a pretrained LLM.'",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The authors mention implementing a fine-tuning pipeline and custom wrappers but do not release them.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All three evaluation datasets are publicly available: Refactory (Hu et al., 2019), RunBugRun (Prenner and Robbes, 2023), and HumanEval-Java (Jiang et al., 2023). The authors did not create proprietary data.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using PySnooper and deepseek-coder-1.3b-instruct but does not list library versions or dependencies.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A researcher would need to reverse-engineer the experimental setup from the paper text.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Tables 1 and 2 report only point estimates for CFA and CPA. No confidence intervals, error bars, or ± notation is provided for any result.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are used. All comparisons between prompt types and models are based solely on comparing raw accuracy numbers without any p-values or formal tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Raw accuracy values (CFA, CPA) are reported for all conditions with baselines, allowing readers to assess magnitude. For example, GPT-4 on HumanEval-Java shows Trace Prompt CPA=0.713 vs Error Prompt CPA=0.662, providing sufficient context to gauge improvement magnitude.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification is given for sample sizes. The paper uses 138 programs from Refactory (which has ~2000), 157 from HumanEval-Java, and 1000 sampled from RunBugRun, but does not explain why these particular sizes were chosen or provide power analysis.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or any spread measure is reported. Results are single-run numbers with one prediction per prompt. No indication of result stability across different conditions.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Two baselines are included: Error Prompts (failing test case without trace) and Self-Debug (LLM-generated traces inspired by Chen et al., 2023). Both are evaluated across all datasets and models.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Self-Debug (Chen et al., 2023) and the prompt template from Xia et al. (2023) are recent. TraceFixer (Bouzenia et al., 2023) is used as a fine-tuning comparison. These are contemporary to the work.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Multiple trace format variations are tested: raw traces, collated traces, LLM-optimized traces, confidence-based selection, and trace-length-based routing. This effectively ablates the trace representation component.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two metrics are used: Correct Fix Accuracy (CFA, percentage of fixes passing all test cases) and Correct Program Accuracy (CPA, percentage of programs with at least one correct fix).",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation is performed. All evaluation is automated through test-suite pass/fail. The probing studies in Section 5.2 involve manual review of diffs but this evaluates trace understanding, not the repair quality.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "For prompting experiments, the full datasets serve as test data (no training involved). For the fine-tuning experiment (Section 5.1), an explicit 80/20 train/test split is used: '80% of the problems are randomly selected for training, and the rest are reserved for testing.'",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by dataset (Refactory, HumanEval-Java, RunBugRun), by model (GPT-3.5, GPT-4), and by prompt type in Tables 1 and 2. Figure 2 provides per-dataset trace complexity analysis.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "RQ2 (Section 3.3) analyzes why trace-based prompts fail, showing distributions of trace complexity for correct vs incorrect fixes. Section 5.2 qualitatively analyzes LLM trace manipulation failures (missed variable modifications in loops, wrong function returns, hallucinated exceptions).",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The core finding is negative: 'Trace prompts do not consistently outperform Error Prompts on program repair' (RQ1 Summary). Collated traces are described as 'disappointingly' not improving over standard traces. Multiple configurations show traces hurting performance.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "The paper says 'GPT-3.5 Turbo' and 'GPT-4' without specific version identifiers (e.g., gpt-3.5-turbo-0613, gpt-4-0613). For OPT traces they mention 'GPT4-32k' without a version. For fine-tuning, 'deepseek-coder-1.3b-instruct' is specified but this is a model family name, not a snapshot.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Prompts are described in natural language ('We follow the instruction template for complete function generation used by Xia et al., 2023') but the actual prompt text is not provided. Figure 1 shows example data (buggy program, test case, trace) but not the instruction template itself.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "No API hyperparameters (temperature, top-p, max tokens) are reported for GPT-3.5 or GPT-4. For fine-tuning, the paper says 'We use the training settings and parameters suggested by deepseek-coder developers' without listing specific values. Prompt truncation at 200 lines is the only parameter mentioned.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The approach is single-turn prompting of LLMs.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Trace generation via PySnooper is described. Postprocessing steps are stated: 'removal of timestamps and stripping of terminal formatting command sequences.' Prompt truncation at 200 lines is documented. RunBugRun wrapper for I/O handling is described. Truncation rates are reported (5% for Refactory, ~10% for RunBugRun).",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "No raw experimental data (model outputs, generated fixes, execution traces) is made available for verification. Only aggregated accuracy numbers are reported in tables.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.1 describes dataset selection: 15 datasets surveyed with criteria (size, diversity, unit test availability, origin). Three datasets selected with clear rationale. RunBugRun sampling of 1000 Python bugs is described. Trace generation via PySnooper is detailed.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. All data comes from standard public benchmarks (Refactory, RunBugRun, HumanEval-Java).",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The pipeline from raw datasets to final experimental samples has gaps. Refactory has ~2000 faulty programs but only 138 are used without explanation of the filtering. The total number of prompts per dataset is given but intermediate steps are not fully documented.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff dates are stated for GPT-3.5 Turbo or GPT-4. This is critical since the benchmarks predate these models.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether benchmark problems appeared in GPT training data. HumanEval (published 2021), Refactory (2019), and CodeNet/RunBugRun (2021-2023) all predate or overlap with GPT model training periods.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "HumanEval is a widely-known benchmark published in 2021, well before GPT-3.5 and GPT-4 training. Refactory and CodeNet are similarly public. No contamination analysis or acknowledgment is provided.",
    308           "source": "opus"
    309         }
    310       },
    311       "cost_and_practicality": {
    312         "inference_cost_reported": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No API costs, tokens consumed, or wall-clock time reported. The approach makes many GPT-3.5/GPT-4 API calls (hundreds per dataset-model pair) and OPT traces require an additional GPT-4-32k call per example, but costs are never quantified.",
    316           "source": "opus"
    317         },
    318         "compute_budget_stated": {
    319           "applies": true,
    320           "answer": false,
    321           "justification": "No total computational budget is stated. Neither API spend for prompting experiments nor GPU hours for the deepseek-coder fine-tuning are reported.",
    322           "source": "opus"
    323         }
    324       },
    325       "human_studies": {
    326         "pre_registered": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "irb_or_ethics_approval": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "demographics_reported": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "inclusion_exclusion_criteria": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "randomization_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         },
    356         "blinding_described": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No human participants in this study.",
    360           "source": "opus"
    361         },
    362         "attrition_reported": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "No human participants in this study.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No multiple-seed experiments are conducted. Single predictions per prompt are generated with no analysis of sensitivity to random seeds or sampling variation.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": true,
    379           "justification": "The paper explicitly states: 'we generate a single prediction per test case-specific prompt and aggregate across prompts when computing metrics' (Section 3.1, Metrics).",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search is described for the prompting experiments. For the trace-length routing threshold, 6 values are tested (25-50) but no search budget is stated. Fine-tuning uses default deepseek-coder parameters without search.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": true,
    391           "justification": "For trace-length-based routing (TRL OPT), the paper reports 'we only report the best results in the table' but provides full results for all threshold values in Appendix Figures 6 and 7, enabling verification.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "No statistical tests are performed at all, despite comparing many prompt types across multiple datasets and models. No multiple comparison correction could be applied since no tests were run.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors re-implement TraceFixer's fine-tuning pipeline ('As we didn't have access to TraceFixer's code, we implemented our own fine-tuning pipeline') and Self-Debug baseline, but do not acknowledge or discuss the bias of evaluating their own implementations of baselines.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "OPT traces require an additional GPT-4-32k call per example, and confidence-based selection requires extra LLM queries. These compute differences relative to simpler baselines are not discussed or quantified.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": true,
    415           "justification": "Section 3.1 discusses dataset selection criteria and explicitly acknowledges the gap between algorithmic benchmarks and realistic programs: 'While realistic datasets are ideal, evaluating them requires significant manual effort due to complex dependencies. Algorithmic datasets offer advantages like manageable length and easily testable, self-contained functions.'",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No agentic scaffolding is used. The approach is single-turn prompting.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "No discussion of temporal leakage. HumanEval (2021), Refactory (2019), and CodeNet (2021) all predate GPT-3.5 and GPT-4 training, meaning solutions may exist in training data.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether the evaluation setup leaks answer information. The execution traces themselves come from running the buggy program, which could provide information not available in realistic debugging scenarios.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "No discussion of independence between train and test data. The fine-tuning experiment uses a random 80/20 split but does not verify that problems don't share structural similarities.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection or prevention method is used (no canary strings, membership inference tests, n-gram overlap analysis, or temporal splits).",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Execution traces do not consistently improve APR performance over error-only prompts",
    456       "evidence": "Table 1: only 2/6 dataset-model pairs show improvement with trace prompts (GPT-4 HumanEval-Java, GPT-4 RunBugRun). RQ1 Summary explicitly states this finding.",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "Longer and more complex execution traces reduce LLM effectiveness at program repair",
    461       "evidence": "Figure 2: median trace length and variable modifications higher for incorrect fixes (HumanEval-Java, RunBugRun). RQ2 directly investigates this relationship.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "LLM-optimized (condensed) traces provide the most consistent performance improvements",
    466       "evidence": "Table 2: OPT traces among top-3 performers on CPA for all model-dataset pairs; best CFA for 3/6 pairs. RQ3 Summary: 'OPT is the most consistent type of prompting technique.'",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "Trace-based prompting outperforms fine-tuning a smaller LLM on APR",
    471       "evidence": "Figure 4 shows all prompting techniques (Error, Trace, OPT) exceed fine-tuned deepseek-coder-1.3b CPA and CFA across datasets.",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "GPT-4 benefits from execution traces whereas GPT-3.5 does not",
    476       "evidence": "Table 1: GPT-4 shows improvements on 2/3 datasets; GPT-3.5 shows none. Paper notes 'qualitative generational gap...emergent abilities of LLMs.'",
    477       "supported": "moderate"
    478     },
    479     {
    480       "claim": "LLMs struggle to generate accurate execution traces from program code alone",
    481       "evidence": "Table 3 probing study: trace prediction 15-50% exact match. Paper notes this explains Self-Debug baseline weakness and validates utility of real traces.",
    482       "supported": "strong"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval",
    487     "case-study"
    488   ],
    489   "key_findings": "Execution traces provide limited and inconsistent benefits for LLM-based program repair, improving performance in only 2 of 6 tested configurations. The paper finds that trace effectiveness decreases with complexity (longer traces, more variable assignments). LLM-optimized traces—condensed by GPT-4—offer the most consistent improvements. Notably, trace-based prompting outperforms fine-tuning smaller models, and probing studies reveal LLMs struggle to generate or collate execution traces accurately (15-88% accuracy), suggesting real traces contribute information LLMs cannot easily infer.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical significance testing",
    493       "detail": "All results reported as point estimates without confidence intervals or p-values. Differences (e.g., 79.1% vs 73.7%) could be within noise—unclear if findings are statistically robust."
    494     },
    495     {
    496       "flag": "Limited baseline coverage",
    497       "detail": "Only compares against Self-Debug and error prompts. No comparison to other APR methods (symbolic execution, constraint-based repair) or recent LLM approaches."
    498     },
    499     {
    500       "flag": "Possible data contamination",
    501       "detail": "HumanEval is a famous benchmark likely in GPT training data. Paper acknowledges this indirectly (GeeksForGeeks 'eliminates prompt leakage') but main results not systematically checked for contamination."
    502     },
    503     {
    504       "flag": "Missing artifact details for reproducibility",
    505       "detail": "Actual prompts deferred to external template. No released code, API calls logged, or processed outputs. Exact GPT model versions (API snapshots) not specified."
    506     },
    507     {
    508       "flag": "No dedicated limitations section",
    509       "detail": "Lacks systematic discussion of threats to validity, generalization boundaries, or scope limitations. Scattered inline caveats are insufficient for a methodological review."
    510     },
    511     {
    512       "flag": "Hyperparameters not reported",
    513       "detail": "Temperature, top-p, max_tokens for GPT models missing. Fine-tuning setup refers to external documentation. Critical for reproducibility."
    514     },
    515     {
    516       "flag": "Small probing study sample",
    517       "detail": "Trace understanding evaluated on 34-38 Refactory examples. Geeks-for-Geeks sample (300) helps but still limited for generalizing about LLM trace abilities."
    518     }
    519   ],
    520   "cited_papers": [
    521     {
    522       "title": "Teaching large language models to self-debug",
    523       "relevance": "Self-Debug baseline method; inspiration for trace-free prompting approach tested in RQ1."
    524     },
    525     {
    526       "title": "Execution trace-driven program repair",
    527       "relevance": "TraceFixer: prior work using traces with fine-tuning; compared against in finetuning study (Section 5.1)."
    528     },
    529     {
    530       "title": "Automated program repair in the era of large pre-trained language models",
    531       "relevance": "Xia et al. (2023): template for prompt construction and baseline evaluation methodology."
    532     },
    533     {
    534       "title": "TRACED: Execution-aware pre-training for source code",
    535       "relevance": "Pre-training approach integrating execution information; contrasts with zero-shot prompting approach in this paper."
    536     },
    537     {
    538       "title": "SelfAPR: Self-supervised program repair with test execution diagnostics",
    539       "relevance": "Earlier work using execution diagnostics for APR; cited as motivation for including runtime information."
    540     },
    541     {
    542       "title": "Program state prediction pre-training for code search and generation",
    543       "relevance": "Liu et al. (2023): pre-training strategy using program state; relevance to incorporating dynamic program information."
    544     }
    545   ],
    546   "engagement_factors": {
    547     "practical_relevance": {
    548       "score": 1,
    549       "justification": "The idea of augmenting APR prompts with traces is interesting but results are mixed and no tools or code are released for practitioners to use."
    550     },
    551     "surprise_contrarian": {
    552       "score": 1,
    553       "justification": "The finding that execution traces often don't help and can hurt LLM-based repair is mildly contrarian to the intuition that more information should help."
    554     },
    555     "fear_safety": {
    556       "score": 0,
    557       "justification": "No safety, security, or risk implications in this work."
    558     },
    559     "drama_conflict": {
    560       "score": 0,
    561       "justification": "No controversy or conflict angle."
    562     },
    563     "demo_ability": {
    564       "score": 0,
    565       "justification": "No code, demo, or tool released."
    566     },
    567     "brand_recognition": {
    568       "score": 1,
    569       "justification": "J.P. Morgan AI Research is a recognized corporate lab; GPT-3.5/GPT-4 are well-known models."
    570     }
    571   },
    572   "hn_data": {
    573     "threads": [
    574       {
    575         "hn_id": "43120088",
    576         "title": "Show HN: We have just released our first Debloating tool for Containers",
    577         "points": 5,
    578         "comments": 4,
    579         "url": "https://news.ycombinator.com/item?id=43120088"
    580       },
    581       {
    582         "hn_id": "42657501",
    583         "title": "The GAN is dead; long live the GAN - A Modern GAN Baseline",
    584         "points": 3,
    585         "comments": 1,
    586         "url": "https://news.ycombinator.com/item?id=42657501"
    587       },
    588       {
    589         "hn_id": "44439235",
    590         "title": "Wider or Deeper? Scaling LLM Inference-Time Compute with Adaptive Tree Search",
    591         "points": 3,
    592         "comments": 0,
    593         "url": "https://news.ycombinator.com/item?id=44439235"
    594       },
    595       {
    596         "hn_id": "44312317",
    597         "title": "Self-Supervised Contrastive Learning Approximates Supervised CL",
    598         "points": 3,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=44312317"
    601       },
    602       {
    603         "hn_id": "44363141",
    604         "title": "Revisiting the Othello World Model Hypothesis",
    605         "points": 1,
    606         "comments": 0,
    607         "url": "https://news.ycombinator.com/item?id=44363141"
    608       }
    609     ],
    610     "top_points": 5,
    611     "total_points": 15,
    612     "total_comments": 5
    613   }
    614 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs