scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32037B)
      1 {
      2   "paper": {
      3     "title": "RunBugRun – An Executable Dataset for Automated Program Repair",
      4     "authors": [
      5       "Julian Aron Prenner",
      6       "Romain Robbes"
      7     ],
      8     "year": 2023,
      9     "venue": "arXiv.org",
     10     "arxiv_id": "2304.01102",
     11     "doi": "10.48550/arXiv.2304.01102"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval"],
     16   "key_findings": "RunBugRun provides ~450,000 executable buggy/fixed program pairs across 8 languages with test cases and fine-grained bug labels, derived from programming contest submissions. A CodeT5-based NPR baseline achieved 59.8% plausibility across all languages (best on Python at 69%, worst on JavaScript at 44.5%), while the G&V baseline Cardumen fixed only ~5% of Java bugs under a constrained time budget. Cross-language knowledge transfer significantly boosts performance for low-resource languages (e.g., +34.5pp for PHP, +31.8pp for Go), and performance strongly degrades with the number of token changes required (r = −0.996).",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a GitHub repository URL: https://github.com/giganticode/run_bug_run (Section 8), which includes the dataset and infrastructure to compile, execute, and test programs."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The full dataset of ~450,000 buggy/fixed program pairs, test cases, bug labels, and error messages is released through the project website referenced in Section 8."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Section 4 specifies exact compiler/interpreter versions: GCC 11 for C/C++, CPython 3.9, CRuby 3.0, OpenJDK 17, PHP 8.1.2, Go 1.18.1, NodeJS 12.22. For the NPR baseline: NVIDIA RTX 3090 with 24GB, HuggingFace transformers library with codet5-small checkpoint."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 8 states: 'Details on how to download, install and run RunBugRun are provided on the project website.' The infrastructure includes sandbox-based execution using bwrap, and the paper provides detailed methodology for reproducing baseline experiments."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as point estimates (e.g., '59.8%' plausibility rate, '64.4%' for C) without any confidence intervals, error bars, or uncertainty measures."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims performance differences across languages and training regimens (e.g., 'performance is best for Python (69%)... hardest languages were Ruby (47.6%)') without any statistical significance tests. Pearson's r is reported but without p-values for most comparisons."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Pearson's r values are reported (r = −0.996 for changes vs. NPR performance, r = 0.33 for training size vs. performance). Performance ranges are given with baseline context (e.g., '68.7% for single change fixes to 42.8% for six changes'). Transfer effects are quantified (e.g., '+34.5pp for PHP')."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for the sample sizes used. The manual correctness assessment uses 200 randomly selected patches with no power analysis or justification for this number. The 450K dataset size is not justified from a statistical power perspective."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "All baseline results appear to be from single experimental runs. No standard deviations, variance across seeds, or any spread measures are reported for either the NPR or G&V baseline evaluations."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Two baselines are provided: Cardumen, a G&V approach (Section 5.2), and a fine-tuned CodeT5-small NPR model. These represent the two main paradigms in APR."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "CodeT5 (2021) was a contemporary pre-trained code model at the time of writing. Cardumen was justified as 'one of the most successful G&V systems on QuixBugs' (Section 4), a similar benchmark. Both were reasonable choices for 2023."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "RQ2.3 presents ablation-style experiments with 17 different training regimens: training on all languages, excluding one language, or training on a single language only (Table 5). RQ2.4 varies the number of candidates from 1 to 10 (Figure 7)."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The primary evaluation metric is plausibility rate (whether patches pass all test cases). While the paper provides per-label r-scores and Pearson correlations as analytical tools, the fundamental evaluation metric for the baselines is a single metric: test-pass plausibility rate."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 7 ('Plausibility vs correctness') reports manual assessment of 200 randomly selected plausible fixes, finding 3.5% (7/200) to be incorrect. 46.5% were identical to the ground-truth patch."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The dataset has explicit train/validation/test splits (Section 4, 'Data splitting'). The test set contains ~32,500 bugs. Splits ensure no user's submissions to the same problem cross set boundaries, and trigram-based deduplication prevents cross-set near-duplicates."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Extensive breakdowns are provided: per-language performance (Table 5), per-label performance with aggregated categories (Table 4), per-change-count performance, and strengths/weaknesses analysis (RQ2.2). Figure 4 shows language distribution across splits."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "RQ2.2 provides a detailed weak-spot analysis for both baselines. For CodeT5: struggles with adding if-branches and continue keywords. For Cardumen: fails entirely on adding function calls or type conversions. Specific failing label categories are identified."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Cardumen's poor performance (~5% plausibility) is reported honestly. The paper discusses where models fail (RQ2.2 weak points). Transfer learning limitations are noted: 'perhaps there is a limit to the effectiveness of transfer learning' for complex bugs (Section 6)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims about dataset size (~450K), 8 languages, executable programs, test cases, fine-grained labels, and two baselines are all supported by detailed results in Sections 4-5. The abstract appropriately hedges ('basic evaluation results')."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The main causal claim is about cross-language knowledge transfer (RQ2.3). This is supported by controlled experiments: training with/without specific languages while holding other variables constant (Table 5). The ablation design with 17 models provides adequate causal evidence for transfer effects."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 7 explicitly bounds the scope: 'The dataset consists of short implementations of solutions to algorithmic problems. There is a significant difference between the program code in the dataset and the code of larger software projects.' The title and abstract accurately describe it as a dataset for APR, not a general code quality tool."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 6 discusses alternative explanations for Cardumen's poor performance (constrained time budget, low redundancy in small programs). For transfer learning, the paper considers language similarity as a confounding factor ('Java and C we conjecture that this might be due to the languages' similarity to C++')."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper explicitly distinguishes plausibility (passing all tests) from correctness (Section 7, 'Plausibility vs correctness'): 'Passing all test cases is no guarantee for a patch to be fully correct.' They quantify the gap through manual assessment (3.5% false positives) and discuss test suite strength as a mitigating factor."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The NPR baseline uses the specific 'codet5-small' model checkpoint from HuggingFace. Cardumen is run via ASTOR with Flacoco fault localization. All compiler/interpreter versions are specified (GCC 11, CPython 3.9, OpenJDK 17, etc.)."
    146       },
    147       "prompts_provided": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "The paper does not use prompting. CodeT5 is fine-tuned as a seq2seq model (buggy→fixed translation) with language identification token prefixes, not prompted. Cardumen is a G&V tool that does not use prompts."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 5.2.1 reports: learning rate 1e-4, batch size 16, 2 gradient accumulation steps, 5 epochs, beam search with 6 beams, 5 candidates per bug. For Cardumen: -maxtime 10 -maxgen 100. Memory limit 512MB, execution timeout 3 seconds."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. CodeT5 is a straightforward seq2seq fine-tuned model, and Cardumen is a template-based G&V tool."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 4 provides extensive documentation: language filtering, buggy/fixed pair extraction using LCS-based diffing with 6-token threshold, syntax error filtering, Python 2→3 conversion, sandbox execution of 130M+ executions, test case filtering (45% pass rate threshold, 12% timeout rate), flakiness checks, trigram deduplication, and rule-based labeling with TreeSitter."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 7 'Limitations, Biases and Threats to Validity' is a dedicated section with multiple subsections addressing limited scope, lack of diversity, tokenization, labeling, and plausibility vs. correctness."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Specific threats discussed include: short algorithmic programs vs. large projects (limited scope), same-problem solution similarity reducing bug diversity, language-specific tokenizer limitations, rule-based labeling possibly missing important change classes, and 3.5% false-positive plausibility rate from manual assessment."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 7 explicitly states: programs lack 'complex class hierarchies, UI-related code, large numbers of methods, functions and code files, dependencies on software frameworks and libraries.' The paper also states bugs are limited to 1-6 token changes and are single-file only."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The full dataset is released including all source programs, test cases, labels, error messages, and execution results. The underlying Project CodeNet source is also publicly available."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4 describes data collection in detail: sourced from Project CodeNet (AIZU and AtCoder submissions), with specific criteria for pair selection (same problem, same user, <6 token difference between rejected and accepted submissions)."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data comes from Project CodeNet, a well-known public dataset of programming contest submissions."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 4 documents the full pipeline: 13M+ submissions → language filtering → pair extraction → syntax error filtering → Python 2→3 conversion → execution (130M+ runs) → test case filtering → flakiness checks → labeling → deduplication → splitting. Specific counts are given at key stages (e.g., 9,082 example test cases extracted, 331,834 from CodeContests, 600+ generated)."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are listed: Julian Aron Prenner at inf.unibz.it (University of Bolzano) and Romain Robbes at u-bordeaux.fr (University of Bordeaux). Neither evaluates a product from their own institution."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "Since no funding is disclosed, independence of funding cannot be assessed. The absence of a funding statement makes this unanswerable."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not state CodeT5's pre-training data cutoff date. CodeT5 was pre-trained on code data but no temporal bounds are given for what code was included."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Section 4 ('Deduplication') extensively addresses train/test overlap: exact duplicates are removed, cross-set near-duplicates are removed using trigram similarity, and splits ensure the same user's submissions to the same problem don't cross sets."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper does not discuss whether Project CodeNet submissions could have appeared in CodeT5's pre-training corpus. Since both Project CodeNet and CodeT5 were released in 2021, there is a non-trivial contamination risk that is not addressed."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. The study mines programming contest submissions and runs automated evaluations."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study uses publicly available programming contest submissions."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. Contest submitters are anonymized in the source dataset."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants. Inclusion/exclusion criteria for submissions (not people) are thoroughly documented."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants or experimental conditions involving people."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants or conditions requiring blinding."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No specific inference cost or latency per example is reported for the NPR baseline. Cardumen is given a 10-minute time budget per bug but no aggregate cost. The paper mentions 'several weeks' for 130M+ executions but does not quantify per-bug evaluation cost."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "The GPU is stated (single NVIDIA RTX 3090, 24GB) and training duration (5 epochs) but no total GPU hours, wall-clock training time, or total compute budget is reported. Execution cost is described only as 'several weeks.'"
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of random seeds or sensitivity analysis. All results appear to be from a single training run of each model configuration."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is not stated. The 17 different training configurations appear to each be single runs."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Specific hyperparameters are reported but no search budget or tuning methodology is described. It is unclear how the learning rate (1e-4), batch size (16), etc. were selected."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper uses specific hyperparameters without explaining how they were selected. No validation-based selection procedure is described."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "The paper does not perform multiple formal statistical hypothesis tests requiring correction. Analyses are primarily descriptive (plausibility rates, correlations)."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors implemented the NPR baseline and configured Cardumen with significantly constrained parameters (-maxtime 10 -maxgen 100). While the Cardumen constraints are acknowledged, the paper does not discuss author-evaluation bias or the risk that their implementation choices may systematically favor one baseline over another."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "CodeT5 and Cardumen operate under very different compute regimes (GPU-based neural inference vs. 10-minute search budget per bug). The paper acknowledges Cardumen's constrained time but does not systematically analyze performance as a function of compute budget."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Section 7 ('Limited scope') explicitly discusses construct validity: short algorithmic programs differ from large software projects, bug diversity is limited by same-problem solutions, and the bug type distribution is compared to real-world distributions (Pan et al. findings). Section 3 provides extensive motivation for why this benchmark design measures what it claims."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No scaffolding is involved. CodeT5 is a direct seq2seq model and Cardumen is a template-based G&V tool."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The paper does not discuss whether CodeT5's pre-training data temporally overlaps with Project CodeNet submissions. Both were publicly available around 2021, creating a plausible temporal leakage risk."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The paper does not discuss whether any information in the model's input (e.g., problem IDs, language tokens) could leak information about the expected fix."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "The paper addresses non-independence through deduplication: exact duplicates removed, trigram-based cross-set near-duplicate removal, and ensuring same-user/same-problem submissions don't cross splits (Section 4, 'Deduplication' and 'Data splitting')."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Trigram similarity is used as a concrete leakage detection method for cross-set near-duplicates. Exact matching is used for full duplicates. The paper explicitly compares this to prior work which only used exact matching (Section 2)."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "RunBugRun contains ~450,000 executable buggy/fixed program pairs in 8 programming languages with test cases and fine-grained bug labels.",
    368       "evidence": "Section 5.1 provides detailed statistics: 417,101 training, 32,504 test, 7,144 validation instances across C, C++, Java, Python, Ruby, PHP, JavaScript, Go (Figure 4). Bug labels (136 leaf categories), test cases, and error messages are documented.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "The CodeT5-based NPR baseline achieves 59.8% plausibility rate across all languages with 5 candidates per bug.",
    373       "evidence": "Section 5.2.2 reports this figure. Per-language breakdown in Table 5: Python 69%, C 64.4%, Java 62.9%, PHP 55.7%, C++ 54.9%, Go 47.8%, Ruby 47.6%, JavaScript 44.5%.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Cross-language knowledge transfer significantly boosts repair performance for low-resource languages.",
    378       "evidence": "Table 5 (RQ2.3) shows 17 models with different training regimens. Transfer effects: +34.5pp for PHP, +31.8pp for Go, +28.3pp for JavaScript when training on all languages vs. target language only. Small gains for high-resource languages: +1.2pp for C++, +2.2pp for Python.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "There is a strong negative correlation (r = −0.996) between the number of fixing changes and NPR baseline performance.",
    383       "evidence": "Section 5.2.2 reports Pearson's r = −0.996 for the NPR baseline and r = −0.6823 for the G&V baseline. Performance ranges from 68.7% (1 change) to 42.8% (6 changes) for the NPR model.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Cardumen (G&V baseline) fixed only ~5% of Java bugs, performing poorly compared to the NPR baseline.",
    388       "evidence": "Section 5.2.2 reports 157 of ~3,300 Java bugs fixed. The authors acknowledge the constrained time budget (-maxtime 10 -maxgen 100) and low redundancy in small programs as contributing factors.",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "Only 3.5% of plausible NPR patches were found to be incorrect upon manual assessment.",
    393       "evidence": "Section 7 reports manual assessment of 200 randomly selected plausible fixes: 7 (3.5%) incorrect, 46.5% identical to ground truth. However, 19 patches were difficult to assess and required execution-based comparison.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "No variance or error bars on any results",
    400       "detail": "All baseline results appear to be from single experimental runs. No standard deviations, confidence intervals, or seed sensitivity analysis is reported for any of the 17+ model configurations trained and evaluated. This makes it impossible to assess result stability."
    401     },
    402     {
    403       "flag": "Unfair G&V baseline comparison",
    404       "detail": "Cardumen was given a 10-minute time budget and 100 generations per bug, which the paper acknowledges is 'unusually low' compared to typical G&V budgets of hours. This severely constrains Cardumen's performance, making the NPR vs. G&V comparison misleading. The paper notes this but the headline comparison (5% vs 60%) is still presented."
    405     },
    406     {
    407       "flag": "Pre-training contamination not addressed",
    408       "detail": "CodeT5 was pre-trained on code data, and Project CodeNet was publicly available before CodeT5's training. The paper does not discuss whether CodeT5's pre-training corpus included any programming contest submissions that overlap with the benchmark data."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    414       "authors": ["Yue Wang"],
    415       "year": 2021,
    416       "arxiv_id": "2109.00859",
    417       "relevance": "Pre-trained code model used as the NPR baseline in the paper, representative of neural approaches to program repair."
    418     },
    419     {
    420       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    421       "authors": ["René Just", "Darioush Jalali", "Michael D. Ernst"],
    422       "year": 2014,
    423       "doi": "10.1145/2610384.2628055",
    424       "relevance": "The most widely used APR benchmark; RunBugRun positions itself as complementary with executable infrastructure inspired by Defects4J."
    425     },
    426     {
    427       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    428       "authors": ["Thibaud Lutellier"],
    429       "year": 2020,
    430       "doi": "10.1145/3395363.3397369",
    431       "relevance": "NPR system that mined 3.2M buggy/fixed code pairs for training; relevant for understanding data-hungry neural repair approaches."
    432     },
    433     {
    434       "title": "Neural Program Repair with Execution-Based Backpropagation",
    435       "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"],
    436       "year": 2022,
    437       "doi": "10.1145/3510003.3510222",
    438       "relevance": "RewardRepair integrates test execution feedback into NPR training, directly relevant to RunBugRun's goal of enabling execution-based NPR."
    439     },
    440     {
    441       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    442       "authors": ["Zimin Chen"],
    443       "year": 2019,
    444       "doi": "10.1109/TSE.2019.2940179",
    445       "arxiv_id": "1901.01808",
    446       "relevance": "Pioneering seq2seq approach to NPR that frames repair as neural machine translation, foundational to the NPR paradigm RunBugRun targets."
    447     },
    448     {
    449       "title": "FixEval: Execution-based Evaluation of Program Fixes for Programming Problems",
    450       "authors": ["Md Mahim Anjum Haque"],
    451       "year": 2022,
    452       "arxiv_id": "2206.07796",
    453       "relevance": "Compared execution-based and match-based evaluation metrics for NPR, finding execution-based metrics are better performance indicators — directly supports RunBugRun's motivation."
    454     },
    455     {
    456       "title": "SelfAPR: Self-supervised Program Repair with Test Execution Diagnostics",
    457       "authors": ["He Ye"],
    458       "year": 2022,
    459       "arxiv_id": "2203.12755",
    460       "relevance": "Demonstrates that error messages and stack traces from execution improve repair accuracy, motivating RunBugRun's collection of execution data."
    461     },
    462     {
    463       "title": "Competition-Level Code Generation with AlphaCode",
    464       "authors": ["Yujia Li"],
    465       "year": 2022,
    466       "arxiv_id": "2203.07814",
    467       "relevance": "AlphaCode's CodeContests dataset provided 331,834 additional test cases used in RunBugRun; relevant to AI code generation evaluation."
    468     },
    469     {
    470       "title": "CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks",
    471       "authors": ["Ruchir Puri"],
    472       "year": 2021,
    473       "arxiv_id": "2105.12655",
    474       "relevance": "The source dataset from which RunBugRun is derived; 13M+ programming contest submissions across 4,000+ problems."
    475     },
    476     {
    477       "title": "Ultra-Large Repair Search Space with Automatically Mined Templates: The Cardumen Mode of Astor",
    478       "authors": ["Matias Martinez", "Martin Monperrus"],
    479       "year": 2018,
    480       "relevance": "G&V baseline used in RunBugRun evaluation; template-based repair approach representative of generate-and-validate paradigm."
    481     },
    482     {
    483       "title": "Unified Pre-training for Program Understanding and Generation",
    484       "authors": ["Wasi Uddin Ahmad"],
    485       "year": 2021,
    486       "arxiv_id": "2103.06333",
    487       "relevance": "PLBART multi-language code model whose language identification token approach was adopted by RunBugRun's NPR baseline."
    488     },
    489     {
    490       "title": "Empirical review of Java program repair tools: A large-scale experiment on 2,141 bugs and 23,551 repair attempts",
    491       "authors": ["Thomas Durieux"],
    492       "year": 2019,
    493       "relevance": "Found APR tools overfit to Defects4J benchmark, directly motivating RunBugRun's larger and more diverse evaluation set."
    494     }
    495   ],
    496   "engagement_factors": {
    497     "practical_relevance": {
    498       "score": 2,
    499       "justification": "Provides a usable benchmark and infrastructure for APR researchers; practitioners building repair tools can directly use this dataset for training and evaluation."
    500     },
    501     "surprise_contrarian": {
    502       "score": 1,
    503       "justification": "The finding that a simple NPR baseline outperforms G&V by 12x is notable, and cross-language transfer effects are interesting, but the overall message (execution helps, multi-lingual is needed) confirms existing intuitions."
    504     },
    505     "fear_safety": {
    506       "score": 0,
    507       "justification": "No safety or security concerns raised; the paper is about improving automated program repair tools."
    508     },
    509     "drama_conflict": {
    510       "score": 0,
    511       "justification": "No controversy or conflict; straightforward dataset contribution paper."
    512     },
    513     "demo_ability": {
    514       "score": 2,
    515       "justification": "Full dataset, infrastructure, and code released on GitHub (https://github.com/giganticode/run_bug_run); researchers can download and run evaluations."
    516     },
    517     "brand_recognition": {
    518       "score": 0,
    519       "justification": "From University of Bolzano and University of Bordeaux; not a major AI lab or associated with a well-known product."
    520     }
    521   }
    522 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs