scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32899B)
      1 {
      2   "paper": {
      3     "title": "Improving Automated Program Repair with Domain Adaptation",
      4     "authors": ["Armin Zirak", "Hadi Hemmati"],
      5     "year": 2022,
      6     "venue": "ACM Transactions on Software Engineering and Methodology",
      7     "arxiv_id": "2212.11414",
      8     "doi": "10.1145/3631972"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Domain shift significantly degrades APR model accuracy, with TFix-Large losing 11.82% weighted average accuracy when tested on unseen projects. The proposed domain adaptation framework using FullFineTuning improves TFix-Large by 13.05% and CodeXGLUE by up to 39.6% on target projects, with adaptation taking only minutes versus days for full retraining. A novel bug generator model (TBug) synthesizes bug-fix pairs for zero-shot domain adaptation, improving accuracy on projects with no labeled data. Larger models benefit more from domain adaptation but are also more vulnerable to domain shift.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'We publish all the source codes, models and results in a public repository' with a footnote linking to https://github.com/arminzirak/TFix."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: the TFix JavaScript dataset created by Berabi et al. and the CodeXGLUE Java dataset by Tufano et al./Lu et al. Both are referenced public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper specifies hardware (32GB v100l GPU, 4 CPU cores, 30GB RAM on ComputeCanada Cedar) in Section 4.4 but does not provide software dependency specifications such as requirements.txt, Dockerfile, or library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper provides a GitHub repository link but does not include step-by-step reproduction instructions in the paper itself. It states 'We use the scripts published by authors of TFix and CodeXGLUE' but provides no detailed commands or reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (exact match percentages) in Tables 4-20 with no confidence intervals, error bars, or uncertainty measures."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims improvements (e.g., '13.05% improvement') by comparing raw numbers between methods without any statistical significance tests (no p-values, t-tests, or other tests)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context throughout. For example, TFix-Large Default accuracy is 54.19% and FFT achieves 67.24%, with the improvement of 13.05% explicitly stated. Per-project results provide before/after comparisons."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The threshold of ≥150 samples for target project selection (TFix) and ≥50 for CodeXGLUE is stated but not formally justified via power analysis. The paper acknowledges 'projects with very few samples are not proper candidates' but provides no statistical justification for the thresholds."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results appear to be from single experimental runs. No variance, standard deviation, or spread across multiple runs or random seeds is reported in any table."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper includes two baseline approaches: 'Default' (pretrained model as-is, equivalent to excluded design) and 'Baseline' (included design with target data in training). All DA methods are compared against both (Section 5.2)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "TFix (2021, ICML) and CodeXGLUE (2021) were state-of-the-art APR methods at the time. The DA methods compared (FullFineTuning, Adapter Layers, Curriculum Learning) are established NLP domain adaptation approaches cited from recent literature."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares 5 DA method variants (FFT, TLWAL, CLS, CLL, CLC) plus Default and Baseline across 8 projects and 2 model sizes, effectively ablating the DA approach. Three types of curriculum learning are compared. The bug generator is tested with and without metadata (Table 12)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper evaluates effectiveness (Exact Match), efficiency (preparation time, inference time, model size), and reliability (exposure bias). RQ2 systematically addresses all three dimensions."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated using Exact Match. No human evaluation of generated fixes is performed. The paper acknowledges Exact Match as a limitation but includes no manual inspection of fix quality."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper explicitly separates train/validation/test splits per project and error type (80/20 split, Section 5.1.1, Table 3). The test set is held constant across all RQs: 'we want to keep the test set the same between RQs[1-3].'"
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Per-project breakdowns are provided in all results tables (Tables 4-6, 10-11, 13-20). TBug accuracy is broken down by error type (Table 12, 51 error types). Results are shown for both model sizes."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where methods fail: ONM is too easy for comparison, LivelyKernel has too few samples for reliable DA (Section 5.2.2), CurriculumLearning underperforms on TFix-Large, CodeXGLUE achieves 0% on some projects (Coprhd Controller), and the synthetic approach shows limited benefit on small CodeXGLUE datasets."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Negative results include: CurriculumLearning methods performing worse than FFT and TLWAL, TFix-Small showing small exposure bias loss after DA (Table 10), accuracy decreasing in some projects after DA (e.g., Dcos-ui with TFix-Small FFT drops from 94.60 to 86.49 with synthetic data), and CodeXBUG showing limited improvements on some projects."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims 'improve TFix by 13.05%' which matches Table 5 (67.24-54.19=13.05). However, the claim of 'CodeXGLUE by 23.4%' does not match the supervised DA results in Table 17 (39.64% improvement). The 23.42% figure actually matches the synthetic data result from Table 19 (28.82-5.40=23.42), suggesting a mix-up between supervised and unsupervised results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims about DA improving accuracy. The study design supports these: included vs excluded scenarios control for domain shift with the same test set, DA methods are applied to the same pretrained model, and exposure bias tests verify the model doesn't simply overfit. Single-variable manipulation (adding target data to training) is used."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title 'Improving Automated Program Repair with Domain Adaptation' and abstract frame results broadly for APR in general, but experiments cover only 2 specific APR models (TFix, CodeXGLUE) on 2 languages (JavaScript, Java) with 19 total projects. Section 5.6 acknowledges 'results might not be applicable to other APR methods' but the title does not reflect this bounded scope."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5.7 (Validity Threats) addresses specific alternatives: the accuracy drop could be due to fewer training samples (argued against: only 2% excluded), improvements could be from more data rather than domain-specific learning (addressed via exposure bias analysis in RQ2.3), and construct validity of Exact Match is discussed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper consistently frames results in terms of the measured metric (Exact Match accuracy) without inflating it to broader claims about code quality or repair capability. Section 5.6 explicitly discusses the limitations of Exact Match as a proxy and acknowledges it as a limitation of APR research generally."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "T5-Small and T5-Large are specified as the model architectures from Raffel et al. [74], and CodeBERT from Feng et al. [31] with specific architecture details (6 layers, 768-dimensional hidden states, 12 attention heads). These are fixed pretrained artifacts, not evolving API models."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper uses fine-tuned sequence-to-sequence models, not prompt-based LLMs. The input formatting ('fix error type error message buggy line: error context ⇒ fixed line') is documented but this is model input format, not prompting."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper states 'We use the scripts published by authors of TFix and CodeXGLUE to run the experiments. The only change we make... is the data division. We do not change other configurations and hyperparameters.' However, the actual hyperparameter values (learning rate, batch size, etc.) are not reported. Adapter layer dimensions are mentioned as hyperparameters but values are not given."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a standard ML training and evaluation pipeline with fine-tuned sequence-to-sequence models."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data preprocessing is documented: TFix input formatting with ESLint metadata (Section 2.1.1), CodeXGLUE variable normalization and AST processing (Section 2.1.2), data splitting per project and error type with 80/20 ratio (Section 5.1.1), project filtering criteria (≥150 samples for TFix, ≥50 for CodeXGLUE), and statistics of resulting splits (Tables 2-3)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.6 'Limitations of the Proposed Methods and the Conducted Study' provides extensive discussion of limitations. Section 5.7 'Validity Threats' covers construct, internal, external, and conclusion validity in four subsections."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5.7 discusses specific threats: limited to 2 APR methods that may not generalize, CodeXGLUE dataset has few samples per project, excluded data is only 2% (addressing internal validity concern), Exact Match metric limitations, and unavailability of Error Removal metric code. These are specific to this study, not generic disclaimers."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope boundaries: 'One may argue that the results and findings might not be applicable to other APR methods' (Section 5.6), 'we narrow the scope of this paper to logical one-line errors' (Section 2.1), and 'Detection and localization of buggy code lines are separate lines of research outside the scope of this paper.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper publishes 'all the source codes, models and results in a public repository' (Section 1). The underlying datasets (TFix and CodeXGLUE) are also publicly available from the original papers."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection is described for both datasets: TFix collected 5.5M GitHub commits and extracted ~100K aligned bug-fix pairs using ESLint, bipartite matching, and Myers Diff (Section 2.1.1). CodeXGLUE used GitHub events with commit-message-based bug-fix selection, variable normalization, and filtering of lexical/syntactical errors (Section 2.1.2)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard public benchmarks (TFix JavaScript dataset, CodeXGLUE Java dataset)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The data pipeline is documented: project labeling, filtering by sample count thresholds, splitting per project and error type (80/20), aggregation into source/target sets with explicit statistics (Tables 2-3), and validation/test separation. The pipeline from pretrained model through DA to evaluation is depicted in Figures 3, 7, 8, and 9."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper text. University of Calgary researchers likely have funding but it is not disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: both authors are from the University of Calgary, Canada. They are not affiliated with any company whose product is being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The authors use ComputeCanada resources, which could imply government funding, but this is not stated."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interest declaration appears in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not state the training data cutoff dates for T5 or CodeBERT's pre-training. While the fine-tuning data splits are carefully controlled, the pre-trained models' training data boundaries are not discussed."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "The entire paper is designed around controlling train/test overlap at the project level. The included/excluded experimental design (Section 5.1) explicitly measures the effect of having target project data in the training set. However, pre-training data overlap for T5/CodeBERT is not discussed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper does not discuss whether the TFix or CodeXGLUE benchmark datasets were available online before T5 or CodeBERT were pre-trained, leaving potential pre-training contamination unaddressed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. It is a computational evaluation of APR models."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study uses publicly available code datasets."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 9 reports inference times (0.39 sec for TFix-Small, 1.06 sec for TFix-Large). Table 7 reports model preparation times for all DA methods (ranging from seconds to minutes for DA, vs. 1d 19h for Baseline with TFix-Large)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 4.4 specifies: 'single node of ComputeCanada (Cedar) with 32GB v100l GPU, 4 CPU cores, and 30GBs RAM.' Training times are reported in Table 7. Model sizes are in Table 8. They note 'All experiments are possible with 16GB GPU, 1 CPU core, and 20 GB RAM.'"
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single experimental runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state how many times experiments were run. Results are presented as single values per configuration without any indication of repeated trials."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Adapter layer dimensions and curriculum learning parameters are mentioned as hyperparameters, but no search budget, search method, or number of configurations tried is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "EarlyStopping is used with validation: 'It stops training epochs if no more improvement happens in four consecutive epochs. After each epoch, the model is evaluated, and the best version is stored.' All configurations (5 DA methods × 8 projects × 2 sizes) are reported in the tables."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper compares multiple methods across multiple projects (5 DA methods × 8 projects × 2 model sizes) without any correction for multiple comparisons (no Bonferroni, Holm, or similar adjustments)."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement their own DA methods and compare against their own implementations of Default and Baseline. They do not acknowledge or address potential bias from evaluating their own system."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "While preparation times (Table 7) and accuracy (Tables 5-6) are reported separately, performance is not explicitly plotted or compared as a function of compute budget. The Baseline approach uses vastly more compute (1d 19h vs 7 min) but this trade-off is not formally analyzed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses Exact Match as the primary metric and acknowledges it as a limitation in Section 5.6 ('we consider the metrics used in this study as a limitation') but does not discuss whether Exact Match actually measures code repair quality. A correct fix that differs textually from the reference is counted as wrong."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are evaluated directly via sequence-to-sequence inference."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper does not discuss whether T5 or CodeBERT's pre-training data temporally overlaps with the bug-fix benchmark datasets. The TFix dataset comes from GitHub commits that could be in T5's training corpus (C4/Common Crawl)."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "TFix uses ESLint metadata (error type, message, location) as additional input features. While this is by design, the paper does not discuss whether this auxiliary information constitutes feature leakage compared to a real deployment scenario where such information may differ."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The entire paper addresses non-independence of train and test data through the cross-project domain shift framework. The included/excluded design explicitly tests the impact of shared projects between training and testing. Data is split by both project and error type."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The project-based data splitting serves as a leakage prevention method: target project data is completely excluded from training in the 'excluded' design. The paper validates this separation by comparing included vs excluded scenarios and measuring the resulting performance gap."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Domain shift reduces TFix-Large accuracy by 11.82% (weighted average) when tested on unseen projects.",
    365       "evidence": "Table 4 shows weighted average drops from 66.01% (Included) to 54.19% (Excluded) for TFix-Large across 8 target projects. Per-project breakdowns show drops in 7 of 8 projects (Section 5.1.2).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "FullFineTuning improves TFix-Large effectiveness by 13.05% over the Default approach.",
    370       "evidence": "Table 5 shows weighted average improvement from 54.19% (Default) to 67.24% (FFT) for TFix-Large. Improvement observed in 7 of 8 target projects (Section 5.2.2, RQ2.1).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "TFix-Small is relatively resistant to domain shift compared to TFix-Large.",
    375       "evidence": "Table 4 shows TFix-Small weighted average drops only 1.47% (54.93% to 56.40%) vs 11.82% for TFix-Large. Authors attribute this to smaller model capacity preventing overfitting to project-specific distributions (Section 5.1.2).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The DA framework is efficient, with adaptation taking minutes rather than days.",
    380       "evidence": "Table 7 shows FFT adaptation takes 7m 09s for TFix-Large vs 1d 19h 19m for Baseline retraining. All DA methods take under 8 minutes for TFix-Large (Section 5.2.2, RQ2.2).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Domain-adapted models do not suffer from catastrophic exposure bias.",
    385       "evidence": "Tables 10-11 show that adapted TFix-Large models actually improve slightly on general data (from 40.01% pretrained to 46.39% average after FFT). TFix-Small shows a small drop from 38.03% to ~34% (Section 5.2.2, RQ2.3).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "TBug generates synthetic bugs with 44.68% exact match accuracy (TBug-Large with metadata).",
    390       "evidence": "Table 12 reports per-error-type accuracy for TBug, with weighted average of 44.68% for TBug-Large with metadata vs 27.13% without metadata (Section 5.3.2, RQ3.1).",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Synthetic data from TBug improves TFix-Large accuracy by 5.66% on projects with no labeled data.",
    395       "evidence": "Table 14 shows weighted average improvement from 54.19% (Default) to 59.85% (FFT with synthetic data) for TFix-Large. Improvement in 5 of 8 projects, same in 3 (Section 5.3.2, RQ3.2).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "CodeXGLUE is sensitive to domain shift, with accuracy dropping 26.13% on the small dataset.",
    400       "evidence": "Table 15 shows weighted average drop from 31.53% (Included) to 5.40% (Excluded) for CodeXGLUE small dataset. Snobot2015 drops from 85.29% to 0% (Section 5.4.2, RQ4.1).",
    401       "supported": "strong"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or statistical significance tests",
    407       "detail": "All results are single-run point estimates. No confidence intervals, error bars, standard deviations, or statistical significance tests are reported despite numerous claims of one method 'outperforming' another. With small test sets (36-115 samples), observed differences could be due to chance."
    408     },
    409     {
    410       "flag": "Numerical inconsistencies between abstract and tables",
    411       "detail": "The abstract claims the DA framework improves CodeXGLUE by 23.4%, but the supervised DA improvement in Table 17 is approximately 39.6% (weighted average). The 23.42% figure matches the synthetic data result (Table 19: 28.82-5.40=23.42), suggesting a mix-up between supervised and unsupervised results in the abstract."
    412     },
    413     {
    414       "flag": "Small test sets for some projects",
    415       "detail": "Several target projects have very small test sets (LivelyKernel: 36, Dcos-ui: 37, ONM: 39, Sequelize: 40). A single correct/incorrect prediction changes accuracy by 2.5-2.8 percentage points, making per-project comparisons unreliable without significance testing."
    416     },
    417     {
    418       "flag": "No seed sensitivity analysis",
    419       "detail": "All experiments appear to be single-run results. Fine-tuning results can vary substantially across random seeds, especially with small adaptation datasets (98-345 samples). Without multiple runs, the reported improvements may not be stable."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Tfix: Learning to fix coding errors with a text-to-text transformer",
    425       "authors": ["Berkay Berabi", "Jingxuan He", "Veselin Raychev", "Martin Vechev"],
    426       "year": 2021,
    427       "relevance": "State-of-the-art NMT-based APR model using T5 transformers, one of the two primary methods studied in this paper."
    428     },
    429     {
    430       "title": "Codexglue: A machine learning benchmark dataset for code understanding and generation",
    431       "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"],
    432       "year": 2021,
    433       "arxiv_id": "2102.04664",
    434       "relevance": "ML benchmark for code tasks including APR using CodeBERT encoder-decoder, the second primary method studied."
    435     },
    436     {
    437       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    438       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    439       "year": 2020,
    440       "arxiv_id": "2002.08155",
    441       "relevance": "Bimodal pretrained model for programming languages used as the encoder in CodeXGLUE's APR method."
    442     },
    443     {
    444       "title": "Sequencer: Sequence-to-sequence learning for end-to-end program repair",
    445       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano"],
    446       "year": 2019,
    447       "relevance": "Pioneering NMT-based APR model that formulates program repair as a sequence-to-sequence translation task."
    448     },
    449     {
    450       "title": "Coconut: combining context-aware neural translation models using ensemble for program repair",
    451       "authors": ["Thibaud Lutellier", "Hung Viet Pham", "Lawrence Pang"],
    452       "year": 2020,
    453       "relevance": "Context-aware NMT-based APR model using ensemble methods, representative of the NMT APR approach."
    454     },
    455     {
    456       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    457       "authors": ["Colin Raffel", "Noam Shazeer", "Adam Roberts"],
    458       "year": 2020,
    459       "relevance": "The T5 model architecture that underpins TFix, demonstrating transfer learning capabilities for text generation tasks."
    460     },
    461     {
    462       "title": "On Distribution Shift in Learning-based Bug Detectors",
    463       "authors": ["Jingxuan He", "Luca Beurer-Kellner", "Martin Vechev"],
    464       "year": 2022,
    465       "arxiv_id": "2204.10049",
    466       "relevance": "Analyzes domain shift effects on bug detection models including CuBERT and GNN, closely related to this work's domain shift analysis in APR."
    467     },
    468     {
    469       "title": "SelfAPR: Self-supervised Program Repair with Test Execution Diagnostics",
    470       "authors": ["He Ye", "Matias Martinez", "Xiapu Luo"],
    471       "year": 2022,
    472       "arxiv_id": "2203.12755",
    473       "relevance": "Self-supervised APR approach using AST perturbations for synthetic bug generation, directly comparable to the TBug data synthesis method proposed in this paper."
    474     },
    475     {
    476       "title": "Self-supervised bug detection and repair",
    477       "authors": ["Miltiadis Allamanis", "Henry Jackson-Flux", "Marc Brockschmidt"],
    478       "year": 2021,
    479       "relevance": "BugLab: jointly trained bug detector and selector for synthetic bug creation, related to the bug generation approach in this paper."
    480     },
    481     {
    482       "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation",
    483       "authors": ["Michele Tufano", "Cody Watson", "Gabriele Bavota"],
    484       "year": 2019,
    485       "relevance": "Foundational work formulating APR as NMT and creating the Java bug-fix dataset used by CodeXGLUE."
    486     },
    487     {
    488       "title": "GraphCodeBERT: Pre-training code representations with data flow",
    489       "authors": ["Daya Guo", "Shuo Ren", "Shuai Lu"],
    490       "year": 2020,
    491       "arxiv_id": "2009.08366",
    492       "relevance": "Extension of CodeBERT incorporating data flow for code representation, relevant to pretrained code models for APR."
    493     }
    494   ],
    495   "engagement_factors": {
    496     "practical_relevance": {
    497       "score": 2,
    498       "justification": "Practitioners building APR tools could directly apply the domain adaptation framework to improve model performance on their specific projects, with code publicly available."
    499     },
    500     "surprise_contrarian": {
    501       "score": 1,
    502       "justification": "The finding that domain shift degrades APR performance is somewhat expected from NLP research; the main novelty is quantifying it specifically for program repair."
    503     },
    504     "fear_safety": {
    505       "score": 0,
    506       "justification": "No AI safety or security concerns are raised by this work on automated program repair domain adaptation."
    507     },
    508     "drama_conflict": {
    509       "score": 0,
    510       "justification": "No controversy or dramatic claims; a methodical empirical study of domain adaptation for APR."
    511     },
    512     "demo_ability": {
    513       "score": 1,
    514       "justification": "Code is released on GitHub but it requires specific datasets and compute infrastructure to replicate, not a simple pip-installable tool."
    515     },
    516     "brand_recognition": {
    517       "score": 0,
    518       "justification": "Authors are from University of Calgary, not a major AI lab. TFix and CodeXGLUE are known in the APR community but not broadly."
    519     }
    520   }
    521 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs