scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33055B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DEAR: A Novel Deep Learning-based Approach for Automated Program Repair",
      6     "authors": [
      7       "Yi Li",
      8       "Shaohua Wang",
      9       "Tien N. Nguyen"
     10     ],
     11     "year": 2022,
     12     "venue": "International Conference on Software Engineering",
     13     "arxiv_id": "2205.01859",
     14     "doi": "10.1145/3510003.3510177"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims are supported: '42%–683%' improvement matches Table 1 calculations (11→47 is 327%, 6→47 is 683%). '31–145 more bugs' on BigFix matches Table 4. '169 (25.3%) multi-hunk/multi-statement bugs' matches Table 6.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper makes causal claims ('DEAR outperforms') and justifies them through controlled ablation studies (RQ4) where individual components are removed one at a time from the system. The single-variable manipulation design is adequate for the ablation claims.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title claims 'Automated Program Repair' generally, and the abstract says 'fixing for the general bugs.' All evaluation is on Java only. The Limitations section acknowledges 'we currently focus on Java' but the title, abstract, and contribution claims make unbounded generalization.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not discuss alternative explanations for DEAR's improvements. The Threats to Validity section is one short paragraph about Java focus and baseline reimplementation, with no substantive discussion of confounding factors (e.g., dataset-specific properties, architectural advantages unrelated to multi-hunk capability).",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures number of correctly fixed bugs (matching developer fixes or passing test suites) and claims to evaluate automated program repair capability. The measurement directly matches the claim with no proxy gap.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A 'Limitations' paragraph appears at the end of Section 6 with five specific limitations, and a 'Threats to Validity' paragraph discusses Java-only evaluation and baseline reimplementation.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The Limitations section identifies specific issues: (1) rare/OOV names are challenging, (2) only bugs with failing tests, (3) cannot generate fixes with many new statements, (4) expansion may produce incorrect hunks, (5) Java-only. These are specific to this study.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states it cannot handle: security/vulnerability bugs, non-failing-test bugs, fixes with 'several new statements added or arbitrarily large sizes,' and currently focuses on Java only.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The Acknowledgments section discloses NSF funding: grants CNS-2120386, CCF-1723215, CCF-1723432, TWC-1723198, CCF-1518897, and CNS-1513263.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated: Yi Li and Shaohua Wang at New Jersey Institute of Technology, Tien N. Nguyen at University of Texas at Dallas. No commercial product is being evaluated.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The US National Science Foundation (NSF) is the funder and has no financial stake in whether DEAR outperforms specific APR tools.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms including 'hunks', bug types (Type-1 through Type-5), multi-hunk/multi-statement bugs, and fault localization are explicitly defined with examples.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three explicit contributions are listed: the FL technique for multi-hunk bugs, the compositional divide-and-conquer fixing approach, and the enhanced two-layer LSTM with attention and cycle training.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper engages substantively with both DL-based (DLFix, CoCoNuT, CURE, SequenceR, etc.) and pattern-based APR approaches, explaining why each is limited for multi-hunk bugs and how DEAR builds on DLFix's architecture.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper states 'Our data and tool are publicly available [2]' and reference [2] points to https://github.com/AutomatedProgramRepair-2021/dear-auto-fix.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All three evaluation datasets are publicly available: Defects4J v1.2.0 [1], BigFix [18], and CPatMiner [26]. The paper also claims data availability at the GitHub repository [2].",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions hardware ('a workstation with a 8-core Intel CPU and a single GTX Titan GPU') but provides no software dependencies, library versions, requirements.txt, or Dockerfile.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the approach and settings but does not include a README or reproduction guide.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results are reported as point estimates (e.g., '47 bugs fixed', '15.1%') with no confidence intervals or error bars in any table or figure.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper claims DEAR 'outperforms' baselines based solely on comparing raw numbers (e.g., 47 vs 36 bugs). No statistical significance tests (p-values, t-tests, etc.) are reported.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The paper provides both absolute differences and relative improvements: 'DEAR can auto-fix 32, 41, 33, 17, 14, and 11 more bugs... (i.e., 213%, 683%, 236%, 57%, 42%, and 31% relative improvements)' with baseline values clearly stated.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No justification for dataset sizes. The paper uses existing benchmarks (395 bugs in Defects4J, 26k in BigFix, 44k in CPatMiner) without discussing whether these are adequate for the claims being made.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No standard deviations, variance, or multi-run results are reported. All results appear to be from single experimental runs.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Extensive baselines: 6 DL-based APR models (DLFix, CoCoNuT, SequenceR, Tufano19, CODIT, CURE) and 8 pattern-based APR tools (Elixir, ssFix, CapGen, FixMiner, Avatar, Hercules, SimFix, TBar).",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include CURE (2021), CoCoNuT (2020), DLFix (2020), Hercules (2019), and TBar (2019) — all published within 1–3 years of the submission. These represent the state-of-the-art at the time.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "RQ4 (Section 6.4) presents a sensitivity analysis removing hunk detection, multi-statement expansion, and attention+cycle training individually, showing each component's contribution (Table 9).",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper reports number of correct patches, number of plausible patches, top-K metrics (top-1, top-3, top-5), and per-bug-type breakdowns.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "All evaluation is automated: test-case validation for Defects4J, exact match with developer fixes for BigFix and CPatMiner. No human evaluation of patch quality.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "For Defects4J: trained on CPatMiner and tested on Defects4J with 'no overlap between the two datasets.' For BigFix/CPatMiner: 80%/10%/10% split for training/tuning/testing. Cross-dataset evaluation also performed.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down by Defects4J project (Chart, Closure, Lang, Math, Mockito, Time) in Tables 1, 7; by bug type (Types 1-5) in Tables 2, 3, 6, 8; and across multiple datasets.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "The Limitations section discusses where DEAR fails: rare/OOV names, non-failing-test bugs, large dependent fixes. The sensitivity analysis (RQ4) discusses incorrect hunk detection and incorrect expansion. Specific multi-hunk bugs that Hercules handles but DEAR misses are analyzed.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Table 9 shows that adding hunk detection causes DEAR to fix 2 fewer Type-1 bugs, and expansion causes 3 fewer Type-1/Type-3 bugs. The paper acknowledges 'multi-statement expansion may expand the buggy hunk incorrectly.'",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "The paper references 'Google's pre-trained BERT model [8]' without specifying which variant (base/large) or version. GloVe and TreeCaps are referenced by citation only. No version identifiers for any pre-trained component.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "DEAR does not use prompting. It uses custom neural network models (tree-based LSTM, fine-tuned BERT, RNN) trained on data.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Section 5.3.1 reports detailed hyperparameter settings: BERT (epoch=4, batch=32, lr=1e-4), LSTM (epoch=200, lr=0.003, batch=128), GloVe (v-size=200, lr=0.001, batch=64, epoch=200), plus the search ranges tried.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "DEAR does not use agentic scaffolding. It is a direct neural model pipeline (fault localization → hunk detection → expansion → tree-based repair).",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3.1 documents subtree pairing rules. Section 3.2 describes context building including alpha-renaming, GloVe encoding, TreeCaps summarization. Section 5.3.1-5.3.2 describe dataset splitting (80/10/10), cross-dataset setup, and that CPatMiner was used for training while Defects4J for testing.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "All three datasets are publicly available: Defects4J [1] with source code and test cases, BigFix [18], and CPatMiner [26]. The authors also claim to release their data at [2].",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 5.2 describes the three datasets with their properties: 'Defects4J v1.2.0 [1] with 395 bugs with test cases; BigFix [18] with +26k bugs in +1.8 million buggy methods; CPatMiner [26] with +44k bugs from 5,832 Java projects.'",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data comes from standard public benchmarks (Defects4J, BigFix, CPatMiner).",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The paper documents: CPatMiner used for AST differencing, subtree pairing rules (Section 3.1), context building pipeline (Section 3.2), dataset splits (80/10/10), and test set sizes (4,415 CPatMiner, 2,594 BigFix tested bugs). The pipeline from raw data through training and evaluation is traceable.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "DEAR trains its own models (LSTM, fine-tuned BERT, RNN) on specific datasets. It does not evaluate a pre-trained model's inherent capability on a benchmark. The concern of pre-training contamination does not apply.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "Same rationale: DEAR is not evaluating a pre-trained model on a benchmark. The authors control train/test splits directly. They do use separate datasets for training and testing on Defects4J.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "DEAR is a custom-trained system, not a pre-trained model being evaluated on benchmarks. Pre-training contamination is not applicable.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study. All evaluation is automated.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Section 6.5 reports: 'predicting on CPatMiner took 2.4-3.1 seconds for each candidate patch,' 'predicting on BigFix took 3.6-4.2 seconds,' 'Predicting on Defects4J took only 2.1 seconds.' Test validation times also reported.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Section 6.5 reports training time: '+22 hours' on CPatMiner, '18-19 hours' on BigFix. Hardware stated: '8-core Intel CPU and a single GTX Titan GPU.' Training parameters: DEAR 0.39M vs CURE 3.1M.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from a single run.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The paper does not state how many experimental runs produced the reported results.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "The paper lists hyperparameter ranges searched (Section 5.3.1) but does not report the total number of configurations tried or compute spent on the search.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": true,
    390           "justification": "The paper states 'We tuned all approaches with the aforementioned parameters on the same CPatMiner dataset to obtain the best performance' and reports the best settings found. Tuning on CPatMiner and testing on Defects4J avoids selection on the test set.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "The paper makes numerous comparisons across 6+ DL baselines and 8 pattern-based baselines without any statistical tests, let alone corrections for multiple comparisons.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors re-implemented CURE ('which is unavailable') and replicated other baselines. They note 'We tried our best to re-implement the pattern-based APR baselines and CURE for a fair comparison' but do not discuss the systematic bias of authors implementing their own baselines.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": true,
    408           "justification": "Section 6.5 compares DEAR's training parameters (0.39M) against CURE (3.1M), showing DEAR achieves better results with 7x fewer parameters. Training times are also reported.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "No discussion of whether Defects4J, BigFix, or CPatMiner actually measure real-world bug-fixing capability. The paper uses these benchmarks without questioning their construct validity.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No agentic scaffolding is involved. All tools are compared as direct model pipelines. The FL tool (Ochiai) is controlled to be the same across approaches.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of whether training data temporally overlaps with test bugs, or whether BERT pre-training data includes code from the evaluation datasets.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether any features leak information about the fix. In RQ2, actual bug locations are provided to all tools, which is acknowledged but is the evaluation design rather than leakage.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "For BigFix and CPatMiner, random 80/10/10 splits are used without discussing whether train and test bugs come from the same projects or share structural similarities that could inflate performance.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No concrete leakage detection or prevention method is applied. For Defects4J, the paper states 'no overlap between the two datasets' (CPatMiner and Defects4J) but provides no verification.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "DEAR outperforms all DL-based APR baselines by 42%–683% on Defects4J in terms of number of auto-fixed bugs using top-1 patches.",
    455       "evidence": "Table 1 shows DEAR fixes 47 bugs vs 36 (CURE), 33 (CoCoNuT), 30 (DLFix), 15 (SequenceR), 14 (Tufano19), 6 (CODIT).",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "DEAR is the first DL-based APR model capable of fixing multi-hunk/multi-statement bugs that require dependent changes to multiple statements at once.",
    460       "evidence": "Table 2 shows all existing DL baselines (DLFix, CoCoNuT, CURE) fix 0 bugs of Types 2–5 (multi-statement/multi-hunk), while DEAR fixes 18.",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "DEAR achieves comparable performance to state-of-the-art pattern-based APR tools on Defects4J.",
    465       "evidence": "Table 7 shows DEAR fixes 47 bugs vs Hercules (49) and TBar (43); DEAR fixes 12 bugs Hercules missed and 15 more multi-hunk bugs than TBar.",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "DEAR requires 7× fewer training parameters than the best baseline CURE while achieving better results.",
    470       "evidence": "RQ5 reports 0.39M parameters for DEAR vs 3.1M for CURE on CPatMiner (7× ratio), with DEAR fixing more bugs on all datasets.",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Each component of DEAR (hunk detection, multi-statement expansion, attention-cycle training) independently contributes to performance.",
    475       "evidence": "Table 9 shows removing hunk detection drops total from 47 to 35, removing expansion drops to 43, removing attention-cycle drops to 40.",
    476       "supported": "strong"
    477     },
    478     {
    479       "claim": "DEAR's approach generalizes across datasets, outperforming baselines in both within-dataset and cross-dataset settings.",
    480       "evidence": "Tables 4 and 5 show DEAR achieves highest Top-1/3/5 rates on both BigFix and CPatMiner, including cross-dataset training scenarios.",
    481       "supported": "moderate"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval"
    486   ],
    487   "key_findings": "DEAR advances DL-based automated program repair by introducing a multi-hunk, multi-statement fixing capability that no prior DL-based APR system possessed. On Defects4J, DEAR fixes 47 bugs (31%–683% more than baselines) including 18 multi-statement bugs that all DL baselines score zero on, reaching parity with the best pattern-based tools. DEAR achieves this with 7× fewer parameters than the best baseline (CURE) by combining spectrum-based fault localization with fine-tuned BERT, a divide-and-conquer AST subtree strategy, and cycle-trained tree-LSTM with attention. The ablation study confirms each component independently contributes, with multi-statement expansion being the key enabler for the most challenging bug types.",
    488   "red_flags": [
    489     {
    490       "flag": "No statistical significance testing",
    491       "detail": "All comparative claims are based on raw bug counts with no significance tests, confidence intervals, or variance estimates across runs. A difference of 11 bugs (DEAR vs CURE on Defects4J) on a 395-bug dataset is presented as conclusive without any uncertainty quantification."
    492     },
    493     {
    494       "flag": "Single-run results",
    495       "detail": "No indication that experiments were repeated multiple times; DL training has stochasticity but no variance across runs is reported."
    496     },
    497     {
    498       "flag": "CURE re-implementation risk",
    499       "detail": "CURE's source code was 'unavailable' so authors re-implemented it. Re-implementations may underperform the original, potentially inflating DEAR's advantage over this key baseline."
    500     },
    501     {
    502       "flag": "No environment specification",
    503       "detail": "Reproducibility is hampered by absence of dependency specifications, CUDA version, or container configuration; hardware is listed but software stack is not."
    504     },
    505     {
    506       "flag": "Java-only evaluation",
    507       "detail": "All three benchmarks are Java-only; claims about 'general software defects' are bounded to Java despite the broad title framing."
    508     }
    509   ],
    510   "cited_papers": [
    511     {
    512       "title": "DLFix: Context-Based Code Transformation Learning for Automated Program Repair",
    513       "relevance": "Direct predecessor and baseline; DEAR enhances DLFix's tree-based LSTM architecture with multi-hunk capability."
    514     },
    515     {
    516       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    517       "relevance": "State-of-the-art DL-based APR baseline using GPT; primary competitor that DEAR outperforms on all datasets."
    518     },
    519     {
    520       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    521       "relevance": "Key DL baseline for comparison; DEAR fixes 14–42% more bugs across datasets."
    522     },
    523     {
    524       "title": "Graph-Based Mining of in-the-Wild, Fine-Grained, Semantic Code Change Patterns (CPatMiner)",
    525       "relevance": "Provides the largest training dataset (44k+ bugs) and the AST-based change detection tool used throughout DEAR's pipeline."
    526     },
    527     {
    528       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    529       "relevance": "State-of-the-art pattern-based APR baseline used to show DEAR reaches pattern-based tool performance levels."
    530     },
    531     {
    532       "title": "Harnessing Evolution for Multi-Hunk Program Repair (Hercules)",
    533       "relevance": "The only prior APR tool targeting multi-hunk bugs; DEAR shows complementary results, fixing 7 bugs Hercules missed."
    534     },
    535     {
    536       "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    537       "relevance": "DEAR fine-tunes BERT for the hunk detection component to learn fixing-together relationships among statements."
    538     },
    539     {
    540       "title": "Unpaired Image-to-Image Translation Using Cycle-Consistent Adversarial Networks",
    541       "relevance": "Source of cycle training technique adapted for DEAR's tree-based LSTM to improve code transformation learning."
    542     },
    543     {
    544       "title": "Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks",
    545       "relevance": "Foundational Tree-LSTM architecture that DEAR extends with attention and cycle training for APR."
    546     },
    547     {
    548       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    549       "relevance": "DL-based APR baseline using sequence-to-sequence translation; represents the single-statement-at-a-time limitation DEAR overcomes."
    550     }
    551   ],
    552   "engagement_factors": {
    553     "practical_relevance": {
    554       "score": 2,
    555       "justification": "DEAR is a working APR tool with released code that practitioners could apply to Java bug fixing, though requires significant ML pipeline setup."
    556     },
    557     "surprise_contrarian": {
    558       "score": 1,
    559       "justification": "The multi-hunk capability is novel for DL-based APR but the general finding that a better approach fixes more bugs is expected."
    560     },
    561     "fear_safety": {
    562       "score": 0,
    563       "justification": "No AI safety or security concerns raised; purely a software engineering tool."
    564     },
    565     "drama_conflict": {
    566       "score": 0,
    567       "justification": "No controversy; straightforward academic contribution with incremental improvements."
    568     },
    569     "demo_ability": {
    570       "score": 1,
    571       "justification": "Code is released on GitHub but requires training data, GPU setup, and ML pipeline expertise to run."
    572     },
    573     "brand_recognition": {
    574       "score": 0,
    575       "justification": "Academic authors from NJIT and UT Dallas; no major industry lab or famous product involved."
    576     }
    577   },
    578   "hn_data": {
    579     "threads": [
    580       {
    581         "hn_id": "36018657",
    582         "title": "DarkBERT: A Language Model for the Dark Side of the Internet",
    583         "points": 142,
    584         "comments": 59,
    585         "url": "https://news.ycombinator.com/item?id=36018657"
    586       },
    587       {
    588         "hn_id": "38162779",
    589         "title": "Category Theory for Programming",
    590         "points": 47,
    591         "comments": 6,
    592         "url": "https://news.ycombinator.com/item?id=38162779"
    593       },
    594       {
    595         "hn_id": "30565951",
    596         "title": "Improved Approximation Algorithms and Lower Bounds for Search-Diversification",
    597         "points": 5,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=30565951"
    600       },
    601       {
    602         "hn_id": "35994539",
    603         "title": "DarkBERT: A Language Model for the Dark Side of the Internet",
    604         "points": 4,
    605         "comments": 0,
    606         "url": "https://news.ycombinator.com/item?id=35994539"
    607       },
    608       {
    609         "hn_id": "36013633",
    610         "title": "DarkBERT: A Language Model for the Dark Side of the Internet",
    611         "points": 3,
    612         "comments": 1,
    613         "url": "https://news.ycombinator.com/item?id=36013633"
    614       },
    615       {
    616         "hn_id": "43311058",
    617         "title": "A programmable environment for shape optimization and shapeshifting problems",
    618         "points": 2,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=43311058"
    621       },
    622       {
    623         "hn_id": "46211392",
    624         "title": "A Simple Proof of the Riemann Hypothesis",
    625         "points": 1,
    626         "comments": 0,
    627         "url": "https://news.ycombinator.com/item?id=46211392"
    628       },
    629       {
    630         "hn_id": "44067109",
    631         "title": "The effectiveness of Large Language Models in the mechanical design domain",
    632         "points": 1,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=44067109"
    635       },
    636       {
    637         "hn_id": "40350177",
    638         "title": "GPT-4 passes most of the 297 written Polish Board Certification Examinations",
    639         "points": 1,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=40350177"
    642       }
    643     ],
    644     "top_points": 142,
    645     "total_points": 206,
    646     "total_comments": 66
    647   }
    648 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs