ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (31723B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A Deep Dive into Large Language Models for Automated Bug Localization and Repair",
      6     "authors": [
      7       "Soneya Binta Hossain",
      8       "Nan Jiang",
      9       "Qiang Zhou",
     10       "Xiaopeng Li",
     11       "Wen-Hao Chiang",
     12       "Yingjun Lyu",
     13       "Hoan Nguyen",
     14       "Omer Tripp"
     15     ],
     16     "year": 2024,
     17     "venue": "Proc. ACM Softw. Eng.",
     18     "arxiv_id": "2404.11595",
     19     "doi": "10.1145/3660773"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract claims SOTA on CodeXGLUE (supported by Table 1, PolyCoder-2.7B at 25.07% vs NSEdit 23.86%) and better/comparable performance on Defects4J (supported by Table 3, Top-10 through Top-100 rankings).",
     27         "source": "opus"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Causal claims about prompt design improving accuracy are supported by controlled experiments (RQ3) where only the prompt changes while model, data, and ground-truth locations remain fixed. The adjustment module ablation (RQ5) is similarly controlled.",
     33         "source": "opus"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The title claims 'Large Language Models' broadly but experiments use only 6 models from 110M to 2.7B parameters—far smaller than current frontier models. Datasets are predominantly Java. These scope limitations are not acknowledged in the title or abstract framing.",
     39         "source": "opus"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Section 5 (Threats to Validity) offers generic disclaimers: 'there's a possibility that our results may not generalize across other datasets' and 'it is conceivable that [scripts] might contain bugs.' No substantive alternative explanations for the observed results are discussed (e.g., whether improvements are due to reduced sequence length rather than inductive bias).",
     45         "source": "opus"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper measures exact match accuracy and test-case pass rates, and frames results in terms of 'bug fixing accuracy'—which directly matches the measurements. No proxy gap exists between what is measured and what is claimed.",
     51         "source": "opus"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5 'Threats to Validity' provides a dedicated subsection discussing generalizability, implementation correctness, and accuracy metrics.",
     59         "source": "opus"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Section 5 states generic threats: 'there's a possibility that our results may not generalize across other datasets' and 'it is conceivable that [tools] might contain bugs.' These are boilerplate disclaimers rather than threats specific to this study's methodology.",
     65         "source": "opus"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that findings may be specific to models in the 110M–2.7B range, to Java code, or to single-hunk bugs.",
     71         "source": "opus"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding sources are mentioned despite 5 of 8 authors being affiliated with Amazon Web Services, suggesting corporate funding.",
     79         "source": "opus"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are clearly listed: Soneya Binta Hossain (University of Virginia), Nan Jiang (Purdue), and Qiang Zhou, Xiaopeng Li, Wen-Hao Chiang, Yingjun Lyu, Hoan Nguyen, Omer Tripp (all Amazon Web Services).",
     85         "source": "opus"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No funding is disclosed. With 5/8 authors from AWS, the work is implicitly corporate-funded. While the paper does not evaluate AWS products specifically, the lack of funding disclosure prevents assessment of independence.",
     91         "source": "opus"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     97         "source": "opus"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Token-granulated localization is explicitly defined and contrasted with line-level localization; exact match metric is defined; the three framework components are clearly described.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 1.4 lists four explicit contributions: token-granulated direction, four-prompt design, adjustment module for tokenizer discrepancy, and the comprehensive empirical study.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4 discusses NMT-based, edit-based, and LLM-based APR methods, explicitly positioning Toggle's differences (token vs. line granularity, separated localization/fixing, prompt design study) relative to prior work.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. Toggle's implementation is not released.",
    128           "source": "opus"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The paper uses publicly available datasets: CodeXGLUE (Tufano Small/Medium), CodeReviewer, Defects4J, and the GitHub dataset. All are referenced with citations and are accessible.",
    134           "source": "opus"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions GPU instances and Hugging Face models but does not specify library versions or environment details.",
    140           "source": "opus"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology in detail but does not include scripts, commands, or a README for reproducing experiments.",
    146           "source": "opus"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "All results in Tables 1–8 are reported as single point estimates (e.g., '25.07%') with no confidence intervals, error bars, or ± notation.",
    154           "source": "opus"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Claims like 'PolyCoder-2.7B reaching the highest accuracy of 25.07%' outperforming the baseline of 23.86% are made by comparing raw numbers with no statistical significance tests.",
    160           "source": "opus"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Results are reported with baseline context in all tables (e.g., Table 1 shows baseline accuracies alongside Toggle results, Table 4 shows improvement from Prompt 1 at 16.07% to Prompt 4 at 56.98%), allowing the reader to assess the magnitude of improvements.",
    166           "source": "opus"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No justification is given for sample sizes. The number of patches per bug (210 for Defects4J) is noted as 'relatively small' compared to prior work but not formally justified.",
    172           "source": "opus"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Section 5 mentions experiments were 'repeated each experiment several times to confirm consistency' but no standard deviations, variance, or spread measures appear in any results table.",
    178           "source": "opus"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Multiple baselines are included: NSEdit, CoText for CodeXGLUE; CodeT5, NSEdit for CodeReviewer; CURE, RewardRepair, Recoder, KNOD, Tare, AlphaRepair, TENURE for Defects4J (Tables 1, 3).",
    186           "source": "opus"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include methods from 2021–2023 (KNOD, Tare, AlphaRepair, TENURE from ICSE 2023), which are contemporary for a 2024 publication.",
    192           "source": "opus"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "RQ3 (four prompts compared), RQ4 (with/without contextual information), and RQ5 (with/without adjustment module) constitute comprehensive ablation studies isolating each component's contribution.",
    198           "source": "opus"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Exact match accuracy is used for CodeXGLUE/CodeReviewer, test-case validation for Defects4J, and Top-K metrics (Top-10, 30, 50, 100) for patch ranking.",
    204           "source": "opus"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "For Defects4J, Section 3.2.1 states: 'we manually executed all corrective patches to ensure they indeed pass all tests and effectively fix the bugs,' constituting manual verification of system outputs.",
    210           "source": "opus"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Section 3.1.1 states datasets are split 80/10/10 for train/validation/test. Defects4J (RQ2) serves as a completely unseen test set not used in fine-tuning.",
    216           "source": "opus"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Results are broken down by dataset (Tufano Small, Medium, CodeReviewer ±comments), by model backbone (6 LLMs), and by prompt type (4 prompts) across Tables 1, 4, 6, 7, 8.",
    222           "source": "opus"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Figure 7 shows a concrete failure case where correct localization leads to incorrect fix, and Figure 2 illustrates tokenizer discrepancy failures. Section 3.5.2 discusses why the bug fixing model prematurely closes methods.",
    228           "source": "opus"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "CodeGPT underperforms other models due to Java-only pretraining (Section 3.1.3). Prompt 4 underperforms Prompt 3 on Tufano datasets despite outperforming with ground-truth locations (Table 8 vs Table 4). Location prediction accuracy drops significantly for dual-token prediction (Table 7).",
    234           "source": "opus"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Specific model names with parameter counts are given: CodeGPT-110M (Java checkpoint), CodeParrot-110M (multi-language), CodeGen-350M/2B, PolyCoder-400M/2.7B, CodeT5-large (347M). These are identifiable on Hugging Face.",
    242           "source": "opus"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Figure 5 provides the complete structure of all four prompt formats with concrete examples showing how buggy code, shared prefix, shared suffix, and separators are arranged. The prompts are deterministically constructed from code, and the construction rules are fully specified.",
    248           "source": "opus"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": false,
    253           "justification": "No training hyperparameters are reported—no learning rate, batch size, number of epochs, optimizer, temperature for generation, or beam size. Only dataset splits (80/10/10) and number of generated patches (70/210) are mentioned.",
    254           "source": "opus"
    255         },
    256         "scaffolding_described": {
    257           "applies": false,
    258           "answer": false,
    259           "justification": "Toggle is a pipeline of fine-tuned models (localization → adjustment → fixing), not an agentic scaffold. No tool use, retry logic, or feedback mechanisms are involved.",
    260           "source": "opus"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Section 2.2 describes dataset characteristics and selection criteria. For Defects4J, 240 single-hunk bugs are selected. The GitHub dataset excludes Defects4J patches via AST comparison. Sections 2.3.1–2.3.3 detail how input prompts are constructed from buggy/fixed code pairs.",
    266           "source": "opus"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "While the benchmark datasets are public, the paper does not release model outputs, generated patches, predicted locations, or fine-tuned model weights. Independent verification of reported numbers is not possible.",
    274           "source": "opus"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 2.2 describes each dataset in detail: Tufano Small (58,350 samples) and Medium (65,465) from GitHub commits, CodeReviewer (183,881 multilingual samples), Defects4J (835 bugs from 17 projects, 240 single-hunk used), and the GitHub dataset (1,083,185 commits).",
    280           "source": "opus"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants. All data comes from standard public benchmarks.",
    286           "source": "opus"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The pipeline from bug localization to adjustment to bug fixing is described in detail (Sections 2.3.1–2.3.3). Figure 3 provides a visual overview. The adjustment model training pipeline is described step-by-step in Section 2.3.3 with concrete shift ranges (-3 to +3).",
    292           "source": "opus"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The pre-training data cutoff dates for CodeGPT, CodeParrot, CodeGen, PolyCoder, and CodeT5 are not stated. The paper does not mention when these models' training data was collected.",
    300           "source": "opus"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "Section 2.2 states the GitHub dataset excluded 'patches associated with the Defects4J project or those resembling any in Defects4J v1.2 or v2.0 were meticulously excluded, based on an AST comparative analysis.' Standard splits are used for other datasets.",
    306           "source": "opus"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "While Defects4J overlap with training data is addressed, no discussion of whether CodeXGLUE benchmark examples could appear in the pre-training data of the base models (CodeGPT, CodeParrot, etc.) is provided.",
    312           "source": "opus"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in this study. All evaluation is on code benchmarks.",
    320           "source": "opus"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in this study.",
    326           "source": "opus"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in this study.",
    332           "source": "opus"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in this study.",
    338           "source": "opus"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in this study.",
    344           "source": "opus"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in this study.",
    350           "source": "opus"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants in this study.",
    356           "source": "opus"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No inference cost, latency, or tokens consumed are reported. The number of generated patches (210 per bug) is noted as 'relatively small' but no wall-clock time or compute cost is given.",
    364           "source": "opus"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No GPU hours, training time, or total computational budget is stated despite fine-tuning 6 LLMs across 4 datasets with 4 prompt styles (96 experiments) plus adjustment model training.",
    370           "source": "opus"
    371         }
    372       },
    373       "experimental_rigor": {
    374         "seed_sensitivity_reported": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "No mention of multiple random seeds. All results appear to be single-run numbers despite Section 5 claiming experiments were 'repeated several times.'",
    378           "source": "opus"
    379         },
    380         "number_of_runs_stated": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "Section 5 vaguely states 'repeated each experiment several times to confirm consistency' but the exact number of runs is not stated and no variance is shown in results tables.",
    384           "source": "opus"
    385         },
    386         "hyperparameter_search_budget": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "No hyperparameter search budget is reported. Training hyperparameters are not even listed, let alone the search process for selecting them.",
    390           "source": "opus"
    391         },
    392         "best_config_selection_justified": {
    393           "applies": true,
    394           "answer": true,
    395           "justification": "All configurations are systematically reported: 6 models × 4 prompts in Table 4, all models on all datasets in Tables 1 and 8. CodeParrot-110M was selected for Defects4J based on 'superior performance over the similarly sized CodeGPT-110M and comparable performance to larger models' (Section 3.2.1).",
    396           "source": "opus"
    397         },
    398         "multiple_comparison_correction": {
    399           "applies": true,
    400           "answer": false,
    401           "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons, despite making dozens of pairwise comparisons across 6 models, 4 prompts, and 4 datasets.",
    402           "source": "opus"
    403         },
    404         "self_comparison_bias_addressed": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "The authors re-implement some baselines (NSEdit on CodeReviewer, Section 3.1.2) and compare their own system against these re-implementations without acknowledging or addressing author-evaluation bias.",
    408           "source": "opus"
    409         },
    410         "compute_budget_vs_performance": {
    411           "applies": true,
    412           "answer": false,
    413           "justification": "Models range from 110M to 2.7B parameters (25× difference) but no performance-per-compute analysis is provided. Finding 1 notes larger models perform better but does not account for the additional compute required.",
    414           "source": "opus"
    415         },
    416         "benchmark_construct_validity": {
    417           "applies": true,
    418           "answer": false,
    419           "justification": "The paper uses exact match as the primary metric, noting only that it is 'commonly used' and 'a more fairer measure of accuracy than either BLEU or CodeBLEU scores' (Section 3.1.1). No deeper discussion of whether exact match captures real-world bug fixing utility.",
    420           "source": "opus"
    421         },
    422         "scaffold_confound_addressed": {
    423           "applies": false,
    424           "answer": false,
    425           "justification": "No agentic scaffolding is involved. Toggle is a deterministic pipeline of fine-tuned models, not a scaffold-based system.",
    426           "source": "opus"
    427         }
    428       },
    429       "data_leakage": {
    430         "temporal_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of whether the base models' pre-training data temporally overlaps with the benchmark datasets. CodeXGLUE and Defects4J predate the models' training, creating potential temporal leakage.",
    434           "source": "opus"
    435         },
    436         "feature_leakage_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "No discussion of whether the evaluation setup leaks information not available in real usage scenarios.",
    440           "source": "opus"
    441         },
    442         "non_independence_addressed": {
    443           "applies": true,
    444           "answer": true,
    445           "justification": "Section 2.2 states the GitHub dataset excluded patches 'associated with the Defects4J project or those resembling any in Defects4J v1.2 or v2.0' using AST comparative analysis, directly addressing train-test independence for the Defects4J evaluation.",
    446           "source": "opus"
    447         },
    448         "leakage_detection_method": {
    449           "applies": true,
    450           "answer": true,
    451           "justification": "An AST comparative analysis was used to identify and exclude Defects4J-related patches from the GitHub training dataset (Section 2.2), constituting a concrete decontamination method.",
    452           "source": "opus"
    453         }
    454       }
    455     }
    456   },
    457   "claims": [
    458     {
    459       "claim": "Toggle achieves new state-of-the-art on the CodeXGLUE code refinement benchmark (PolyCoder-2.7B: 25.07% EM on Tufano Small vs. NSEdit baseline of 23.86%)",
    460       "evidence": "Table 1 shows PolyCoder-2.7B and CodeGen-2B exceeding both NSEdit and CoText baselines on Tufano Small and Medium",
    461       "supported": "weak"
    462     },
    463     {
    464       "claim": "Token-granulated bug localization significantly outperforms line-level localization for bug fixing (e.g., line-granular reduces CodeGPT-110M to 28.5% vs. Prompt 2's 34.19% EM)",
    465       "evidence": "Mentioned in Section 3.3.2 for one model on one dataset; not systematically evaluated across all models",
    466       "supported": "weak"
    467     },
    468     {
    469       "claim": "Prompt 4 (excluding shared prefix and suffix) substantially outperforms simpler prompts under oracle location conditions (CodeGPT: 56.98% vs 16.07% for Prompt 1)",
    470       "evidence": "Table 4 shows consistent improvement across all 6 LLMs from Prompt 1 to Prompt 4 with ground-truth locations",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Toggle (CodeParrot-110M) outperforms all existing APR methods on Defects4J Top-10, Top-30, Top-50, and Top-100 metrics",
    475       "evidence": "Table 3 shows Toggle achieves 41/58/64/74 vs best alternatives 36/51/62/70 (KNOD) for those metrics",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Additional contextual information (buggy line numbers, code review comments) improves localization accuracy by ~20-30pp",
    480       "evidence": "Table 5 shows consistent improvement across all three datasets; e.g., Tufano Small starting token: 39.07% → 60.37% with line numbers",
    481       "supported": "strong"
    482     },
    483     {
    484       "claim": "The adjustment module consistently improves bug-fixing accuracy across all 16 dataset-model combinations",
    485       "evidence": "Table 6 shows improvement in every cell, ranging from 0.07pp to 2.02pp",
    486       "supported": "strong"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval"
    491   ],
    492   "key_findings": "Toggle introduces token-level (rather than line-level) bug localization integrated with a generative bug-fixing model and an optional adjustment module to bridge tokenizer discrepancies. Under oracle location conditions, Prompt 4 (excluding shared prefix and suffix) consistently yields the highest accuracy across all models, with improvements of up to 40pp over the baseline Prompt 1. On Defects4J, Toggle with a 110M parameter model outperforms larger and more complex existing APR systems on Top-10 through Top-100 patch generation metrics. However, benefits of Prompt 4 over Prompt 3 are dataset-dependent when predicted (imperfect) locations are used, and SOTA claims for CodeXGLUE rest on comparison with only two baselines.",
    493   "red_flags": [
    494     {
    495       "flag": "Thin SOTA baseline coverage",
    496       "detail": "The 'new state-of-the-art on CodeXGLUE' claim is supported by comparison with only 2 baselines (NSEdit, CoText); no comprehensive leaderboard sweep or comparison with more recent methods is provided."
    497     },
    498     {
    499       "flag": "No statistical significance testing",
    500       "detail": "All comparative claims are made without significance tests or confidence intervals despite multiple comparisons across 6 models, 4 prompts, and 4-5 datasets."
    501     },
    502     {
    503       "flag": "No hyperparameters reported",
    504       "detail": "Learning rate, batch size, optimizer, number of epochs, and other training hyperparameters are entirely absent, making replication impossible even with released code."
    505     },
    506     {
    507       "flag": "Code not released",
    508       "detail": "Toggle's implementation is not made available despite being a systems paper; results cannot be reproduced without reimplementing the entire three-component pipeline."
    509     },
    510     {
    511       "flag": "Pre-training contamination unaddressed",
    512       "detail": "All base models (CodeGPT, CodeParrot, etc.) were pre-trained on GitHub code; Tufano and Defects4J datasets also derive from GitHub; no analysis of pre-training/benchmark overlap is performed."
    513     },
    514     {
    515       "flag": "No conflict of interest disclosure",
    516       "detail": "Five of eight authors are Amazon Web Services employees; no funding disclosure or COI statement appears despite the institutional affiliation with a major cloud/developer-tools company."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Impact of Code Language Models on Automated Program Repair",
    522       "relevance": "Direct prior work using LLMs for APR that Toggle builds upon and compares against"
    523     },
    524     {
    525       "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models",
    526       "relevance": "AlphaRepair, a key LLM-based APR baseline evaluated on Defects4J"
    527     },
    528     {
    529       "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair",
    530       "relevance": "Contemporary SOTA APR method Toggle outperforms on Defects4J metrics"
    531     },
    532     {
    533       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    534       "relevance": "Primary benchmark used for evaluation; Toggle claims SOTA on its code refinement task"
    535     },
    536     {
    537       "title": "CodeReviewer: Pre-Training for Automating Code Review Activities",
    538       "relevance": "Dataset and baseline used for evaluation; contextual information (code review comments) study"
    539     },
    540     {
    541       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    542       "relevance": "Primary generalizability benchmark for held-out evaluation"
    543     },
    544     {
    545       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    546       "relevance": "Base model used for the bug localization component of Toggle"
    547     },
    548     {
    549       "title": "Fix Bugs with Transformer through a Neural-Symbolic Edit Grammar (NSEdit)",
    550       "relevance": "Primary CodeXGLUE SOTA baseline that Toggle claims to surpass"
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 1,
    556       "justification": "Toggle addresses a practical problem (automated bug fixing) but no code is released, limiting immediate use."
    557     },
    558     "surprise_contrarian": {
    559       "score": 1,
    560       "justification": "Token-level vs line-level localization is a novel granularity shift, but does not fundamentally challenge conventional wisdom about LLM-based APR."
    561     },
    562     "fear_safety": {
    563       "score": 0,
    564       "justification": "No AI safety or security concerns raised; the work focuses on improving code repair accuracy."
    565     },
    566     "drama_conflict": {
    567       "score": 0,
    568       "justification": "No controversy or conflict; straightforward benchmark improvement paper."
    569     },
    570     "demo_ability": {
    571       "score": 0,
    572       "justification": "No code, demo, or tool released for anyone to try."
    573     },
    574     "brand_recognition": {
    575       "score": 1,
    576       "justification": "Amazon Web Services is well-known but not a top-tier AI research brand; the work is published at FSE, a respected but niche SE venue."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [
    581       {
    582         "hn_id": "40205264",
    583         "title": "Urban highways are barriers to social ties",
    584         "points": 6,
    585         "comments": 0,
    586         "url": "https://news.ycombinator.com/item?id=40205264"
    587       },
    588       {
    589         "hn_id": "41103162",
    590         "title": "Beyond Deepfake Images: Detecting AI-Generated Videos [pdf]",
    591         "points": 3,
    592         "comments": 0,
    593         "url": "https://news.ycombinator.com/item?id=41103162"
    594       },
    595       {
    596         "hn_id": "40165320",
    597         "title": "Generation of Low-Inclination, Neptune-Crossing TNOs by Planet Nine",
    598         "points": 2,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=40165320"
    601       }
    602     ],
    603     "top_points": 6,
    604     "total_points": 11,
    605     "total_comments": 0
    606   }
    607 }

Impressum · Datenschutz