scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32828B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Extensive Study on Model Architecture and Program Representation in the Domain of Learning-based Automated Program Repair",
      6     "authors": [
      7       "Dániel Horváth",
      8       "Viktor Csuvik",
      9       "Tibor Gyimóthy",
     10       "László Vidács"
     11     ],
     12     "year": 2023,
     13     "venue": "IEEE/ACM International Workshop on Automated Program Repair (APR)",
     14     "arxiv_id": null,
     15     "doi": "10.1109/APR59189.2023.00013"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims match Table II results: 19.88% on java-small with text (CodeT5-base), 11.87% on java-medium (CodeT5-base), 30.64% on java-small with cmdseq (CodeT5-base), 18.53% on medium with cmdseq (CodeT5-base). The claim that AST+text 'significantly underperform' is supported by <1% exact match results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The primary claims are comparative ('cmdseq outperforms text'), supported by controlled experiments that vary one factor at a time (same model, different representations). The ablation design with matched conditions is adequate for these claims.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title claims 'An Extensive Study' in 'the Domain of Learning-based Automated Program Repair' broadly, but experiments cover only 2 languages (Java, JavaScript), 2 specific datasets, and 4-5 model architectures. Conclusions like 'each deep-learning setting requires its own data representation' generalize beyond what was tested.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper offers brief speculation that FixJS underperformance 'may be due to' smaller dataset size and unique-sample filtering, but does not systematically discuss alternative explanations. The AST+text failure is attributed to 'insufficient model size' without rigorous analysis.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper uses exact match with developer patches as the evaluation metric and frames results as 'accuracy' for automated program repair. While Section I acknowledges exact match is stringent, the paper does not discuss that exact match misses semantically correct but syntactically different patches, a well-known limitation in APR evaluation.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusions (Section VII) briefly mention future directions but do not substantively discuss limitations of the current study.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No threats to validity are discussed. The paper acknowledges some issues in passing (e.g., 'further experiments would be needed' for AST+text, model size not optimized) but these are not framed as specific threats to the study's conclusions.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what the results do NOT show. It mentions that 'Optimizing the size of the model is out of the scope of this paper' but does not bound the generalizability of its claims to specific languages, datasets, or model families.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The Acknowledgement section discloses multiple funding sources: ÚNKP-22-3-SZTE New National Excellence Program, European Union project RRF-2.3.1-21-2022-00004 (Artificial Intelligence National Laboratory), and project TKP2021-NVA-09 from the National Research, Development and Innovation Fund.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are affiliated with the Department of Software Engineering, University of Szeged, Hungary. Affiliations are clearly listed in the paper header. Two co-authors (Csuvik, Vidács) authored the FixJS dataset used in evaluation.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Funding is from Hungarian government ministries and EU programs supporting general AI research. None of the funders have a financial stake in which code representation or model performs best.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Program representations clearly defined (text: raw tokens; cmdseq: [DELETE]/[INSERT]/[LOCATION] tokens; AST+text: flattened AST chains). Models referenced to papers. 'Accuracy' defined as exact match with developer fix.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Contribution explicitly stated: compare representation choices across models and languages to understand strengths/limitations of DL for APR and provide practical guidance.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section VI substantially engages with prior work (GenProg, DeepDebug, NSEdit, Hoppity). Shows how this work differs from Navamar et al. by using Transformers instead of NMT. Clearly positioned in the literature.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "A GitHub repository is provided at https://github.com/AAI-USZ/APR23-representations (footnote 1 in Section I): 'Our setup, data, and methods used are also available in a GitHub repository.'",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Both datasets are publicly available: the Java dataset is part of the CodeXGLUE benchmark (Tufano et al.), and FixJS is a published dataset. The paper's GitHub repository also makes their processed data available.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Section IV mentions Python 3.8, PyTorch, PyTorch-Lightning, and the transformers library, along with an RTX 3090 GPU. However, no specific library versions, requirements.txt, or Dockerfile are provided — just library names without version numbers.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper describes experimental setup (hyperparameters, datasets, hardware) in Section IV but does not provide step-by-step reproduction instructions. The GitHub repo is linked but the paper itself contains no commands, scripts, or README-style instructions for reproducing the experiments.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table II reports only point estimates for accuracy and 100% accuracy. No confidence intervals, error bars, or ± notation are provided for any results.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper makes numerous comparative claims (e.g., 'outperforms all the other representations, by at least 6%') but provides no statistical significance tests. All comparisons are based on raw number differences.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Table III explicitly reports the differences between representations and datasets with baseline context. For example, CodeT5-base on Java small: text 19.88% vs command sequence 30.64%, difference 10.76%. Both absolute values and differences are provided.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Dataset sizes are stated (58,350 Java small, 65,455 Java medium, 9,662 FixJS small, 11,410 FixJS medium) but no justification or power analysis is provided for why these sizes are sufficient for the claims being made.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "All results appear to be from single training runs. No standard deviations, variance across seeds, or spread measures are reported anywhere in the paper.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The study compares multiple models (T5, CodeT5, RoBERTa+CodeBERT+GPTNeo) across multiple representations (text, cmdseq, AST+text) and references NSEdit as the state-of-the-art on the CodeXGLUE leaderboard (Section VI).",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The paper references NSEdit (2022) as the state-of-the-art on CodeXGLUE at time of writing, and uses recent pre-trained models (CodeT5 2021, CodeBERT 2020). The baselines are appropriate for a 2023 workshop paper.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "The study systematically varies components: pretrained vs not pretrained (T5-base vs T5-base empty), LM-head-only vs full fine-tuning (T5-base LM vs T5-base), different representations on same models, and different models on same representations. Table II shows results for all configurations.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Table II reports two metrics: 'Accuracy' (appears to be average token/sequence-level accuracy) and '100% accuracy' (exact match rate). Values differ substantially, e.g., T5-base on Java small text: 0.9756 Accuracy vs 0.1491 100% accuracy.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Evaluation is entirely automated via exact match comparison with developer patches. While Section V includes qualitative examples of generated patches (Listings 1-8), there is no systematic human evaluation of output quality.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Figure 1 clearly shows separate TRAIN, TEST, and VALID splits. The paper follows 'the standard evaluation procedure in learning-based APR approaches' (Section II) with early stopping on validation loss and evaluation on the test set.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table II provides results broken down by model, representation, pretrained status, dataset (Java/FixJS), and dataset size (small/medium). Table III further compares across representations and datasets.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section V presents multiple failure examples with analysis: Listing 3 shows an incorrect cmdseq patch with discussion of deletion bias, Listing 7 shows an incorrect text patch with discussion of overfitting to simple patterns like 'return 1'.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Several negative results are reported: AST+text representation achieves below 1% exact match ('significantly underperform'), training only the LM head 'is not enough' (0% exact match after 42 epochs), and cmdseq underperforms text on FixJS in some configurations.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific HuggingFace model identifiers are provided: t5-base, codet5-base, codebert-base, roberta-base, gpt-neo-125M. These identify specific model checkpoints. Model parameter counts are also given (222M for base models, 346M for AST+text composite).",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "The paper fine-tunes sequence-to-sequence models rather than using prompting. Input is the source code representation fed directly to the model, not a prompt.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Section IV reports: learning rate 5e-5, Adam optimizer with epsilon 1e-8, batch sizes (16 for small, 8 for medium), sequence lengths (256 for small, 384 for medium), max 50 epochs, early stopping with delta 0.05 and patience 8, cross-entropy loss.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The models are standard fine-tuned transformer models with direct input-output evaluation.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section III describes preprocessing for both datasets: Java dataset uses variable/method abstraction (e.g., myVar → VARIABLE_1) with idioms preserved, FixJS uses similar abstraction with duplicate filtering. Section II.B documents the three representation transformations (text, cmdseq, AST+text) in detail with examples in Table I.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Both evaluation datasets are publicly available: the Java dataset through CodeXGLUE and FixJS through its original release. The paper's GitHub repository provides access to experimental setup and data.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section III describes both datasets: the Java dataset was mined from GitHub with normalization by Tufano et al., with sizes given. FixJS contains bug-fixing GitHub commits with abstraction and deduplication by Csuvik & Vidács. Dataset sizes and splitting are documented.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. The study uses standard benchmark datasets (CodeXGLUE Java code refinement, FixJS).",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from raw datasets to model input is documented: Section III describes the original datasets, Section II.B describes how each code representation (text, cmdseq, AST+text) is generated, and Section IV describes the training/evaluation procedure.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The paper uses pre-trained models (T5, CodeT5, CodeBERT, RoBERTa) that were trained on large corpora including web text and GitHub code, but never states the training data cutoff dates for any of these models.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether the Java or FixJS test examples appeared in the pre-training data of T5, CodeT5, or CodeBERT. The Java dataset was mined from GitHub, and these models were pre-trained on GitHub code, creating a clear overlap risk.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The Java dataset (Tufano et al. 2019) and CodeXGLUE benchmark (2021) were publicly available before the pre-trained models' training. This contamination risk is not discussed at all.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study. It is a purely computational benchmark evaluation.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference cost, latency, or per-example cost is reported. Only training time is mentioned.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Section IV reports hardware (RTX 3090 GPU) and training times: 'about half an hour on small, and one hour on the medium dataset' for text/cmdseq, '1 hour and 40 minutes on small, and 2 hours on medium' for AST+text on Java. FixJS training ranged from 5-20 minutes per epoch. Total training time 'from 1 hour to about 1 day.'",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No mention of random seeds or sensitivity analysis across seeds. All results appear to be from single training runs.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "The number of experimental runs is never stated. Results are presented without indication of how many runs produced them.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search is described. The paper uses fixed hyperparameters (learning rate 5e-5, batch sizes, etc.) without reporting whether any search was conducted or why these values were chosen beyond citing reference [27].",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": true,
    391           "justification": "Early stopping on validation loss is used with patience 8 and minimum delta 0.05: 'we assume that the model converged, and stop the training, and load the best model so far for testing.' This is a principled model selection approach.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": false,
    396           "answer": false,
    397           "justification": "No statistical tests are performed at all, making multiple comparison correction inapplicable.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors implement all model configurations and baselines themselves without acknowledging potential self-comparison bias. No reference to Lucic et al.'s finding that authors' implementations of baselines systematically underperform.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "The AST+text model is 346M parameters vs 222M for the other configurations, and required halved batch sizes. Performance is compared directly without controlling for or discussing these compute differences.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper does not discuss whether exact-match evaluation on CodeXGLUE/FixJS actually measures real-world APR capability. While Section I briefly mentions patch correctness issues in APR, it does not question whether these specific benchmarks have construct validity.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No agentic scaffolding is used. Models are evaluated directly as fine-tuned seq2seq systems.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "Not discussed. The Java dataset (2019) and FixJS were publicly available before the pre-trained models were released. Models pre-trained on GitHub code may have seen solutions in the test sets.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "Not discussed. The paper does not analyze whether the input representations leak information about the target patches.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "The paper notes that 'duplicated samples can occur in the Java dataset' while FixJS filters duplicates, but does not verify independence between train and test sets or between pre-training data and evaluation data.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Command sequence representation outperforms text representation on Java dataset by approximately 11 percentage points in accuracy",
    456       "evidence": "Table II: CodeT5 cmdseq achieves 30.64% (java-small) and 18.53% (java-medium) vs text at 19.88% and 11.87% respectively. Table III confirms 12-21pp average improvement of cmdseq over text on Java.",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "AST+text representation significantly underperforms compared to text and command sequence representations",
    461       "evidence": "Table II: AST+text achieves 0.38 accuracy (java-small) and 0.28 (java-medium) compared to 30.64% and 18.53% for cmdseq. Dramatic failure acknowledged in text.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Representation choice effects vary by dataset—no single representation is optimal across all languages",
    466       "evidence": "FixJS dataset (JavaScript) shows text performing marginally better than cmdseq in some cases, contrasting with Java results. Table III shows FixJS cmdseq average 0.0281 vs text 0.0334.",
    467       "supported": "moderate"
    468     },
    469     {
    470       "claim": "Pre-trained transformer models substantially outperform models trained from scratch",
    471       "evidence": "Table II: T5-base pretrained achieves 97.56% accuracy (java-small) vs 93.71% for empty weights (untrained) on text representation.",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "CodeT5 is superior to vanilla T5 on command-sequence representation for Java",
    476       "evidence": "Table II cmdseq: CodeT5-base achieves 83.71% (java-small) and 80.51% (java-medium) vs T5 at 82.01% and 81.07%. Differences small (1-2pp).",
    477       "supported": "weak"
    478     },
    479     {
    480       "claim": "Dataset size and composition significantly affect model performance across representations",
    481       "evidence": "Java dataset (58,350 samples) shows 30%+ accuracy; FixJS (9,662 samples) shows 6-9% accuracy on same representations. Authors attribute to dataset size and filtering differences.",
    482       "supported": "strong"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "benchmark-eval",
    487     "empirical"
    488   ],
    489   "key_findings": "The paper demonstrates that code representation choice critically impacts transformer model performance for automated program repair. Command sequence representation (focusing on edit operations) achieves ~30% accuracy on Java benchmarks versus ~20% for raw text, by reducing redundant code generation. However, this advantage doesn't uniformly transfer across languages (JavaScript shows marginal differences) or to combined AST+text representations, which catastrophically fail. Pre-trained models substantially outperform training from scratch, but representation selection remains dataset-dependent and cannot be universally optimized.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical significance testing",
    493       "detail": "Results reported as single accuracy numbers without confidence intervals, p-values, or multiple runs. Reported differences (6-12pp) lack statistical rigor; unclear if meaningful."
    494     },
    495     {
    496       "flag": "No variance or uncertainty quantification",
    497       "detail": "Single accuracy value per configuration; no standard deviation, error bars, or multiple-run averaging. Reproducibility across runs unknown."
    498     },
    499     {
    500       "flag": "Missing limitations section",
    501       "detail": "No formal threats-to-validity, limitations, or scope boundary analysis. Generalization claims not qualified."
    502     },
    503     {
    504       "flag": "Potential benchmark contamination",
    505       "detail": "CodeT5 pre-trained on GitHub code; CodeXGLUE uses GitHub code. Train/test overlap not discussed or ruled out."
    506     },
    507     {
    508       "flag": "AST+text failure unexplained",
    509       "detail": "Catastrophic underperformance (0.38 vs 0.80 accuracy) speculatively attributed to 'model size' but not systematically investigated."
    510     },
    511     {
    512       "flag": "No comparison to state-of-the-art",
    513       "detail": "NSEdit (24%+ accuracy) mentioned as CodeXGLUE leader but not compared against. Evaluation limited to model ablations."
    514     },
    515     {
    516       "flag": "Single evaluation metric",
    517       "detail": "Only exact-match accuracy reported; no BLEU, semantic similarity, human judgment, or functional correctness evaluation."
    518     },
    519     {
    520       "flag": "Limited generalization scope",
    521       "detail": "Two languages with normalized identifiers (abstract variable names). Findings may not transfer to real-world, non-abstracted code."
    522     }
    523   ],
    524   "cited_papers": [
    525     {
    526       "title": "Automatically finding patches using genetic programming",
    527       "authors": "Weimer et al.",
    528       "year": 2009,
    529       "relevance": "Foundational APR work using genetic algorithms; provides historical context for oracle-based patch validation approaches."
    530     },
    531     {
    532       "title": "Automatic software repair: A survey",
    533       "authors": "Gazzola, Micucci, Mariani",
    534       "year": 2019,
    535       "relevance": "Comprehensive APR survey covering repair techniques and challenges; situates this work in broader APR landscape."
    536     },
    537     {
    538       "title": "Generating bug-fixes using pretrained transformers",
    539       "authors": "Drain et al. (DeepDebug)",
    540       "year": 2021,
    541       "relevance": "Directly relevant transformer-based APR approach showing effectiveness of pre-trained models for bug fixing."
    542     },
    543     {
    544       "title": "On learning meaningful code changes via neural machine translation",
    545       "authors": "Tufano et al.",
    546       "year": 2019,
    547       "relevance": "Original dataset paper (Java code refinement) used in this evaluation; primary benchmark source."
    548     },
    549     {
    550       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    551       "authors": "Lu et al.",
    552       "year": 2021,
    553       "relevance": "Benchmark suite containing the Java code refinement task; defines evaluation leaderboard and SOTA baseline (NSEdit)."
    554     },
    555     {
    556       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    557       "authors": "Raffel et al. (T5)",
    558       "year": 2020,
    559       "relevance": "Foundational T5 model architecture used as baseline; one of primary transformer models evaluated."
    560     },
    561     {
    562       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    563       "authors": "Wang et al.",
    564       "year": 2021,
    565       "relevance": "Code-specialized transformer variant; shows best performance on command-sequence representation for Java."
    566     },
    567     {
    568       "title": "Fix bugs with transformer through a neural-symbolic edit grammar",
    569       "authors": "Hu et al. (NSEdit)",
    570       "year": 2022,
    571       "relevance": "State-of-the-art APR method on CodeXGLUE (24.04% accuracy); mentioned but not compared against in this work."
    572     }
    573   ],
    574   "engagement_factors": {
    575     "practical_relevance": {
    576       "score": 1,
    577       "justification": "Provides guidance on representation choice for APR researchers but is not a ready-to-use tool for practitioners."
    578     },
    579     "surprise_contrarian": {
    580       "score": 1,
    581       "justification": "AST+text underperforming is somewhat unexpected, but overall finding that representation matters is not surprising."
    582     },
    583     "fear_safety": {
    584       "score": 0,
    585       "justification": "No safety, security, or AI risk implications in this work."
    586     },
    587     "drama_conflict": {
    588       "score": 0,
    589       "justification": "No controversy or conflict — a straightforward empirical comparison study."
    590     },
    591     "demo_ability": {
    592       "score": 1,
    593       "justification": "Code is released on GitHub but requires training infrastructure and significant setup to reproduce."
    594     },
    595     "brand_recognition": {
    596       "score": 0,
    597       "justification": "University of Szeged; published at a small workshop (APR 2023), not a major venue or lab."
    598     }
    599   },
    600   "hn_data": {
    601     "threads": [],
    602     "top_points": 0,
    603     "total_points": 0,
    604     "total_comments": 0
    605   }
    606 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs