scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24759B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Extensive Study on Model Architecture and Program Representation in the Domain of Learning-based Automated Program Repair",
      6     "authors": [
      7       "Dániel Horváth",
      8       "Viktor Csuvik",
      9       "Tibor Gyimóthy",
     10       "László Vidács"
     11     ],
     12     "year": 2023,
     13     "venue": "IEEE/ACM International Workshop on Automated Program Repair (APR)",
     14     "arxiv_id": null,
     15     "doi": "10.1109/APR59189.2023.00013"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims (representation impact, command sequence outperformance, ast+text failure) are directly supported by Table II results with specific accuracy numbers.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Controlled experiments vary representation while holding dataset and model fixed, providing causal evidence. However, no formal ablation study isolating representation components.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Scope explicitly bounded to 'two popular programming languages, Java and JavaScript' and specific datasets. Authors note differences between datasets and languages.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Paper discusses why FixJS underperforms (smaller dataset, stricter deduplication), why ast+text fails (insufficient model size for encoder), and overfitting patterns in examples.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Exact-match accuracy to developer patches is used as the metric, but paper never discusses whether exact match equals successful repair or what approximate correctness means.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Section VII is 'Conclusions' with no dedicated limitations or threats-to-validity section. Limitations are scattered throughout the text rather than systematically presented.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Some specific issues mentioned (overfitting, model size, dataset difficulty differences) but no systematic discussion of threats to validity like temporal generalization, metric validity, or dataset contamination.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Explicitly states scope: 'two popular programming languages, Java and JavaScript', 'real-world defects from open-source projects', transformer-based models only.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgement section lists multiple funding sources: ÚNKP program, EU project RRF-2.3.1-21-2022-00004, national project TKP2021-NVA-09.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors from Department of Software Engineering, University of Szeged—clear academic affiliation with no apparent connection to evaluated products.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Hungarian government ministries and EU are independent of outcomes about which code representation is best for APR.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No explicit competing interests statement included. No mention of patents, equity, or consulting arrangements.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "APR defined in introduction; representations explained (text, command sequence, ast+text); models specified with references; exact-match metric clearly described.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Explicitly states: 'find out which program representation fits better for the APR task' and 'provide a broader vision of the importance of how we choose to represent the data'.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section VI cites GenProg, DeepDebug, NSEdit, Hoppity and explains how work differs. Shows their use of transformers vs NMT in related work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Paper states 'Our setup, data, and methods used are also available in a GitHub repository' with link: https://github.com/AAI-USZ/APR23-representations",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Uses public datasets: Tufano et al. (CodeXGLUE benchmark) for Java and FixJS for JavaScript. Both are publicly available.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Provides Python 3.8, PyTorch, PyTorch-Lightning, transformers library, RTX 3090 GPU. But no requirements.txt or complete dependency list with versions mentioned in paper.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Paper describes methodology but provides no step-by-step reproduction instructions. Code is on GitHub but instructions are not in paper itself.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table II reports single accuracy percentages with no error bars, confidence intervals, or multiple runs reported. No cross-validation results shown.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests reported. Differences are stated (e.g., 'command sequence outperforms') but without p-values or hypothesis tests.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Table III reports absolute differences: command sequence vs text shows improvements of 0.1-0.3 on java-small/medium. Differences quantified in percentages.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Java: 58,350-65,455 samples; FixJS: 9,662-11,410 samples. No power analysis or justification for why these sizes are sufficient.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Only single accuracy numbers reported per model/representation/dataset combination. No variance, std dev, or multiple runs shown.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Compares multiple models (T5, CodeT5, RoBERTa, GPTNeo), representations (text, cmdseq, ast+text), and pre-trained vs from-scratch. Cites NSEdit achieving 24.04%.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "NSEdit (2022), T5 (2019), CodeT5 (2021) are contemporaneous with 2023 submission. Models are reasonably recent.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Varies representation, model, dataset, and pre-training status. Shows effect of each factor but not fully systematic ablation of individual representation components.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "Only accuracy (exact match percentage) reported. No recall, precision, partial credit, or other metrics that would capture near-correct patches.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Shows example patches with developer fixes (Listings) but no formal human evaluation of whether generated patches are acceptable to developers.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Datasets are split into train/test. Authors state 'After training the models are evaluated using the standard evaluation procedure'.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": false,
    217           "justification": "Results shown by dataset and representation, but not by bug type, difficulty level, or code category.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Listings 3-4 and 7-8 show failure examples. Authors discuss overfitting (e.g., 'model is biased towards guessing single deletion commands').",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Prominently reports that ast+text representation 'significantly underperform...achieving results below one percent'.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specifies T5-base, CodeT5-base, codebert-base, gpt-neo-125M with references to papers. Pre-trained vs empty weights clearly stated.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "This is a sequence-to-sequence fine-tuning task, not a prompt-based approach. No prompts involved.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Provides learning rate (5e-5), Adam optimizer, sequence lengths (256/384), batch sizes (16/8), epochs (50), early stopping (delta 0.05, patience 8), loss function.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "Standard seq2seq task, no agentic scaffolding involved.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Describes variable/method name abstraction for both Java (per-file index reset) and JavaScript (includes raw commit info). Vocabulary reduction explained.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Java dataset available in public CodeXGLUE benchmark. FixJS from published MSR workshop paper. Both datasets are publicly accessible.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Java: 'Java source codes mined from GitHub' from Tufano et al. JavaScript: 'bug-fixing information for GitHub commits' from FixJS. Collection methods described at appropriate level.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human subjects involved. Using existing datasets from GitHub.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Preprocessing steps documented, train/test splits mentioned, dataset normalization described. Full pipeline is traceable.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "Not an LLM evaluation with training cutoff dates. Fine-tuning models on fixed datasets. Not applicable.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "Paper uses pre-trained models (T5 2019, CodeT5 trained on GitHub) and evaluates on GitHub data. Potential overlap between pre-training and test sets not discussed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "CodeT5 was trained on 'public GitHub repositories'. Test sets are also from GitHub. Risk of pre-training contamination not addressed or analyzed.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human subjects.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human subjects.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human subjects.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human subjects.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human subjects.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human subjects.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human subjects.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Training time reported (1 hour to 1 day). Inference time/latency for generating patches NOT reported, which is critical for practical deployment.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware (RTX 3090) and training times given, but total GPU-hours or cost budget not explicitly calculated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Command sequence representation outperforms text and AST+text on Java dataset",
    374       "evidence": "Table II shows CodeT5-base on java-small achieves 30.64% with cmdseq-token vs 19.88% with text representation. java-medium: 18.53% cmdseq vs 11.87% text.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "AST+text representation significantly underperforms all other representations",
    379       "evidence": "Table II shows RoBERTa+CodeBERT+GPTNeo achieves 0.3862 accuracy (38.62%) on java-small and 0.2783 (27.83%) on medium—dramatically below text (97%) and cmdseq (83%).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Program representation effectiveness varies by programming language and dataset",
    384       "evidence": "On Java, cmdseq outperforms text by ~10pp. On FixJS, this advantage reverses (text 92.45% vs cmdseq 65.02% for T5-base). Table III shows opposite trends.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Pre-trained models significantly outperform models trained from scratch",
    389       "evidence": "T5-base (pretrained) achieves 0.9756 on java-small vs T5-base (empty, no pretraining) at 0.9371. CodeT5-base pretrained 0.9684 vs empty on cmdseq 0.7884.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "FixJS dataset is significantly harder to learn on than the Java dataset",
    394       "evidence": "Best accuracy on FixJS (93.69%) lags best on Java (97.95%). Authors note FixJS has fewer samples (9,662 vs 58,350), stricter deduplication, and likely language-specific difficulty.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Exact-match accuracy is the appropriate measure of APR success",
    399       "evidence": "Paper uses exact-match as sole metric: 'the generated patch should be exactly the same as the one in the dataset'. No discussion of whether approximate correctness counts.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "empirical",
    405     "benchmark-eval",
    406     "comparative"
    407   ],
    408   "key_findings": "This empirical study demonstrates that code representation choice significantly impacts deep learning model performance on automated program repair. Command sequence representation (with [INSERT]/[DELETE] tokens) achieves 30.64% exact-match accuracy on the Java-small dataset, outperforming text representation by 10.8 percentage points. However, AST+text representation catastrophically underperforms (<1% accuracy), suggesting that additional syntactic information can degrade performance if not properly integrated. Results vary substantially by programming language and dataset: the same representations show opposite performance orderings on Java versus JavaScript, indicating that representation effectiveness is dataset and language-dependent. Pre-trained models consistently outperform from-scratch training by large margins across all settings.",
    409   "red_flags": [
    410     {
    411       "flag": "No variance or confidence intervals",
    412       "detail": "Single accuracy numbers reported with no cross-validation, multiple runs, or error bars. Cannot assess result reliability or statistical significance."
    413     },
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "Differences between models/representations presented as point estimates without hypothesis tests. Unknown whether observed differences are statistically meaningful or noise."
    417     },
    418     {
    419       "flag": "Exact-match metric is extremely strict",
    420       "detail": "Only counts patches identical to developer fix as correct. Patches that are 99% correct or functionally equivalent are counted as complete failures."
    421     },
    422     {
    423       "flag": "No human validation of results",
    424       "detail": "No formal evaluation of whether generated patches are actually acceptable, executable, or solve the intended problem. Only that they match developer's exact fix."
    425     },
    426     {
    427       "flag": "Pre-training contamination not addressed",
    428       "detail": "CodeT5 was trained on 'public GitHub repositories' and test sets are also from GitHub. Potential overlap in training/test distributions not analyzed."
    429     },
    430     {
    431       "flag": "Limited generalization evidence",
    432       "detail": "Only Java and JavaScript tested. Unclear if findings (especially cmdseq advantage) generalize to Python, C++, Go, or other languages."
    433     },
    434     {
    435       "flag": "AST+text catastrophic failure under-investigated",
    436       "detail": "Dramatic collapse to <1% accuracy is noted but root cause is speculative ('insufficient model size'). No systematic investigation of why additional information hurts performance."
    437     },
    438     {
    439       "flag": "Inference cost completely missing",
    440       "detail": "Training time reported but not inference latency. For practical APR deployment, knowing how long to generate a patch per code sample is critical."
    441     },
    442     {
    443       "flag": "No formal limitations section",
    444       "detail": "Limitations scattered throughout text rather than systematically documented. No discussion of threats to validity or external validity concerns."
    445     },
    446     {
    447       "flag": "State-of-the-art comparison unclear",
    448       "detail": "NSEdit achieves 24.04% on java-small (cited as SOTA), but this paper claims 30.64%. Unclear if results are directly comparable (different dataset splits?) or if this work exceeds SOTA."
    449     }
    450   ],
    451   "cited_papers": [
    452     {
    453       "title": "Automatically finding patches using genetic programming",
    454       "relevance": "Foundational APR work using genetic algorithms and oracle-based patch validation. Establishes patch correctness as open problem."
    455     },
    456     {
    457       "title": "Generating bug-fixes using pretrained transformers",
    458       "relevance": "DeepDebug: applies pre-trained transformers to APR with copy-attention mechanism. Shows effectiveness of transfer learning for bug repair."
    459     },
    460     {
    461       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    462       "relevance": "T5 paper: the base model architecture used for sequence-to-sequence fine-tuning in this study."
    463     },
    464     {
    465       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    466       "relevance": "Domain-specific variant of T5 trained on CodeSearchNet. Core model evaluated in this paper."
    467     },
    468     {
    469       "title": "Fix bugs with transformer through a neural-symbolic edit grammar",
    470       "relevance": "NSEdit: state-of-the-art baseline (24.04% accuracy) on CodeXGLUE code refinement. Uses command sequence approach similar to this paper."
    471     },
    472     {
    473       "title": "Hoppity: Learning Graph Transformations To Detect and Fix Bugs in Programs",
    474       "relevance": "Graph-based neural approach to APR on large JavaScript dataset. Alternative to sequence-based representations."
    475     },
    476     {
    477       "title": "A controlled experiment of different code representations for learning-based program repair",
    478       "relevance": "Directly related empirical study by Namavar et al. comparing code representations for APR using NMT models (vs transformers here)."
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 2,
    484       "justification": "Provides actionable guidance on representation choice for practitioners building APR systems. But lacks deployment guidance, inference costs, and production recommendations."
    485     },
    486     "surprise_contrarian": {
    487       "score": 2,
    488       "justification": "Finding that simpler text beats complex AST+text representation is somewhat counterintuitive. Variation by language is expected but quantified results show magnitude of effect."
    489     },
    490     "fear_safety": {
    491       "score": 0,
    492       "justification": "Pure technical methodology paper on program repair. No AI safety, security, or risk concerns raised or addressed."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "Straightforward technical comparison. No controversy, conflict, or debate angle."
    497     },
    498     "demo_ability": {
    499       "score": 2,
    500       "justification": "Code and datasets are public on GitHub, enabling reproduction. But no interactive demo or one-click tool to try the system."
    501     },
    502     "brand_recognition": {
    503       "score": 1,
    504       "justification": "University of Szeged is established but not top-tier (not MIT, Stanford, Google, Meta, DeepMind). Published at APR workshop, not a top-tier venue like ICSE or FSE."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [],
    509     "top_points": 0,
    510     "total_points": 0,
    511     "total_comments": 0
    512   }
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs