scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31972B)
      1 {
      2   "paper": {
      3     "title": "An Extensive Study on Model Architecture and Program Representation in the Domain of Learning-based Automated Program Repair",
      4     "authors": [
      5       "Dániel Horváth",
      6       "Viktor Csuvik",
      7       "Tibor Gyimóthy",
      8       "László Vidács"
      9     ],
     10     "year": 2023,
     11     "venue": "IEEE/ACM International Workshop on Automated Program Repair (APR)",
     12     "doi": "10.1109/APR59189.2023.00013"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "Command sequence representation outperforms text representation on Java APR datasets (30.64% vs 19.88% exact match on java-small), but this advantage does not hold on the JavaScript FixJS dataset. AST+text representation significantly underperforms all other representations, achieving below 1% exact match accuracy despite a larger model (346M vs 222M parameters). The study demonstrates that no single code representation works best across all datasets and model configurations, suggesting representation choice must be tuned per setting.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "A GitHub repository is provided at https://github.com/AAI-USZ/APR23-representations (footnote 1 in Section I): 'Our setup, data, and methods used are also available in a GitHub repository.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Both datasets are publicly available: the Java dataset is part of the CodeXGLUE benchmark (Tufano et al.), and FixJS is a published dataset. The paper's GitHub repository also makes their processed data available."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section IV mentions Python 3.8, PyTorch, PyTorch-Lightning, and the transformers library, along with an RTX 3090 GPU. However, no specific library versions, requirements.txt, or Dockerfile are provided — just library names without version numbers."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper describes experimental setup (hyperparameters, datasets, hardware) in Section IV but does not provide step-by-step reproduction instructions. The GitHub repo is linked but the paper itself contains no commands, scripts, or README-style instructions for reproducing the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Table II reports only point estimates for accuracy and 100% accuracy. No confidence intervals, error bars, or ± notation are provided for any results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes numerous comparative claims (e.g., 'outperforms all the other representations, by at least 6%') but provides no statistical significance tests. All comparisons are based on raw number differences."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table III explicitly reports the differences between representations and datasets with baseline context. For example, CodeT5-base on Java small: text 19.88% vs command sequence 30.64%, difference 10.76%. Both absolute values and differences are provided."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Dataset sizes are stated (58,350 Java small, 65,455 Java medium, 9,662 FixJS small, 11,410 FixJS medium) but no justification or power analysis is provided for why these sizes are sufficient for the claims being made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "All results appear to be from single training runs. No standard deviations, variance across seeds, or spread measures are reported anywhere in the paper."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The study compares multiple models (T5, CodeT5, RoBERTa+CodeBERT+GPTNeo) across multiple representations (text, cmdseq, AST+text) and references NSEdit as the state-of-the-art on the CodeXGLUE leaderboard (Section VI)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper references NSEdit (2022) as the state-of-the-art on CodeXGLUE at time of writing, and uses recent pre-trained models (CodeT5 2021, CodeBERT 2020). The baselines are appropriate for a 2023 workshop paper."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The study systematically varies components: pretrained vs not pretrained (T5-base vs T5-base empty), LM-head-only vs full fine-tuning (T5-base LM vs T5-base), different representations on same models, and different models on same representations. Table II shows results for all configurations."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Table II reports two metrics: 'Accuracy' (appears to be average token/sequence-level accuracy) and '100% accuracy' (exact match rate). Values differ substantially, e.g., T5-base on Java small text: 0.9756 Accuracy vs 0.1491 100% accuracy."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Evaluation is entirely automated via exact match comparison with developer patches. While Section V includes qualitative examples of generated patches (Listings 1-8), there is no systematic human evaluation of output quality."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 1 clearly shows separate TRAIN, TEST, and VALID splits. The paper follows 'the standard evaluation procedure in learning-based APR approaches' (Section II) with early stopping on validation loss and evaluation on the test set."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table II provides results broken down by model, representation, pretrained status, dataset (Java/FixJS), and dataset size (small/medium). Table III further compares across representations and datasets."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section V presents multiple failure examples with analysis: Listing 3 shows an incorrect cmdseq patch with discussion of deletion bias, Listing 7 shows an incorrect text patch with discussion of overfitting to simple patterns like 'return 1'."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative results are reported: AST+text representation achieves below 1% exact match ('significantly underperform'), training only the LM head 'is not enough' (0% exact match after 42 epochs), and cmdseq underperforms text on FixJS in some configurations."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims match Table II results: 19.88% on java-small with text (CodeT5-base), 11.87% on java-medium (CodeT5-base), 30.64% on java-small with cmdseq (CodeT5-base), 18.53% on medium with cmdseq (CodeT5-base). The claim that AST+text 'significantly underperform' is supported by <1% exact match results."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The primary claims are comparative ('cmdseq outperforms text'), supported by controlled experiments that vary one factor at a time (same model, different representations). The ablation design with matched conditions is adequate for these claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'An Extensive Study' in 'the Domain of Learning-based Automated Program Repair' broadly, but experiments cover only 2 languages (Java, JavaScript), 2 specific datasets, and 4-5 model architectures. Conclusions like 'each deep-learning setting requires its own data representation' generalize beyond what was tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper offers brief speculation that FixJS underperformance 'may be due to' smaller dataset size and unique-sample filtering, but does not systematically discuss alternative explanations. The AST+text failure is attributed to 'insufficient model size' without rigorous analysis."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper uses exact match with developer patches as the evaluation metric and frames results as 'accuracy' for automated program repair. While Section I acknowledges exact match is stringent, the paper does not discuss that exact match misses semantically correct but syntactically different patches, a well-known limitation in APR evaluation."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific HuggingFace model identifiers are provided: t5-base, codet5-base, codebert-base, roberta-base, gpt-neo-125M. These identify specific model checkpoints. Model parameter counts are also given (222M for base models, 346M for AST+text composite)."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The paper fine-tunes sequence-to-sequence models rather than using prompting. Input is the source code representation fed directly to the model, not a prompt."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section IV reports: learning rate 5e-5, Adam optimizer with epsilon 1e-8, batch sizes (16 for small, 8 for medium), sequence lengths (256 for small, 384 for medium), max 50 epochs, early stopping with delta 0.05 and patience 8, cross-entropy loss."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The models are standard fine-tuned transformer models with direct input-output evaluation."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section III describes preprocessing for both datasets: Java dataset uses variable/method abstraction (e.g., myVar → VARIABLE_1) with idioms preserved, FixJS uses similar abstraction with duplicate filtering. Section II.B documents the three representation transformations (text, cmdseq, AST+text) in detail with examples in Table I."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusions (Section VII) briefly mention future directions but do not substantively discuss limitations of the current study."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed. The paper acknowledges some issues in passing (e.g., 'further experiments would be needed' for AST+text, model size not optimized) but these are not framed as specific threats to the study's conclusions."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state what the results do NOT show. It mentions that 'Optimizing the size of the model is out of the scope of this paper' but does not bound the generalizability of its claims to specific languages, datasets, or model families."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Both evaluation datasets are publicly available: the Java dataset through CodeXGLUE and FixJS through its original release. The paper's GitHub repository provides access to experimental setup and data."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section III describes both datasets: the Java dataset was mined from GitHub with normalization by Tufano et al., with sizes given. FixJS contains bug-fixing GitHub commits with abstraction and deduplication by Csuvik & Vidács. Dataset sizes and splitting are documented."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. The study uses standard benchmark datasets (CodeXGLUE Java code refinement, FixJS)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline from raw datasets to model input is documented: Section III describes the original datasets, Section II.B describes how each code representation (text, cmdseq, AST+text) is generated, and Section IV describes the training/evaluation procedure."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The Acknowledgement section discloses multiple funding sources: ÚNKP-22-3-SZTE New National Excellence Program, European Union project RRF-2.3.1-21-2022-00004 (Artificial Intelligence National Laboratory), and project TKP2021-NVA-09 from the National Research, Development and Innovation Fund."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All four authors are affiliated with the Department of Software Engineering, University of Szeged, Hungary. Affiliations are clearly listed in the paper header. Two co-authors (Csuvik, Vidács) authored the FixJS dataset used in evaluation."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funding is from Hungarian government ministries and EU programs supporting general AI research. None of the funders have a financial stake in which code representation or model performs best."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper uses pre-trained models (T5, CodeT5, CodeBERT, RoBERTa) that were trained on large corpora including web text and GitHub code, but never states the training data cutoff dates for any of these models."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether the Java or FixJS test examples appeared in the pre-training data of T5, CodeT5, or CodeBERT. The Java dataset was mined from GitHub, and these models were pre-trained on GitHub code, creating a clear overlap risk."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The Java dataset (Tufano et al. 2019) and CodeXGLUE benchmark (2021) were publicly available before the pre-trained models' training. This contamination risk is not discussed at all."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. It is a purely computational benchmark evaluation."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or per-example cost is reported. Only training time is mentioned."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section IV reports hardware (RTX 3090 GPU) and training times: 'about half an hour on small, and one hour on the medium dataset' for text/cmdseq, '1 hour and 40 minutes on small, and 2 hours on medium' for AST+text on Java. FixJS training ranged from 5-20 minutes per epoch. Total training time 'from 1 hour to about 1 day.'"
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of random seeds or sensitivity analysis across seeds. All results appear to be from single training runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is never stated. Results are presented without indication of how many runs produced them."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search is described. The paper uses fixed hyperparameters (learning rate 5e-5, batch sizes, etc.) without reporting whether any search was conducted or why these values were chosen beyond citing reference [27]."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Early stopping on validation loss is used with patience 8 and minimum delta 0.05: 'we assume that the model converged, and stop the training, and load the best model so far for testing.' This is a principled model selection approach."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed at all, making multiple comparison correction inapplicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement all model configurations and baselines themselves without acknowledging potential self-comparison bias. No reference to Lucic et al.'s finding that authors' implementations of baselines systematically underperform."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The AST+text model is 346M parameters vs 222M for the other configurations, and required halved batch sizes. Performance is compared directly without controlling for or discussing these compute differences."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether exact-match evaluation on CodeXGLUE/FixJS actually measures real-world APR capability. While Section I briefly mentions patch correctness issues in APR, it does not question whether these specific benchmarks have construct validity."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No agentic scaffolding is used. Models are evaluated directly as fine-tuned seq2seq systems."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Not discussed. The Java dataset (2019) and FixJS were publicly available before the pre-trained models were released. Models pre-trained on GitHub code may have seen solutions in the test sets."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. The paper does not analyze whether the input representations leak information about the target patches."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The paper notes that 'duplicated samples can occur in the Java dataset' while FixJS filters duplicates, but does not verify independence between train and test sets or between pre-training data and evaluation data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Command sequence representation outperforms text representation on Java datasets, achieving 30.64% exact match on java-small and 18.53% on java-medium (vs 19.88% and 11.87% for text).",
    369       "evidence": "Table II shows CodeT5-base with cmdseq achieving 30.64% and 18.53% on Java small/medium respectively, vs 19.88% and 11.87% with text representation. Table III confirms cmdseq advantages across most Java configurations.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "AST+text representation significantly underperforms compared to other representations, achieving below 1% exact match accuracy.",
    374       "evidence": "Table II shows the RoBERTa+CodeBERT+GPTNeo model on AST+text achieves 0.77% on Java small and 0.00% on Java medium, 0.10% on FixJS small and 0.00% on FixJS medium.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The advantage of command sequence representation does not hold on the FixJS JavaScript dataset, where text representation performs better on average.",
    379       "evidence": "Table III shows average FixJS 100% accuracy for text is 3.34% vs 2.81% for cmdseq. The Text vs Cmd column shows negative values for some FixJS configurations (e.g., -4.96% for T5-base small, -7.54% for CodeT5-base small).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Training only the language model head of a pre-trained T5 model is not sufficient for the APR task.",
    384       "evidence": "Table II shows T5-base LM achieves 0.00% exact match on both Java small (42 epochs) and Java medium (25 epochs), despite having the highest raw accuracy values (0.9104, 0.8577).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "No single representation fits every dataset and every model — each deep-learning setting requires its own data representation.",
    389       "evidence": "Table III shows cmdseq is better for Java but text is sometimes better for FixJS. However, this conclusion is drawn from only 2 datasets and 3 model configurations, limiting its generalizability.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "No uncertainty quantification",
    396       "detail": "All results appear to be single-run experiments with no error bars, standard deviations, or confidence intervals. Deep learning results are known to vary significantly across random seeds (Henderson et al. 2018), making single-run comparisons unreliable."
    397     },
    398     {
    399       "flag": "No statistical significance tests",
    400       "detail": "The paper makes numerous comparative claims ('outperforms by at least 6%') without any statistical tests. With single-run results and no significance testing, observed differences may be within noise."
    401     },
    402     {
    403       "flag": "Uncontrolled model size confound",
    404       "detail": "The AST+text model has 346M parameters vs 222M for text/cmdseq models, and required halved batch sizes. The poor AST+text performance could be partially due to these training differences rather than the representation itself, but this confound is not analyzed."
    405     },
    406     {
    407       "flag": "Pre-training data contamination risk",
    408       "detail": "T5 and CodeT5 were pre-trained on large corpora including GitHub code. The Java evaluation dataset was mined from GitHub. Pre-trained models may have seen test examples during pre-training, yet no contamination analysis is performed."
    409     },
    410     {
    411       "flag": "Co-authors authored one of the evaluation datasets",
    412       "detail": "Two co-authors (Csuvik, Vidács) authored the FixJS dataset used for evaluation. While this is disclosed through the citation, it is not explicitly discussed as a potential conflict when interpreting FixJS results."
    413     },
    414     {
    415       "flag": "No limitations section",
    416       "detail": "The paper has no dedicated limitations or threats-to-validity section despite making broad claims about code representation for APR. Known limitations (single runs, model size confound, limited language coverage) are not systematically discussed."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Automatically finding patches using genetic programming",
    422       "authors": ["W. Weimer", "T. Nguyen", "C. Le Goues", "S. Forrest"],
    423       "year": 2009,
    424       "relevance": "Foundational APR work (GenProg) using genetic programming, establishing the template-based patch generation paradigm."
    425     },
    426     {
    427       "title": "Generating bug-fixes using pretrained transformers",
    428       "authors": ["D. Drain", "C. Wu", "A. Svyatkovskiy", "N. Sundaresan"],
    429       "year": 2021,
    430       "relevance": "DeepDebug: seminal work on pre-trained transformers for automated bug fixing, directly related to this study's approach."
    431     },
    432     {
    433       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    434       "authors": ["C. Raffel", "N. Shazeer", "A. Roberts"],
    435       "year": 2020,
    436       "relevance": "Introduces T5, one of the primary model architectures evaluated in this study for code repair."
    437     },
    438     {
    439       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    440       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. H. Hoi"],
    441       "year": 2021,
    442       "relevance": "Introduces CodeT5, the best-performing model architecture in this study's experiments."
    443     },
    444     {
    445       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    446       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    447       "year": 2020,
    448       "relevance": "Pre-trained code model used as the program encoder in the AST+text configuration."
    449     },
    450     {
    451       "title": "Fix bugs with transformer through a neural-symbolic edit grammar",
    452       "authors": ["Y. Hu", "X. Shi", "Q. Zhou", "L. Pike"],
    453       "year": 2022,
    454       "relevance": "NSEdit: state-of-the-art on CodeXGLUE code refinement leaderboard using neural-symbolic edit sequences, directly compared with this study."
    455     },
    456     {
    457       "title": "CURE: Code-aware neural machine translation for automatic program repair",
    458       "authors": ["N. Jiang", "T. Lutellier", "L. Tan"],
    459       "year": 2021,
    460       "relevance": "Code-aware NMT approach to APR, demonstrating the effectiveness of leveraging code structure in neural repair."
    461     },
    462     {
    463       "title": "DEAR: A Novel Deep Learning-based Approach for Automated Program Repair",
    464       "authors": ["Y. Li", "S. Wang", "T. N. Nguyen"],
    465       "year": 2022,
    466       "relevance": "Recent deep learning approach for APR showing promising results, part of the landscape this study maps."
    467     },
    468     {
    469       "title": "Hoppity: Learning Graph Transformations To Detect and Fix Bugs in Programs",
    470       "authors": ["E. Dinella", "H. Dai", "Z. Li", "M. Naik", "L. Song", "K. Wang"],
    471       "year": 2020,
    472       "relevance": "Graph-based neural network for bug detection and fixing using AST transformations, relevant to the AST representation study."
    473     },
    474     {
    475       "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation",
    476       "authors": ["M. Tufano", "C. Watson", "G. Bavota", "M. D. Penta", "M. White", "D. Poshyvanyk"],
    477       "year": 2019,
    478       "relevance": "Source of the Java APR dataset used in this study, foundational work on NMT-based program repair."
    479     },
    480     {
    481       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    482       "authors": ["S. Lu", "D. Guo", "S. Ren"],
    483       "year": 2021,
    484       "relevance": "Benchmark suite including code refinement task used in this study's evaluation."
    485     },
    486     {
    487       "title": "FixJS: A Dataset of Bug-Fixing JavaScript Commits",
    488       "authors": ["V. Csuvik", "L. Vidács"],
    489       "year": 2022,
    490       "relevance": "JavaScript bug-fixing dataset used as one of the two evaluation datasets in this study."
    491     },
    492     {
    493       "title": "CODIT: Code Editing with Tree-Based Neural Models",
    494       "authors": ["S. Chakraborty", "Y. Ding", "M. Allamanis", "B. Ray"],
    495       "year": 2022,
    496       "doi": "10.1109/tse.2020.3020502",
    497       "relevance": "Tree-based neural model for code editing, related approach to this study's AST+text representation experiments."
    498     },
    499     {
    500       "title": "A controlled experiment of different code representations for learning-based program repair",
    501       "authors": ["M. Namavar", "N. Nashid", "A. Mesbah"],
    502       "year": 2022,
    503       "relevance": "Most directly related work studying code representation effects on APR, but limited to NMT models and abstraction-level variations."
    504     },
    505     {
    506       "title": "Studying the usage of text-to-text transfer transformer to support code-related tasks",
    507       "authors": ["A. Mastropaolo", "S. Scalabrino", "N. Cooper", "D. Nader Palacio", "D. Poshyvanyk", "R. Oliveto", "G. Bavota"],
    508       "year": 2021,
    509       "relevance": "Study of T5 for code-related tasks including bug fixing, demonstrating transformer effectiveness across SE tasks."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 1,
    515       "justification": "Provides guidance on representation choice for APR researchers but is not a ready-to-use tool for practitioners."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "AST+text underperforming is somewhat unexpected, but overall finding that representation matters is not surprising."
    520     },
    521     "fear_safety": {
    522       "score": 0,
    523       "justification": "No safety, security, or AI risk implications in this work."
    524     },
    525     "drama_conflict": {
    526       "score": 0,
    527       "justification": "No controversy or conflict — a straightforward empirical comparison study."
    528     },
    529     "demo_ability": {
    530       "score": 1,
    531       "justification": "Code is released on GitHub but requires training infrastructure and significant setup to reproduce."
    532     },
    533     "brand_recognition": {
    534       "score": 0,
    535       "justification": "University of Szeged; published at a small workshop (APR 2023), not a major venue or lab."
    536     }
    537   }
    538 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs