scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27413B)
      1 {
      2   "paper": {
      3     "title": "Detect-Localize-Repair: A Unified Framework for Learning to Debug with CodeT5",
      4     "authors": [
      5       "Nghi D. Q. Bui",
      6       "Yue Wang",
      7       "Steven C.H. Hoi"
      8     ],
      9     "year": 2022,
     10     "venue": "Conference on Empirical Methods in Natural Language Processing",
     11     "arxiv_id": "2211.14875",
     12     "doi": "10.48550/arXiv.2211.14875"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "CodeT5-DLR, a unified multi-task framework for bug detection, localization, and repair, outperforms individual task-specific baselines across all three debugging tasks on newly collected Java (single-line) and Python (multi-line) datasets. Joint training with detection, localization, and repair objectives yields better performance than training on each objective individually. In end-to-end evaluation, the model correctly localizes 33.93% of buggy lines and repairs 46.93% of single-line Java bugs.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL for the CodeT5-DLR code is provided in the paper. The paper references the existing CodeT5 GitHub repo and baseline repos, but does not release its own implementation."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The conclusion states 'We will make our datasets publicly available to facilitate research on this topic,' which is a promise of future release, not an actual release with a download link."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section 4 mentions 'NVIDIA A100 GPUs with 40 GB memory' and 'CodeT5-base (220M)' but provides no requirements.txt, library versions, or detailed environment specifications sufficient to recreate the setup."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided in the paper."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 3, 4, 5, and 6 are reported as point estimates with no confidence intervals, error bars, or ± notation."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper repeatedly claims the model 'significantly outperforms' baselines (abstract, Section 1, Section 4) but reports no statistical significance tests (no p-values, t-tests, or any other statistical test)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Results are presented as raw numbers in tables. While baseline context is available for comparison, no explicit effect sizes (percentage improvement, Cohen's d, etc.) are computed or discussed."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Dataset sizes are reported (Table 2: ~75K SL-Java, ~190K ML-Python instances) but no justification for these sizes or power analysis is provided."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or results across multiple runs are reported anywhere in the paper. All results appear to be single-run numbers."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Extensive baselines are included: SpotBugs (static analysis), TBCNN, CodeBERT, GraphCodeBERT, PLBART, DeepLineDP, LineVul, plus ablation variants CodeT5-D, CodeT5-L, CodeT5-R (Tables 3-5)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include PLBART (2021), LineVul (2022), DeepLineDP (2022), and CodeT5 (2021), which are contemporary to this 2022 paper."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper compares CodeT5-D (detection only), CodeT5-L (localization only), CodeT5-R (repair only), and CodeT5-DLR (all three) across all tasks, demonstrating the contribution of joint training (Tables 3-6)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics are used per task: F1 + FPR for detection (Table 3), MRR + MAP + FPR for localization (Table 4), EM + BLEU for repair (Table 5)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of the system's outputs is performed. All evaluation is automated via metrics."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 2 shows explicit train/val/test splits for both SL-Java (52,789/7,465/15,250) and ML-Python (132,243/22,395/35,457) with separate project counts."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Figure 5 provides per-bug-pattern F1 breakdown across 13 patterns (P0-P12), and results are split by dataset type (single-line Java vs multi-line Python)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 4 shows a CHANGE_NUMERAL bug where the model detects and localizes correctly but suggests the wrong fix, with discussion of why this is challenging (Section 4.4.1)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 5 reports that pointer network design does not work well for their case due to class imbalance between buggy and non-buggy lines, 'confirmed with experiments.' Figure 4 shows a repair failure."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims the model 'significantly outperforms existing baselines from both NLP and software engineering domains,' which is supported by Tables 3-5 showing consistent improvements across all tasks and datasets."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The main causal claim is that 'jointly training with the three objectives yields better performance.' The ablation study (CodeT5-D/L/R vs CodeT5-DLR in Tables 3-6) provides controlled single-variable manipulation to support this."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'A Unified Framework for Learning to Debug' implies general debugging ability, but results are limited to single-line Java bugs and multi-line Python bugs from GitHub commits. The framework's applicability to other languages, bug types, or real-world debugging scenarios is not tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Section 5 discusses design choices vs pointer networks but does not consider alternative explanations for the improvements (e.g., whether the gains come from additional training signal/data rather than task complementarity, or whether larger models would close the gap)."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures EM, BLEU, F1, MRR, MAP and states claims in terms of these metrics without overreaching to broader claims about 'debugging ability' beyond what was measured."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper specifies 'CodeT5-base (220M)' (Section 4) and provides the GitHub URL for the checkpoint. Baselines use 'public checkpoints' of CodeBERT, GraphCodeBERT, and PLBART. These are fixed pretrained checkpoints, not API models with changing versions."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The paper fine-tunes models rather than using prompting. No prompts are involved."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Only 'maximum source and target sequence lengths to 512' and hardware (A100 GPUs) are stated. Learning rate, batch size, number of epochs, optimizer, beam search width, and other key hyperparameters are not reported."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The system is a fine-tuned encoder-decoder model."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3 describes the data pipeline: Pydriller for mining GitHub commits, keyword-based commit filtering (with cited accuracy: 96-97.6%), Lizard for function extraction, tree-sitter for bug pattern identification, and comparison between before/after function versions."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 8 'Limitations' provides substantive discussion of two specific weaknesses: inconsistency between the detection and repair modules, and the limitation of using only within-function information."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 8 discusses specific limitations: 'while the function-level bug detection module indicates that a function is not buggy, the program repair module continues to generate fixes' and the lack of cross-function context. These are specific to this study's design."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The limitations section discusses weaknesses but does not explicitly state what the results do NOT show or which settings are excluded. For example, there is no statement that results apply only to the specific bug types, languages, or GitHub-mined data tested."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The datasets are promised for future release but no download link is provided. Raw data is not available for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3 describes the collection procedure in detail: using Pydriller to mine GitHub commits, keyword filtering on commit messages, Lizard for function extraction, tree-sitter for bug pattern identification, with cited validation of the keyword heuristic (96-97.6% accuracy)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The paper describes commit filtering but does not describe how GitHub projects were selected. There is no discussion of which projects were included, whether this introduces selection bias (e.g., toward popular/well-maintained projects), or what the project selection criteria were."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "While the pipeline stages are described (commits → function extraction → bug pattern identification), the paper does not document how many examples were filtered at each stage. Table 2 shows final counts but intermediate filtering counts are missing."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations with Salesforce Research Asia are clearly stated on the first page. The connection to the evaluated CodeT5 model (also from Salesforce) is implicit from the shared authorship."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "All authors are Salesforce employees evaluating Salesforce's own CodeT5 model. Salesforce has a commercial interest in CodeT5 performing well, making the funder non-independent of the outcome."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper does not state the training data cutoff date for CodeT5's pretraining corpus, despite CodeT5 being pretrained on GitHub data that could overlap with the test set."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "While train/val/test splits use different projects (Table 2), the paper does not discuss whether CodeT5's pretraining data (collected from GitHub) overlaps with the fine-tuning test data, which also comes from GitHub."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "CodeT5 was pretrained on a large-scale GitHub corpus. The test data is also collected from GitHub commits. The paper does not address whether test examples could have appeared in CodeT5's pretraining data."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or per-example timing is reported for any of the three tasks."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is mentioned (NVIDIA A100 GPUs, 40 GB memory) but total GPU hours, training time, or overall compute budget is not quantified."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is not stated anywhere in the paper."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search budget or search method is reported, despite key hyperparameters (learning rate, batch size, epochs) being unreported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No description of how the final model configuration was selected. The paper does not state whether validation set performance was used for model selection."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple comparisons are made across 6+ baselines, 3 tasks, and 2 datasets with no correction applied. No statistical tests are used at all."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors (Salesforce) evaluate their own CodeT5 model against baselines they fine-tuned from public checkpoints. The bias of self-evaluation is not acknowledged or discussed."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "CodeT5-DLR is trained on three objectives simultaneously while baselines are trained on single objectives. The difference in training compute is not discussed or controlled for."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether its benchmarks (keyword-filtered GitHub commits) actually measure real-world debugging ability, or whether the bug types are representative of bugs developers encounter in practice."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. The system is a fine-tuned model without agentic components."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "CodeT5 was pretrained on GitHub code, and the test data is also from GitHub commits. No discussion of whether the model could have seen test solutions during pretraining."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the input format (with [SEP] tokens marking line boundaries and commit-derived bug labels) leaks information not available in real-world debugging scenarios."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Table 2 shows different project counts for train/val/test suggesting project-level splitting, but this is not explicitly discussed. No analysis of whether train and test examples share structural similarities (e.g., similar codebases, duplicate patterns)."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method (canary strings, deduplication, n-gram overlap analysis) is applied."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "CodeT5-DLR significantly outperforms existing baselines on function-level bug detection, achieving 63.46 F1 on SL-Java and 54.83 F1 on ML-Python.",
    369       "evidence": "Table 3 shows CodeT5-DLR outperforms PLBART (59.01/52.33), CodeBERT (55.67/50.24), GraphCodeBERT (56.44/49.30), and SpotBugs (4.6 F1) on both datasets.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "CodeT5-DLR outperforms baselines on line-level bug localization, achieving MRR@5 of 34.67 on SL-Java and MAP@5 of 33.75 on ML-Python.",
    374       "evidence": "Table 4 shows improvements over PLBART (30.98 MRR@5, 30.56 MAP@5) and other baselines including DeepLineDP and LineVul.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "CodeT5-DLR outperforms baselines on program repair with 10.30 EM on SL-Java and 6.30 EM on ML-Python.",
    379       "evidence": "Table 5 shows improvements over CodeT5-R (7.30/6.01 EM) and PLBART (6.02/5.39 EM).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Joint training with detection, localization, and repair objectives yields better performance than single-task training.",
    384       "evidence": "Ablation study comparing CodeT5-D, CodeT5-L, CodeT5-R vs CodeT5-DLR across Tables 3-6 shows consistent improvement from joint training.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "In end-to-end evaluation, the model correctly localizes 33.93% of buggy lines and repairs 46.93% for single-line Java bugs.",
    389       "evidence": "Table 6 shows end-to-end pipeline results with CodeT5-DLR achieving 33.93 BL and 46.93 PR on SL-Java.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Conflict of interest: company evaluating own model",
    396       "detail": "All authors are Salesforce Research employees evaluating Salesforce's CodeT5 model. CodeT5-DLR is shown to outperform all non-Salesforce baselines, and there is no acknowledgment of this conflict."
    397     },
    398     {
    399       "flag": "'Significantly outperforms' without significance tests",
    400       "detail": "The paper repeatedly uses 'significantly outperforms' language (abstract, introduction, contributions) but performs no statistical significance tests. All comparisons are based on point estimates without error bars or variance."
    401     },
    402     {
    403       "flag": "No variance or multi-run results",
    404       "detail": "All results appear to be single-run numbers with no standard deviation, confidence intervals, or seed sensitivity analysis, making it impossible to assess result stability."
    405     },
    406     {
    407       "flag": "Pretraining-test data overlap risk",
    408       "detail": "CodeT5 was pretrained on GitHub code, and the evaluation datasets are also collected from GitHub commits. No analysis of potential data contamination between CodeT5's pretraining corpus and the test sets."
    409     },
    410     {
    411       "flag": "Dataset and code not released",
    412       "detail": "Both the datasets and the CodeT5-DLR code are promised but not provided at publication time, preventing independent verification of results."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    418       "authors": ["Yue Wang", "Weishi Wang", "Shafiq R. Joty", "Steven C. H. Hoi"],
    419       "year": 2021,
    420       "relevance": "Foundation model used in this work; key pretrained model for code intelligence tasks."
    421     },
    422     {
    423       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    424       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    425       "year": 2020,
    426       "relevance": "Major pretrained code model used as baseline; represents encoder-only approach to code understanding."
    427     },
    428     {
    429       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    430       "authors": ["Daya Guo", "Shuo Ren", "Shuai Lu"],
    431       "year": 2020,
    432       "arxiv_id": "2009.08366",
    433       "relevance": "Pretrained code model incorporating data flow information, used as baseline for all three tasks."
    434     },
    435     {
    436       "title": "Unified Pre-training for Program Understanding and Generation",
    437       "authors": ["Wasi Uddin Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"],
    438       "year": 2021,
    439       "relevance": "PLBART model used as a key baseline; unified pretraining approach for code understanding and generation."
    440     },
    441     {
    442       "title": "Self-supervised Bug Detection and Repair",
    443       "authors": ["Miltiadis Allamanis", "Henry Jackson-Flux", "Marc Brockschmidt"],
    444       "year": 2021,
    445       "relevance": "Joint bug localization and repair approach at token level; key related work for neural bug detection and fixing."
    446     },
    447     {
    448       "title": "Neural Program Repair by Jointly Learning to Localize and Repair",
    449       "authors": ["Marko Vasic", "Aditya Kanade", "Petros Maniatis", "David Bieber", "Rishabh Singh"],
    450       "year": 2019,
    451       "arxiv_id": "1904.01720",
    452       "relevance": "Pioneering work on joint neural learning for bug localization and repair."
    453     },
    454     {
    455       "title": "CURE: Code-aware Neural Machine Translation for Automatic Program Repair",
    456       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    457       "year": 2021,
    458       "relevance": "Neural machine translation approach to automated program repair."
    459     },
    460     {
    461       "title": "CoCoNuT: Combining Context-aware Neural Translation Models Using Ensemble for Program Repair",
    462       "authors": ["Thibaud Lutellier", "Hung Viet Pham", "Lawrence Pang"],
    463       "year": 2020,
    464       "relevance": "Context-aware neural program repair using ensemble methods."
    465     },
    466     {
    467       "title": "LineVul: A Transformer-based Line-level Vulnerability Prediction",
    468       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn"],
    469       "year": 2022,
    470       "relevance": "Transformer-based line-level vulnerability prediction using attention scores; adapted as baseline for bug localization."
    471     },
    472     {
    473       "title": "DeepLineDP: Towards a Deep Learning Approach for Line-level Defect Prediction",
    474       "authors": ["Chanathip Pornprasit", "Chakkrit Tantithamthavorn"],
    475       "year": 2022,
    476       "relevance": "Deep learning approach for line-level software defect prediction; baseline for bug localization task."
    477     },
    478     {
    479       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    480       "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"],
    481       "year": 2021,
    482       "relevance": "Major benchmark suite for code intelligence tasks including defect detection and code refinement."
    483     },
    484     {
    485       "title": "DLFix: Context-based Code Transformation Learning for Automated Program Repair",
    486       "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
    487       "year": 2020,
    488       "relevance": "Context-based neural approach to automated program repair."
    489     }
    490   ],
    491   "engagement_factors": {
    492     "practical_relevance": {
    493       "score": 2,
    494       "justification": "A unified debugging framework with practical appeal to developers, but no released code or tool to actually use."
    495     },
    496     "surprise_contrarian": {
    497       "score": 0,
    498       "justification": "Multi-task learning improving over single-task training is an expected finding with no contrarian element."
    499     },
    500     "fear_safety": {
    501       "score": 0,
    502       "justification": "No AI safety or security concerns raised; this is a developer productivity tool."
    503     },
    504     "drama_conflict": {
    505       "score": 0,
    506       "justification": "No controversy or conflict angle."
    507     },
    508     "demo_ability": {
    509       "score": 0,
    510       "justification": "No code, demo, or tool released at publication time."
    511     },
    512     "brand_recognition": {
    513       "score": 1,
    514       "justification": "Salesforce Research is moderately known in the AI/NLP community but not a top-tier brand for developer tools."
    515     }
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs