scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30393B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detect-Localize-Repair: A Unified Framework for Learning to Debug with CodeT5",
      6     "authors": [
      7       "Nghi D. Q. Bui",
      8       "Yue Wang",
      9       "Steven C.H. Hoi"
     10     ],
     11     "year": 2022,
     12     "venue": "Conference on Empirical Methods in Natural Language Processing",
     13     "arxiv_id": "2211.14875",
     14     "doi": "10.48550/arXiv.2211.14875"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims the model 'significantly outperforms existing baselines from both NLP and software engineering domains,' which is supported by Tables 3-5 showing consistent improvements across all tasks and datasets.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The main causal claim is that 'jointly training with the three objectives yields better performance.' The ablation study (CodeT5-D/L/R vs CodeT5-DLR in Tables 3-6) provides controlled single-variable manipulation to support this.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title 'A Unified Framework for Learning to Debug' implies general debugging ability, but results are limited to single-line Java bugs and multi-line Python bugs from GitHub commits. The framework's applicability to other languages, bug types, or real-world debugging scenarios is not tested.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Section 5 discusses design choices vs pointer networks but does not consider alternative explanations for the improvements (e.g., whether the gains come from additional training signal/data rather than task complementarity, or whether larger models would close the gap).",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures EM, BLEU, F1, MRR, MAP and states claims in terms of these metrics without overreaching to broader claims about 'debugging ability' beyond what was measured.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 8 'Limitations' provides substantive discussion of two specific weaknesses: inconsistency between the detection and repair modules, and the limitation of using only within-function information.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 8 discusses specific limitations: 'while the function-level bug detection module indicates that a function is not buggy, the program repair module continues to generate fixes' and the lack of cross-function context. These are specific to this study's design.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The limitations section discusses weaknesses but does not explicitly state what the results do NOT show or which settings are excluded. For example, there is no statement that results apply only to the specific bug types, languages, or GitHub-mined data tested.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source or acknowledgments section is present in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations with Salesforce Research Asia are clearly stated on the first page. The connection to the evaluated CodeT5 model (also from Salesforce) is implicit from the shared authorship.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "All authors are Salesforce employees evaluating Salesforce's own CodeT5 model. Salesforce has a commercial interest in CodeT5 performing well, making the funder non-independent of the outcome.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Bug detection, localization, repair defined operationally. But 'productivity' used in abstract ('improving productivity of software developers') is never defined. 'Unified framework' not formally defined; means 'multi-task' learning but not stated.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three contributions clearly stated: (1) unified DLR framework, (2) two new line-level debugging datasets, (3) extensive evaluations. Reader knows what the paper adds.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6 covers pretrained models and neural debugging. Discussion section engages with Allamanis et al. (2021), explaining why pointer networks don't work for their setting. However, related work is mostly listing papers without deep synthesis of how this work fits into research landscape.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No repository URL for the CodeT5-DLR code is provided in the paper. The paper references the existing CodeT5 GitHub repo and baseline repos, but does not release its own implementation.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The conclusion states 'We will make our datasets publicly available to facilitate research on this topic,' which is a promise of future release, not an actual release with a download link.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Section 4 mentions 'NVIDIA A100 GPUs with 40 GB memory' and 'CodeT5-base (220M)' but provides no requirements.txt, library versions, or detailed environment specifications sufficient to recreate the setup.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided in the paper.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables 3, 4, 5, and 6 are reported as point estimates with no confidence intervals, error bars, or ± notation.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper repeatedly claims the model 'significantly outperforms' baselines (abstract, Section 1, Section 4) but reports no statistical significance tests (no p-values, t-tests, or any other statistical test).",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "Results are presented as raw numbers in tables. While baseline context is available for comparison, no explicit effect sizes (percentage improvement, Cohen's d, etc.) are computed or discussed.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Dataset sizes are reported (Table 2: ~75K SL-Java, ~190K ML-Python instances) but no justification for these sizes or power analysis is provided.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or results across multiple runs are reported anywhere in the paper. All results appear to be single-run numbers.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Extensive baselines are included: SpotBugs (static analysis), TBCNN, CodeBERT, GraphCodeBERT, PLBART, DeepLineDP, LineVul, plus ablation variants CodeT5-D, CodeT5-L, CodeT5-R (Tables 3-5).",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include PLBART (2021), LineVul (2022), DeepLineDP (2022), and CodeT5 (2021), which are contemporary to this 2022 paper.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The paper compares CodeT5-D (detection only), CodeT5-L (localization only), CodeT5-R (repair only), and CodeT5-DLR (all three) across all tasks, demonstrating the contribution of joint training (Tables 3-6).",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple metrics are used per task: F1 + FPR for detection (Table 3), MRR + MAP + FPR for localization (Table 4), EM + BLEU for repair (Table 5).",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation of the system's outputs is performed. All evaluation is automated via metrics.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Table 2 shows explicit train/val/test splits for both SL-Java (52,789/7,465/15,250) and ML-Python (132,243/22,395/35,457) with separate project counts.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Figure 5 provides per-bug-pattern F1 breakdown across 13 patterns (P0-P12), and results are split by dataset type (single-line Java vs multi-line Python).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Figure 4 shows a CHANGE_NUMERAL bug where the model detects and localizes correctly but suggests the wrong fix, with discussion of why this is challenging (Section 4.4.1).",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Section 5 reports that pointer network design does not work well for their case due to class imbalance between buggy and non-buggy lines, 'confirmed with experiments.' Figure 4 shows a repair failure.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "The paper specifies 'CodeT5-base (220M)' (Section 4) and provides the GitHub URL for the checkpoint. Baselines use 'public checkpoints' of CodeBERT, GraphCodeBERT, and PLBART. These are fixed pretrained checkpoints, not API models with changing versions.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "The paper fine-tunes models rather than using prompting. No prompts are involved.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Only 'maximum source and target sequence lengths to 512' and hardware (A100 GPUs) are stated. Learning rate, batch size, number of epochs, optimizer, beam search width, and other key hyperparameters are not reported.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. The system is a fine-tuned encoder-decoder model.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3 describes the data pipeline: Pydriller for mining GitHub commits, keyword-based commit filtering (with cited accuracy: 96-97.6%), Lizard for function extraction, tree-sitter for bug pattern identification, and comparison between before/after function versions.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The datasets are promised for future release but no download link is provided. Raw data is not available for independent verification.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3 describes the collection procedure in detail: using Pydriller to mine GitHub commits, keyword filtering on commit messages, Lizard for function extraction, tree-sitter for bug pattern identification, with cited validation of the keyword heuristic (96-97.6% accuracy).",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "The paper describes commit filtering but does not describe how GitHub projects were selected. There is no discussion of which projects were included, whether this introduces selection bias (e.g., toward popular/well-maintained projects), or what the project selection criteria were.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "While the pipeline stages are described (commits → function extraction → bug pattern identification), the paper does not document how many examples were filtered at each stage. Table 2 shows final counts but intermediate filtering counts are missing.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper does not state the training data cutoff date for CodeT5's pretraining corpus, despite CodeT5 being pretrained on GitHub data that could overlap with the test set.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "While train/val/test splits use different projects (Table 2), the paper does not discuss whether CodeT5's pretraining data (collected from GitHub) overlaps with the fine-tuning test data, which also comes from GitHub.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "CodeT5 was pretrained on a large-scale GitHub corpus. The test data is also collected from GitHub commits. The paper does not address whether test examples could have appeared in CodeT5's pretraining data.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or per-example timing is reported for any of the three tasks.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Hardware is mentioned (NVIDIA A100 GPUs, 40 GB memory) but total GPU hours, training time, or overall compute budget is not quantified.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs is not stated anywhere in the paper.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search budget or search method is reported, despite key hyperparameters (learning rate, batch size, epochs) being unreported.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "No description of how the final model configuration was selected. The paper does not state whether validation set performance was used for model selection.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "Multiple comparisons are made across 6+ baselines, 3 tasks, and 2 datasets with no correction applied. No statistical tests are used at all.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors (Salesforce) evaluate their own CodeT5 model against baselines they fine-tuned from public checkpoints. The bias of self-evaluation is not acknowledged or discussed.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "CodeT5-DLR is trained on three objectives simultaneously while baselines are trained on single objectives. The difference in training compute is not discussed or controlled for.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper does not discuss whether its benchmarks (keyword-filtered GitHub commits) actually measure real-world debugging ability, or whether the bug types are representative of bugs developers encounter in practice.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved. The system is a fine-tuned model without agentic components.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "CodeT5 was pretrained on GitHub code, and the test data is also from GitHub commits. No discussion of whether the model could have seen test solutions during pretraining.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the input format (with [SEP] tokens marking line boundaries and commit-derived bug labels) leaks information not available in real-world debugging scenarios.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "Table 2 shows different project counts for train/val/test suggesting project-level splitting, but this is not explicitly discussed. No analysis of whether train and test examples share structural similarities (e.g., similar codebases, duplicate patterns).",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No concrete leakage detection or prevention method (canary strings, deduplication, n-gram overlap analysis) is applied.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Joint training on three objectives (detect-localize-repair) improves performance compared to single-objective training.",
    455       "evidence": "Table 6 shows CodeT5-DLR achieves 33.93% MRR for localization and 46.93% EM for repair vs single-objective variants on end-to-end task. Table 4 shows CodeT5-DLR 27.67% MRR@1 vs CodeT5-L 24.40%. Table 5 shows CodeT5-DLR 10.30% EM vs CodeT5-R 7.30% for program repair.",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "CodeT5-DLR significantly outperforms baseline models (CodeBERT, GraphCodeBERT, PLBART, SpotBugs) on all three debugging tasks.",
    460       "evidence": "Table 3: F1 63.46% vs PLBART 59.01%. Table 4: MRR@1 26.78% vs PLBART 23.02%. Table 5: EM 10.30% vs PLBART 6.02%.",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Line-level bug granularity is more practical than token or file level for developer workflows.",
    465       "evidence": "Paper argues line-level mirrors human debugging. No direct empirical validation against token or file-level approaches.",
    466       "supported": "moderate"
    467     },
    468     {
    469       "claim": "Real-world bug datasets from GitHub are more useful for training than synthetic bug datasets.",
    470       "evidence": "Paper compares favorably to Allamanis et al. (2021) which uses synthetic bugs, but no direct side-by-side comparison on same test set.",
    471       "supported": "weak"
    472     },
    473     {
    474       "claim": "The unified framework enables end-to-end bug detection-localization-repair pipeline with 33.93% buggy line detection and 46.93% repair accuracy for single-line Java bugs.",
    475       "evidence": "Table 6 reports 33.93% MRR for line localization and 46.93% BLEU for repair in combined setting. These are low absolute rates but improved over single-objective models.",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Different bug patterns require different modeling approaches; CHANGE_NUMERAL is 'very challenging' compared to identifier/method changes.",
    480       "evidence": "Figure 4 failure case and Figure 5 showing differential performance by pattern type (P0-P12). However, no explicit analysis of why patterns differ.",
    481       "supported": "moderate"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval",
    486     "case-study",
    487     "observational"
    488   ],
    489   "key_findings": "The paper proposes CodeT5-DLR, a unified framework fine-tuning CodeT5 for joint multi-task learning on bug detection (63.46% F1), line-level localization (27.67% MRR@1), and program repair (10.30% EM). Joint training outperforms single-objective variants by 2-3pp, validated through ablation studies. The model significantly outperforms CodeBERT, GraphCodeBERT, and PLBART baselines across all tasks. End-to-end evaluation achieves 33.93% buggy line localization and 46.93% repair accuracy for single-line Java bugs, though multi-line Python results are lower (28.49%/41.21%). Bug pattern analysis shows varying difficulty: identifier/method changes are easier than numeral changes. However, absolute success rates remain low (<11% exact code match), limiting practical utility.",
    490   "red_flags": [
    491     {
    492       "flag": "Train-test contamination risk",
    493       "detail": "CodeT5 was pretrained on GitHub source code; evaluation uses GitHub bug-fix commits. No analysis of whether test commits overlap with CodeT5 pretraining data. Critical issue for LLM-based code models but unaddressed."
    494     },
    495     {
    496       "flag": "No statistical significance testing",
    497       "detail": "Claims of 'significant outperformance' with no p-values, confidence intervals, or statistical tests. All results are point estimates without variance measures."
    498     },
    499     {
    500       "flag": "Module inconsistency",
    501       "detail": "Paper acknowledges that 'detection module indicates non-buggy but repair module continues to generate fixes.' Unified framework doesn't actually unify decision-making; modules contradict each other."
    502     },
    503     {
    504       "flag": "Low repair success rates",
    505       "detail": "EM of 10.30% (Java) and 6.3% (Python) means model generates exactly correct fixes <11% of the time. Claims of 'improvement' are relative to weak baselines; absolute performance is poor."
    506     },
    507     {
    508       "flag": "Missing critical hyperparameters",
    509       "detail": "Learning rate, batch size, epochs, optimizer, dropout, loss weights not reported. Reproducibility impossible without these."
    510     },
    511     {
    512       "flag": "Proxy outcome gap",
    513       "detail": "Abstract claims about 'improving productivity' but evaluates only on exact code match (EM). Large gap between narrow proxy metric and broad productivity claim."
    514     },
    515     {
    516       "flag": "No variance reporting",
    517       "detail": "All results are single point estimates. No standard deviations, multiple runs, random seed effects, or variance discussion. Cannot assess robustness."
    518     },
    519     {
    520       "flag": "Code release unclear",
    521       "detail": "Paper states datasets 'released' but unclear if code/model released. Only 'datasets' explicitly mentioned; reproducibility of model training uncertain."
    522     }
    523   ],
    524   "cited_papers": [
    525     {
    526       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    527       "relevance": "Key baseline pretrained model; foundation for comparison of encoder-based approaches to code understanding"
    528     },
    529     {
    530       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    531       "relevance": "Foundation model; pretrained encoder-decoder architecture adapted for unified debugging framework"
    532     },
    533     {
    534       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    535       "relevance": "Baseline incorporating data flow information; compared across all three debugging tasks"
    536     },
    537     {
    538       "title": "Self-supervised Bug Detection and Repair",
    539       "relevance": "Prior work on joint bug detection-repair; paper argues their approach is more practical with real-world data vs synthetic bugs"
    540     },
    541     {
    542       "title": "How Often Do Single-Statement Bugs Occur? The ManySStuBs4J Dataset",
    543       "relevance": "Prior dataset work for single-statement bug detection; methodology followed for dataset collection"
    544     },
    545     {
    546       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    547       "relevance": "Prior neural program repair approach; compared against on repair task"
    548     },
    549     {
    550       "title": "On the Naturalness of Software Code",
    551       "relevance": "Theoretical foundation for treating code as natural language for neural methods"
    552     },
    553     {
    554       "title": "A Call to Action for Evaluating Explainability of Machine Learning",
    555       "relevance": "Evaluation methodology reference for measuring model interpretability"
    556     }
    557   ],
    558   "engagement_factors": {
    559     "practical_relevance": {
    560       "score": 2,
    561       "justification": "A unified debugging framework with practical appeal to developers, but no released code or tool to actually use."
    562     },
    563     "surprise_contrarian": {
    564       "score": 0,
    565       "justification": "Multi-task learning improving over single-task training is an expected finding with no contrarian element."
    566     },
    567     "fear_safety": {
    568       "score": 0,
    569       "justification": "No AI safety or security concerns raised; this is a developer productivity tool."
    570     },
    571     "drama_conflict": {
    572       "score": 0,
    573       "justification": "No controversy or conflict angle."
    574     },
    575     "demo_ability": {
    576       "score": 0,
    577       "justification": "No code, demo, or tool released at publication time."
    578     },
    579     "brand_recognition": {
    580       "score": 1,
    581       "justification": "Salesforce Research is moderately known in the AI/NLP community but not a top-tier brand for developer tools."
    582     }
    583   },
    584   "hn_data": {
    585     "threads": [],
    586     "top_points": 0,
    587     "total_points": 0,
    588     "total_comments": 0
    589   }
    590 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs