scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27021B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detect-Localize-Repair: A Unified Framework for Learning to Debug with CodeT5",
      6     "authors": [
      7       "Nghi D. Q. Bui",
      8       "Yue Wang",
      9       "Steven C. H. Hoi"
     10     ],
     11     "year": 2022,
     12     "venue": "Conference on Empirical Methods in Natural Language Processing",
     13     "arxiv_id": "2211.14875",
     14     "doi": "10.48550/arXiv.2211.14875"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims (unified framework, three tasks, new datasets, performance improvements) are directly supported by results in Tables 3-6 and dataset description in Section 3.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims ('joint training improves performance') are supported by ablation studies (Table 6) comparing CodeT5-DLR vs CodeT5-D/L/R, showing joint training consistently outperforms individual training.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Claims in abstract/conclusion about 'neural-based techniques for debugging' are broader than the tested scope (Java/Python, line-level, function-level, GitHub commits). Generalizability to other languages, proprietary code, or fine-grained code changes not discussed.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No alternative explanations explored. Why joint training works is attributed to task complementarity (brief intuition) without deeper investigation or comparison to other training strategies.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "EM and BLEU measure code similarity but not whether fixes actually work in practice. No discussion of whether high BLEU/EM correlates with functionally correct repairs or if exact syntactic match is necessary.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 8 (Limitations) is a dedicated section discussing module inconsistency and lack of cross-function context, not just a concluding remark.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Limitations mention lack of cross-function context and module inconsistency, but miss critical threats: dataset representativeness (GitHub biases), bug heuristic accuracy impact (96% → 4% false positives), generalization to different code domains.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Explicit boundaries set (Java/Python, line-level, GitHub commits) but limitations section does not clearly state what the results do NOT show. No discussion of whether approach works for other languages, proprietary code, or non-GitHub datasets.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source explicitly stated in the paper. Salesforce Research affiliation suggests internal funding but this is not disclosed.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All authors list Salesforce Research Asia affiliation clearly. No conflict of interest apparent since paper evaluates CodeT5 (Meta) and other external models, not Salesforce products.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Presumed Salesforce funding is independent of the evaluated outcome (CodeT5 performance comparison to baselines). No Salesforce tool or product is the subject of evaluation.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or declaration of patents/equity/consulting arrangements provided.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms defined in Section 2: 'bug detection' as binary function classification, 'bug localization' as line-level labeling with problem formulation in 2.1, 'program repair' as sequence-to-sequence translation.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three contributions explicitly stated in introduction: (1) unified DLR framework, (2) two new datasets for Java/Python, (3) empirical evaluation. Contributions are concrete and testable.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6 reviews related work in pretrained language models and neural debugging, discussing CodeBERT, GraphCodeBERT, PLBART. Distinguishes from Allamanis et al. (2021) joint approach by using real bugs vs synthetic, and function/line-level vs token-level granularity.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "Code not released. Paper states 'available upon request' or via CodeT5 GitHub link, but fine-tuned models, adaptation code, and training scripts are not provided.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Paper states 'we will make our datasets publicly available' (future tense). At time of publication, datasets are not released, only promised.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Hardware specified (NVIDIA A100, 40GB), model version stated (CodeT5-base 220M), and max sequence length (512), but missing: Python version, PyTorch/TensorFlow versions, CUDA version, dependency list, training scripts.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions. Method section describes what was done but not how to reproduce: no training code, no inference code, datasets not available at publication, many hyperparameters missing.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Tables 3-6 report point estimates only. No error bars, confidence intervals, standard deviations, or variance across runs reported despite using deep learning models which typically require multiple runs.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Claims of 'significant improvements' made throughout (e.g., 'significantly outperforms') but no statistical significance tests, p-values, t-tests, or other hypothesis tests provided.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "Point improvements shown in tables (e.g., CodeT5-DLR 63.46 F1 vs PLBART 59.01 F1) but no formal effect size measures, Cohen's d, or percentage improvements with context provided.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Dataset sizes provided in Table 2 (52K-132K training examples) but no justification, power analysis, or discussion of adequacy. Why these particular sizes were chosen is not explained.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Only point estimates reported in results tables. No standard deviation, confidence intervals, or variance across multiple runs mentioned, despite training neural networks which typically have random seed variation.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple baselines included: SpotBugs (static analysis), TBCNN, CodeBERT, GraphCodeBERT, PLBART (neural models), and DeepLineDP/LineVul (vulnerability detection adapted to bug localization).",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines span 2016-2022. For a 2022 submission, CodeBERT (2020), GraphCodeBERT (2020), and PLBART (2021) are contemporary and competitive. SpotBugs is an established production tool.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Ablation study comparing CodeT5-D (detection only), CodeT5-L (localization only), CodeT5-R (repair only) vs CodeT5-DLR (joint) in Tables 3-6, demonstrating benefit of joint training.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Detection: F1 and FPR. Localization: MRR, MAP, FPR at k=1 and k=5. Repair: EM and BLEU. Multiple metrics capture different aspects of performance for each task.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "No human evaluation provided, but not critical since ground truth is available from commits and automatic metrics (EM, BLEU) are standard for code generation tasks. Not applicable.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Table 2 shows explicit train/validation/test splits for both SL-Java and ML-Python datasets. Results reported on held-out test sets.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Section 4.4.2 and Figure 5 provide breakdown by 13 bug patterns for SL-Java detection task (CHANGE_OPERATOR, CHANGE_IDENTIFIER, etc.). ML-Python lacks per-category breakdown.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 4.4.1 shows failure example (Figure 4): CHANGE_NUMERAL pattern where model correctly localizes but fails to predict exact numeral (3476→3344). Explains why certain patterns are hard.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Table 4 includes incomplete results (CodeT5-DLR-new marked with 'x' for ML-Python). Trade-offs discussed: FPR increases from 3.04 to 8.05 as k increases from 1 to 5. Some patterns show CodeT5-L outperforming CodeT5-DLR.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Base model specified as 'CodeT5-base (220M)' with GitHub link, but no snapshot date, commit hash, or version tag for the exact weights used. Reproducer cannot guarantee identical weights.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "No prompts used. This is supervised fine-tuning on labeled data, not prompt-based generation. Not applicable.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Reported: max sequence lengths (512), GPU hardware (A100 40GB). Missing: learning rate, batch size, number of epochs, optimizer (Adam? SGD?), warmup steps, dropout, weight decay, gradient clipping.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding or in-context prompting. Supervised fine-tuning approach. Not applicable.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Collection process well documented: commit keyword filtering (96% accuracy cited), Lizard for function extraction, tree-sitter for pattern identification. Train/val/test splitting described. Some details sparse (exact regex for keyword matching).",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Datasets promised but not made available at publication time. Independent verification of data quality cannot be done. 'Will be made publicly available' is future commitment, not current release.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Collection pipeline clearly described: GitHub commits with bug-fix keywords using PyDriller, function extraction with Lizard, line-level bug indicators. Heuristic validation referenced (96% accuracy from prior work).",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human recruitment. Data sourced from GitHub commits. Not applicable.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Full pipeline documented: commit mining → function extraction → line-level annotation → pattern identification (for Java). Before/after code snapshots preserved for repair task.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "CodeT5 pretrained on large GitHub corpus but pretraining cutoff date not stated. Fine-tuning data from GitHub commits but no temporal cutoff specified. No discussion of when GitHub data was harvested.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "Train/test split at commit level is good, but CodeT5 was pretrained on large GitHub corpus which almost certainly overlaps with test commits. This potential contamination not discussed.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Custom datasets collected by authors (not standard benchmarks). Potential overlap between CodeT5 pretraining and fine-tuning data is not addressed as a standard benchmark issue.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants. Not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. Not applicable.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants. Not applicable.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants. Not applicable.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants. Not applicable.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants. Not applicable.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants. Not applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "Inference latency, throughput, and computational cost not reported. No discussion of wall-clock time to detect/localize/repair a given function or deployment feasibility.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Hardware specified (A100 40GB) but no total compute budget, training time in hours/days, number of iterations, or estimated cost for reproducers provided.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "CodeT5-DLR achieves state-of-the-art performance on bug detection, localization, and repair tasks",
    373       "evidence": "Tables 3-5 show CodeT5-DLR outperforms all baselines (PLBART, CodeBERT, GraphCodeBERT, DeepLineDP, LineVul) on F1 (63.46 vs 59.01), MRR (26.78 vs 23.02), and EM (10.30 vs 6.02)",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Joint training of three tasks yields better performance than training on individual tasks",
    378       "evidence": "Table 6 and earlier tables show CodeT5-DLR consistently outperforms CodeT5-D/L/R variants. E.g., Table 3 CodeT5-DLR F1=63.46 vs CodeT5-D F1=59.28 on detection",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The unified framework successfully mirrors how developers debug code (detect → localize → repair)",
    383       "evidence": "Intuitive argument in Section 1 and Figure 3 example, but no user study validation of whether this pipeline matches real developer workflows",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "CodeT5-DLR achieves 33.93% buggy line localization and 46.93% repair accuracy in end-to-end evaluation",
    388       "evidence": "Table 6 explicitly reports these numbers for SL-Java unified debugging procedure",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "The bug-fix keyword heuristic is reliable for identifying real bug fixes",
    393       "evidence": "Heuristic cited as 96% accurate from Karampatsis & Sutton (2020) and Ray et al. (2016). However, 4% false positive rate could impact dataset quality",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Line-level granularity is more practical than function-level or token-level bug localization",
    398       "evidence": "Argued in Section 1 but not empirically validated. Previous work cited supporting practicality argument",
    399       "supported": "weak"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "empirical"
    405   ],
    406   "key_findings": "CodeT5-DLR unifies three interdependent debugging tasks (detection, localization, repair) via multi-task learning on a pretrained language model. Evaluation on newly collected Java and Python datasets shows the joint approach outperforms independently trained baselines and other neural models. The model achieves 63.46% F1 on function-level bug detection, 26.78 MRR@1 on line-level localization, and 10.30% exact match on repair. End-to-end performance is more modest: 33.93% buggy lines correctly localized and 46.93% repaired for single-line Java bugs, degrading to 28.49% and 41.21% for multi-line Python bugs. Ablation studies confirm joint training benefits.",
    407   "red_flags": [
    408     {
    409       "flag": "No error bars or variance reporting",
    410       "detail": "All results are point estimates with no confidence intervals, standard deviations, or multiple-run variance despite training stochastic neural networks."
    411     },
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "Claims of 'significant improvements' are unsupported by p-values, t-tests, or other hypothesis tests. Differences could be within noise."
    415     },
    416     {
    417       "flag": "Code and data not released",
    418       "detail": "Reproducibility impossible at publication. Datasets promised but not delivered; fine-tuned models and training code absent."
    419     },
    420     {
    421       "flag": "Potential data contamination",
    422       "detail": "CodeT5 pretrained on GitHub corpus with no specified cutoff date. Fine-tuning and test data from same GitHub source. Train/test contamination not discussed."
    423     },
    424     {
    425       "flag": "Missing critical hyperparameters",
    426       "detail": "Learning rate, batch size, epochs, optimizer, warmup, dropout not specified. Reproduction would require guessing or reverse-engineering."
    427     },
    428     {
    429       "flag": "Low absolute performance",
    430       "detail": "End-to-end accuracy is 33.93% localization and 46.93% repair for single-line bugs. Multi-line performance is worse. May be too low for production use."
    431     },
    432     {
    433       "flag": "No human evaluation",
    434       "detail": "While automatic metrics exist, human validation of whether EM/BLEU matches actually correct fixes would strengthen claims."
    435     },
    436     {
    437       "flag": "Limited ablation on design",
    438       "detail": "Why this specific loss combination (Ldetect + Llocalize + Lrepair)? Other joint training strategies not explored."
    439     },
    440     {
    441       "flag": "Class imbalance not addressed",
    442       "detail": "Datasets have far more non-buggy than buggy lines. No discussion of how class imbalance affects training or whether techniques like SMOTE/weighting were used."
    443     },
    444     {
    445       "flag": "GitHub bias in datasets",
    446       "detail": "Real-world bugs from GitHub may not represent all types of bugs (e.g., embedded systems, legacy code). Generalizability assumed but not tested."
    447     }
    448   ],
    449   "cited_papers": [
    450     {
    451       "title": "Self-supervised bug detection and repair",
    452       "authors": "Allamanis et al.",
    453       "year": 2021,
    454       "relevance": "Prior joint approach to bug localization and repair; uses synthetic data and token-level granularity, contrasting with this work's real data and line-level approach"
    455     },
    456     {
    457       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code",
    458       "authors": "Wang et al.",
    459       "year": 2021,
    460       "relevance": "Foundation model used in this work; pretrained on GitHub code with identifier-aware objectives"
    461     },
    462     {
    463       "title": "CodeBERT: A Pre-trained Model for Programming Language and Natural Language",
    464       "authors": "Feng et al.",
    465       "year": 2020,
    466       "relevance": "Baseline model and related work on pretrained models for code understanding"
    467     },
    468     {
    469       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    470       "authors": "Guo et al.",
    471       "year": 2020,
    472       "relevance": "Baseline incorporating data flow structure for code representation"
    473     },
    474     {
    475       "title": "How often do single-statement bugs occur? The ManySStuBs4J dataset",
    476       "authors": "Karampatsis & Sutton",
    477       "year": 2020,
    478       "relevance": "Prior work on single-line bug datasets and heuristic (96% accuracy) for identifying bugs from commits"
    479     },
    480     {
    481       "title": "Neural Program Repair by Jointly Learning to Localize and Repair",
    482       "authors": "Vasic et al.",
    483       "year": 2019,
    484       "relevance": "Earlier joint localization-repair approach using pointer networks; motivates unified framework design"
    485     },
    486     {
    487       "title": "Unified Pre-training for Program Understanding and Generation",
    488       "authors": "Ahmad et al. (PLBART)",
    489       "year": 2021,
    490       "relevance": "Baseline pretrained model evaluated on debugging tasks"
    491     },
    492     {
    493       "title": "On the Accuracy of Spectrum-Based Fault Localization",
    494       "authors": "Abreu et al.",
    495       "year": 2007,
    496       "relevance": "Traditional program analysis approach to bug localization; contrasts with neural methods"
    497     },
    498     {
    499       "title": "Patching as Translation: The Data and the Metaphor",
    500       "authors": "Ding et al.",
    501       "year": 2020,
    502       "relevance": "Neural program repair via sequence-to-sequence translation, foundational for repair objective design"
    503     },
    504     {
    505       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    506       "authors": "Jiang et al.",
    507       "year": 2021,
    508       "relevance": "Recent neural repair approach using context-aware translation; compared baselines"
    509     }
    510   ],
    511   "engagement_factors": {
    512     "practical_relevance": {
    513       "score": 2,
    514       "justification": "Addresses real need (developer productivity) but end-to-end 33-46% accuracy may be too low for production systems without expert review."
    515     },
    516     "surprise_contrarian": {
    517       "score": 1,
    518       "justification": "Unified framework is incremental; joint training benefit expected. No surprising findings about debugging or code understanding."
    519     },
    520     "fear_safety": {
    521       "score": 0,
    522       "justification": "No AI safety or misalignment concerns. Bug fixing is positive application."
    523     },
    524     "drama_conflict": {
    525       "score": 0,
    526       "justification": "Solid technical work without controversy, limitations clearly acknowledged, no dramatic claims."
    527     },
    528     "demo_ability": {
    529       "score": 1,
    530       "justification": "Hard to demo without code/data release. Could build interactive demo if artifacts were available but cannot reproduce as-is."
    531     },
    532     "brand_recognition": {
    533       "score": 2,
    534       "justification": "Salesforce Research and CodeT5 (Meta) have credibility but not top-tier labs like OpenAI, DeepMind, or FAIR."
    535     }
    536   },
    537   "hn_data": {
    538     "threads": [],
    539     "top_points": 0,
    540     "total_points": 0,
    541     "total_comments": 0
    542   }
    543 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs