scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26780B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A Deep Dive into Large Language Models for Automated Bug Localization and Repair",
      6     "authors": [
      7       "Soneya Binta Hossain",
      8       "Nan Jiang",
      9       "Qiang Zhou",
     10       "Xiaopeng Li",
     11       "Wen-Hao Chiang",
     12       "Yingjun Lyu",
     13       "Hoan Nguyen",
     14       "Omer Tripp"
     15     ],
     16     "year": 2024,
     17     "venue": "Proc. ACM Softw. Eng.",
     18     "arxiv_id": "2404.11595",
     19     "doi": "10.1145/3660773"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The abstract claims state-of-the-art on CodeXGLUE and Defects4J; Table 1 and Table 3 confirm Toggle (PolyCoder-2.7B 25.07%) exceeds NSEdit (23.86%) and fixes more bugs in Top-10/30/50/100 on Defects4J than any compared method.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Causal claims (e.g., 'prompt 4 significantly improves bug fixing accuracy') are tested through controlled ablations in RQ3 using ground-truth bug locations to isolate prompt effects, and RQ5 enables/disables the adjustment module across 16 configurations.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The Threats to Validity section acknowledges results may not generalize beyond the studied datasets, and the Defects4J generalizability test uses only 240 single-hunk Java bugs; findings are generally scoped to the specific benchmarks tested.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper attributes improvements to 'inductive bias' from token-level localization and prompt design but does not discuss alternative explanations such as constrained-generation making the fine-tuning task easier or model pre-training data effects.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper explicitly defines exact match (EM) as its primary metric and distinguishes it from BLEU/CodeBLEU; for Defects4J, patch correctness is verified via test execution, and the paper does not conflate EM with real-world utility.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5 is explicitly titled 'THREATS TO VALIDITY' and spans a dedicated paragraph discussing generalization, tooling bugs, and metric validity.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The threats section mentions 'results may not generalize across other datasets' without specifying what properties would limit generalization, and the 'scripts might contain bugs' concern is boilerplate; no quantified or domain-specific threats are identified.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper explicitly restricts the Defects4J evaluation to 'single-hunk' bugs (240 bugs) and the fine-tuning models are bounded to specific parameter ranges (110M–2.7B); scope is stated within individual RQ setups.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "There is no acknowledgment section or funding disclosure anywhere in the paper.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations (University of Virginia, Purdue University, Amazon Web Services) are explicitly listed in the author block.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No funder is identified; five of eight authors are Amazon Web Services employees evaluating their own research framework, making funder independence moot but affiliation bias is a concern.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Key terms such as 'token-granulated bug localization,' 'exact match metric,' 'inductive bias,' 'shared prefix/suffix,' and 'single-hunk bugs' are defined or explained contextually with examples and figures.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 1.4 explicitly lists four contributions: granularity shift to token-level, four novel prompt designs, adjustment module for tokenizer discrepancies, and comprehensive empirical study.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4 (Related Work) positions Toggle against specific prior methods (NSEdit, CoText, CURE, KNOD, AlphaRepair, Recoder) with direct performance comparisons; the paper explains how its token-level approach differs from line-level methods in prior LLM-APR work.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No Toggle source code release is mentioned anywhere in the paper; base model checkpoints are referenced via Hugging Face but the Toggle framework itself is not released.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "All datasets used (CodeXGLUE/Tufano, CodeReviewer, Defects4J, GitHub) are publicly available benchmarks referenced with citations.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Only PyTorch and Hugging Face are mentioned; no version numbers, requirements file, or Dockerfile are provided.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step reproduction instructions are provided; the paper describes the methodology but not how to replicate the experimental setup from scratch.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No confidence intervals or error bars appear in any of the results tables (Tables 1–8); only point estimates are reported.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical significance tests are applied to any comparative claims, despite numerous comparisons between methods and prompts.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Absolute performance values with baselines are consistently reported (e.g., 25.07% vs 23.86% on Tufano Small), providing enough context to assess effect magnitudes.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The choice of 240 Defects4J single-hunk bugs and 210 patches per bug is not statistically justified; no power analysis is discussed.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "The paper states experiments were 'repeated several times to confirm consistency' but no variance, standard deviation, or spread across runs is reported in any table.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Baselines include NSEdit, CoText (Table 1) and CURE, RewardRepair, Recoder, KNOD, Tare, AlphaRepair, TENURE (Table 3).",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include papers from 2021–2023 (NSEdit 2022, KNOD 2023, Tare 2023, AlphaRepair 2022), which are competitive and recent relative to the 2024 submission.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "RQ3 ablates across four prompt designs; RQ5 ablates the adjustment module enabled vs disabled across 4 models × 4 datasets; RQ4 ablates the effect of contextual information.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "The paper uses exact match (EM) for CodeXGLUE/CodeReviewer and Top-K (K=10,30,50,100,200) metrics for Defects4J, plus start/end token accuracy for localization.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "Human evaluation of system outputs is not relevant to this task; patch correctness is verified via automated exact match and test execution on Defects4J.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Datasets are explicitly split 80/10/10 into training, validation, and test sets; Defects4J is kept entirely held-out from fine-tuning.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Results are broken down by dataset (Tufano Small, Tufano Medium, CodeReviewer w/o comment, CodeReviewer w/ comment) and by model backbone for all major experiments.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Figure 7 explicitly shows a failure case where correct bug location still produces incorrect fix, and RQ6 discusses conditions under which prompt 4 underperforms prompt 3.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "CodeGPT underperforms on multilingual datasets due to Java-only pretraining; prompt 4 underperforms prompt 3 on Tufano datasets; smaller models don't benefit as much from the adjustment module.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Models are specified with parameter counts and sources: CodeGPT-110M, CodeParrot-110M, CodeGen-350M, CodeGen-2B, PolyCoder-400M, PolyCoder-2.7B, CodeT5-large (347M); Hugging Face references are provided.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "All four prompts are illustrated in Figure 5 with concrete code examples showing the exact format including separator tokens and truncation strategy.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": false,
    253           "justification": "No learning rates, batch sizes, number of epochs, or optimizer settings are reported for any of the fine-tuning experiments.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 2.3 describes the Toggle framework architecture in detail including the localization model (CodeT5 encoder with attention-based prediction), four prompt designs, and adjustment module (CodeT5 encoder with FC layer).",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "The GitHub dataset preprocessing is documented (commit filtering by keywords, AST-based Defects4J deduplication); train/validation/test splits of 80/10/10 are stated; adjustment module training data collection procedure is described.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "All benchmarks used (CodeXGLUE/Tufano, CodeReviewer, Defects4J) are publicly available; citations and URLs are provided for access.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "The GitHub dataset curation is described (commit message keywords, single-statement patches, AST-based Defects4J exclusion); the other datasets reference published papers describing their collection.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants; all data is from public code repositories and established benchmarks.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The full pipeline from dataset splits through fine-tuning to patch generation and evaluation is described, including the 7-shift range used for adjustment module training data.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No pre-training cutoff dates are stated for any of the six base LLMs (CodeGPT, CodeParrot, CodeGen, PolyCoder, CodeT5) despite their pre-training corpora potentially overlapping with public benchmarks.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "The paper explicitly excludes Defects4J samples from the GitHub fine-tuning dataset via AST comparison to prevent data leakage into the held-out generalization test.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "The pre-trained base models (CodeParrot, CodeGen, etc.) were trained on large code corpora that likely include CodeXGLUE and Defects4J data; this pre-training contamination risk is never discussed.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "The paper mentions 'resource-intensive nature of larger models' as justification for testing only smaller models in some RQs, but no actual inference cost, latency, or GPU-hours are reported.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total computational budget (GPU hours, cloud costs, hardware used) is stated anywhere in the paper.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Toggle achieves new state-of-the-art on CodeXGLUE code refinement benchmark (Tufano Small 25.07%, Tufano Medium 16.19%)",
    378       "evidence": "Table 1 shows PolyCoder-2.7B at 25.07% vs NSEdit's 23.86% on Tufano Small and 16.19% vs CoText's 15.36% on Tufano Medium",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Toggle outperforms all compared APR methods on Defects4J in Top-10, Top-30, Top-50, and Top-100 metrics",
    383       "evidence": "Table 3 shows Toggle fixes 41 bugs in Top-10 vs next-best 36 (Recoder), 58 vs 51, 64 vs 62, 74 vs 70 respectively",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Larger LLMs yield better bug fixing accuracy after fine-tuning with Toggle prompts",
    388       "evidence": "Table 1 consistently shows larger models outperform smaller ones (e.g., CodeGen-2B 24.73% vs CodeGen-350M 23.19% on Tufano Small)",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Token-granulated prompts (3 and 4) significantly outperform standard prompting (prompt 1) for bug fixing",
    393       "evidence": "Table 4 shows CodeGPT-110M improves from 16.07% (prompt 1) to 56.98% (prompt 4) on Tufano Small using ground-truth bug locations",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Contextual information (buggy line numbers, code review comments) significantly improves bug localization accuracy",
    398       "evidence": "Table 5 shows starting token accuracy for Tufano Small improves from 39.07% to 60.37% (+21%) with buggy line numbers",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "The adjustment module consistently improves bug fixing accuracy across all models and datasets",
    403       "evidence": "Table 6 shows improvement in all 16 configurations, e.g., CodeParrot-110M on Tufano Small improves from 21.78% to 23.51%",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "Prompt 4 outperforms prompt 3 only when both start and end token locations are highly accurate",
    408       "evidence": "Table 8 shows prompt 3 superior on Tufano datasets but prompt 4 superior on CodeReviewer datasets where partial location accuracy is higher (65.76% vs 53.23%)",
    409       "supported": "moderate"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval"
    414   ],
    415   "key_findings": "Toggle introduces token-granulated bug localization and repair, demonstrating that preventing LLMs from regenerating non-buggy shared prefix/suffix significantly improves accuracy (prompt 1 to prompt 4: 16.07% to 56.98% for CodeGPT on Tufano Small). The system achieves state-of-the-art on CodeXGLUE code refinement and outperforms all compared methods on Defects4J in Top-10 through Top-100 metrics using only 110M parameter models and 210 generated patches. Contextual information (line numbers, code review comments) improves localization accuracy by 20-30 percentage points. The choice between prompts 3 and 4 with predicted locations is dataset-dependent, with prompt 4 winning when partial location accuracy is high and additional context is available.",
    416   "red_flags": [
    417     {
    418       "flag": "No statistical significance tests",
    419       "detail": "All comparisons between Toggle and baselines, and between prompt configurations, are made without any statistical significance testing despite many tables of numerical comparisons."
    420     },
    421     {
    422       "flag": "No variance or confidence intervals",
    423       "detail": "Despite claiming experiments were repeated multiple times, no standard deviation, confidence intervals, or error bars are reported for any results."
    424     },
    425     {
    426       "flag": "No hyperparameters reported",
    427       "detail": "Learning rates, batch sizes, number of epochs, and optimizer configurations for all fine-tuning experiments are absent, making reproduction impossible."
    428     },
    429     {
    430       "flag": "No code release",
    431       "detail": "The Toggle framework is not released; only the public base model checkpoints are referenced, preventing independent verification of results."
    432     },
    433     {
    434       "flag": "Pre-training contamination unaddressed",
    435       "detail": "Base LLMs (CodeGPT, CodeParrot, CodeGen, PolyCoder, CodeT5) were trained on large code corpora that likely include CodeXGLUE and Defects4J benchmarks; this contamination risk is never discussed."
    436     },
    437     {
    438       "flag": "Asymmetric patch count in Defects4J comparison",
    439       "detail": "Toggle generates 210 patches per bug (Top-100 is primary comparison), while competing methods (Tare, AlphaRepair, TENURE) generate 500+ patches and only their Top-500+ results are reported, making Top-100 comparisons potentially favorable to Toggle."
    440     },
    441     {
    442       "flag": "No funding disclosure",
    443       "detail": "Five of eight authors are Amazon Web Services employees; no funding source or competing interests are disclosed."
    444     }
    445   ],
    446   "cited_papers": [
    447     {
    448       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    449       "relevance": "Primary benchmark for evaluation; Toggle achieves state-of-the-art on its code refinement tasks"
    450     },
    451     {
    452       "title": "CodeReviewer: Pre-Training for Automating Code Review Activities",
    453       "relevance": "Provides dataset and CodeT5 baseline for code review-guided bug fixing experiments"
    454     },
    455     {
    456       "title": "Defects4J: A Database of existing faults to enable controlled testing studies for Java programs",
    457       "relevance": "Primary generalizability benchmark; 835 real-world Java bugs used for out-of-distribution evaluation"
    458     },
    459     {
    460       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    461       "relevance": "Backbone model for bug localization; used as both a baseline and the encoder in Toggle's localization module"
    462     },
    463     {
    464       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    465       "relevance": "Key APR baseline compared on Defects4J (18 bugs in Top-10 vs Toggle's 41)"
    466     },
    467     {
    468       "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair",
    469       "relevance": "Strong APR baseline using tree-based decoding; compared on Defects4J across all Top-K metrics"
    470     },
    471     {
    472       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    473       "relevance": "AlphaRepair baseline demonstrating LLMs used for APR without fine-tuning; contextualizes Toggle's fine-tuning approach"
    474     },
    475     {
    476       "title": "Impact of Code Language Models on Automated Program Repair",
    477       "relevance": "Prior work on LLM-based APR that Toggle directly builds on and improves over"
    478     },
    479     {
    480       "title": "Fix Bugs with Transformer through a Neural-Symbolic Edit Grammar",
    481       "relevance": "NSEdit — primary baseline for CodeXGLUE leaderboard comparison; Toggle surpasses it on all Tufano datasets"
    482     },
    483     {
    484       "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation",
    485       "relevance": "Source of Tufano Small/Medium datasets used as primary fine-tuning and evaluation benchmarks"
    486     }
    487   ],
    488   "engagement_factors": {
    489     "practical_relevance": {
    490       "score": 2,
    491       "justification": "APR is directly useful to developers; Toggle is a concrete working system tested on real bug benchmarks, though it requires fine-tuning and infrastructure to deploy."
    492     },
    493     "surprise_contrarian": {
    494       "score": 1,
    495       "justification": "Token-level vs line-level localization is a novel framing but the performance improvements are expected given the design rationale."
    496     },
    497     "fear_safety": {
    498       "score": 0,
    499       "justification": "No AI risk or safety concerns raised; the paper is purely about automated software engineering."
    500     },
    501     "drama_conflict": {
    502       "score": 0,
    503       "justification": "Standard benchmark competition paper with no controversy or conflict angle."
    504     },
    505     "demo_ability": {
    506       "score": 1,
    507       "justification": "The framework is described in detail but no public demo or code is released, limiting hands-on accessibility."
    508     },
    509     "brand_recognition": {
    510       "score": 1,
    511       "justification": "Amazon Web Services affiliation for five authors adds some recognition, but this is not a top-name lab publication; published at FSE which is a respected venue."
    512     }
    513   },
    514   "hn_data": {
    515     "threads": [
    516       {
    517         "hn_id": "40205264",
    518         "title": "Urban highways are barriers to social ties",
    519         "points": 6,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=40205264"
    522       },
    523       {
    524         "hn_id": "41103162",
    525         "title": "Beyond Deepfake Images: Detecting AI-Generated Videos [pdf]",
    526         "points": 3,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=41103162"
    529       },
    530       {
    531         "hn_id": "40165320",
    532         "title": "Generation of Low-Inclination, Neptune-Crossing TNOs by Planet Nine",
    533         "points": 2,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=40165320"
    536       }
    537     ],
    538     "top_points": 6,
    539     "total_points": 11,
    540     "total_comments": 0
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs