scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28834B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Explainable Automated Debugging via Large Language Model-driven Scientific Debugging",
      6     "authors": [
      7       "Sungmin Kang",
      8       "Bei Chen",
      9       "Shin Yoo",
     10       "Jian-Guang Lou"
     11     ],
     12     "year": 2023,
     13     "venue": "Empirical Software Engineering",
     14     "arxiv_id": "2304.02195",
     15     "doi": "10.1007/s10664-024-10594-x"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims are backed by results: competitive repair performance shown in Tables 1-2, confidence signaling via <DONE> shown in Figure 3, and human study accuracy/satisfaction figures match reported numbers.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The debugger ablation (RQ2) is a controlled experiment testing the causal contribution of actual code execution; the human study uses a within-subjects randomized design (each participant sees 3 of 6 bugs with explanations) enabling causal inference about explanation benefit.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion states AutoSD can 'significantly ease developer use of automated techniques' broadly, but evaluation covers only single-method Java/Python bugs on three specific benchmarks with a 20-person study, making such broad generalization unjustified.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The threats section addresses implementation errors and data leakage but does not discuss alternative explanations for accuracy improvements (e.g., novelty effect, easier bugs allocated to explanation condition, or demand characteristics in a 5-minute study).",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes 'plausible' patches (pass tests) from 'correct' patches (semantically equivalent to developer fix), and measures developer accuracy as patch-review correctness rather than claiming broader productivity gains.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 contains both a 'Threats to Validity' subsection (6.1) and a 'Limitations' subsection (6.2), each with substantive discussion.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are named: incorrect implementations addressed by planned public release, patch correctness assessment by manual inspection, data leakage addressed by constructing ARHE dataset, and bias in human study addressed by accuracy improvements being hard to attribute to bias.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 6.2 explicitly states AutoSD only handles single-method bugs, requires method-level FL as input, and is approximately 5× slower than LLM-Base — these are concrete scope boundaries.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source is disclosed anywhere in the paper text; the internship at Microsoft Research Asia is noted in a footnote but no research funding statement appears.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations (KAIST and Microsoft Research Asia) are clearly listed on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Two authors are from Microsoft Research Asia; the paper evaluates AutoSD built on ChatGPT/OpenAI products, and Microsoft holds significant investment in OpenAI — a non-independent relationship exists even without explicit funding.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Scientific Debugging is formally defined citing Zeller (hypothesis/prediction/experiment/observation/conclusion cycle), APR and fault localization are explained in Section 2.1, and the <DONE> token's role is precisely specified.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 1 lists four explicit bullet-point contributions: identifying LLM-based explainable debugging, empirical evaluation on three benchmarks, a developer study, and user feedback guidelines.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper engages substantively with prior APR techniques (Recoder, InCoder, Jiang et al.), developer expectation studies (Kochhar et al., Noller et al., Kirbas et al.), and Scientific Debugging foundations (Zeller, Siegmund et al.), showing how AutoSD differs from and builds on each.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "Section 6.1 states 'we plan to make our implementation and repair results publicly available for scrutiny' — this is a future promise, not a current release.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Defects4J v1.2/v2.0 and HumanEval are publicly available standard benchmarks used unmodified; the ARHE dataset construction is described in detail in the appendix though its separate release is only planned.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements file, Dockerfile, or dependency specification is provided or mentioned; only the debugger tools (jdb for Java, pdb for Python) are named.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper describes the approach conceptually with prompts in the appendix, but provides no step-by-step instructions for running the system on the benchmarks.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Only the stochastic template-based baseline reports mean ± std dev (85.77 ± 4.20); LLM-Base and AutoSD patch counts in Tables 1-2 are reported as single integers with no variance information.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to compare AutoSD vs LLM-Base or AutoSD vs Recoder/InCoder; the human study time comparison is noted as not significant but no test is reported.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage-point differences are reported throughout: <DONE> predictions are 12.4%p more likely to be plausible; debugger ablation reduces plausible rate from 73% to 63%.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The 20-participant human study size and 12 bugs are not justified with power analysis; the paper does not discuss whether the study is adequately powered to detect expected effect sizes.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "LLM-based runs on Defects4J and ARHE generate 10 patches per bug but no variance across runs is reported for AutoSD or LLM-Base; only the template baseline includes a standard deviation.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Three baselines are used: LLM-Base (direct LLM patching), Recoder (DL-based APR), and finetuned InCoder; a template-based baseline is added for ARHE.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Recoder and InCoder results are taken from Jiang et al. 2023, a contemporaneous large-scale empirical APR study, and InCoder was finetuned with perfect FL giving it an advantage over AutoSD.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "RQ2 explicitly ablates the debugger/code execution component, replacing actual observations with LLM-hallucinated observations, and measures the impact on plausibility and <DONE> reliability.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "APR evaluation uses both plausible and correct patch counts; human study measures accuracy, time, helpfulness ratings, and post-questionnaire satisfaction scores.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "A formal human study (n=20, including 6 professional developers) evaluates system-generated explanations for patch review, measuring accuracy and time with and without explanations.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "AutoSD is zero-shot with no training phase, so train/test split is not applicable; the benchmarks serve as evaluation sets without requiring holdout.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Defects4J results are broken down by v1.2 and v2.0; ARHE appendix breaks down by mutator type; human study results are shown per-bug in Figure 5.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "RQ6 provides a dedicated analysis of 25 failure cases where all hypotheses were rejected, finding 13/25 failures due to uncovered breakpoints; BIP002 disliked explanation is shown in Figure 7.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Two cases where explanations reduced accuracy (ARHE105, BIP003) are reported and explained; professional developer dissatisfaction (5/6 unsatisfied) is prominently reported in RQ5.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "ChatGPT is described only as 'a sibling model to InstructGPT' with no API version or snapshot date; though Codex (code-davinci-002) is named specifically, the primary model lacks versioning.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The full Scientific Debugging prompt for Defects4J is reproduced verbatim in Appendix Section 4, including all instruction text, examples, and DSL definitions.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "No temperature, top-p, max tokens, or other API hyperparameters are reported for any of the LLMs evaluated.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Section 3 describes the full hypothesize-observe-conclude loop, the DSL commands (REPLACE/ADD/DEL/RUN), debugger integration, rejected-hypothesis removal before patching, and the <DONE> token mechanism in detail.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "ARHE construction is documented in the appendix (7 mutators, 200 bugs, reversibility classification); Defects4J uses standard settings with method-level FL and 10 candidates matching Jiang et al. settings.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw data (patch outputs, human study responses) is not currently released; only future availability is promised in Section 6.1.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "ARHE construction from HumanEval via mutation is documented; human study data collection procedure (6 bugs per participant, randomized explanation/no-explanation, 3 questions per bug) is described in Section 4.2.2.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "The paper states participants were recruited from undergraduate/graduate students with at least 1 year of Python experience and professional developers from a software testing company, with career spans noted (3-10 years).",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from benchmark bug → AutoSD patch generation → patch selection for human study → randomized explanation assignment → survey collection is described, though raw outputs are not yet public.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff is stated for ChatGPT; the paper mentions RLHF training but does not specify a knowledge cutoff date.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 6.1 External Validity explicitly discusses data contamination concerns and notes ARHE was constructed to mitigate them, as HumanEval was designed to avoid contamination by Chen et al.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Contamination is only addressed for ARHE; Defects4J v1.2 and v2.0 solutions were publicly available before ChatGPT's training cutoff and the paper does not assess whether the model has memorized these fixes.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration is mentioned for the human study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "Section 4.2.2 states 'Our human study received IRB review exemption (IRB-23-054)'.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": false,
    327           "justification": "Only role categories are reported (8 undergrad, 6 grad students, 6 professionals with 3-10 year careers); no gender, institution, language background, or other demographics are provided.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "The inclusion criterion is explicit: 'at least 1 year of Python experience' for students, plus professional developers from a software testing company.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": true,
    338           "answer": true,
    339           "justification": "Participants were randomly assigned to one of two groups of 6 bugs; within each group, explanations were randomly provided for 3 of 6 bug reviews, with order randomized.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": true,
    344           "answer": false,
    345           "justification": "No blinding procedure is described; participants know they are in a study and whether they see the explanation is apparent from the interface, not hidden from them.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": false,
    351           "justification": "No mention of participant dropout or attrition; the paper reports final counts but does not state whether all recruited participants completed the study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "AutoSD is noted to be 'about five times longer to generate a patch' than LLM-Base in terms of wall-clock time, but no API cost or token count is reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No compute budget (total API calls, GPU hours, or cost) is stated for any of the experiments.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "AutoSD achieves competitive automated program repair performance compared to prior techniques (Recoder, InCoder, Codex-based approaches) on Defects4J v1.2 and v2.0",
    374       "evidence": "Table 2: AutoSD correct=76 (D4Jv1.2) and 113 (D4Jv2.0) vs Recoder 24/11, InCoder 41/28, LLM-Base 87/110",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The <DONE> token reliably predicts higher patch correctness, and its reliability depends on actual code execution",
    379       "evidence": "Figure 3: <DONE>-predicted plausible patches are correct at 89% vs 82% without; in hallucination ablation <DONE> is 11pp LESS reliable than random, reversed from 12.4pp MORE reliable with real execution",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "AutoSD-generated explanations improve developer accuracy in patch review for real-world bugs without increasing review time",
    384       "evidence": "Figure 5: accuracy improved with explanations in 7 of 12 cases (5 concentrated in BugsInPy); time difference not statistically significant in any case",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "70% of participants consider explanations an important factor when using automated program repair tools",
    389       "evidence": "Post-questionnaire (Figure 6): 70% agreed explanations were important; 55% were satisfied with the Scientific Debugging explanation format",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Professional developers are less satisfied with AutoSD explanations than students",
    394       "evidence": "Figure 6b: only 1 of 6 professional developers was satisfied with AutoSD overall; Figure 6a: more than half of students were satisfied",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "AutoSD performance scales with underlying LLM capability",
    399       "evidence": "Figure 4: plausible patches on ARHE increase from near-zero (CodeGen-6B) to ~179 (Codex) to ~189 (ChatGPT); AutoSD improves proportionally",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "The most common failure mode is AutoSD suggesting breakpoints that are never covered during execution",
    404       "evidence": "RQ6 analysis of 25 failure cases where all hypotheses rejected: 13/25 (52%) caused by uncovered breakpoints",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "case-study",
    411     "qualitative"
    412   ],
    413   "key_findings": "AutoSD uses LLMs to emulate Scientific Debugging (iterative hypothesis-experiment-conclusion cycles with real debugger execution), achieving competitive automated program repair performance on Defects4J and ARHE while generating human-readable explanations. A 20-person human study found explanations improved developer patch-review accuracy for 5 of 6 real-world bugs without increasing review time, though professional developers were largely dissatisfied with the format. The most common failure mode is AutoSD generating hypotheses pointing to uncovered code paths; actual code execution (vs. LLM hallucination of results) is critical for the <DONE> confidence signal to be meaningful.",
    414   "red_flags": [
    415     {
    416       "flag": "ChatGPT unversioned",
    417       "detail": "The primary model is described only as 'ChatGPT (a sibling model to InstructGPT)' with no API version or snapshot date, making reproducibility impossible as the model evolves."
    418     },
    419     {
    420       "flag": "No significance tests",
    421       "detail": "Comparisons between AutoSD and baselines in Tables 1-2 use raw counts with no statistical tests; it is unknown whether differences (e.g., 76 vs 87 on D4Jv1.2) are statistically meaningful."
    422     },
    423     {
    424       "flag": "Small human study, no power analysis",
    425       "detail": "20 participants and 12 bugs provide very low statistical power; 5/6 improvement in BugsInPy could be due to bug selection rather than treatment effect."
    426     },
    427     {
    428       "flag": "Code not released",
    429       "detail": "Only a future release is promised; without the code, the competitive repair numbers on Defects4J cannot be verified."
    430     },
    431     {
    432       "flag": "Defects4J contamination unaddressed",
    433       "detail": "ChatGPT was trained on publicly available code; Defects4J developer patches are on GitHub, making contamination of these results plausible and unaddressed."
    434     },
    435     {
    436       "flag": "No variance on LLM repair counts",
    437       "detail": "LLM-based methods are non-deterministic but Tables 1-2 report single counts without variance, obscuring whether differences between methods are reliable."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Impact of Code Language Models on Automated Program Repair",
    443       "relevance": "Primary baseline source providing Recoder and InCoder results on Defects4J for comparison with AutoSD"
    444     },
    445     {
    446       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    447       "relevance": "Core evaluation benchmark for automated program repair used throughout the paper"
    448     },
    449     {
    450       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    451       "relevance": "Source of HumanEval benchmark used to construct ARHE; Codex is evaluated as a component of AutoSD"
    452     },
    453     {
    454       "title": "Trust Enhancement Issues in Program Repair",
    455       "relevance": "Developer expectation study showing explanations are the most-wanted APR output, motivating AutoSD"
    456     },
    457     {
    458       "title": "Practitioners' Expectations on Automated Fault Localization",
    459       "relevance": "Survey showing 85% of developers want rationale for FL/APR results, core motivation for the paper"
    460     },
    461     {
    462       "title": "Why Programs Fail: A Guide to Systematic Debugging (Zeller 2009)",
    463       "relevance": "Foundational text defining Scientific Debugging that AutoSD emulates"
    464     },
    465     {
    466       "title": "Towards Developer-Centered Automatic Program Repair: Findings from Bloomberg",
    467       "relevance": "Industrial APR deployment case study showing all patches require developer review"
    468     },
    469     {
    470       "title": "BugsInPy: A Database of Existing Bugs in Python Programs",
    471       "relevance": "Python bug benchmark used for the human study's real-world bugs"
    472     },
    473     {
    474       "title": "Training Language Models to Follow Instructions with Human Feedback (InstructGPT)",
    475       "relevance": "Training approach behind ChatGPT, the primary model used in AutoSD"
    476     },
    477     {
    478       "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
    479       "relevance": "Codex-based APR baseline providing comparison point under 200-candidate patch generation"
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 3,
    485       "justification": "Directly addresses a known pain point in industrial APR adoption (explanations for developer acceptance) and includes results from professional developers."
    486     },
    487     "surprise_contrarian": {
    488       "score": 1,
    489       "justification": "The finding that professional developers were much less satisfied than students (5/6 unsatisfied) challenges the assumption that explainability universally helps adoption."
    490     },
    491     "fear_safety": {
    492       "score": 0,
    493       "justification": "No AI safety or risk concerns are raised; the paper is focused on software engineering productivity."
    494     },
    495     "drama_conflict": {
    496       "score": 1,
    497       "justification": "The stark student-vs-professional developer satisfaction split (majority satisfied vs. 1/6 satisfied) creates a notable tension in the results."
    498     },
    499     "demo_ability": {
    500       "score": 2,
    501       "justification": "The system could be demoed on any Python/Java bug with a failing test, and the prompt is fully published, though no public tool or API is currently available."
    502     },
    503     "brand_recognition": {
    504       "score": 1,
    505       "justification": "Microsoft Research Asia affiliation and use of ChatGPT/Codex provide moderate brand recognition, though this is not a marquee Microsoft product paper."
    506     }
    507   },
    508   "hn_data": {
    509     "threads": [
    510       {
    511         "hn_id": "43578430",
    512         "title": "DeepSeek: Inference-Time Scaling for Generalist Reward Modeling",
    513         "points": 163,
    514         "comments": 35,
    515         "url": "https://news.ycombinator.com/item?id=43578430"
    516       },
    517       {
    518         "hn_id": "22875937",
    519         "title": "Air-ViBeR: Exfiltrating Data from Air-Gapped Computers via Covert Vibrations",
    520         "points": 9,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=22875937"
    523       },
    524       {
    525         "hn_id": "39941576",
    526         "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
    527         "points": 3,
    528         "comments": 1,
    529         "url": "https://news.ycombinator.com/item?id=39941576"
    530       },
    531       {
    532         "hn_id": "37040795",
    533         "title": "Retroformer: Retrospective Large Language Agents",
    534         "points": 1,
    535         "comments": 1,
    536         "url": "https://news.ycombinator.com/item?id=37040795"
    537       },
    538       {
    539         "hn_id": "38765461",
    540         "title": "SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis",
    541         "points": 1,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=38765461"
    544       },
    545       {
    546         "hn_id": "26728012",
    547         "title": "Revisiting Rashomon: A Comment on “The Two Cultures”",
    548         "points": 1,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=26728012"
    551       },
    552       {
    553         "hn_id": "22896956",
    554         "title": "Exfiltrating Data from Air-Gapped Computers via Covert Surface ViBrAtIoNs",
    555         "points": 1,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=22896956"
    558       }
    559     ],
    560     "top_points": 163,
    561     "total_points": 179,
    562     "total_comments": 37
    563   }
    564 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs