scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28242B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Fixing 7,400 Bugs for 1$: Cheap Crash-Site Program Repair",
      6     "authors": [
      7       "Han Zheng",
      8       "Ilia Shumailov",
      9       "Tianqi Fan",
     10       "Aiden Hall",
     11       "Mathias Payer"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2505.13103",
     16     "doi": "10.48550/arXiv.2505.13103"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The title claims '7,400 bugs' but the evaluation covers only 358 bugs — this number is never derived in the paper. The abstract's '73.5%' combined fixing rate cannot be reconciled with the body (195 + 60 additional = 255/358 ≈ 71.2%). The conclusion also swaps the 29.6% and 45.9% figures relative to Section 5.2.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Comparative experiments against CodeRover-S, Agentless, and VulMaster on the same 358-bug benchmark with identical evaluation metrics adequately support the causal claim that WILLIAMT's design reduces cost while maintaining repair rate.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Fixing 7,400 Bugs' dramatically overstates the 358-bug evaluation scope; the paper claims 'broad applicability and scalability' beyond what the ARVO benchmark (4 memory corruption types from OSS-Fuzz) can support.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss why WILLIAMT and CodeRover-S fix largely disjoint sets of bugs, nor whether ARVO's 15-minute compilation filter systematically selects simpler bugs where templates work better.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Appendix A explicitly analyzes the gap between 'plausible' fixes (PoC does not crash) and actual fixes, showing only 56/165 plausible patches pass manual review with broader inputs; the authors recommend manual developer verification.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Discussion' substantively covers three limitations: incorrect crash site analysis, semantically disruptive patch insertion within conditionals, and the imprecise plausible fix metric.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named with examples: LLM variable identification fails when required variables are beyond the crash frame; patch insertion above a crash site inside an if-statement breaks control flow; the plausible metric doesn't test inputs beyond the PoC.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state that results are bounded to memory corruption bugs (HBO, GBO, SBO, UAF) and do not apply to logic bugs or other vulnerability classes; the broad title actively misleads about scope.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper; two authors are employed at Google/Google DeepMind and two at EPFL, but no grant numbers or sponsorship are disclosed.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (EPFL, Google DeepMind, Google Zurich, Google New York) are clearly listed in the paper header.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed; N/A.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are clearly defined: 'crash-site repair' vs 'root cause analysis', 'plausible fix' metric, spatial vs temporal memory corruption, and the three-stage repair objective framework (graceful crash, bail-out, root cause).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit bullet points at the end of Section 1 state contributions: crash-site repair proposal, template-guided patch generation, WILLIAMT prototype evaluation, and combined cost/fixing rate results.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 provides background on LLM-based APR and OSS-Fuzz; direct comparisons with Agentless, CodeRover-S, and VulMaster situate the contribution clearly within SoTA.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "'We promise to fully release WILLIAMT upon paper acceptance' — this is a future promise, not a current release; the code is unavailable.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "ARVO is a publicly available benchmark; OSS-Fuzz bugs are publicly accessible. The evaluation data can be independently obtained.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Hardware is described (Ubuntu 22.04, AMD EPYC 7302P, 64GB RAM, RTX 4090) but no requirements.txt, Dockerfile, or dependency specification for WILLIAMT itself is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Appendix C describes the regex logic at a high level and shows one prompt example, but no step-by-step reproduction instructions are provided and the code is not released.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are reported as single-run point estimates (46.1%, 54.5%, etc.) with no confidence intervals or error bars across any comparison.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are performed for any comparative claim between WILLIAMT and baselines.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported as percentage differences (46.1% vs 54.5% fix rate, 99.7% token reduction, 45.9% cost reduction) with baseline values provided for context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 358-bug subset is chosen by a 15-minute compilation filter 'following recommended practice [60]', but no power analysis or justification for why 358 is sufficient is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread is reported for any result; all figures show single-run point estimates with no indication of run-to-run stability.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Three baselines are compared: CodeRover-S (best SoTA), Agentless, and VulMaster, all evaluated on the same 358-bug ARVO subset with the same plausible fix metric.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All baselines are from 2024 publications (CodeRover-S arXiv Nov 2024, Agentless ISSTA 2024, VulMaster ICSE 2024), contemporary with the 2025 preprint.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study is conducted; the contributions of regex-based context retrieval vs. template-guided patch generation are never separated to measure individual impact.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: plausible fix rate, token cost ($), execution time, per-LLM performance breakdown (Figure 6), and manual review pass rate (Appendix A).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Appendix A includes manual review of all 165 plausible patches from WILLIAMT-GPT-4o, determining which preserve behavior with broader inputs beyond the PoC.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is a benchmark evaluation of a repair system, not a prediction model requiring train/test split; N/A.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Results are not broken down by memory corruption type (HBO, GBO, SBO, UAF) — Figures 4 and 6 show aggregate fixing rates only, despite the system being designed per-category with different templates.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 6 analyzes failure modes with specific examples; Appendix A quantifies 70 early-exit failures and 39 patches that block valid inputs beyond the PoC.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "VulMaster's near-complete failure (5 bugs fixed) is reported; WILLIAMT's ~37% 'No Patch' rate is acknowledged; Gemma3:1B fixes only 11/358 bugs.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model versions are stated: gpt-4o-2024-08-06, DeepSeek-V3, DeepSeek-R1, Claude 3.5-Haiku, Claude 3.7-Sonnet, Gemma3 1B/4B/12B/27B.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 15 shows the complete actual prompt used for crash-site variable analysis including structured output requirements, constraints, and an example for a global-buffer-overflow bug.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or other LLM sampling hyperparameters are reported for any model.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The two-component scaffolding (regex-based context retrieval and template-guided patch generation) is described with workflow diagrams, appendices showing regex logic, and template code (Figures 12–15).",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix C documents the preprocessing pipeline: PoC reproduction in Docker, ASan report parsing via regex, crash frame identification, source line extraction, and code window selection.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "ARVO benchmark is public, but WILLIAMT's per-bug patch outputs and intermediate results are not released; the code is promised but not yet available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The selection of 358 bugs from ARVO (all HOF, SOF, UAF, GOF bugs compilable within 15 minutes) is clearly stated with reference to ARVO's curation methodology.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participant recruitment; evaluation uses a static benchmark dataset.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from ClusterFuzz PoC → Docker reproduction → ASan report → regex extraction → LLM analysis → patch insertion → compilation → PoC re-execution is described in the paper and appendices.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The paper uses multiple LLMs but states no training data cutoffs, despite evaluating on public OSS-Fuzz bugs (with known CVEs and fix commits) that predate most model training cutoffs.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "ARVO bugs are real, public OSS-Fuzz reports with public fix commits; LLMs may have memorized these bugs and their fixes during training. This threat is never discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "ARVO bugs are from OSS-Fuzz reports predating model training cutoffs; the possibility that LLMs have seen these specific bug reports and reference fixes is not addressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Inference cost is a central focus: Figure 5a compares $/bug ($0.0026 vs $0.93), Figure 7 shows cost per model in cents, and total cost for 358 bugs ($0.68) is explicitly stated.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware specs are described (AMD EPYC 7302P, RTX 4090) and per-bug API costs are given, but the total compute budget across all experimental conditions is not summed or stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "WILLIAMT reduces token cost by 99.7% vs CodeRover-S while retaining over 86.7% of its fixing rate",
    375       "evidence": "WILLIAMT fixes 165/358 (46.1%) vs CodeRover-S 195/358 (54.5%) on gpt-4o; average cost $0.0026 vs $0.93 per bug (Section 5.1, Figure 5a)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Combined WILLIAMT+CodeRover-S pipeline achieves 29.6% more fixes and 45.9% lower total cost than CodeRover-S alone",
    380       "evidence": "Section 5.2 states 60 additional plausible fixes and 45.9% cost reduction; though the conclusion erroneously swaps these two percentages",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "WILLIAMT can fix all 358 benchmark bugs for under $0.68 using GPT-4o — less than the cost of fixing one bug with CodeRover-S",
    385       "evidence": "Directly stated in Section 5.2; consistent with $0.0026/bug × 358 = $0.93 total vs CodeRover-S's $0.93/bug",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Gemma3:27B local model achieves 96.4% of GPT-4o's fixing performance on consumer hardware (RTX 4090 or Mac Mini M4)",
    390       "evidence": "Figure 6: Gemma3:27B fixes 163 bugs vs GPT-4o's 165; Mac Mini M4 performance stated to be on par with RTX 4090",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Plausible fix metric significantly overestimates actual fix quality: only 56/165 WILLIAMT plausible patches pass manual review",
    395       "evidence": "Appendix A: 165 plausible → 95 avoid early exit → 56 pass manual review for broader inputs; 39 introduce early exits on non-PoC inputs",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Reasoning models cost 5-6x more than non-reasoning models without fixing more bugs on this task",
    400       "evidence": "Figure 7: DeepSeek-R1 costs 0.70 cent/bug vs Claude-haiku 0.09 cent/bug; fix rates are comparable (159 vs 170); Claude-haiku non-reasoning achieves the highest fix rate",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "WILLIAMT demonstrates that crash-site repair — inserting assertions directly before the crash point rather than fixing the root cause — achieves 86.7% of state-of-the-art APR performance at 0.3% of the token cost. When used as a pre-filter before CodeRover-S, the combined pipeline fixes 29.6% more bugs while reducing total cost by 45.9%. Local models (Gemma3:27B on RTX 4090 or Mac Mini M4) achieve near-parity with frontier models, and non-reasoning models outperform reasoning models on this template-constrained task. However, the plausible fix metric substantially overstates actual repair quality: only 34% (56/165) of 'plausible' patches pass manual review with broader inputs, meaning real-world utility requires human verification.",
    409   "red_flags": [
    410     {
    411       "flag": "Title exaggeration",
    412       "detail": "The title claims '7,400 bugs' but the evaluation covers only 358 bugs. This number is never derived or explained anywhere in the paper body."
    413     },
    414     {
    415       "flag": "Abstract-body numerical inconsistency",
    416       "detail": "The abstract states a '73.5%' combined fixing rate, but 195 (CodeRover-S) + 60 (additional) = 255/358 ≈ 71.2%. The 73.5% figure cannot be reconciled with the evidence in the body."
    417     },
    418     {
    419       "flag": "Conclusion swaps key results",
    420       "detail": "The conclusion states 'reduces token usage by 29.6% and improves fixing rate by 45.9%' — these figures are transposed relative to the correct values in Section 5.2 (45.9% cost reduction, 29.6% fixing rate improvement)."
    421     },
    422     {
    423       "flag": "No statistical significance testing",
    424       "detail": "No confidence intervals, significance tests, or variance estimates are reported across any comparison. All results are single-run point estimates on 358 bugs."
    425     },
    426     {
    427       "flag": "Benchmark contamination unaddressed",
    428       "detail": "ARVO bugs are real public OSS-Fuzz reports with known CVEs and fix commits predating model training cutoffs. LLMs may have seen these bug reports and reference patches during training. This threat is never discussed."
    429     },
    430     {
    431       "flag": "Code not released",
    432       "detail": "WILLIAMT is promised for open-source release upon paper acceptance but is currently unavailable, making independent reproduction impossible."
    433     },
    434     {
    435       "flag": "No ablation study",
    436       "detail": "The two core components (regex-based context retrieval, template-guided patch generation) are never evaluated independently to determine each component's contribution to the results."
    437     },
    438     {
    439       "flag": "Plausible metric acknowledged weak but still used for comparisons",
    440       "detail": "The paper itself shows only 56/165 (34%) plausible patches pass manual review, yet all SoTA comparisons use this metric — comparisons with CodeRover-S and Agentless use a metric the paper demonstrates is unreliable."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "Fixing Security Vulnerabilities with AI in OSS-Fuzz (CodeRover-S)",
    446       "relevance": "Primary baseline and the SoTA APR tool; WILLIAMT is benchmarked against it for both cost and fixing rate on ARVO"
    447     },
    448     {
    449       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 each using ChatGPT (Agentless)",
    450       "relevance": "Key baseline APR system compared on the same ARVO benchmark; also inspired the cost-per-bug framing"
    451     },
    452     {
    453       "title": "ARVO: Atlas of Reproducible Vulnerabilities for Open Source Software",
    454       "relevance": "The benchmark dataset used for all evaluation; provides 5,000+ reproducible OSS-Fuzz memory corruption bugs"
    455     },
    456     {
    457       "title": "Out of Sight, Out of Mind: Better Automatic Vulnerability Repair by Broadening Input Ranges and Sources (VulMaster)",
    458       "relevance": "Third baseline evaluated; its near-complete failure (5 bugs) provides important context for the difficulty of the task"
    459     },
    460     {
    461       "title": "AutoCodeRover: Autonomous Program Improvement",
    462       "relevance": "Foundation of the CodeRover-S system; describes the multi-iteration repair loop that WILLIAMT's one-shot approach simplifies"
    463     },
    464     {
    465       "title": "Template-Guided Program Repair in the Era of Large Language Models",
    466       "relevance": "Related prior work on template-guided repair that WILLIAMT extends to crash-site-specific repair"
    467     },
    468     {
    469       "title": "AddressSanitizer: A Fast Address Sanity Checker",
    470       "relevance": "Core dependency — ASan reports are the primary structured input to WILLIAMT's regex-based context retrieval"
    471     },
    472     {
    473       "title": "Code Repair with LLMs Gives an Exploration-Exploitation Tradeoff",
    474       "relevance": "Related APR work examining LLM-guided repair strategies; contextualizes the multi-attempt vs one-shot tradeoff"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "Directly applicable to any developer maintaining OSS with OSS-Fuzz integration; the Mac Mini M4 deployment and $0.68 total cost make it immediately accessible to individual developers."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "The core thesis — that crash-site repair (blocking exploitation) is sufficient and far cheaper than root-cause repair — is genuinely contrarian to the dominant APR research direction."
    485     },
    486     "fear_safety": {
    487       "score": 1,
    488       "justification": "Addresses security vulnerability fixing backlog at scale, which has clear security implications, but does not raise novel AI risk concerns."
    489     },
    490     "drama_conflict": {
    491       "score": 1,
    492       "justification": "Challenges expensive agentic APR tools with a template approach, but presents WILLIAMT as complementary to CodeRover-S rather than a replacement."
    493     },
    494     "demo_ability": {
    495       "score": 2,
    496       "justification": "The Mac Mini M4 local deployment with Gemma3:4b is concrete and reproducible in principle, though the code is not yet released."
    497     },
    498     "brand_recognition": {
    499       "score": 2,
    500       "justification": "Co-authors from Google DeepMind; evaluates GPT-4o, Claude 3.5-Haiku/3.7-Sonnet, DeepSeek — all major recognizable LLM brands with active developer audiences."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "45444062",
    507         "title": "Machine Learnability as a Measure of Order in Aperiodic Sequences",
    508         "points": 48,
    509         "comments": 5,
    510         "url": "https://news.ycombinator.com/item?id=45444062"
    511       },
    512       {
    513         "hn_id": "46697408",
    514         "title": "WildCAT3D: Appearance-Aware Multi-View Diffusion in the Wild",
    515         "points": 3,
    516         "comments": 0,
    517         "url": "https://news.ycombinator.com/item?id=46697408"
    518       },
    519       {
    520         "hn_id": "43401539",
    521         "title": "CriteoPrivateAd: RealWorld Bidding Dataset to Design Private Advertising Systems",
    522         "points": 2,
    523         "comments": 1,
    524         "url": "https://news.ycombinator.com/item?id=43401539"
    525       },
    526       {
    527         "hn_id": "43516923",
    528         "title": "UniHOPE: A Unified Approach for Hand-Only and Hand-Object Pose Estimation",
    529         "points": 2,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=43516923"
    532       },
    533       {
    534         "hn_id": "43496516",
    535         "title": "UniHOPE: A Unified Approach for Hand-Only and Hand-Object Pose Estimation",
    536         "points": 2,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=43496516"
    539       },
    540       {
    541         "hn_id": "36016970",
    542         "title": "Visual Question Answering: Techniques and Common Trends in Recent Literature",
    543         "points": 2,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=36016970"
    546       },
    547       {
    548         "hn_id": "44686218",
    549         "title": "The Heteronomy of Algorithms",
    550         "points": 1,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=44686218"
    553       },
    554       {
    555         "hn_id": "47380252",
    556         "title": "Show HN: Karpathy's Autoresearch with Evolutionary Database",
    557         "points": 1,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=47380252"
    560       },
    561       {
    562         "hn_id": "40515506",
    563         "title": "Evaluating AI-Generated Code for C++, Fortran, Go, Java, Julia, Matlab, etc.",
    564         "points": 1,
    565         "comments": 2,
    566         "url": "https://news.ycombinator.com/item?id=40515506"
    567       },
    568       {
    569         "hn_id": "43104988",
    570         "title": "Aide: AI-Driven Exploration in the Space of Code (Arxiv)",
    571         "points": 1,
    572         "comments": 1,
    573         "url": "https://news.ycombinator.com/item?id=43104988"
    574       }
    575     ],
    576     "top_points": 48,
    577     "total_points": 63,
    578     "total_comments": 9
    579   }
    580 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs