scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28623B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HAFixAgent: History-Aware Automated Program Repair Agent",
      6     "authors": [
      7       "Yu Shi",
      8       "Hao Li",
      9       "Bram Adams",
     10       "Ahmed E. Hassan"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2511.01047",
     15     "doi": "10.48550/arXiv.2511.01047"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The 212.3% and 29.9% improvement claims are supported by Tables 3a/3b, though the paper itself acknowledges these SOTA comparisons are confounded by different LLMs; the internal ablation (194 unique history fixes vs 32 non-history) uses a controlled same-LLM design.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The causal claim that history improves repair is supported by a controlled ablation comparing four configurations (non-history, fn_all, fn_pair, fl_diff) using identical LLM, parameters, and containerized runtime on the same 854 bugs.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 8 explicitly bounds results to Defects4J v3.0.1, Java, a single LLM, and perfect fault localization, with explicit statements that results may not transfer to other languages, benchmarks, or noisy FL settings.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 6.1.3 explicitly flags LLM differences as a confound in SOTA comparisons; Section 7.1 discusses that extra context can hurt (lost in middle effect) as an alternative explanation for MFMH regressions.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper uses Plausible@1 (test-passing) and clearly distinguishes it from semantic correctness throughout, explicitly noting RepairAgent's correct vs plausible distinction and proposing patch quality assessment as future work (Section 7.3).",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 8 'Threats to Validity' contains dedicated subsections for Internal Validity (data leakage, fault localization) and External Validity (dataset/language, LLM/agent generalizability).",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Threats are concrete and specific: the undisclosed DeepSeek pretraining corpus cannot be verified against Defects4J, perfect FL is assumed (limiting practical applicability), results confined to 854 Java bugs from one benchmark, and single LLM/agent loop.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states what results do not show: generalization to other languages (Python, C), noisy fault localization, other benchmarks with different history profiles, or different agent configurations.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment appears anywhere in the paper text provided; there is no grants or funding section.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are identified as being from Queen's University, Canada, with individual email addresses provided.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed so this criterion cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosure, or financial interest declaration appears in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are explicitly defined: bug categories (SL, SH, SFMH, MFMH) defined in Section 3.2.2 with precise criteria; blameable/blameless defined; agent architecture components defined; Plausible@1 formalized in Equation 1.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly lists three contributions at the end of Section 1: HAFixAgent design+implementation, automated historical context construction method, and large-scale empirical evaluation on all 854 Defects4J bugs.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 9 contains three subsections situating HAFixAgent relative to traditional APR, LLM/agent-based APR, and in-context learning APR; the paper builds directly on prior HAFix work [60] and compares against contemporary baselines.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The paper states 'We will release the complete replication package of code and the evaluation script after the revision is completed'—this is an explicit promise of future release, not a current release.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The evaluation uses Defects4J v3.0.1, a standard publicly available benchmark, unmodified as the primary dataset.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper describes Ubuntu 20.04, Docker containers, and DeepSeek-V3.2-Exp API in prose, but no Dockerfile, requirements.txt, or formal environment specification is provided or linked.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided; the code is not yet released and the description in Section 5.4 is a high-level architectural summary rather than executable steps.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Main effectiveness results in Tables 3 and 4 report only raw counts and Plausible@1 percentages with no confidence intervals; box plots in Figures 4-5 show distributions but no CIs on point estimates.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Friedman and Wilcoxon tests are used for cost/step comparisons in RQ2 (Table 5), but the primary comparative effectiveness claims in RQ1 (212.3%, 29.9% improvements, 194 unique fixes) have no significance tests.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage improvements are reported throughout (212.3% over RepairAgent, 29.9% over BIRCH-feedback, 194 unique history-only fixes), providing interpretable effect magnitudes.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The 854-bug sample is the entirety of Defects4J v3.0.1, chosen because it is the standard benchmark rather than via power analysis or formal sample size justification.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Figures 4 and 5 show box plot distributions for agent steps and inference costs across successful and failed repairs, with median values tabulated.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Two SOTA baselines are included: RepairAgent (agent-based APR) and BIRCH-feedback (multi-hunk-specialized APR), plus an internal non-history ablation baseline.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "RepairAgent was published at ICSE 2025 and BIRCH-feedback is a 2025 preprint, both contemporaneous with HAFixAgent.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Four configurations (non-history, fn_all, fn_pair, fl_diff) are systematically compared with identical LLM/parameters across all 854 bugs, isolating the contribution of each history heuristic.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper reports Plausible@1, #Pass, #Unique Pass, agent steps, and inference cost (USD) stratified by success/failure and bug category.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "APR evaluation via automated test suites is standard in this field; human evaluation of patch quality is explicitly identified as future work rather than a gap in current evaluation.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Defects4J provides developer-written test suites for each bug that serve as the fixed evaluation criteria; HAFixAgent's patches are validated against these test suites.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "All results in Tables 3, 4 and Figures 3, 4, 5, 6 are broken down across four bug categories (SL, SH, SFMH, MFMH).",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 7.1 explicitly discusses the 32 performance regression cases where history configurations underperform non-history, including MFMH where history sometimes distracts the agent.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that for MFMH bugs, all three history configurations achieve fewer total fixes than non-history (50 vs 47, 43, 48), and discusses this honestly as a limitation of the approach.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "DeepSeek-V3.2-Exp is specified with references to the technical report [11] and API documentation [10]; temperature = 0.0 is stated as per official DeepSeek recommendation.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix A provides the complete system prompt for HAFixAgent including all sections, tool descriptions, methodology steps, and the conditional history context block with Jinja2 placeholder syntax.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Temperature = 0.0, maximum 50 agent steps, $1 USD cost guard, 1-hour timeout per bug are all specified in Section 5.4.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Section 4 provides detailed description of the three-module architecture (Context Builder, Agent Execution Loop, Tools) with numbered workflow steps and Table 2 listing all bash tools with usage examples.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 5.1 documents bug metadata collection, handling of Chart project cases without issue links, fault localization derivation from developer patches, and the blame extraction pipeline including fallback strategy.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The replication package including generated patches and logs is promised for future release post-revision; raw run data is not currently publicly available.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 5.1 describes how bug reports were mined from issue trackers, how failing tests were collected from Defects4J, how blame commits were extracted, and how the three history heuristics were constructed.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; the dataset is the standard Defects4J benchmark.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from bug metadata → blame extraction (blameable/blameless/multi-commit cases) → heuristic construction (fn_all, fn_pair, fl_diff) → prompt injection → containerized execution is documented across Sections 4 and 5.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Section 8.1 explicitly acknowledges that 'the pretraining corpus for this model is not disclosed to the best of our knowledge,' so the training cutoff cannot be stated.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 8.1 devotes a full 'Data leakage' paragraph to this threat, arguing that relative comparisons between configurations mitigate the confound even if absolute numbers may be inflated by memorization.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "Section 8.1 directly addresses that Defects4J bugs are publicly available and may be in DeepSeek's training data, and argues the relative design isolates the benefit of historical context regardless.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Section 6.2 and Figure 5 report per-bug inference cost in USD stratified by configuration (non-history, fn_all, fn_pair, fl_diff), outcome (success/failure), and bug category with median values tabulated.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Per-bug median costs are reported but the total computational budget for running all 854 × 4 configurations (~3,416 runs) is never stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "HAFixAgent outperforms RepairAgent by an average of 212.3% across different context configurations on the 829-bug shared subset.",
    374       "evidence": "Table 3a shows 502–523 plausible fixes vs RepairAgent's 164 correct (186 plausible) fixes. Paper acknowledges confound: different LLMs used (GPT-3.5 vs DeepSeek-V3.2-Exp).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "HAFixAgent outperforms BIRCH-feedback by an average of 29.9% on 371 multi-hunk bugs.",
    379       "evidence": "Table 3b shows 171–175 fixes vs BIRCH-feedback's 133. Confounded by different LLMs (o4-mini vs DeepSeek-V3.2-Exp).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "History-aware configurations add 194 unique fixes that the non-history configuration never achieves, while non-history contributes only 32 unique fixes.",
    384       "evidence": "Table 4 and Figure 3 Venn diagrams show this with identical LLM/setup — the strongest controlled evidence in the paper.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "71.1% of Defects4J bugs are blameable and 70.7% have exactly one unique blame commit, making history-aware APR practical even for complex multi-hunk bugs.",
    389       "evidence": "Table 1 and Figure 1 report these measurements across all 854 Defects4J v3.0.1 bugs with category breakdowns.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Historical context does not significantly increase agent step counts or token costs.",
    394       "evidence": "Table 5 reports Friedman tests (p ≥ 0.05) for steps in all categories and for cost in SH/SFMH/MFMH; only SL shows significant cost increase for fn_all and fl_diff.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Combining two or three history heuristics is more cost-effective than using all four or just one.",
    399       "evidence": "Figure 6 shows diminishing returns when adding the fourth configuration, with the cost roughly doubling for small marginal gains across categories.",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "The three history heuristics are complementary, each solving distinct bugs the others miss.",
    404       "evidence": "Figure 3 Venn diagrams show nontrivial unique contributions from each heuristic in every category, especially SFMH (fn_all: 21, fn_pair: 19, fl_diff: 19 unique cases).",
    405       "supported": "strong"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "case-study"
    411   ],
    412   "key_findings": "A preliminary study of all 854 Defects4J bugs finds that 71.1% are blameable via git blame and 70.7% map to exactly one unique blame commit, making repository history a practical and concentrated signal even for complex multi-hunk bugs. HAFixAgent's controlled ablation shows history-aware configurations add 194 unique fixes unreachable without history, while the non-history configuration contributes only 32 unique fixes, with the largest gains in multi-hunk categories. Integrating history does not significantly increase agent steps or token costs — in the hardest multi-file-multi-hunk category, median success costs are actually lower with history. The three heuristics (fn_all, fn_pair, fl_diff) are complementary rather than redundant, and combining two or three delivers the best cost-effectiveness with diminishing returns beyond that.",
    413   "red_flags": [
    414     {
    415       "flag": "SOTA comparison confounded by different LLMs",
    416       "detail": "The headline 212.3% improvement over RepairAgent and 29.9% over BIRCH-feedback use different LLMs (GPT-3.5, o4-mini vs DeepSeek-V3.2-Exp). The paper acknowledges this but still leads with these numbers in abstract and title framing."
    417     },
    418     {
    419       "flag": "Perfect fault localization assumed",
    420       "detail": "All experiments assume oracle fault localization derived from developer patches, which is unavailable in practice. This significantly limits the claimed practical applicability of the approach."
    421     },
    422     {
    423       "flag": "No significance tests on primary effectiveness claims",
    424       "detail": "The main repair count comparisons (RQ1) lack statistical significance testing; only cost/step comparisons (RQ2) use Friedman+Wilcoxon tests. The 194 unique-fix difference is not tested for significance."
    425     },
    426     {
    427       "flag": "Code not yet released",
    428       "detail": "The replication package is promised 'after the revision is completed,' so the evaluation cannot currently be reproduced."
    429     },
    430     {
    431       "flag": "DeepSeek training data undisclosed",
    432       "detail": "The paper cannot verify whether Defects4J bugs are in DeepSeek-V3.2-Exp's training corpus; the relative comparison design partially mitigates but does not eliminate this contamination concern."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    438       "relevance": "Primary baseline comparison; agent-based APR system used as the main SOTA comparison target"
    439     },
    440     {
    441       "title": "Characterizing Multi-Hunk Patches: Divergence, Proximity, and LLM Repair Challenges",
    442       "relevance": "BIRCH-feedback baseline for multi-hunk APR; provides the competing approach for complex bug categories"
    443     },
    444     {
    445       "title": "HAFix: History-Augmented Large Language Models for Bug Fixing",
    446       "relevance": "Direct prior work by same authors; HAFixAgent extends HAFix's blame-based heuristics to agentic workflows"
    447     },
    448     {
    449       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    450       "relevance": "Foundation for HAFixAgent's execution loop via mini-swe-agent; key related agent-based SE work"
    451     },
    452     {
    453       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    454       "relevance": "Seminal benchmark for agentic APR; contextualizes the Defects4J evaluation approach"
    455     },
    456     {
    457       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    458       "relevance": "Key agent-based APR baseline and framework context for agentic software engineering"
    459     },
    460     {
    461       "title": "Defects4J: a database of existing faults to enable controlled testing studies for Java programs",
    462       "relevance": "The primary evaluation benchmark used in this study"
    463     },
    464     {
    465       "title": "When do changes induce fixes?",
    466       "relevance": "SZZ algorithm foundational paper; provides theoretical basis for git blame as bug-introducing commit heuristic"
    467     },
    468     {
    469       "title": "An Empirical Study on LLM-based Agents for Automated Bug Fixing",
    470       "relevance": "Survey of agent-based APR design space; provides context for comparing HAFixAgent's design choices"
    471     },
    472     {
    473       "title": "Out of Context: How important is Local Context in Neural Program Repair?",
    474       "relevance": "Studies context engineering for APR; directly motivates the historical context injection approach"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "Git blame is universally available in any version-controlled project, making the history injection approach directly adoptable without special infrastructure."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "The finding that 70.7% of even complex multi-hunk bugs map to a single blame commit — contrary to the intuitive expectation that multi-hunk bugs would have scattered history — is genuinely surprising."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No AI safety or societal risk dimensions; this is a software engineering productivity tool."
    489     },
    490     "drama_conflict": {
    491       "score": 1,
    492       "justification": "Implicitly challenges the assumption that repository history is too noisy for complex bugs, but framed constructively rather than as a controversy."
    493     },
    494     "demo_ability": {
    495       "score": 1,
    496       "justification": "The system exists and was evaluated on a public benchmark, but code is not yet released and requires Defects4J infrastructure plus DeepSeek API access to replicate."
    497     },
    498     "brand_recognition": {
    499       "score": 1,
    500       "justification": "Queen's University and Ahmed Hassan are well-known in the MSR/software engineering research community but not a headline AI lab."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "46145180",
    507         "title": "How elites could shape mass preferences as AI reduces persuasion costs",
    508         "points": 704,
    509         "comments": 661,
    510         "url": "https://news.ycombinator.com/item?id=46145180"
    511       },
    512       {
    513         "hn_id": "42798811",
    514         "title": "Lossless Compression of Vector IDs for Approximate Nearest Neighbor Search",
    515         "points": 151,
    516         "comments": 6,
    517         "url": "https://news.ycombinator.com/item?id=42798811"
    518       },
    519       {
    520         "hn_id": "42286397",
    521         "title": "DynaSaur: Large Language Agents Beyond Predefined Actions",
    522         "points": 128,
    523         "comments": 31,
    524         "url": "https://news.ycombinator.com/item?id=42286397"
    525       },
    526       {
    527         "hn_id": "46287626",
    528         "title": "Detailed balance in large language model-driven agents",
    529         "points": 48,
    530         "comments": 5,
    531         "url": "https://news.ycombinator.com/item?id=46287626"
    532       },
    533       {
    534         "hn_id": "46127920",
    535         "title": "LST-1 follow-up of the exceptionally bright gamma-ray burst GRB 221009A",
    536         "points": 3,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=46127920"
    539       },
    540       {
    541         "hn_id": "39028645",
    542         "title": "Learning Unsupervised World Models for Autonomous Driving via Discrete Diffusion",
    543         "points": 2,
    544         "comments": 1,
    545         "url": "https://news.ycombinator.com/item?id=39028645"
    546       },
    547       {
    548         "hn_id": "33466177",
    549         "title": "Fluent APIs in Functional Languages",
    550         "points": 2,
    551         "comments": 1,
    552         "url": "https://news.ycombinator.com/item?id=33466177"
    553       },
    554       {
    555         "hn_id": "38252121",
    556         "title": "Fast unfolding of communities in large networks: 15 years later",
    557         "points": 2,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=38252121"
    560       },
    561       {
    562         "hn_id": "2332563",
    563         "title": "Statistically evaluating Internet auction authenticity",
    564         "points": 1,
    565         "comments": 1,
    566         "url": "https://news.ycombinator.com/item?id=2332563"
    567       },
    568       {
    569         "hn_id": "42230041",
    570         "title": "DynaSaur: Large Language Agents Beyond Predefined Actions",
    571         "points": 1,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=42230041"
    574       }
    575     ],
    576     "top_points": 704,
    577     "total_points": 1042,
    578     "total_comments": 706
    579   }
    580 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs