scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27441B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Hallucination to Consensus: Multi-Agent LLMs for End-to-End JUnit Test Generation",
      6     "authors": [
      7       "Qinghua Xu",
      8       "Guancheng Wang",
      9       "Lionel C. Briand",
     10       "Kui Liu"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2506.02943",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims (comparable coverage to EvoSuite, superior mutation score, ≥21.1pp gain over TOGLL in oracle correctness) are directly supported by Table 1 and Figure 4 with statistical testing.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Ablation studies in Section 5.3 systematically remove each key component (Planner, Requirement Engineer, panel discussion) with Wilcoxon significance tests to support causal claims about each component's contribution.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper explicitly limits scope to Java methods without external or user-defined class dependencies, excludes Defects4J and SF110, and acknowledges external validity threats in Section 6.2.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper attributes CANDOR's mutation score advantage to LLM semantic understanding without considering alternatives such as prompt engineering artifacts or dataset-specific patterns favoring LLM outputs.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly states mutation score is 'a proxy for bug-finding capability' (Section 6.2) and notes real bug detection on Defects4J was outside scope.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6.2 'Threats to Validity' covers construct, internal, external, and conclusion validity with dedicated discussion of specific threats.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats named include: scope limited to methods without external dependencies, LLM choice affecting results, data leakage from HumanEvalJava being in pretraining data, and only two benchmark datasets.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states CANDOR only handles Java methods without user-defined or external class dependencies and acknowledges this excludes Defects4J and SF110.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure or acknowledgment section appears in the paper text despite affiliation with Research Ireland Lero Centre and Huawei.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All author affiliations are clearly disclosed: Research Ireland Lero Centre/University of Limerick, University of Ottawa, and Huawei Software Engineering Application Technology Lab.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Co-author Kui Liu is affiliated with Huawei, which has direct commercial interests in automated software testing tools; no independence statement is provided.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests declaration is present anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined with citations: test prefix, test oracle, regression oracle, specification-based oracle, mutation score, oracle correctness, and cyclomatic complexity are all explicitly defined.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Four explicit contributions are enumerated in the introduction: first multi-agent Java end-to-end test framework, panel discussion strategy, dual-LLM pipeline, and experimental validation.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 7 provides detailed related work on both test prefix and oracle generation, explicitly positioning CANDOR relative to EvoSuite, TOGLL, LLM-Empirical, TOGA, and other approaches.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "Section 4.4 states 'we plan to release the code publicly upon paper acceptance' — a promise of future release, not actual release.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "HumanEvalJava is a public benchmark but LeetCodeJava (the novel contribution dataset) is not formally packaged and released; raw experimental outputs are unavailable.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Hardware specs are given (Precision 7960 Tower, dual RTX 6000 Ada GPUs) and LangChain is mentioned, but no requirements.txt, Dockerfile, or full dependency specification is provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions exist; with code unreleased, independent reproduction is not feasible from the paper alone.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Results are reported as averages over 3 runs across all tables and figures; no confidence intervals or error bars are provided.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Wilcoxon Signed Rank tests are applied throughout all RQs with significance level 0.05, as stated in Section 4.3.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Vargha and Delaney's A12 effect size is reported for EVO-CANDOR vs TOGLL comparisons (A12=0.920 on correct code, A12=0.960 on faulty code).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "LeetCodeJava sample of 100 methods is justified only by 'time and computational resource constraints' with no power analysis or principled sample size calculation.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Experiments are repeated 3 times and averages reported, but no standard deviations or variance measures appear in any result table or figure.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Four baselines evaluated: EvoSuite, LLM-Empirical, TOGLL, and the EVO-CANDOR variant designed for fair oracle comparison.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "TOGLL (2024) is the acknowledged SOTA for oracle generation; EvoSuite is acknowledged as unmaintained since 2021 but remains the SOTA for coverage per Tang et al. 2024.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 5.3 ablates three components (w/o Planner, w/o Requirement Engineer, w/o Panel, plus w/ Voting variant) with statistical significance testing.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Four metrics used: line coverage, branch coverage, mutation score, and oracle correctness, measured independently.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not applicable; test quality is assessed via automated metrics (coverage, mutation killing, oracle correctness against known-correct implementations).",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Two dedicated evaluation datasets used (HumanEvalJava: 160 programs, LeetCodeJava: 100 programs); no training is performed — prompt engineering only.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results broken down by dataset (HumanEvalJava, Leetcode-Medium, Leetcode-Hard) and by condition (correct vs. faulty source code) throughout Tables 1-2 and Figure 4.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 6.1 discusses specific failure cases, including Panelists hallucinating max_element() as minimum, and over 70% of cases showing Panelist disagreements that required Curator correction.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that Requirement Engineer removal was not significant on Leetcode-Medium (p=0.17), and EvoSuite achieves slightly higher branch coverage on Leetcode-Medium than CANDOR.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model names provided: 'Llama 3.1 70B' as basic LLM and 'DeepSeek R1 Llama-distilled 70B' as reasoning LLM, with alternatives (CodeLlama 70B, Mistral 22B) reported in appendix.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Figure 3 provides full system prompts and user prompts for all 8 agents, with variable placeholders (e.g., {{source_code}}) explicitly labeled.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Key hyperparameters reported: max_attempts=3, number of Panelist pipelines=3, DeepSeek output token limit=2000, EvoSuite assertion_timeout=2min, and pipeline selection rationale given.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Section 3 describes all three pipeline steps and all 8 specialized agents in detail, including their roles, inputs, outputs, and interaction flow.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "LeetCodeJava construction described (random sample from LeetCode medium/hard, community-maintained solutions from doocs/leetcode); dataset statistics (LOC, CC) reported.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Generated test files, mutation results, and oracle evaluations are not released; code release is deferred to post-acceptance.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "LeetCodeJava collection described: 50 medium + 50 hard problems randomly sampled from LeetCode, solutions sourced from GitHub repository doocs/leetcode (cited as ref [21]).",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants — standard benchmark datasets used without human recruitment.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Evaluation pipeline documented: JaCoCo for coverage reporting (with uncovered lines/branches fed to Planner), PiTest for mutation generation, compilation/execution validation steps described.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff dates are stated for Llama 3.1 70B or DeepSeek R1 despite evaluating on HumanEvalJava, a publicly available benchmark.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 6.2 explicitly acknowledges data leakage risk and proposes mutation score as mitigation since mutated programs are unlikely to appear in pretraining corpora.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "HumanEvalJava contamination is acknowledged and the mutation score metric is proposed as a more reliable evaluation because mutants are unique at test time.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No systematic inference time or cost figures reported; only qualitative remarks that DeepSeek sometimes produced 10,000+ token outputs 'taking hours to complete a single test file'.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware specs are listed (Precision 7960 Tower, dual RTX 6000 Ada GPUs) but no total experiment runtime or compute budget is stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "CANDOR achieves comparable line and branch coverage to EvoSuite across all three datasets",
    374       "evidence": "Table 1: differences between CANDOR and EvoSuite in line/branch coverage are ≤0.031 and not statistically significant (p>0.05) on HumanEvalJava and LeetCode-Medium; LeetCode-Hard line difference also not significant",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "CANDOR significantly outperforms EvoSuite in mutation score by at least 4.9 percentage points on all datasets",
    379       "evidence": "Table 1: mutation scores 0.980 vs 0.858 (HumanEvalJava), 0.939 vs 0.845 (LeetCode-Medium), 0.937 vs 0.888 (LeetCode-Hard); all differences statistically significant (p<1e-4)",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "CANDOR outperforms fine-tuned SOTA oracle generator TOGLL by at least 21.1 percentage points",
    384       "evidence": "Figure 4: EVO-CANDOR vs TOGLL gaps range 0.255–0.211 on correct code and 0.254–0.211 on faulty code; A12 effect sizes 0.920 and 0.960 with p<1e-4",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Panel discussion is the most critical component for oracle correctness",
    389       "evidence": "Table 2: removing panel discussion reduces oracle correctness by 0.067–0.086 across datasets (all p<1e-2), larger than removing Requirement Engineer (0.007–0.028)",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "The Planner agent is critical for test prefix quality",
    394       "evidence": "Table 2: removing Planner reduces line coverage by 0.050–0.099, branch coverage by 0.046–0.130, and mutation score by 0.070–0.111; all differences statistically significant (p<1e-4)",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "CANDOR is robust to increasing code complexity with only slight performance drops from Medium to Hard LeetCode",
    399       "evidence": "Table 1: line coverage drops only 0.001 (0.990→0.989) and mutation score 0.002 (0.939→0.937) from Medium to Hard; authors attribute this to LLM pretraining breadth",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "case-study"
    406   ],
    407   "key_findings": "CANDOR, a multi-agent LLM framework using 8 specialized agents, achieves comparable line/branch coverage to EvoSuite (the SBST SOTA) while outperforming it in mutation score by at least 4.9pp across all datasets without fine-tuning or external tools. In oracle generation, CANDOR outperforms the fine-tuned SOTA baseline TOGLL by at least 21.1 percentage points using only off-the-shelf LLMs, demonstrating prompt-engineering-based multi-agent approaches can surpass fine-tuned models on this task. Ablation studies confirm the panel discussion mechanism accounts for 6.7–8.6pp of oracle accuracy, and the Planner agent accounts for up to 13pp of coverage; over 70% of oracle evaluations exhibited Panelist disagreements requiring Curator resolution.",
    408   "red_flags": [
    409     {
    410       "flag": "Code not released",
    411       "detail": "Authors state 'we plan to release the code publicly upon paper acceptance' — standard promise that prevents independent reproduction at time of publication."
    412     },
    413     {
    414       "flag": "No variance reported",
    415       "detail": "Despite repeating experiments 3 times, only averages are reported with no standard deviations, confidence intervals, or error bars in any table or figure."
    416     },
    417     {
    418       "flag": "Table inconsistency",
    419       "detail": "Table 1 reports CANDOR HumanEvalJava branch coverage as 0.950, but Table 2 reports it as 0.970 — a 2pp discrepancy with no explanation."
    420     },
    421     {
    422       "flag": "HumanEvalJava contamination unresolved",
    423       "detail": "HumanEvalJava is publicly available and almost certainly in Llama 3.1 and DeepSeek R1 pretraining data; the proposed mitigation (mutation score) is incomplete since test prefixes and oracles may still be memorized."
    424     },
    425     {
    426       "flag": "Scope severely limited",
    427       "detail": "CANDOR only handles Java methods with no dependencies on user-defined or external classes — this excludes the vast majority of real-world production code, which the paper acknowledges but does not quantify."
    428     },
    429     {
    430       "flag": "No inference cost reported",
    431       "detail": "A framework involving 8 LLM agents per test file has significant computational overhead; no systematic latency, throughput, or API cost figures are provided despite practitioners needing this for adoption decisions."
    432     },
    433     {
    434       "flag": "Undisclosed Huawei affiliation conflict",
    435       "detail": "Co-author Kui Liu is from Huawei's Software Engineering Application Technology Lab, which has direct commercial interest in automated testing tools; no conflict-of-interest statement is provided."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "EvoSuite: Automatic Test Suite Generation for Object-Oriented Software (Fraser & Arcuri, 2011)",
    441       "relevance": "Primary baseline for test prefix generation; the paper positions CANDOR as achieving comparable coverage to EvoSuite while surpassing it in mutation score."
    442     },
    443     {
    444       "title": "TOGLL: Correct and Strong Test Oracle Generation with LLMs (Hossain & Dwyer, 2024)",
    445       "relevance": "Primary SOTA baseline for oracle generation; CANDOR's main claim is outperforming this fine-tuned approach by ≥21.1pp using only prompt engineering."
    446     },
    447     {
    448       "title": "Using Large Language Models to Generate JUnit Tests: An Empirical Study (Siddiq et al., 2024)",
    449       "relevance": "LLM-Empirical baseline representing prompt-engineering-only end-to-end test generation; most direct predecessor to CANDOR."
    450     },
    451     {
    452       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning (Guo et al., 2025)",
    453       "relevance": "The reasoning LLM used as Panelist agents in CANDOR's panel discussion; selected for strong reasoning capability among open-source models."
    454     },
    455     {
    456       "title": "TOGA: A Neural Method for Test Oracle Generation (Dinella et al., 2022)",
    457       "relevance": "First LLM-based (CodeBERT fine-tuned) oracle generator; positioned as foundational prior work motivating the specification-based oracle generation line of research."
    458     },
    459     {
    460       "title": "ChatGPT vs SBST: A Comparative Assessment of Unit Test Suite Generation (Tang et al., 2024)",
    461       "relevance": "Reports EvoSuite remains SOTA for coverage despite LLM advances; motivates CANDOR's multi-agent approach to close this gap."
    462     },
    463     {
    464       "title": "A Practical Guide for Using Statistical Tests to Assess Randomized Algorithms in Software Engineering (Arcuri & Briand, 2011)",
    465       "relevance": "Methodological reference for the Wilcoxon Signed Rank test used throughout the evaluation as per Section 4.3."
    466     },
    467     {
    468       "title": "Large Language Model Based Multi-Agents: A Survey of Progress and Challenges (Guo et al., 2024)",
    469       "relevance": "Background survey on multi-agent LLM systems that provides conceptual grounding for CANDOR's agent orchestration approach."
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 2,
    475       "justification": "Addresses a real developer pain point with a working system showing strong results, but scope limited to dependency-free Java methods and code not yet released reduces immediate adoptability."
    476     },
    477     "surprise_contrarian": {
    478       "score": 1,
    479       "justification": "The finding that prompt-engineering beats fine-tuned TOGLL is somewhat surprising and noteworthy, but the general trend of LLMs matching traditional tools was already established."
    480     },
    481     "fear_safety": {
    482       "score": 0,
    483       "justification": "No safety or risk implications; purely a software engineering productivity tool with no adversarial or alignment concerns."
    484     },
    485     "drama_conflict": {
    486       "score": 1,
    487       "justification": "The 'Hallucination to Consensus' framing and David Hume quote add narrative interest, but there is no real methodological controversy or surprising reversal."
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "Code not yet released; framework is described in enough detail to partially replicate but requires significant open-source LLM infrastructure and Java toolchain setup."
    492     },
    493     "brand_recognition": {
    494       "score": 0,
    495       "justification": "University of Limerick, University of Ottawa, and Huawei are not prominent AI research brands; Lero Centre is specialized but not widely known in the broader ML community."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "40584327",
    502         "title": "To Believe or Not Believe Your LLM",
    503         "points": 58,
    504         "comments": 17,
    505         "url": "https://news.ycombinator.com/item?id=40584327"
    506       },
    507       {
    508         "hn_id": "23512220",
    509         "title": "Ear2Face",
    510         "points": 3,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=23512220"
    513       },
    514       {
    515         "hn_id": "40641266",
    516         "title": "To Believe or Not to Believe Your LLM",
    517         "points": 2,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=40641266"
    520       },
    521       {
    522         "hn_id": "38617050",
    523         "title": "A Neural Corpus Indexer for Document Retrieval",
    524         "points": 2,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=38617050"
    527       },
    528       {
    529         "hn_id": "27441912",
    530         "title": "Cluster Monte Carlo: Modeling Dense Star Clusters in the Milky Way and Beyond",
    531         "points": 2,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=27441912"
    534       },
    535       {
    536         "hn_id": "23544629",
    537         "title": "Vulnerability Analysis of 2500 Docker Hub Images",
    538         "points": 2,
    539         "comments": 0,
    540         "url": "https://news.ycombinator.com/item?id=23544629"
    541       },
    542       {
    543         "hn_id": "23426018",
    544         "title": "Generate a face image of a subject given an ear image as the input [pdf]",
    545         "points": 1,
    546         "comments": 1,
    547         "url": "https://news.ycombinator.com/item?id=23426018"
    548       },
    549       {
    550         "hn_id": "44022405",
    551         "title": "Art of Repair: Optimizing Iterative Program Repair with Instruction-Tuned Models",
    552         "points": 1,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=44022405"
    555       }
    556     ],
    557     "top_points": 58,
    558     "total_points": 71,
    559     "total_comments": 18
    560   }
    561 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs