scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27040B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Empirical Evaluation of Large Language Models in Automated Program Repair",
      6     "authors": [
      7       "Jiajun Sun",
      8       "Fengjie Li",
      9       "Xinzhu Qi",
     10       "Hongyu Zhang",
     11       "Jiajun Jiang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2506.13186",
     16     "doi": "10.48550/arXiv.2506.13186"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims — CodeLlama outperforming larger LLaMA, non-linear scaling, early-stage correct patches, prompt sensitivity — are directly supported by Tables IV–VI and Figure 4.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The claim that 'fine-tuning on code-related tasks significantly enhances repair capabilities' is based on comparing CodeLlama-7B vs LLaMA-2-13B, which differ in both fine-tuning and parameter count; no controlled experiment isolates the fine-tuning variable.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Findings are stated broadly (e.g., 'Finding 6: Bugs of shorter length are more likely to be successfully repaired by LLMs') without consistently bounding claims to the four specific models and six datasets studied.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not systematically consider alternatives; for example, the large performance gap between algorithmic and enterprise bugs could be due to training data contamination rather than bug complexity, but this is not explored.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper clearly defines repair rate (correct patches / total bugs) and precision (correct patches / plausible patches) and uses these direct APR metrics consistently with its claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section V.B is a dedicated 'Limitation' section and Section V.C provides a 'Threats to Validity' section addressing both internal and external threats.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Threats are largely boilerplate; the external threat merely states 'generalizability remains an open question,' and the internal threat only notes manual patch verification without quantifying inter-rater agreement or disagreement rate.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly bounds its evaluation to four LLMs, six datasets, three languages, and single-function bugs, acknowledging that real-world bugs may be more complex and additional languages remain unexplored.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No acknowledgment or funding section is present anywhere in the paper; funding sources are entirely undisclosed.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed on the title page: Tianjin University, UESTC, and Chongqing University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, making independence assessment impossible.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosures of any kind appear in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "APR is defined, and 'plausible patch,' 'correct patch,' 'repair rate,' and 'precision' are all formally defined in Section III.E.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Five explicit contribution bullet points are listed at the end of the introduction, clearly stating the study scope, analysis dimensions, and practical implications.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II and the introduction explicitly compare this work to Xia et al. [37], Fan et al. [38], Xiang et al. [43], and others, articulating specific gaps (multi-language, modern large models, cost analysis) that this study addresses.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper states artifacts are released 'at our homepage' but provides no URL; this is functionally unverifiable and equivalent to 'available upon request.'",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All six evaluation datasets (Defects4J, BugsCpp, IntroClass-C/Java, ConDefects-Java/Py) are publicly available standard benchmarks.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Model versions are cited by name but no hardware specifications, Python version, framework dependencies, or environment files are provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions appear in the paper; the vague reference to 'our homepage' provides no actionable guidance.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables IV–VI are reported as point estimates (repair rate %, precision %) with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims; all differences between models and prompt conditions are reported as raw counts without p-values.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper reports relative changes (e.g., '206.7% increase in repair count,' '22.9% lower RRate') with baseline values, providing effective effect size context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Datasets were adopted from existing benchmarks without any power analysis or justification for why specific subset sizes (255, 228, 106, 297, 563 bugs) were selected.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or run-to-run variability is reported; LLM generation is stochastic but all results appear to be single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four LLMs serve as mutual comparisons spanning general-purpose vs. code-specialized and 7B–33B parameter ranges, providing meaningful cross-model baselines.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All four evaluated models (CodeLlama-7B, LLaMA-2-13B, StarCoder-15.5B, DeepSeek-Coder-33B-instruct) are from 2023–2024 and are widely used in current research.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "RQ4 systematically ablates prompt components across all four models: zero-shot vs. one-shot vs. analysis-augmented prompts on two datasets.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The evaluation uses repair rate, precision (correct/plausible), complementarity (unique bugs per model), and patch ranking position analysis.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "All plausible patches are manually inspected by the first two authors to verify semantic equivalence to developer patches, as described in Section III.E.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Established bug benchmarks serve as evaluation sets; the evaluated models were not trained specifically on these benchmarks (perfect fault localization is provided to isolate patch generation capability).",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by dataset, programming language (Java vs. C/C++ vs. Python), bug type (enterprise vs. algorithmic), and prompt strategy across all tables.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figure 7 shows a concrete failure case where incorrect LLM-generated bug analysis misleads DeepSeek-Coder; Section IV-A analyzes BugsCpp failures attributing them to long bug functions.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that bug analysis hurts DeepSeek-Coder by 46.6%, that all LLMs perform poorly on BugsCpp (avg 3.5% RRate), and that LLaMA consistently underperforms across all settings.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Table II specifies exact model identifiers: CodeLlama-7B, LLaMA-2-13B, StarCoder-15.5B, DeepSeek-Coder-33B-instruct, with references to original papers.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 1 shows the full prompt template structure with actual example code, guidance text, and all four prompt variants are described in detail with their components.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No sampling hyperparameters (temperature, top-p, repetition penalty, beam search settings) are reported for any of the four models.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This is direct LLM inference for patch generation; no agentic scaffolding is used.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section III.B documents selection criteria: single-function bugs only, specific subset sizes, and random sampling of one submission per assignment for ConDefects to reduce overhead.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Generated patches are claimed to be available 'at our homepage' without a URL; while input benchmarks are public, the 600K+ generated patches are not verifiably accessible.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section III.B describes dataset selection criteria, subset sizes, random sampling methodology, and rationale for each dataset included in the study.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Standard public benchmarks are used; no human participant recruitment is involved.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Section III.E documents the full pipeline: patch generation (200 or 30 per bug) → deduplication → test suite validation → manual inspection for semantic equivalence.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the four evaluated models, despite the explicit concern about benchmark data appearing in training corpora.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section V.A explicitly discusses data leakage as a 'critical concern,' acknowledging that benchmark code may exist in training corpora, though the mitigation strategy (model diversity, dataset diversity) is weak.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "ConDefects [57] was specifically designed to address LLM data leakage concerns for fault localization and program repair, and the paper explicitly cites this as part of their contamination mitigation.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants; benchmark evaluation study only.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "While cost-effectiveness is discussed qualitatively (diminishing returns beyond 30 patches, smaller models with complementary value), no actual GPU hours, latency, or dollar costs are quantified.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Total computational budget (GPU type, hours, hardware configuration) is not stated anywhere in the paper.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Fine-tuned CodeLlama-7B consistently outperforms general-purpose LLaMA-2-13B despite having fewer parameters",
    375       "evidence": "Table IV: CodeLlama fixes 40/34 bugs on Defects4J v1.2/v2.0 vs LLaMA's 19/18; pattern holds across all 4 algorithmic datasets in Table V",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLMs perform significantly better on algorithmic assignment bugs than enterprise-grade project bugs",
    380       "evidence": "DeepSeek achieves 45.45% repair rate on IntroClass-C (Table V) vs 5.66% on BugsCpp; average RRate on Defects4J is 15.1% vs 3.5% on BugsCpp",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Correct patches predominantly emerge in early generations; 30 patches achieves comparable effectiveness to 200",
    385       "evidence": "Figure 4: 95.77% of StarCoder's correct patches on IntroClass-Java within first 30 generations; most LLMs have at most 1 correct patch beyond rank 30 on Defects4J",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "In-context repair examples substantially improve LLM repair performance over zero-shot",
    390       "evidence": "Table VI: average RRate on ConDefects-Java drops from 11.5% (one-shot) to 8.9% (zero-shot); LLaMA drops 85.7% with zero-shot",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Bug analysis prompts improve weaker models but degrade stronger models",
    395       "evidence": "Table VI: DeepSeek-Coder drops from 127 to 63 correct repairs on ConDefects-Java (-46.6%) with analysis; LLaMA increases from 1 to 32 (+3100%)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Shorter bugs are significantly more likely to be successfully repaired by LLMs",
    400       "evidence": "Figure 5: median length of successfully repaired bugs is consistently lower than unrepaired bugs across all 6 datasets; significant drop observed for functions exceeding 100 lines",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "All four LLMs exhibit complementary repair capabilities, each producing unique fixes unattainable by others",
    405       "evidence": "Figure 3: even LLaMA (weakest model) contributes 1 unique repair on Defects4J v2.0; CodeLlama fixes 9 unique bugs unmatched by any other model",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "empirical"
    412   ],
    413   "key_findings": "Four open-source LLMs spanning 7B–33B parameters were evaluated on 2,309 bugs across six benchmarks in three programming languages. Code-specialized fine-tuned models substantially outperform general-purpose models even at smaller parameter counts (CodeLlama-7B > LLaMA-2-13B), and doubling parameter count yields sublinear gains. LLMs achieve 3–8× higher repair rates on algorithmic assignment bugs vs. enterprise project bugs, likely driven by shorter function lengths and simpler bug patterns. A key practical finding is that 95%+ of correct patches emerge in the first 30 generations, enabling significant cost reduction without meaningful accuracy loss. Prompt design has large and heterogeneous effects: in-context examples universally improve performance, while bug analysis helps weak models (+3100% for LLaMA) but hurts strong ones (-46.6% for DeepSeek-Coder) due to sensitivity to inaccurate diagnostic content.",
    414   "red_flags": [
    415     {
    416       "flag": "No statistical significance testing",
    417       "detail": "All comparative claims between models and prompt conditions are made without significance tests; observed differences could reflect noise given stochastic LLM outputs."
    418     },
    419     {
    420       "flag": "No variance across runs",
    421       "detail": "LLM patch generation is stochastic but no run-to-run variance or confidence intervals are reported; all results appear to be single experimental runs."
    422     },
    423     {
    424       "flag": "Confounded fine-tuning causal claim",
    425       "detail": "The claim that fine-tuning improves APR compares CodeLlama-7B vs LLaMA-2-13B, which differ in both fine-tuning and architecture/parameter count; the effect of fine-tuning alone is not isolated."
    426     },
    427     {
    428       "flag": "Sampling hyperparameters undisclosed",
    429       "detail": "Temperature, top-p, and repetition penalty are not reported for any model, making exact replication impossible and preventing assessment of how generation settings affect results."
    430     },
    431     {
    432       "flag": "No comparison to non-LLM APR baselines",
    433       "detail": "The paper does not compare to traditional APR methods (GenProg, TBar) or recent LLM-based methods (ChatRepair, ThinkRepair) mentioned in related work, preventing contextualization of absolute performance."
    434     },
    435     {
    436       "flag": "Unverifiable reproducibility claim",
    437       "detail": "Artifacts are claimed released 'at our homepage' with no URL provided; the claim cannot be verified and is functionally equivalent to 'available upon request.'"
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Automated program repair in the era of large pre-trained language models",
    443       "relevance": "First major study applying large LLMs to APR on Defects4J/ManyBugs/QuixBugs; directly compared to and identified as gap this paper addresses"
    444     },
    445     {
    446       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    447       "relevance": "Primary evaluation benchmark; used for both RQ1 enterprise-grade bug evaluation and patch ranking analysis"
    448     },
    449     {
    450       "title": "ConDefects: A new dataset to address the data leakage concern for LLM-based fault localization and program repair",
    451       "relevance": "Key benchmark specifically designed to mitigate LLM contamination; central to the study's validity argument for data leakage mitigation"
    452     },
    453     {
    454       "title": "DeepSeek-Coder: When the large language model meets programming",
    455       "relevance": "Best-performing evaluated model; represents state-of-the-art code-specialized open-source LLM at time of study"
    456     },
    457     {
    458       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    459       "relevance": "ChatRepair — representative recent LLM-based APR approach that motivated the study but focuses only on Defects4J"
    460     },
    461     {
    462       "title": "How far can we go with practical function-level program repair?",
    463       "relevance": "Recent LLM APR study with Java-only evaluation; identified as gap motivating multi-language coverage"
    464     },
    465     {
    466       "title": "An empirical study on fine-tuning large language models of code for automated program repair",
    467       "relevance": "Closely related ASE 2023 study on fine-tuning smaller LLMs for APR; directly compared in related work"
    468     },
    469     {
    470       "title": "Code llama: Open foundation models for code",
    471       "relevance": "One of the four evaluated models; fine-tuned from LLaMA on code tasks, enabling the fine-tuning comparison"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "Directly actionable guidance: generate 30 patches not 200, use code-specialized models, combine models for complementary coverage, include in-context examples."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "Counterintuitive finding that bug analysis hurts stronger models (DeepSeek drops 46.6%) and that smaller 7B model produces unique fixes unavailable from 33B model challenges scale-is-everything assumptions."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety or risk concerns raised; this is a capability evaluation for software maintenance."
    486     },
    487     "drama_conflict": {
    488       "score": 0,
    489       "justification": "Straightforward empirical comparison with no controversy or competing claims from other groups."
    490     },
    491     "demo_ability": {
    492       "score": 2,
    493       "justification": "Uses publicly available open-source models (DeepSeek-Coder, CodeLlama) and public benchmarks; anyone with GPU access can replicate the core experiments."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "DeepSeek-Coder has moderate recognition; work is from Chinese universities without major lab branding."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "44507887",
    504         "title": "Empirical Evaluation of Large Language Models in Automated Program Repair",
    505         "points": 5,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=44507887"
    508       },
    509       {
    510         "hn_id": "40876136",
    511         "title": "LLMMatDesign – Gen AI for Materials",
    512         "points": 4,
    513         "comments": 0,
    514         "url": "https://news.ycombinator.com/item?id=40876136"
    515       },
    516       {
    517         "hn_id": "44663723",
    518         "title": "Prompt Injection 2.0: Hybrid AI Threats – Paper and Open Source Testing Toolkit",
    519         "points": 3,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=44663723"
    522       },
    523       {
    524         "hn_id": "43293373",
    525         "title": "RingFormer: Rethinking Recurrent Transformer with Adaptive Level Signals",
    526         "points": 3,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=43293373"
    529       },
    530       {
    531         "hn_id": "44943311",
    532         "title": "NaN-propagation: a novel method for sparsity detection in black-box computationa",
    533         "points": 3,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=44943311"
    536       },
    537       {
    538         "hn_id": "44962664",
    539         "title": "Chain-of-Agents",
    540         "points": 2,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=44962664"
    543       },
    544       {
    545         "hn_id": "43914672",
    546         "title": "Questions to Fall in Love with ChatGPT: An Experimental Study",
    547         "points": 2,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=43914672"
    550       }
    551     ],
    552     "top_points": 5,
    553     "total_points": 22,
    554     "total_comments": 1
    555   }
    556 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs