scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27376B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Generalizable Automated Program Repair with Large Language Models",
      6     "authors": [
      7       "Viola Campos",
      8       "Ridwan Shariffdeen",
      9       "Adrian Ulges",
     10       "Yannic Noller"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2506.03283",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All three abstract claims — language-specific model specialization (Table 2), ensemble benefit (Table 5), and dramatic FL accuracy drop (Table 6) — are directly supported by experimental results.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims about test information improving performance and automated FL degrading it are supported by controlled prompt variation experiments where only one ingredient changes at a time.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper explicitly states 'we cannot claim generality beyond our experiments' and bounds conclusions to single-function repairs across four specific benchmarks and languages.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Python's poor performance is explained by indentation errors (Figure 1); data leakage is discussed as alternative explanation; test overfitting is acknowledged as alternative to correctness claims.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 3.4 explicitly distinguishes plausible patches (pass tests) from correct patches (semantic equivalence with developer patch), acknowledging test overfitting as a known limitation.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5 'Discussion & Threats to Validity' is a dedicated multi-subsection threats discussion covering multiple specific validity concerns.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats include: exact leaked ratios (Defects4J 0.41%, BugsInPy 11.0%), limitation to single-function bugs, plausibility as proxy metric, and FL tool coverage restricted to Java only.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly excludes agentic workflows, limits to single-function repairs, and states 'we cannot claim generality beyond our experiments.'",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No acknowledgment or funding section appears in the paper; no grants or funding sources are mentioned anywhere.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are disclosed in the header: RheinMain University of Applied Sciences, SonarSource (Singapore), and Ruhr University Bochum.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Section 8 disclaims that results don't represent SonarSource's official policies, but no formal competing interests or financial interests declaration is provided.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "APR is defined, open vs. closed model distinction is explicitly defined (footnote 1), plausible vs. correct patches are defined, and FL granularities (function-level vs. line-level) are explained.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The introduction provides three explicit bullet-point contributions targeting practitioners, researchers, and the community, with an open-source experimental setup.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 contextualizes against prior LLM-APR studies (Xia et al., Silva et al., Ouyang et al.) and specifically identifies four gaps in prior work that this study addresses.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "Section 7 states scripts will be released 'upon acceptance' — code is not yet available; only results and patches are on figshare.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Results and generated patches are openly available on figshare (https://figshare.com/s/947fd7030f10a67a1c9f); all four benchmark datasets are publicly available.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or dependency specifications are provided; temperature=1.0 is stated but the full computational environment is not described.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "A reproduction package is promised 'upon acceptance' but is not yet included; current artifact contains only results and patches without execution scripts.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported; statistical significance is indicated by underlines in tables via Wilcoxon test, but no CIs accompany point estimates.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Wilcoxon signed-rank test at α=0.05 is applied throughout; tables mark best results bold and underline non-significantly-different results.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute pass@k differences are reported (e.g., 'up to almost +47% pass@1' for test prompt on Python; drops from ~20% to ~3% for automated FL in Table 6).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "n=15 generations is justified by reference to prior work's standard deviation analysis of pass@1 for LLM-based APR (Parasaram et al., 2024).",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Three independent runs were conducted but variance across runs is not reported; only the aggregated pass@k estimator is presented without spread.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "13 models are compared against each other; base prompt serves as baseline for test/localization prompt comparisons; direct comparison to prior work is enabled via shared benchmarks.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "All 13 models are recent (2023–2025), selected from 'recent code-focused leaderboards' (Aider polyglot, BigCodeBench, RepairBench); no outdated weak baselines.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Prompt components are systematically varied: base (code only) → test (+failing test info) → line-level localization (+line hints) → automated FL (realistic localization from FLACOCO).",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Both pass@1 and pass@5 are reported for all experiments; patch complexity breakdown (single-line, single-hunk, multi-hunk) adds further evaluation dimensions.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Evaluation is fully automated using test suite execution; manual patch review is explicitly noted as infeasible at scale.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Real-world benchmark bugs with pre-existing test suites are used for evaluation; prompt comparison experiments use a stratified 100-bug subset per benchmark.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results broken down by language (4), patch complexity (3 levels), prompt type (4 variants); Table 7 provides the full multi-dimensional breakdown.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Python indentation failures analyzed quantitatively (Figure 1); automated FL failures discussed (correct location in only 28/100 cases); test overfitting acknowledged as a failure mode.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Line-level localization decreases performance for 4/6 models on PHP; automated FL drops pass@1 from ~15–20% to ~1–4%; these negative results are explicitly highlighted and discussed.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Most models are identified by marketing names only (Claude 3.7 Sonnet, Gemini 2.0 Flash, etc.); only GPT-4o and o3-mini have explicit snapshot dates (Nov 11 2024, Jan 31 2025).",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "All four prompt templates are shown verbatim in Listings 1–4, including system messages and user prompts with placeholder markers.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Temperature=1.0 and n=15 generations per model are explicitly specified; models use 'standard settings' per their respective APIs.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used; the paper explicitly excludes iterative and agentic workflows, using single-turn prompts throughout.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3.3 describes benchmark selection criteria and filtering (single-function, reproducible, test-backed); Table 1 shows the full filtering funnel per benchmark.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "All results and generated patches are openly available on figshare (https://figshare.com/s/947fd7030f10a67a1c9f).",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Benchmark selection criteria are explicitly stated in Section 3.3 (reproducible real bugs, executable tests, human ground-truth patches, sufficient size); filtering shown in Table 1.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "Standard publicly available benchmarks are used; no participant recruitment involved.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from bug selection through prompting, patch generation (15 per model, 3 runs of 5), and pass@k evaluation is described in Sections 3.3–3.4.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training data cutoffs are not stated for any of the 13 models; only release dates are given for two (GPT-4o, o3-mini), not training cutoffs.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 5 explicitly discusses data leakage, citing Zhou et al. (2025) with specific leaked ratios (Defects4J 0.41%, BugsInPy 11.0%) and identifies it as a threat to internal validity.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "The paper acknowledges benchmark data 'may have been included in training corpora,' cites Ramos et al. (2025) on memorization, and notes BugsInPy's leakage paradoxically correlates with it being the hardest benchmark.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API costs or inference times are reported; only a qualitative note that DeepSeek R1 requires 'significantly more time' than other models.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Total compute budget is not reported despite generating ~195,000 patches across 13 models and 4 benchmarks.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "No single LLM consistently outperforms others across all four programming languages",
    374       "evidence": "Table 2 shows four different models achieving best pass@1 on four benchmarks: Claude 3.7 Sonnet (Java), Claude 3.5 Haiku (JavaScript), DeepSeek R1 (PHP), Gemini 2.0 Flash (Python)",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Model ensembles improve pass@5 in 14 of 16 evaluated combinations over the best single model",
    379       "evidence": "Table 5 shows ensemble gains across all languages; e.g., JavaScript pass@5 from 68.00% (o3-mini alone) to 71.68% (o3-mini + DeepSeek R1)",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Failing test case information is the most impactful prompt ingredient, improving pass@1 by up to +47%",
    384       "evidence": "Table 4 shows consistent improvements across all 6 models and 4 languages; Python shows largest average gain (+34.7% pass@1 over all models)",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Automated fault localization causes catastrophic APR performance drops, from ~15–20% to ~1–4% pass@1",
    389       "evidence": "Table 6 shows pass@1 drops from 19.02% to 2.76% (Claude 3.7), 17.16% to 0.31% (DeepSeek R1); attributed to correct location found in only 28/100 FLACOCO results",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Line-level fault localization adds less value than test information and can decrease performance",
    394       "evidence": "Table 4 shows LL underperforms Test prompt for all models; 4 of 6 models decrease on PHP with line-level localization vs. base prompt",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Open models are catching up to closed models, with DeepSeek R1 surpassing most closed models",
    399       "evidence": "Figure 4 shows DeepSeek R1 (dist.) at ~25.85% pass@5 average, exceeding all closed models except Claude 3.7 in the base prompt setting",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "LLMs handle multi-hunk bugs better than expected, with only moderate performance decline from single-line (45%) to multi-hunk (27.5%) pass@1",
    404       "evidence": "Table 7 shows averaged results across all 4 benchmarks; in 10/48 cases performance actually improves from single-hunk to multi-hunk",
    405       "supported": "strong"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "observational"
    411   ],
    412   "key_findings": "An empirical evaluation of 13 LLMs for automated program repair across Java, JavaScript, Python, and PHP (712 bugs, ~195,000 patches) shows no single model generalizes across all languages, requiring language-specific model selection or ensembles. Adding failing test information yields the largest accuracy gains (up to +47% pass@1), while automated fault localization causes catastrophic performance drops from ~20% to ~3% pass@1 because FLACOCO correctly identifies the buggy function in only 28% of cases. Ensembles of two complementary models consistently outperform single models, and open models (particularly DeepSeek R1) are approaching parity with closed frontier models.",
    413   "red_flags": [
    414     {
    415       "flag": "Reproduction scripts not yet released",
    416       "detail": "Scripts for prompting LLMs are promised 'upon acceptance' but are not currently available; artifact contains only patches and results, preventing full reproduction."
    417     },
    418     {
    419       "flag": "Model versions lack snapshot identifiers",
    420       "detail": "Most models are identified by marketing names without explicit API version IDs or snapshot dates; only GPT-4o (Nov 11 2024) and o3-mini (Jan 31 2025) have explicit dates."
    421     },
    422     {
    423       "flag": "No variance reporting across runs",
    424       "detail": "Three independent runs of n=5 patches were conducted, but variance across runs is not reported; only the aggregated pass@k estimator is presented."
    425     },
    426     {
    427       "flag": "Plausibility-only evaluation acknowledged but unresolved",
    428       "detail": "Correctness is measured solely by test suite passage; semantic correctness (equivalence to developer patch) is not assessed at scale, and test overfitting risk is acknowledged as a threat."
    429     },
    430     {
    431       "flag": "Automated FL experiment limited to Java only",
    432       "detail": "FLACOCO only supports Java, so the most important finding (catastrophic FL accuracy drop) is tested on a single language benchmark, limiting generalizability."
    433     },
    434     {
    435       "flag": "No funding disclosed",
    436       "detail": "One author is from SonarSource, an industrial APR vendor, but no funding sources or potential financial interests are declared despite a disclaimer in Section 8."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    442       "relevance": "Key prior systematic LLM-APR evaluation; this paper extends it with more recent models and additional languages"
    443     },
    444     {
    445       "title": "The Fact Selection Problem in LLM-Based Program Repair",
    446       "relevance": "Established that test information substantially boosts APR performance; this paper confirms and extends across 13 models and 4 languages"
    447     },
    448     {
    449       "title": "RepairBench: Leaderboard of Frontier Models for Program Repair",
    450       "relevance": "Contemporary leaderboard used for model selection; prompt template for the 'test' prompt is adapted from this work"
    451     },
    452     {
    453       "title": "Benchmarking Automated Program Repair: An Extensive Study on Both Real-World and Artificial Bugs",
    454       "relevance": "Establishes that plausibility and TCE correlate with patch correctness, justifying plausibility as the primary metric"
    455     },
    456     {
    457       "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks",
    458       "relevance": "Provides specific benchmark contamination ratios (Defects4J 0.41%, BugsInPy 11.0%) used to bound the data leakage threat"
    459     },
    460     {
    461       "title": "Breaking the Silence: the Threats of Using LLMs in Software Engineering",
    462       "relevance": "Identifies key evaluation threats (output variability, data leakage, closed-source models) that this paper explicitly addresses"
    463     },
    464     {
    465       "title": "Evaluating Large Language Models Trained on Code",
    466       "relevance": "Establishes the pass@k metric with unbiased estimator used as primary evaluation criterion throughout"
    467     },
    468     {
    469       "title": "You Cannot Fix What You Cannot Find! An Investigation of Fault Localization Bias in Benchmarking Automated Program Repair Systems",
    470       "relevance": "Establishes the localization bias problem in APR benchmarking; motivates the automated FL experiment in RQ2"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 3,
    476       "justification": "Directly actionable for APR practitioners: specific model recommendations per language, the critical value of test information in prompts, and the danger of assuming perfect fault localization."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "Counterintuitive finding that automated FL causes catastrophic drops (20% → 3% pass@1) challenges the common research assumption of perfect FL; line-level hints sometimes hurt performance."
    481     },
    482     "fear_safety": {
    483       "score": 0,
    484       "justification": "No safety or AI risk concerns raised; paper focuses on software maintenance tool efficacy."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Open vs. closed model narrative creates mild tension; SonarSource affiliation with an APR disclaimer in Section 8 adds a minor conflict-of-interest angle."
    489     },
    490     "demo_ability": {
    491       "score": 1,
    492       "justification": "Patches and results are on figshare, but execution scripts are not yet released; readers can inspect outputs but cannot reproduce the pipeline."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "One author from SonarSource (industrial static analysis vendor); no high-profile academic lab affiliation."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "44750462",
    503         "title": "Nonogram: Complexity of Inference and Phase Transition Behavior",
    504         "points": 16,
    505         "comments": 2,
    506         "url": "https://news.ycombinator.com/item?id=44750462"
    507       },
    508       {
    509         "hn_id": "44815351",
    510         "title": "The possibility of a giant impact on Venus",
    511         "points": 5,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=44815351"
    514       },
    515       {
    516         "hn_id": "31662569",
    517         "title": "NeMF: Neural Motion Fields for Kinematic Animation",
    518         "points": 4,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=31662569"
    521       },
    522       {
    523         "hn_id": "46021186",
    524         "title": "User Location Disclosure Amplifies Regional Divisions on Chinese Social Media",
    525         "points": 3,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=46021186"
    528       },
    529       {
    530         "hn_id": "27450354",
    531         "title": "Tabular Data: Deep Learning Is Not All You Need",
    532         "points": 3,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=27450354"
    535       },
    536       {
    537         "hn_id": "47690469",
    538         "title": "Frontier AI models are the most cost-efficient",
    539         "points": 2,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=47690469"
    542       },
    543       {
    544         "hn_id": "44003454",
    545         "title": "Twist: Teleoperated Whole-Body Imitation System",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=44003454"
    549       },
    550       {
    551         "hn_id": "43692092",
    552         "title": "Semantic Commit: Helping Users Update Intent Specifications for AI Memory",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=43692092"
    556       },
    557       {
    558         "hn_id": "32176051",
    559         "title": "Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks",
    560         "points": 2,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=32176051"
    563       },
    564       {
    565         "hn_id": "45293628",
    566         "title": "A Trustworthiness-Based Metaphysics of Artificial Intelligence Systems",
    567         "points": 1,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=45293628"
    570       }
    571     ],
    572     "top_points": 16,
    573     "total_points": 40,
    574     "total_comments": 2
    575   }
    576 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs