scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28171B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Data-Efficient Adaptation of Large Language Models for Code Generation",
      6     "authors": [
      7       "Xue Jiang",
      8       "Yihong Dong",
      9       "Zhiyuan Fan",
     10       "Zhi Jin",
     11       "Wenpin Jiao",
     12       "Ge Li"
     13     ],
     14     "year": 2024,
     15     "venue": "ACM Transactions on Software Engineering and Methodology",
     16     "arxiv_id": "2403.00046",
     17     "doi": "10.1145/3772721"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract's '46.2% average relative improvement in Pass@1' matches the average of the five per-dataset improvements reported in Table 1 (29.5%, 33.0%, 27.1%, 37.6%, 103.8%); all major claims are supported by experimental results.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper makes causal claims about error-driven learning improving efficiency; these are backed by controlled ablation studies (RQ3 training data variants, RQ6 component ablations) that isolate the effect.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper claims 'broad applicability' but experiments are exclusively on Python benchmarks with 2B–7B models; no explicit scope boundary for programming language or model scale is stated despite the sweeping framing.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The gains are attributed solely to error-driven learning without considering alternative explanations such as quality-filtering effects (only passing revisions are kept), curriculum learning dynamics, or data augmentation from the iterative process.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Pass@k is measured via automated test case execution, directly evaluating functional correctness; no conflation between proxy metrics (BLEU, token overlap) and actual correctness is made.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 8 'Limitations' is a dedicated section listing two specific limitations: requirement for test cases during preprocessing and restriction to low-resource scenarios.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 7 discusses three threats with some specificity: dataset quality and generalizability, hyperparameter sensitivity with acknowledgment of 'small-range grid search,' and metric reliability justifying the unbiased Pass@k estimator.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "While limitations note the test case requirement and low-resource focus, the paper does not explicitly state what the results do NOT show — no mention of language scope, model scale limits, or inapplicability to other task types.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments disclose funding from National Key R&D Program No. 2023YFB4503801, National Natural Science Foundation of China Nos. 62192733/62192730/62192731, and Hubei Province Major Program No. 2023BAA024.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors are disclosed as affiliated with the Key Laboratory of High Confidence Software Technologies, School of Computer Science, Peking University.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funders are Chinese government research programs (NSFC, National Key R&D) with no direct financial stake in the DEED method's adoption or commercialization.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interests declaration is included anywhere in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "'Data-efficient adaptation' is contextualized as adapting with limited training data, 'error-driven learning' is explained via the four-step process, and DEED's acronym is explicitly defined.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three contributions are explicitly enumerated: demonstrating error-driven learning effectiveness, proposing DEED, and showing outperformance over mainstream approaches on five benchmarks.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 6 engages substantively with fine-tuning variants, prompting approaches, and related code refinement methods (Self-Refine, Self-Debug, Self-Edit, CYCLE, ILF), distinguishing DEED's contribution from each.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No code repository URL or availability statement is provided anywhere in the paper.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "All five evaluation datasets (HumanEval, MBPP, HumanEval-ET, MBPP-ET, DS-1000/DataScience) are standard public benchmarks available independently of this work.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Only 'a single A6000 GPU' is mentioned; no requirements.txt, Dockerfile, Python version, or library versions are specified.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Algorithm 1 provides pseudocode and Section 4.2 lists hyperparameters, but no step-by-step reproduction instructions sufficient to rerun experiments without guessing implementation details are provided.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Results are averaged over five runs but no confidence intervals, standard deviations, or error bars are reported for any metric.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are performed despite making multiple comparative claims across methods, datasets, and LLMs.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Relative improvement percentages (e.g., ↑29.5%, ↑103.8%) are reported against the best-performing baseline, providing interpretable effect magnitudes.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The training split (min(200, 40%*D)) is stated but not justified; no power analysis or reasoning about whether sample sizes are sufficient to detect the reported effects is provided.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Results are 'averaged over five test runs' but no standard deviation, variance, or range across those runs is reported anywhere.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Six baselines are included: Direct Generation, Fine-tuning (Full), Fine-tuning (LoRA), Few-shot Prompting, Self-Refine, and Self-Debug.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include Self-Refine (NeurIPS 2023), Self-Debug (2023), and LoRA (ICLR 2022); all are relevant and contemporary for the submission period.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "RQ3 ablates training data variants, RQ4 studies iteration counts, RQ5 varies the revision model, and RQ6 ablates Self-Revise input components (correct solution, error messages, failed test cases).",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Pass@1, Pass@5, and Pass@10 are reported throughout; Pass@any is added for revision quality experiments.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "Code correctness is evaluated via automated test execution; human evaluation of generated code quality is not applicable to this setting.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Each dataset is split into training (min(200, 40%*D) problems) and a held-out test set (remaining problems); evaluation is performed on the test portion only.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": false,
    219           "justification": "Results are reported per dataset but no per-category, per-difficulty, or per-problem-type breakdowns are provided within datasets.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "Figure 4 provides qualitative success cases for Self-Revise; no systematic discussion of where DEED fails or under what conditions it underperforms is included.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports that ChatGPT/GPT-3.5-turbo as MRevise does not outperform Self-Revise (FT), that Fine-tuning (LoRA) underperforms Full fine-tuning, and that Llama-7B Fine-tuning underperforms Direct Generation — all reported without concealment.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Models are named (CodeGen-2B, Llama-7B, CodeLlama-7B) but no checkpoint hashes, Hugging Face identifiers, or version dates are given; ChatGPT is cited without any version.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix C provides the exact instruction text for automatic code revision, and Figure 3 shows the full template structure with all five input components labeled.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Section 4.2 reports learning rates (5e-6 Full, 2e-4 LoRA), batch size (1), gradient accumulation (32), training epochs (10), temperature (0.8), LoRA rank (128), α (8), β1/β2 (0.9), and sampling counts (5 for errors, 30 for revisions).",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "DEED is a fine-tuning pipeline, not an agentic scaffold; the iterative training process is fully described in Algorithm 1 but does not constitute agentic scaffolding.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Sections 3.1 and 3.2 document error code collection via rejection sampling and revision via acceptance sampling with test execution filtering in sufficient detail.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "The generated error codes and revised training data produced during DEED's preprocessing are not released; only the public benchmark sources are available.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Sections 3.1 and 3.2 describe error code collection (rejection sampling by log-probability) and revision (acceptance sampling with minimum Levenshtein distance selection) in detail.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "Standard public benchmarks are used; no participant recruitment is involved.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Algorithm 1 documents the complete iterative pipeline from dataset input through error collection, revision, model optimization, and iteration termination.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Training data cutoffs for CodeGen, Llama-7B, or CodeLlama-7B are not stated, despite all being trained potentially after HumanEval and MBPP were publicly released.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "Contamination is not discussed for the main benchmarks; EvoCodeBench is used in Appendix B as a supplementary contamination-resistant evaluation but the main evaluation does not address overlap.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "HumanEval (2021) and MBPP (2021) were publicly available before training cutoffs of most evaluated models; this is not discussed despite being a known confound for code LLM evaluation.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "The paper qualitatively states DEED incurs no additional inference overhead compared to direct generation, but provides no quantitative latency or cost measurements.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Only 'a single A6000 GPU' is mentioned; total training time, GPU-hours, or compute budget for the full experimental suite is not reported.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "DEED achieves an average relative improvement of 46.2% in Pass@1 over the best-performing mainstream baseline across five code generation benchmarks under limited data.",
    376       "evidence": "Table 1 reports relative improvements over Fine-tuning (Full) of 29.5% (HumanEval), 33.0% (HumanEval-ET), 27.1% (MBPP), 37.6% (MBPP-ET), and 103.8% (DataScience), averaging to 46.2%.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Training on revised error codes (error-driven learning) is more data-efficient than training on original dataset samples.",
    381       "evidence": "Table 3 shows DEED (32.8% Pass@1) outperforms Raw D_train fine-tuning (25.8%) using fewer training examples; representational distance analysis shows revised codes are closer to error codes than dataset samples (6.39 vs 12.35 Euclidean distance).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Self-Revise using the same base model (fine-tuning setting) yields better final model performance than using larger or more capable external models for revision.",
    386       "evidence": "Table 5 shows Self-Revise (FT) with CodeGen-2B achieves M_θ* Pass@1 of 32.8% vs 27.0% for ChatGPT-based revision, despite ChatGPT having far higher MRevise Pass@any (92.1% vs 24.6%).",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "DEED's iterative adaptation stabilizes after two iterations, capturing most achievable gains.",
    391       "evidence": "Table 4 shows Pass@1 of 31.6% (iter 1), 32.8% (iter 2), 33.0% (iter 3), 33.2% (iter 4), with diminishing returns and Pass@10 oscillation after iteration 2.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "DEED is broadly applicable across LLMs of varying sizes and architectures.",
    396       "evidence": "Table 2 shows 25–33% relative improvements over Fine-tuning across CodeGen-2B, CodeGen-6B, Llama-7B, and CodeLlama-7B, though all are 2B–7B models tested on Python-only tasks.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval"
    402   ],
    403   "key_findings": "DEED, which fine-tunes LLMs on automatically self-revised versions of their own error outputs rather than raw dataset samples, achieves 27–104% relative improvement in Pass@1 over mainstream adaptation approaches on five Python code generation benchmarks under limited data conditions. The core empirical finding is that error-driven training data is more data-efficient than standard dataset samples, supported by representational distance analysis and ablation experiments. Self-Revise performs best using the same model being adapted in a fine-tuning setting, and performance gains stabilize after two iterations.",
    404   "red_flags": [
    405     {
    406       "flag": "No variance reported",
    407       "detail": "Results are averaged over five runs but standard deviations are never reported, making it impossible to assess whether observed differences between methods exceed run-to-run variability."
    408     },
    409     {
    410       "flag": "No statistical significance tests",
    411       "detail": "Multiple comparative claims across six baselines, five datasets, and four LLMs are made with no statistical tests applied; numerical differences may not be meaningful."
    412     },
    413     {
    414       "flag": "Benchmark contamination unaddressed",
    415       "detail": "HumanEval and MBPP (both 2021) were publicly available before training cutoffs of CodeGen, Llama, and CodeLlama; this known confound is not discussed for the main evaluation."
    416     },
    417     {
    418       "flag": "Code not released",
    419       "detail": "No implementation code is provided despite the method having non-trivial implementation complexity (rejection/acceptance sampling, iterative training loop, revision filtering)."
    420     },
    421     {
    422       "flag": "Generalizability overclaimed",
    423       "detail": "Claims of 'broad applicability' are based solely on Python benchmark tasks and models ≤7B; no non-Python languages, larger models, or non-programming tasks are tested."
    424     },
    425     {
    426       "flag": "Model versions unspecified",
    427       "detail": "Model names are given without checkpoint hashes, Hugging Face identifiers, or snapshot dates, preventing exact reproduction and confounding cross-study comparisons."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    433       "relevance": "Foundational code LLM and benchmark (HumanEval); primary evaluation dataset and baseline comparison point."
    434     },
    435     {
    436       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    437       "relevance": "Direct competing baseline for iterative code improvement via prompting; DEED is evaluated against it."
    438     },
    439     {
    440       "title": "Teaching Large Language Models to Self-Debug",
    441       "relevance": "Direct competing baseline using execution feedback for code correction; compared against in main evaluation."
    442     },
    443     {
    444       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    445       "relevance": "Parameter-efficient fine-tuning method used as a baseline and within DEED for resource-constrained settings."
    446     },
    447     {
    448       "title": "CYCLE: Learning to Self-Refine the Code Generation",
    449       "relevance": "Concurrent work on test-driven self-refinement for code; explicitly contrasted with DEED's adaptation focus."
    450     },
    451     {
    452       "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models",
    453       "relevance": "Cited to contextualize data leakage in evaluation benchmarks; motivates supplementary EvoCodeBench experiment."
    454     },
    455     {
    456       "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
    457       "relevance": "Contamination-resistant benchmark used in Appendix B to validate DEED in a data-leakage-aware setting."
    458     },
    459     {
    460       "title": "Program Synthesis with Large Language Models (MBPP)",
    461       "relevance": "Primary benchmark dataset used for most experiments and preliminary representational distance analysis."
    462     },
    463     {
    464       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    465       "relevance": "Primary base model (CodeGen-2B) used in most experiments; default model for full fine-tuning comparisons."
    466     },
    467     {
    468       "title": "Self-Edit: Fault-Aware Code Editor for Code Generation",
    469       "relevance": "Related work training a separate editor model for code revision; contrasted with DEED's self-revision approach."
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 2,
    475       "justification": "Directly addresses the real-world scarcity of domain-specific training data with a deployable fine-tuning pipeline requiring no external resources beyond test cases."
    476     },
    477     "surprise_contrarian": {
    478       "score": 2,
    479       "justification": "Counterintuitive finding that using a weak base model for self-revision outperforms ChatGPT-based revision, and that error-focused training beats learning from correct examples."
    480     },
    481     "fear_safety": {
    482       "score": 0,
    483       "justification": "No safety, risk, or misuse concerns are raised; the paper is entirely focused on improving code generation performance."
    484     },
    485     "drama_conflict": {
    486       "score": 0,
    487       "justification": "No controversy with established methods or high-profile conflict; straightforward incremental improvement paper."
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "Method is conceptually implementable on public benchmarks and models, but no code is released, requiring substantial re-implementation effort before practitioners can try it."
    492     },
    493     "brand_recognition": {
    494       "score": 0,
    495       "justification": "Peking University research group; not a top-tier AI lab brand with mainstream tech community recognition."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "39651926",
    502         "title": "An all-optical general-purpose CPU and optical computer architecture",
    503         "points": 197,
    504         "comments": 103,
    505         "url": "https://news.ycombinator.com/item?id=39651926",
    506         "created_at": "2024-03-09T14:49:53Z"
    507       },
    508       {
    509         "hn_id": "33426789",
    510         "title": "Yoneda Hacking: The Algebra of Attacker Actions",
    511         "points": 9,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=33426789",
    514         "created_at": "2022-11-01T20:10:20Z"
    515       },
    516       {
    517         "hn_id": "42496507",
    518         "title": "Online Advertising Is a Regrettable Necessity",
    519         "points": 6,
    520         "comments": 2,
    521         "url": "https://news.ycombinator.com/item?id=42496507",
    522         "created_at": "2024-12-23T18:27:49Z"
    523       },
    524       {
    525         "hn_id": "41961564",
    526         "title": "Easy real-time collision detection",
    527         "points": 4,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=41961564",
    530         "created_at": "2024-10-27T11:06:23Z"
    531       },
    532       {
    533         "hn_id": "39610408",
    534         "title": "Polyamorous Scheduling is NP-hard",
    535         "points": 3,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=39610408",
    538         "created_at": "2024-03-05T23:27:01Z"
    539       },
    540       {
    541         "hn_id": "39329353",
    542         "title": "Training microrobots to swim by a large language model",
    543         "points": 2,
    544         "comments": 1,
    545         "url": "https://news.ycombinator.com/item?id=39329353",
    546         "created_at": "2024-02-10T19:21:39Z"
    547       },
    548       {
    549         "hn_id": "41537027",
    550         "title": "Towards Battery-Free Wireless Sensing via Radio-Frequency Energy Harvesting",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=41537027",
    554         "created_at": "2024-09-14T02:26:33Z"
    555       },
    556       {
    557         "hn_id": "39352140",
    558         "title": "Detecting Multimedia Generated by Large AI Models: A Survey",
    559         "points": 2,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=39352140",
    562         "created_at": "2024-02-12T23:36:45Z"
    563       },
    564       {
    565         "hn_id": "45763351",
    566         "title": "VaultDB: A Real-World Pilot of SMPC Within a Clinical Research Network",
    567         "points": 1,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=45763351",
    570         "created_at": "2025-10-30T18:24:42Z"
    571       },
    572       {
    573         "hn_id": "41981519",
    574         "title": "Easy real-time collision detection",
    575         "points": 1,
    576         "comments": 0,
    577         "url": "https://news.ycombinator.com/item?id=41981519",
    578         "created_at": "2024-10-29T09:41:11Z"
    579       }
    580     ],
    581     "top_points": 197,
    582     "total_points": 227,
    583     "total_comments": 106
    584   }
    585 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs