scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27878B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Automated Program Repair through Fine-tuning and Prompt Engineering",
      6     "authors": [
      7       "Rishov Paul",
      8       "Md. Mohib Hossain",
      9       "Mohammed Latif Siddiq",
     10       "Masum Hasan",
     11       "Anindya Iqbal"
     12     ],
     13     "year": 2023,
     14     "venue": "arXiv",
     15     "arxiv_id": "2304.07840",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims that fine-tuned pre-trained models notably outperform prior baselines are directly supported by Table II (e.g., CodeT5 +21.12pp on Tufano), and the manual analysis conclusion about practical limitations is supported by RQ3 results showing 40-60% failure rates.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims fine-tuning improves APR but runs no ablations isolating whether gains come from pre-training, architecture, or fine-tuning data composition; no controlled experiment separates these factors.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The threats-to-validity section explicitly bounds results to Java code in English, acknowledging findings may not generalize to other programming languages or review styles.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "For fine-tuned model improvements, no alternative explanations (dataset-specific artifacts, metric sensitivity) are considered; contamination is acknowledged for LLMs but not systematically explored as an alternative to the main fine-tuning claims.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges that exact match may not capture all valid fixes and conducts RQ3 developer analysis specifically to assess actual review intention fulfillment beyond the automated metric.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section VI 'Threats to Validity' is a dedicated section addressing both internal and external validity threats.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Threats are specific: hyperparameter search is limited to avoid compute cost, datasets cover only Java in English, and LLM data leakage is flagged with the concrete knowledge cutoff date of September 2021.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states results are confined to Java code and English reviews, and notes that other programming languages and datasets would require further investigation.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (BUET, University of Notre Dame, University of Rochester) are clearly listed on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, so funder independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section II provides explicit definitions of code review, APR, LLMs, zero-shot learning, and few-shot learning with concrete examples in Listing 2 illustrating the distinction.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction lists six explicit bullet-point contributions including model validation, PLBART vs CodeT5 comparison, LLM prompting investigation, manual analysis, and a replication package.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section VII relates this work to prior APR approaches (SemFix, SequenceR, DeepFix, CoCoNut) and directly positions the study as extending Tufano et al. and Review4Repair by applying pre-trained models and LLM prompting to their datasets.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "A replication package with all scripts is published at https://doi.org/10.5281/zenodo.8122636, a public persistent Zenodo DOI.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both datasets (Tufano et al. and Review4Repair) are from prior published works with publicly available replication packages referenced in the paper.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Only the GPU model (NVIDIA GeForce RTX 2070-8GB) is mentioned; no requirements.txt, Dockerfile, Python version, or library versions are specified in the paper.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The Zenodo package is described as containing scripts, but the paper does not provide step-by-step instructions sufficient to reproduce results without additional reverse-engineering of the scripts.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables II and III report only point estimates for all metrics; no confidence intervals or error bars are reported for any quantitative results.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (t-tests, Wilcoxon, etc.) are applied to any model comparisons despite multiple comparative claims throughout the results sections.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Tables II and III report absolute percentage differences from baseline (e.g., '+20.82%', '+21.12%'), providing effect sizes in the context of baseline performance.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The human evaluation sample sizes (314 and 340) are explicitly justified to achieve 95% confidence interval with 5% margin of error.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results are from single runs with no standard deviation, variance, or variability across multiple runs reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper compares against R4R CC (Review4Repair baseline) and Tufano 2-encoder (Tufano et al. baseline), the state-of-the-art baselines from prior work on code-review-guided APR.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines are from 2021-2022 publications (ICSE 2021, IST 2022), the most recent prior works on code-review-guided APR at time of submission.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study is conducted; the paper compares different full models but does not isolate contributions of pre-training, architecture, or code-review input.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Evaluation uses Top-1/5/10 accuracy (exact match), BLEU-4, CodeBLEU, and human evaluation (fulfillment rate, Cohen's Kappa), covering both automated and human assessment.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Two software developers with one year of industry experience independently rated 314/340 samples per dataset for alignment with code review intentions, with Cohen's Kappa reported for inter-rater agreement.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Both datasets have designated held-out test sets (2,955 samples for Review4Repair; 1,719 for Tufano et al.) and all reported accuracy values use these test sets.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Figure 2 provides per-category breakdowns across Insert, Delete, and Update fix classes for both datasets and all models, revealing differential strengths (PLBART better on Delete, CodeT5 better on Insert/Update).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section III-B-5 categorizes five systematic LLM failure patterns (syntax errors, explanations added, backtick wrapping), and Section V-A discusses dataset quality issues that cause model-reviewer misalignment.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table IV shows models fail to fulfill review intentions 40-62% of the time, and the conclusion explicitly states practical LLM-based APR 'is still a long way off.'",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "GPT-3.5-Turbo and Code-DaVinci-Edit-001 are named without API snapshot dates; as OpenAI models change over time, exact reproducibility is not possible.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Listing 2 provides the complete zero-shot and few-shot prompt template, and the system role content for GPT-3.5-Turbo ('You are a coding assistant. You generate only the source code.') is explicitly stated.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Batch size, gradient accumulation steps, epoch counts, beam sizes, input/target lengths, temperature (0), top-p (1), and frequency/presence penalties (0) are all reported in Section III.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This paper does not use agentic scaffolding; it directly calls fine-tuned models and OpenAI API endpoints without any orchestration layer.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section III-A-1 documents preprocessing for both datasets including token concatenation format, special token handling, split reorganization ratios, and filtering criteria for samples exceeding 512 tokens.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Both datasets are publicly available from their original publications, and the Zenodo replication package includes data gathering scripts.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The paper describes that datasets were collected from Gerrit and GitHub code reviews by prior works, and details the train/validation/test split reorganization for Review4Repair including exact sample counts.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "The two human raters are described as software developers with one year of industry experience at a Fortune 500 company with active code review participation as both submitter and reviewer.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The preprocessing pipeline from raw dataset to model input/output is documented in detail in Section III-A-1, including tokenization, filtering, format conversion, and special token insertion steps.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The paper states the knowledge cutoff for GPT-3.5-Turbo and Code-DaVinci-Edit-001 is September 2021.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "The threats section explicitly discusses that the Tufano et al. dataset was published before the September 2021 cutoff and may have been included in LLM pretraining, while noting this cannot be verified due to black-box models.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The paper specifically flags potential data leakage for the Tufano et al. dataset (pre-September 2021) as an alternative explanation for Code-DaVinci's strong zero-shot performance.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned for the developer evaluation study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB or ethics approval is mentioned despite involving human subjects in the developer analysis.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Only minimal background is provided (one year of industry experience, Fortune 500 company); no demographic details like age, gender, or educational background are reported.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Inclusion criteria are stated: developers with one year of industry experience and significant involvement in code review processes as both submitter and reviewer.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": true,
    340           "justification": "The paper states samples were 'randomly collected' from the test sets to achieve 95% confidence interval with 5% margin of error (314/340 samples per dataset).",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No blinding procedure is described; it is unclear whether raters knew which model produced each repaired code output.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No attrition applicable; all collected samples were evaluated with no participant dropout.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No API costs, token usage, or inference latency for OpenAI model calls are reported; fine-tuning GPU-hours are also absent.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The GPU model (RTX 2070-8GB) is mentioned but total training time, GPU-hours, or compute budget are not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Fine-tuned PLBART and CodeT5 significantly outperform prior baseline models on both code review-based APR datasets.",
    375       "evidence": "Table II shows CodeT5 achieving 33.28% Top-1 vs 12.16% baseline on Tufano (+21.12pp) and 29.82% vs 19.59% on Review4Repair (+10.23pp).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "CodeT5 generally outperforms PLBART, particularly on Insert and Update fix categories.",
    380       "evidence": "Figure 2 and Table II show CodeT5 with higher Top-1/5/10 across Insert and Update classes for both datasets; PLBART performs better on Delete class.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Code-DaVinci-Edit-001 achieves state-of-the-art accuracy on the Tufano dataset via zero-shot prompting.",
    385       "evidence": "Table III shows Code-DaVinci at 40.70% vs 33.28% for fine-tuned CodeT5 on Tufano, but contamination is acknowledged as a plausible explanation since the dataset predates the model's September 2021 cutoff.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Heuristic post-processing substantially improves LLM exact match accuracy.",
    390       "evidence": "Table III shows GPT-3.5-Turbo zero-shot improving from 6.9% to 22.06% (+15.16pp) on Review4Repair and from 17.86% to 31.70% (+12.27pp) on Tufano after applying five heuristics.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Current language models fail to fully align with code review intentions in approximately 40-60% of cases.",
    395       "evidence": "Table IV shows 'Not Fulfilling' rates of 41-62% across all five models and both datasets in developer evaluation, with Cohen's Kappa 0.51-0.68 indicating moderate-to-substantial rater agreement.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "The performance improvement from fine-tuned models stems from learned NL+PL representations rather than architecture alone.",
    400       "evidence": "Both PLBART and CodeT5 improve substantially over baselines; authors attribute gains to pre-trained weights in the conclusion, but no ablation isolates pre-training from architecture effects.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "Fine-tuned PLBART and CodeT5 substantially outperform prior baselines on code-review-guided automated program repair, with CodeT5 achieving up to 25.65pp improvement in Top-10 accuracy on the Tufano dataset. Surprisingly, Code-DaVinci-Edit-001 matches or exceeds fine-tuned models via zero-shot prompting on the Tufano dataset, though acknowledged training data contamination (dataset pre-dates the September 2021 cutoff) may explain this. Manual analysis by two developers reveals that all models fail to fulfill code review intentions 40-62% of the time across both datasets, with Review4Repair's low-quality vague reviews further degrading performance. The practical conclusion is that LLM-based APR remains far from production-ready despite encouraging benchmark numbers.",
    409   "red_flags": [
    410     {
    411       "flag": "No statistical significance tests",
    412       "detail": "All model comparisons use point estimates without t-tests, Wilcoxon tests, or any significance measures, making it impossible to assess whether performance differences exceed noise."
    413     },
    414     {
    415       "flag": "Single-run results, no variance",
    416       "detail": "No standard deviation or confidence intervals are reported for model accuracy; all results are from single training/inference runs."
    417     },
    418     {
    419       "flag": "Unverifiable contamination for Code-DaVinci",
    420       "detail": "The Tufano et al. dataset was publicly available before the September 2021 cutoff; Code-DaVinci's state-of-the-art zero-shot performance may be memorization rather than generalization, and this cannot be ruled out."
    421     },
    422     {
    423       "flag": "Human study lacks blinding and IRB",
    424       "detail": "The developer evaluation (RQ3) does not describe blinding procedures or IRB/ethics approval, raising concerns about rater bias and ethical compliance for human subjects research."
    425     },
    426     {
    427       "flag": "OpenAI model versions not pinned",
    428       "detail": "GPT-3.5-Turbo and Code-DaVinci-Edit-001 are referenced without API snapshot dates; OpenAI silently updates these models, making exact reproduction over time impossible."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Towards automating code review activities",
    434       "relevance": "Primary baseline dataset and model (Tufano 2-encoder) for code-review-guided APR; directly extended by this work's fine-tuning approach."
    435     },
    436     {
    437       "title": "Review4Repair: Code review aided automatic program repairing",
    438       "relevance": "Second primary baseline dataset and model (R4R CC); provides the larger benchmark used for evaluation of all models."
    439     },
    440     {
    441       "title": "Unified pre-training for program understanding and generation (PLBART)",
    442       "relevance": "One of the two fine-tuned pre-trained models; BART-based architecture pretrained on NL and PL corpora."
    443     },
    444     {
    445       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    446       "relevance": "Best-performing fine-tuned model; identifier-aware pretraining key to superior Insert/Update performance."
    447     },
    448     {
    449       "title": "Evaluating large language models trained on code (Codex)",
    450       "relevance": "Foundation for Code-DaVinci-Edit-001 evaluation; establishes zero-shot code generation capabilities for GPT-3-based models."
    451     },
    452     {
    453       "title": "Language models are few-shot learners (GPT-3)",
    454       "relevance": "Foundation for GPT-3.5-Turbo; establishes few-shot prompting methodology and in-context learning paradigm used in this study."
    455     },
    456     {
    457       "title": "Exploring the effectiveness of large language models in generating unit tests",
    458       "relevance": "Prior work using GPT-3.5-Turbo with zero-shot prompting for code generation tasks; methodology and prompt design directly adapted for APR."
    459     }
    460   ],
    461   "engagement_factors": {
    462     "practical_relevance": {
    463       "score": 2,
    464       "justification": "Directly addresses automated code repair in code review workflows, with a released replication package and actionable findings about LLM vs fine-tuning tradeoffs."
    465     },
    466     "surprise_contrarian": {
    467       "score": 1,
    468       "justification": "Code-DaVinci outperforming fine-tuned models via zero-shot is mildly surprising, but the contamination caveat undermines the finding."
    469     },
    470     "fear_safety": {
    471       "score": 0,
    472       "justification": "No AI safety or risk concerns raised; focuses on software engineering productivity."
    473     },
    474     "drama_conflict": {
    475       "score": 0,
    476       "justification": "Incremental improvement paper with no controversial claims or conflicts with established findings."
    477     },
    478     "demo_ability": {
    479       "score": 2,
    480       "justification": "Replication package on Zenodo and standard OpenAI API make the experiments reconstructible; practitioners can immediately try similar prompting approaches."
    481     },
    482     "brand_recognition": {
    483       "score": 1,
    484       "justification": "Uses recognizable OpenAI models (GPT-3.5-Turbo, Codex) but authors are from BUET, Notre Dame, and Rochester rather than top AI labs."
    485     }
    486   },
    487   "hn_data": {
    488     "threads": [
    489       {
    490         "hn_id": "37215331",
    491         "title": "The Simplest Walking Robot: A bipedal robot with 1 actuator and 2 rigid bodies",
    492         "points": 59,
    493         "comments": 29,
    494         "url": "https://news.ycombinator.com/item?id=37215331"
    495       },
    496       {
    497         "hn_id": "37518075",
    498         "title": "Agents: An Open-Source Framework for Autonomous Language Agents",
    499         "points": 7,
    500         "comments": 1,
    501         "url": "https://news.ycombinator.com/item?id=37518075"
    502       },
    503       {
    504         "hn_id": "46100377",
    505         "title": "RIP Twitter API: A eulogy to its vast research contributions",
    506         "points": 4,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=46100377"
    509       },
    510       {
    511         "hn_id": "40117178",
    512         "title": "RIP Twitter API: A eulogy to its research contributions",
    513         "points": 4,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=40117178"
    516       },
    517       {
    518         "hn_id": "37478569",
    519         "title": "Brain-Inspired Computational Intelligence via Predictive Coding",
    520         "points": 4,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=37478569"
    523       },
    524       {
    525         "hn_id": "47717676",
    526         "title": "Your Agent Is Mine: Measuring Malicious Attacks on the LLM Supply Chain",
    527         "points": 3,
    528         "comments": 1,
    529         "url": "https://news.ycombinator.com/item?id=47717676"
    530       },
    531       {
    532         "hn_id": "37189091",
    533         "title": "Calypso: LLMs as Dungeon Masters' Assistants",
    534         "points": 3,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=37189091"
    537       },
    538       {
    539         "hn_id": "35613390",
    540         "title": "Nearby Stars' Close Encounters with the Brightest Earth Transmissions",
    541         "points": 2,
    542         "comments": 1,
    543         "url": "https://news.ycombinator.com/item?id=35613390"
    544       },
    545       {
    546         "hn_id": "36690558",
    547         "title": "AVX Timing Side-Channel Attacks Against Address Space Layout Randomization",
    548         "points": 2,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=36690558"
    551       },
    552       {
    553         "hn_id": "47732263",
    554         "title": "Measuring Malicious Intermediary Attacks on the LLM Supply Chain",
    555         "points": 2,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=47732263"
    558       }
    559     ],
    560     "top_points": 59,
    561     "total_points": 90,
    562     "total_comments": 32
    563   }
    564 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs