scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27872B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "F2A: An Innovative Approach for Prompt Injection by Utilizing Feign Security Detection Agents",
      6     "authors": [
      7       "Yupeng Ren"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2410.08776",
     12     "doi": "10.48550/arXiv.2410.08776"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Main claims about F2A's ability to bypass defenses and reduce success with defense prompts are empirically demonstrated in Tables 1-2. Claims about 'blind trust' are inferred from attack success rather than mechanistically proven.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Paper claims F2A's components (string obfuscation, fake detection, sequential instructions) cause attack success, but no ablation study isolates which components are necessary or individually sufficient. Causal mechanism is asserted, not validated.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Title states 'An Innovative Approach for Prompt Injection' (broad claim), but scope bounded only implicitly to 'mainstream LLMs available on the web' and specific models tested (8 total). Generality claims not explicitly limited in text.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Paper does not discuss alternative mechanisms. Attack success could result from semantic obfuscation alone, instruction-following confusion, or other factors beyond 'blind trust in detection agents.' No consideration of competing explanations.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Paper measures 'successful harmful output' and interprets as 'blind trust in safety agents,' but these are not equivalent. Output generation could result from other vulnerabilities (instruction-following, code execution, semantic confusion). Measurement not clearly distinguished from claimed mechanism.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated limitations or threats-to-validity section. Conclusion briefly reiterates findings but does not discuss scope boundaries or methodological limitations.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No specific threats discussed: (1) only 10 attack prompts tested, (2) single evaluator (GPT-4o as judge), (3) only 8 model families tested, (4) no discussion of parameter sensitivity or reproducibility challenges.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Paper does not explicitly state what results do NOT show: applicability to fine-tuned models, locally-deployed models, or models with different RLHF approaches. Scope implicitly bounded to tested API models only.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source disclosed. Paper appears unfunded independent research but not explicitly stated.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation clearly stated: 'Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China.' No conflicts with evaluated products.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": true,
     83         "justification": "No funding mentioned, so independence assumption holds.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or declaration of financial relationships (patents, equity, consulting) provided.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Key terms used but not precisely defined: 'blind trust' inferred from experiments, not formally defined; 'safety detection agent' used without formal definition; 'feign' used colloquially without technical specification.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Three explicit contributions stated in introduction: (1) introduce/define F2A, (2) demonstrate vulnerabilities empirically, (3) provide defense recommendations. Intentions are clear.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "References prior work on injection attacks [1-10] but does not clearly differentiate F2A from prior attacks. No comparative analysis showing how F2A's mechanism differs from 'direct injection' or other indirect attacks. Related work scattered through introduction rather than synthesized.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "No code released for attack generation, defense mechanism, or evaluation harness. Paper is write-only; reproducibility requires reimplementing methodology.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Attack prompts shown in Appendix are examples, not a reusable dataset. Model outputs not released (only binary hit/miss in tables). No raw evaluation data available.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Model names given (GPT-4o, DeepSeek-V2.5, Mistral-Large-2) but no version snapshots or timestamps. No API endpoint configurations, temperature settings, or top-p parameters specified. Paper states 'Evil-Users cannot arbitrarily adjust parameters' but does not specify defaults.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Methodology described at high level (3 steps: convert, feign, construct) with examples, but no step-by-step instructions to reproduce. Cannot reproduce without reimplementing entire pipeline against live APIs.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No confidence intervals or error bars. Results presented as binary (hit/miss) or aggregate counts (e.g., 2/10) with no uncertainty quantification.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No statistical significance tests applied. No p-values or hypothesis tests. Comparisons between models/prompts treated descriptively (e.g., Table 1 shows checkmarks but no statistical comparison).",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Success rates implicitly reported as proportions (2/10 = 20% for GPT-4o with defense). Tables show attack rates per model × prompt combination.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "10 attack prompts chosen without justification for sample size. No power analysis. Model selection (7B to 72B parameters, 8 families) not justified.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Only single evaluation reported per model × prompt pair. No multiple runs, no mention of variance. Results appear deterministic but could be stochastic across API calls.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Implicit baseline: models without F2A defense. Table 2 explicitly compares defense-protected models vs. unprotected, showing reduction in attack success.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Models tested (GPT-4o, GLM-4-Plus, Mistral-Large-2, DeepSeek-V2.5, Qwen, Llama-3.1) are contemporary as of October 2024 arXiv submission.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "No ablation study. F2A has three components (string concatenation, fake detection, sequential instructions) but none tested independently to determine necessity or sufficiency.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "Single primary metric: binary attack success (hit/miss). No diversity metrics, no severity scales, no gradient of 'partial' success. 'Hit score' in Table 2 is just a count, not a multi-dimensional metric.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "Model outputs evaluated by GPT-4o (automated), not humans. GPT-4o judges whether content is 'dangerous' with no inter-rater reliability check or human agreement study.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": false,
    207           "answer": true,
    208           "justification": "Not applicable. This is an adversarial attack evaluation, not a prediction task. Concept of train/test split does not apply.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Table 1 breaks down results by attack type (death, weapons, racism, poison, fraud, tutorials, antisocial, mental_illness, political, terrorist) and model. Per-category analysis present.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "Some anecdotal failures mentioned (e.g., Llama3.1-8B-Instruct misinterpreting fraud prompt) but not systematically analyzed. No breakdown of failure modes, no categorization of why attacks fail.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "Failures shown in tables but not deeply reported. For example, why do GPT-4o and Qwen-72B show only 2 successful attacks while smaller models show more? Inversion (smaller = more vulnerable) mentioned but not explored.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "Model names provided (GPT-4o, DeepSeek-V2.5) but no version snapshots, no training cutoff dates, no API endpoint specification. GPT-4o is a moving target with continuous updates; results not timestamped to specific model version.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Full attack prompts shown in Appendix with three components (Instance A: string conversion, Instance B: fake detection, Instance C: task instructions). Methodology examples detailed.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No temperature, top-p, top-k, or other generation parameters specified. Paper states 'Evil-Users cannot arbitrarily adjust parameters' but does not specify what defaults were used.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "F2A scaffolding described in detail: 3-step process with examples, 'Sequential Strategy' shown for instruction construction, step-by-step prompt templates in Appendix.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": false,
    257           "answer": true,
    258           "justification": "Not applicable. No dataset preprocessing. Malicious content is prepared through F2A methodology (string splitting, code wrapping).",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "No raw model outputs released. Only summary tables (binary hit/miss for Table 1, aggregate scores for Table 2). Model conversation examples in Figures 3-4 but not comprehensive raw data.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Collection procedure is brief: '10 prompts' created, tested against models, evaluated by GPT-4o. No documentation of how the 10 prompts were selected, what criteria ensured coverage of attack types, or sampling methodology.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": true,
    278           "justification": "Not applicable. No human participants recruited.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": false,
    284           "justification": "High-level pipeline documented: (1) construct F2A prompt, (2) submit to LLM API, (3) collect output, (4) pass to GPT-4o for harm judgment. Intermediate details (API handling, output parsing, judging criteria) not documented.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "Training cutoff dates not stated for any model. GPT-4o released October 2023 (exact date not specified). DeepSeek-V2.5, Mistral, others have no stated cutoff. Unclear if attack prompts fall within or after training data.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "No discussion of whether attack prompts or F2A methodology could have been in training data. Attack is novel construction but potential for training data contamination not addressed.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": false,
    303           "answer": true,
    304           "justification": "Not applicable. Attack evaluation on live models, not benchmarks. No benchmark contamination concern.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": true,
    312           "justification": "Not applicable. No human participants.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": true,
    318           "justification": "Not applicable. No human participants, though paper explicitly warns of 'harmful contents' (ethical concern acknowledged but no review mentioned).",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": true,
    324           "justification": "Not applicable. No human participants.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": true,
    330           "justification": "Not applicable. No human participants.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": true,
    336           "justification": "Not applicable. No human participants.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": true,
    342           "justification": "Not applicable. No human participants.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": true,
    348           "justification": "Not applicable. No human participants.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference cost or latency reported. Paper conducts API calls to multiple commercial models but does not discuss cost per attack, token usage, or rate-limiting issues.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Total computational budget not stated. Number of API calls, total tokens consumed, or resource allocation not mentioned.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "LLMs exhibit 'blind trust' in fabricated safety detection agent results",
    371       "evidence": "Table 1 shows F2A attacks succeed on most models (GPT-4o 2/10, GLM-4-Plus 5/10, Mistral 3/10, DeepSeek 6/10, smaller models 4-7/10 hits). Paper interprets attack success as evidence of blind trust.",
    372       "supported": "weak"
    373     },
    374     {
    375       "claim": "F2A successfully bypasses LLM safety defense mechanisms across multiple models",
    376       "evidence": "Table 1 demonstrates attacks work on 8 different LLM services with 2-7 successful attacks per model across 10 prompts. Specific examples shown in Figures 3-4.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Defense prompt instructing models to critically evaluate detection results dramatically reduces F2A success",
    381       "evidence": "Table 2: With defense prompt, GPT-4o reduces from 2/10 to 0/10, GLM-4 from 5/10 to 1/10, Mistral from 3/10 to 0/10, DeepSeek from 6/10 to 1/10.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Attacks related to fraud, antisocial behavior, mental illness, and political topics are harder to defend against",
    386       "evidence": "Table 1 shows checkmarks for these categories across most models. Paper explains: 'more closely related to mental health treatment, academic discussions, or scenario simulations.'",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Smaller/weaker models are more vulnerable than larger/stronger models",
    391       "evidence": "GPT-4o and Qwen2.5-72B marked as 'least vulnerable' (2 hits each); Gemma2-9B and Llama3.1-8B show 4-6 hits; Qwen2.5-7B shows 4 hits.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Attack failures occur because models misunderstand instructions rather than refuse outright",
    396       "evidence": "Paper states: 'Llama3.1-8B-Instruct was attacked by Fraud, the injection prompt was regarded by the model as other ordinary content.' Anecdotal only.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval",
    402     "case-study"
    403   ],
    404   "key_findings": "The Feign Agent Attack (F2A) successfully exploits LLMs' reliance on embedded safety detection results by hiding malicious content in Python string concatenation and faking security verification scores. Across 8 major LLM services, F2A achieves 20-70% attack success rates on diverse harmful topics (weapons, fraud, illegal activities). A simple defense—prompting models to critically evaluate detection results—reduces attack success to 0-10%, suggesting models can resist F2A with appropriate scaffolding rather than architectural changes.",
    405   "red_flags": [
    406     {
    407       "flag": "No ablation study",
    408       "detail": "F2A has three components (string obfuscation, fake detection, sequential instructions) but no systematic testing of which are necessary. Cannot determine if attack works due to all three or just one component."
    409     },
    410     {
    411       "flag": "Single evaluator with potential bias",
    412       "detail": "GPT-4o used to judge whether outputs are 'dangerous' with no inter-rater agreement check, human verification, or alternative evaluation methods."
    413     },
    414     {
    415       "flag": "Limited sample size",
    416       "detail": "Only 10 attack prompts across 10 harm categories. No justification for why 10 is sufficient or whether coverage is representative of attack surface."
    417     },
    418     {
    419       "flag": "No statistical significance testing",
    420       "detail": "Results presented as raw counts (2/10, 5/10) with no confidence intervals, p-values, or hypothesis tests. Unclear if differences between models are statistically meaningful."
    421     },
    422     {
    423       "flag": "Model versions not pinned",
    424       "detail": "GPT-4o is continuously updated; DeepSeek-V2.5 may have been patched. No snapshot dates provided, limiting reproducibility."
    425     },
    426     {
    427       "flag": "No raw data or code release",
    428       "detail": "Cannot independently verify results. No attack code, model outputs, or evaluation harness provided."
    429     },
    430     {
    431       "flag": "Alternative mechanism not ruled out",
    432       "detail": "'Blind trust' is one interpretation of attack success, but semantic obfuscation (string concatenation working) or instruction-following confusion could be sufficient. Not distinguished."
    433     },
    434     {
    435       "flag": "Overgeneralized claims",
    436       "detail": "Title and abstract use broad language ('mainstream LLMs,' 'most LLM services') but evidence limited to 8 models tested."
    437     },
    438     {
    439       "flag": "Incomplete defense analysis",
    440       "detail": "Defense prompt reduces success but doesn't eliminate it (1/10 for GLM-4, DeepSeek). No exploration of why some attacks still succeed or what additional defenses are needed."
    441     },
    442     {
    443       "flag": "Inversion of vulnerability not explained",
    444       "detail": "Smaller models (7B, 8B) more vulnerable than larger ones, but paper does not explain why or test whether capability/alignment trade-offs are responsible."
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "Breaking Down the Defenses: A Comparative Survey of Attacks on Large Language Models",
    450       "relevance": "Comprehensive survey of LLM attack methods; F2A is a new indirect injection category."
    451     },
    452     {
    453       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    454       "relevance": "Foundational work on indirect prompt injection attacks; F2A extends this by targeting safety agents specifically."
    455     },
    456     {
    457       "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting",
    458       "relevance": "Defense mechanism against indirect injection; relevant to F2A mitigation strategies."
    459     },
    460     {
    461       "title": "ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors",
    462       "relevance": "Safety detection systems that F2A exploits; directly targeted by this attack."
    463     },
    464     {
    465       "title": "SafetyBench: Evaluating the Safety of Large Language Models",
    466       "relevance": "Benchmark for LLM safety; contextualizes F2A as vulnerability in evaluated safety mechanisms."
    467     },
    468     {
    469       "title": "Attack Prompt Generation for Red Teaming and Defending Large Language Models",
    470       "relevance": "Red teaming methodology for LLMs; F2A is a red team attack variant."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Reveals real vulnerability in deployed safety mechanisms that practitioners must address. However, defense is generic ('prompt to evaluate') rather than operationalized into system design."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "Attack vector is novel but builds on well-known instruction-following and prompt injection vulnerabilities. Claiming 'blind trust' is somewhat contrarian but not deeply surprising."
    481     },
    482     "fear_safety": {
    483       "score": 3,
    484       "justification": "Directly demonstrates that LLM safety infrastructure can be spoofed and bypassed. High relevance to AI safety concerns about adversarial robustness of alignment mechanisms."
    485     },
    486     "drama_conflict": {
    487       "score": 2,
    488       "justification": "Has conflict (hackers vs. defenders) and shows clear failures of widely-used systems. However, no real-world incidents documented, only controlled experiments."
    489     },
    490     "demo_ability": {
    491       "score": 3,
    492       "justification": "Highly reproducible with public model APIs. Users can test prompts from Appendix against ChatGPT/Claude/etc. immediately and see results."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "Single-author paper from Chinese Academy of Sciences. No famous authors or high-profile institution. Limited brand presence."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "24805792",
    503         "title": "Refinement Types: A Tutorial",
    504         "points": 3,
    505         "comments": 0,
    506         "url": "https://news.ycombinator.com/item?id=24805792",
    507         "created_at": "2020-10-16T22:55:03Z"
    508       },
    509       {
    510         "hn_id": "41913877",
    511         "title": "Bypassing the Popularity Bias: Repurposing Models for Long-Tail Recommendation",
    512         "points": 2,
    513         "comments": 0,
    514         "url": "https://news.ycombinator.com/item?id=41913877",
    515         "created_at": "2024-10-22T13:11:35Z"
    516       },
    517       {
    518         "hn_id": "24882449",
    519         "title": "The Nvidia PilotNet Experiments",
    520         "points": 2,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=24882449",
    523         "created_at": "2020-10-24T22:35:53Z"
    524       },
    525       {
    526         "hn_id": "42978639",
    527         "title": "DocVLM: Make Your VLM an Efficient Reader",
    528         "points": 2,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=42978639",
    531         "created_at": "2025-02-07T23:20:57Z"
    532       },
    533       {
    534         "hn_id": "42645393",
    535         "title": "Searching Latent Program Spaces",
    536         "points": 2,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=42645393",
    539         "created_at": "2025-01-09T13:46:21Z"
    540       },
    541       {
    542         "hn_id": "38004580",
    543         "title": "Gesture Recognition for FMCW Radar on the Edge",
    544         "points": 1,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=38004580",
    547         "created_at": "2023-10-24T19:59:44Z"
    548       },
    549       {
    550         "hn_id": "24800126",
    551         "title": "Refinement Types: A Tutorial",
    552         "points": 1,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=24800126",
    555         "created_at": "2020-10-16T12:29:06Z"
    556       }
    557     ],
    558     "top_points": 3,
    559     "total_points": 13,
    560     "total_comments": 0
    561   }
    562 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs