scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29062B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Making LLMs Reliable When It Matters Most: A Five-Layer Architecture for High-Stakes Decisions",
      6     "authors": [
      7       "Alejandro R. Jadad"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2511.07669",
     12     "doi": "10.48550/arXiv.2511.07669"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract claims the framework 'demonstrates that human-AI teams can achieve cognitive partnership capable of preventing avoidable regret,' but this goes far beyond what a single-author qualitative study of simulated scenarios can demonstrate. The author himself acknowledges in Section 6.2 that this is 'single-author hypothesis generation.'",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper makes causal claims such as 'partnership state unlocked model capabilities systematically suppressed in default interactions' and 'reliability degrades when architectural drift and context exhaustion align,' but the study design—a single author qualitatively observing 7 LLMs with no control conditions or quantitative metrics—is wholly inadequate for causal inference.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Section 4 makes sweeping claims about hyperscaler strategy, 'multi-trillion-dollar valuations,' and competitive positioning based on a 7-week single-author qualitative study using simulated scenarios; these generalizations vastly exceed the evidential scope even though Section 6.2 acknowledges the study's limited nature.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper briefly touches on whether sophisticated AI could mimic partnership without genuine cognition (Section 6.2), but it does not discuss alternative explanations for the observed patterns—e.g., that observed improvements are due to longer/more elaborate prompting rather than a distinct 'partnership state,' or that the single assessor's expectations shaped his interpretation.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper uses qualitative behavioral markers (acceptance of corrections, resisting flattery) as proxies for 'cognitive partnership' and ultimately for 'preventing avoidable regret,' but the link between these behavioral proxies and actual decision-quality outcomes is never empirically established; the paper explicitly acknowledges that ground truth is unavailable.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6.2 ('Limitations and Research Directions') is a dedicated subsection that discusses several limitations including single-author design, simulated scenarios, inability to verify model self-reports, and absence of counterfactual outcome comparisons.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 6.2 identifies specific threats: single-author assessment, simulated and anonymised scenarios, model self-reports that 'cannot be independently verified,' and the structural unavailability of ground truth in high-stakes delayed-feedback contexts—these go beyond generic disclaimers.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper explicitly states 'This work should be read as single-author hypothesis generation' (Section 6.2) and 'The framework is also explicitly hypothesis-generating, not a claim that efficacy has already been proven' (Section 6), providing clear scope boundaries.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "There is no funding disclosure statement anywhere in the paper; the author's affiliations (Vivenxia consultant, USC adjunct, Centre for Digital Therapeutics founder) are listed but no funding source is acknowledged or denied.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The author discloses affiliations with Centre for Digital Therapeutics (Toronto), Keck School of Medicine USC (adjunct), and Vivenxia (consultant) on the title page.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No funder is identified, so independence cannot be assessed; however, the author is a consultant at Vivenxia, which may commercially benefit from validating the proposed framework.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "There is no competing interests statement or declaration of financial interests; the author's consulting role at Vivenxia represents a potential undisclosed financial interest in the framework's adoption.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "'Partnership state,' 'performance mode,' 'dissolution discipline,' 'confabulation,' and the five protection layers are all explicitly defined in context; the paper consistently uses these terms with the definitions given.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper explicitly states its contribution is a five-layer protection architecture combined with a seven-stage sequential calibration process for achieving and maintaining human-AI cognitive partnership in high-stakes decisions, deployable without model retraining.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The introduction engages substantively with prior work on sycophancy, confabulation, XAI toolkits, RAG, and human-in-the-loop oversight, explaining why each approach fails and positioning this work as addressing architectural rather than parametric failure modes.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "No code is released; the paper describes prompt architectures and interaction protocols conceptually but does not provide the actual ~4,000-word Partnership Calibration Prompt or other artifacts needed for replication.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No dataset is released; the three vignettes are described only at a high level and the session transcripts or interaction logs are not made available, with the paper noting scenarios were 'simulated' and 'anonymised.'",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "No API versions, model snapshot dates, temperature settings, or other environment specifications are provided; the paper names seven LLMs (e.g., 'Llama,' 'DeepSeek') without specifying which versions were used.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper describes the four-stage initialization architecture and seven-element calibration sequence conceptually, but the actual prompts are not provided and the instructions are not specific enough to follow without guessing at the content of the ~4,000-word canonical artifact.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": false,
    145           "answer": false,
    146           "justification": "The study is entirely qualitative with no quantitative results; no CIs or error bars are applicable or reported.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": false,
    151           "answer": false,
    152           "justification": "No comparative quantitative claims are made; all assessments are qualitative observations by a single author.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": false,
    157           "answer": false,
    158           "justification": "No quantitative effect sizes are reported; the study produces only qualitative observations and proposed hypotheses.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "Seven LLMs and three vignettes were used with no justification for why these numbers were chosen; no power analysis or sample size rationale is provided.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": false,
    169           "answer": false,
    170           "justification": "No quantitative results are reported from which variance could be computed; the study is qualitative throughout.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "No baseline condition (e.g., unstructured LLM interaction without the protocol) is included; the framework is evaluated only against itself with no comparator.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": false,
    183           "answer": false,
    184           "justification": "No baselines are included.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "No ablation study is conducted; the paper proposes this as H7 for future work but does not perform it, so the necessity of each of the five layers is unverified.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "Four assessment dimensions are named (calibration responsiveness, partnership state sustainability, drift self-detection, dissolution discipline) but none are operationalized as measurable metrics; all assessments are qualitative impressions from a single observer.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "The only 'human evaluation' is the author evaluating his own framework through interaction with LLMs; this is not independent human evaluation of system outputs.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Not a prediction task; held-out test set is not applicable.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "Qualitative observations are noted for individual models (e.g., ChatGPT-4o and Llama required extended calibration) but no systematic per-model or per-vignette breakdown with consistent reporting is provided.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Section 3.3 discusses session-length degradation patterns including increased confabulation risk, reduced self-detection, and stronger drift toward performance mode; Section 3.2 notes ChatGPT-4o and Llama showed less stable state maintenance.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The paper reports that detailed prompting 'failed to maintain' partnership state under pressure (abstract), that partnership state degraded systematically with session length across all models (Section 3.3), and that two of seven models required extended calibration.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "The paper lists seven model names (Claude Sonnet 4.5, ChatGPT-5, ChatGPT-4o, DeepSeek, Gemini 2.5, Llama, Grok 4) but 'Llama' and 'DeepSeek' lack version numbers, and none include snapshot dates or API version identifiers sufficient for reproduction.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "The paper describes four named prompt stages including a ~4,000-word Partnership Calibration Prompt, but none of the actual prompt text is provided; Section 2.3 describes their purpose but not their content.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No temperature, top-p, maximum tokens, or any other API parameters are reported for any of the seven LLMs tested.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "The interaction architecture is described conceptually (four-stage initialization, seven-element calibration, five protection layers) but not with sufficient technical detail—actual prompt content, context management mechanisms, and re-calibration triggers are absent.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "The study uses simulated vignettes rather than a dataset requiring preprocessing; this criterion is not applicable.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "No session transcripts, interaction logs, or raw assessment notes are provided; the paper states scenarios were 'simulated' and 'anonymised' with no data repository referenced.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "The setting is described (August 29–October 20, 2025; sessions 2–6 hours) and evaluation dimensions are named, but the actual data—what was observed, how assessments were recorded, what criteria triggered state transitions—is not described in reproducible detail.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "LLM inclusion criteria are explicitly described: frontier-grade capabilities, minimum 100K token context window, publicly available through commercial interfaces; all seven systems meeting these criteria at the start of the study period were included.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": false,
    284           "justification": "There is no formal data pipeline; the study relies on a single author's qualitative impressions from live LLM sessions, with no documentation of how observations were recorded, coded, or analyzed.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": false,
    291           "answer": false,
    292           "justification": "The paper is not evaluating model capabilities on held-out benchmarks; contamination is not applicable to this interaction-architecture study.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "Not applicable; no benchmark evaluation is performed.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Not applicable; the study uses simulated proprietary vignettes rather than public benchmarks.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "The sole human participant is the author himself; formal pre-registration for self-study is not a standard requirement and is not applicable here.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "The study involves only the author as human participant using simulated scenarios with no personal data; IRB approval is not applicable.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "Not applicable; the single human participant is the author, not a recruited subject.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "Not applicable for a self-study with the author as sole human participant.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "Not applicable; no experimental assignment of human participants.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "Not applicable; the author is simultaneously the study designer, human participant, and sole evaluator—blinding is structurally impossible.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "Not applicable; no human participants were recruited or could drop out.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference costs, API costs, or session pricing are reported despite the study involving seven frontier-grade LLMs across weeks of extended sessions.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No compute budget is stated; the paper provides no quantitative information about resource consumption.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Partnership state is achievable through ordered seven-stage calibration but requires emergent maintenance protocols that cannot be anticipated a priori.",
    371       "evidence": "Author's qualitative observation across 7 LLMs over 7 weeks of simulated vignette sessions; no quantitative metrics or control conditions.",
    372       "supported": "weak"
    373     },
    374     {
    375       "claim": "Comprehensive one-shot prompting consistently failed to produce a protective partnership state across all seven model families tested.",
    376       "evidence": "Qualitative assertion by the single-author evaluator; no systematic comparison to a one-shot prompting baseline with operationalized failure criteria.",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "Partnership state degraded systematically with session length across all tested model families, suggesting architectural rather than model-specific constraints.",
    381       "evidence": "Qualitative observation reported in Section 3.3; the 'systematic' degradation pattern is attributed to all seven models but no data, rates, or session-length thresholds are provided.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "Five of seven LLMs (Claude 4.5, ChatGPT-5, Grok 4, Gemini 2.5, DeepSeek) demonstrated behavioral patterns consistent with partnership state after initial calibration.",
    386       "evidence": "Single-author qualitative assessment with no inter-rater reliability, no operationalized criteria for 'consistent with partnership state,' and an acknowledged caveat that these represent 'preliminary patterns from single-investigator assessment.'",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "Multiple shorter calibrated sessions with explicit state verification between stages provided more stable partnership maintenance than single extended deliberations for equivalent total engagement time.",
    391       "evidence": "Qualitative impression from the author's own session experience; no controlled comparison, no quantitative stability measures, no independent replication.",
    392       "supported": "unsupported"
    393     }
    394   ],
    395   "methodology_tags": [
    396     "case-study",
    397     "qualitative"
    398   ],
    399   "key_findings": "This single-author design science research paper proposes a five-layer protection architecture and seven-stage sequential calibration process for establishing 'partnership state'—a mode of human-AI interaction intended to prevent cognitive biases from compounding in high-stakes decisions. The study reports qualitative observations from one researcher interacting with seven frontier LLMs over seven weeks using simulated venture scenarios, finding that partnership state degrades with session length and that two of seven models required extended calibration. The paper's primary output is nine falsifiable hypotheses for future empirical validation rather than demonstrated efficacy; the author explicitly acknowledges this is hypothesis-generating single-author work, not a controlled study.",
    400   "red_flags": [
    401     {
    402       "flag": "Single-author self-evaluation",
    403       "detail": "The author is simultaneously the framework designer, the sole human participant, and the sole evaluator—a structural conflict of interest with no independent assessment of any kind."
    404     },
    405     {
    406       "flag": "No quantitative metrics",
    407       "detail": "All findings are qualitative impressions with no operationalized success criteria, no inter-rater reliability, and no way to distinguish genuine partnership state from sophisticated fluency."
    408     },
    409     {
    410       "flag": "No control conditions",
    411       "detail": "There is no baseline comparison (e.g., unstructured prompting, naive LLM use) against which the framework's claimed benefits can be measured."
    412     },
    413     {
    414       "flag": "Actual prompts not provided",
    415       "detail": "The core artifact—the ~4,000-word Partnership Calibration Prompt—is described but not disclosed, making independent replication impossible despite the paper presenting itself as reproducible."
    416     },
    417     {
    418       "flag": "Overclaiming in implications",
    419       "detail": "Section 4 makes sweeping claims about multi-trillion-dollar valuations, hyperscaler strategy, and regulatory advantage based on a 7-week qualitative self-study with simulated scenarios."
    420     },
    421     {
    422       "flag": "No model version specificity",
    423       "detail": "'Llama' and 'DeepSeek' are listed without version numbers; no API snapshot dates are provided for any of the seven models, making temporal replication impossible."
    424     },
    425     {
    426       "flag": "Financial interest not disclosed",
    427       "detail": "The author is a consultant at Vivenxia and founder of Centre for Digital Therapeutics; no competing interests statement is included despite a framework that could generate commercial value."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "When combinations of humans and AI are useful: A systematic review and meta-analysis",
    433       "relevance": "Direct prior work on human-AI collaboration effectiveness; the paper's framework claims to address the failure modes identified in this systematic review."
    434     },
    435     {
    436       "title": "Sycophancy in large language models: Causes and mitigations",
    437       "relevance": "Core background on LLM sycophancy, which is the primary failure mode the proposed framework targets."
    438     },
    439     {
    440       "title": "How human-AI feedback loops alter human perceptual, emotional and social judgements",
    441       "relevance": "Empirical evidence for the human-AI bias reinforcement dynamic that motivates the need for the proposed architecture."
    442     },
    443     {
    444       "title": "DeLLMa: Decision making under uncertainty with large language models",
    445       "relevance": "Prior work on LLM decision-making under uncertainty, directly adjacent to this paper's application domain."
    446     },
    447     {
    448       "title": "Seven failure points when engineering a Retrieval Augmented Generation system",
    449       "relevance": "Cited as an example of prior approaches (RAG) that fail to prevent the reliability gaps the framework addresses."
    450     },
    451     {
    452       "title": "What large language models know and what people think they know",
    453       "relevance": "Empirical work on human trust calibration with LLMs, relevant to the paper's focus on preventing overconfidence in human-AI decision teams."
    454     },
    455     {
    456       "title": "On the fundamental impossibility of hallucination control in large language models",
    457       "relevance": "Theoretical grounding for why architectural rather than parametric solutions are needed—directly supports the paper's core framing."
    458     },
    459     {
    460       "title": "Formalising human-in-the-loop: Computational reductions, failure modes, and legal-moral responsibility",
    461       "relevance": "Prior work on human-in-the-loop oversight that the paper argues is insufficient without bidirectional protection architecture."
    462     }
    463   ],
    464   "engagement_factors": {
    465     "practical_relevance": {
    466       "score": 2,
    467       "justification": "Practitioners working with LLMs on strategic decisions could theoretically apply the framework, though the actual prompts are withheld and the evidence base is a single-author self-study."
    468     },
    469     "surprise_contrarian": {
    470       "score": 1,
    471       "justification": "The argument that interaction architecture matters more than model parameters is somewhat contrarian but increasingly discussed; the 'partnership state' framing is novel but not surprising."
    472     },
    473     "fear_safety": {
    474       "score": 2,
    475       "justification": "Explicitly frames LLM reliability failures in high-stakes decisions as a systemic risk with financial and strategic consequences, invoking AI winter scenarios and irreversible commitment failures."
    476     },
    477     "drama_conflict": {
    478       "score": 2,
    479       "justification": "Bold framing around multi-trillion-dollar valuations at risk, AI winters, and enterprise churn creates drama; but the evidence underlying these claims is a single author's qualitative sessions."
    480     },
    481     "demo_ability": {
    482       "score": 1,
    483       "justification": "The framework is described as deployable without model retraining, but the actual prompts are not provided, making it impossible for readers to try it directly."
    484     },
    485     "brand_recognition": {
    486       "score": 0,
    487       "justification": "Single independent academic author with no famous lab affiliation; USC adjunct and Vivenxia consultant roles carry no significant brand recognition in AI research."
    488     }
    489   },
    490   "hn_data": {
    491     "threads": [
    492       {
    493         "hn_id": "38276574",
    494         "title": "Spherules collected by Loeb et al. do not appear to be extrasolar",
    495         "points": 5,
    496         "comments": 0,
    497         "url": "https://news.ycombinator.com/item?id=38276574",
    498         "created_at": "2023-11-15T13:51:28Z"
    499       },
    500       {
    501         "hn_id": "38287654",
    502         "title": "Mart: Improving LLM Safety with Multi-Round Automatic Red-Teaming",
    503         "points": 2,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=38287654",
    506         "created_at": "2023-11-16T10:04:01Z"
    507       },
    508       {
    509         "hn_id": "38282521",
    510         "title": "Mart: Improving LLM Safety with Multi-Round Automatic Red-Teaming",
    511         "points": 1,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=38282521",
    514         "created_at": "2023-11-15T21:09:22Z"
    515       }
    516     ],
    517     "top_points": 5,
    518     "total_points": 8,
    519     "total_comments": 0
    520   }
    521 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs