scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26646B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Very Long-Term Conversational Memory of LLM Agents",
      6     "authors": [
      7       "Adyasha Maharana",
      8       "Dong-Ho Lee",
      9       "Sergey Tulyakov",
     10       "Mohit Bansal",
     11       "Francesco Barbieri",
     12       "Yuwei Fang"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv",
     16     "arxiv_id": "2402.17753",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All major abstract claims—LOCOMO dataset statistics, LLM performance gaps vs humans, RAG/long-context improvements—are supported by Tables 2–4 and the documented pipeline.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims about RAG and observation-based retrieval are supported by ablations across retrieval units and top-k values in Tables 3 and 6; MiniGPT-5 ablations compare Base/+summary/+observation variants.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper makes broad claims about LLMs struggling with 'lengthy conversations' and 'temporal reasoning' without bounding generalizability beyond 50 synthetic English-only conversations from a single pipeline.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not consider alternative explanations for key findings—e.g., adversarial performance collapse in long-context models could reflect evaluation metric artifacts or context-position effects rather than hallucination per se.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Section 8 explicitly acknowledges that F1/ROUGE are imperfect proxies for memory capability due to LLM verbosity, distinguishing measured scores from claimed 'memory' capabilities.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 8 'Limitations' contains five distinct, substantive subsections covering data quality, multimodal coverage, language, closed-source API dependency, and evaluation metrics.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats are named: LLM-generated data may miss real-world nuance, images lack personal visual consistency, pipeline is English-only, GPT dependency limits reproducibility, and LLM verbosity confounds F1 evaluation.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly bounds scope to English, 50 conversations, specific evaluated models, and acknowledges LOCOMO 'may not fully reflect the nuances of real-world online conversations.'",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper text.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations (UNC Chapel Hill, USC, Snap Inc.) are clearly stated on the first page.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The project is hosted at snap-research.github.io suggesting Snap funding; however, the paper evaluates external LLMs (GPT, LLaMA, Mistral), not Snap's own products, so the funder is not a direct beneficiary of specific model results.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or declaration of financial interests is present.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "'Very long-term' is operationally defined (300 turns, 9K tokens, up to 35 sessions) and 'memory' is defined through three concrete evaluation tasks (QA, event summarization, dialogue generation).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper clearly states its three contributions: the LOCOMO dataset, the machine-human generation pipeline, and the multi-task evaluation benchmark.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 systematically situates LOCOMO against existing dialogue datasets in Table 1, discussing specific quantitative limitations of prior work (e.g., MSC's ~1K tokens over 4 sessions) and how this paper addresses them.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "Footnote 1 states code and data are 'to be available' at snap-research.github.io/locomo — a promise of future release, not a current one.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "LOCOMO is promised for future release under CC BY-NC 4.0; it was not available at time of submission.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions 'OpenAI API and Huggingface, as of January 2024' and an Nvidia A6000 server with FP32, but no requirements.txt, Dockerfile, or explicit package versions are provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are provided; Appendix C gives high-level experimental descriptions but lacks runnable workflow documentation.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "The paper explicitly states 'We report results from a single inference run for each model'; no confidence intervals or error bars appear anywhere.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are used despite multiple comparative claims across six models and three retrieval conditions.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Effect sizes are reported as absolute F1 differences and percentage improvements (22–66% for RAG; 56% gap vs human) with baseline values provided for context.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The choice of 50 conversations for LOCOMO is not justified through power analysis, coverage arguments, or comparison with what would be needed for reliable conclusions.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Single-run results are reported throughout; no variance, standard deviation, or run-to-run variability is reported for any experiment.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Multiple baseline models (Mistral-7B, LLaMA-70B, GPT-3.5, GPT-4) plus human performance are included across tasks.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "All evaluated models (GPT-4-turbo, GPT-3.5-turbo-16K, Mistral-7B, LLaMA-70B) were state-of-the-art as of January 2024.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "RAG ablations compare dialog/observation/summary retrieval units at multiple top-k values (Table 3); MiniGPT-5 ablations compare Base/+summary/+observation training variants (Table 6).",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "QA uses F1 and Recall@k; summarization uses ROUGE-1/2/L and FactScore precision/recall/F1; dialogue generation uses BLEU, ROUGE-L, BertScore, and MMRelevance.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Human performance is measured on the QA task (Table 2: 87.9 overall F1), providing an upper-bound benchmark across all five reasoning categories.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "LOCOMO serves as a held-out evaluation set; no evaluated model was trained on LOCOMO conversations (MiniGPT-5 trains on a separately generated set without human filtering).",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "QA results are broken down by five reasoning categories (single-hop, multi-hop, temporal, open-domain, adversarial) across all model conditions in Tables 2 and 3.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 6.2 identifies five error categories for event summarization with examples (Table 7); Section 6.1 analyzes adversarial question failure and hallucination patterns in long-context models.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "GPT-3.5-16K scores only 2.1% on adversarial questions vs 12.8% for the base model, and underperforms on event summarization despite larger context — both are highlighted and analyzed rather than downplayed.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Models are referenced as 'gpt-3.5-turbo' and 'gpt-4-turbo' with links to general documentation pages rather than specific versioned snapshots that change over time.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix A includes actual prompts for persona generation, event graph generation, session summarization, observation extraction, and image sharing/reaction behaviors (Figures 5–10).",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Temperature=0 and top_p=1 are reported for evaluation; MiniGPT-5 is trained for 10 epochs (~30 hours on A6000) using original codebase defaults for remaining hyperparameters.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The reflect-and-respond memory architecture (short-term session summary, long-term observation database, retrieval during generation) is described in detail in Section 3.3 and Appendix A.2.1.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Image-to-caption conversion using BLIP-2 for QA/summarization tasks, F1 answer normalization, and RAG retrieval procedures are documented in Sections 4–5 and Appendix C.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw LOCOMO data is promised for future release but not available at time of paper submission.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "The full collection pipeline is documented with quantitative annotation statistics (15% of turns edited, 19% images removed/substituted) and specific annotator task descriptions in Section 3.4.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "Annotators are described only as 'in-house annotators'; demographics and recruitment methods are withheld due to 'confidential nature of such information' (Appendix B.3).",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The complete pipeline from persona selection through event graph generation, dialogue synthesis, human annotation, and benchmark construction is documented with prompts and worked examples.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Training data cutoffs for GPT-3.5-turbo, GPT-4-turbo, LLaMA-70B, and Mistral-7B are not stated anywhere in the paper.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper does not discuss whether MSC source personas (which seed LOCOMO) or LLM-generated conversation patterns could overlap with model training distributions.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No analysis of whether the MSC personas or evaluation QA question patterns may have been encountered during pretraining of evaluated models.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No formal human subjects study; human participation is limited to in-house annotation and performance benchmarking on the created dataset.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "Not applicable; in-house annotators are employees, not external research participants requiring IRB review.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "Not applicable as a formal human subjects study; annotator demographics are withheld as confidential per Appendix B.3.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "Not applicable; no formal external participant recruitment.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "Not applicable; no human subjects experiment with randomization.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "Not applicable; annotation tasks do not require blinding.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "Not applicable; no longitudinal human participants study.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "The paper acknowledges using 'strongest commercial LLMs available through a paid API' but reports no actual API costs, token counts, or per-query pricing.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Only MiniGPT-5 training time (30 hours on a single A6000) is stated; no compute budget for the main API-based evaluation experiments is reported.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Long-context LLMs and RAG improve QA memory performance by 22–66% over base models",
    376       "evidence": "Table 2: GPT-3.5-16K reaches 37.8% overall F1 vs 22.4% for GPT-3.5-4K; Table 3: RAG with observations achieves 41.4% overall F1",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "All evaluated models significantly lag behind human performance (56% gap on QA overall F1)",
    381       "evidence": "Table 2 shows human performance at 87.9 vs best model GPT-3.5-16K at 37.8, a 50-point gap",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Long-context LLMs show dramatically degraded performance on adversarial questions vs base models",
    386       "evidence": "Table 2: GPT-3.5-16K scores 2.1% on adversarial at 16K context vs 12.8% for GPT-3.5-4K base",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "RAG with observation-based retrieval outperforms session-summary retrieval on temporal reasoning",
    391       "evidence": "Table 3: Observations achieve 41.9% F1 on temporal at top-5 vs summaries at 31.0% for top-5; however both are well below human performance of 92.6%",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Long-context models underperform base models on event summarization despite larger context windows",
    396       "evidence": "Table 4: GPT-3.5-16K achieves FactScore F1 of 39.9 vs GPT-3.5-4K base at 45.9",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "LOCOMO conversations are substantially longer (9x tokens, 4x sessions) than prior state-of-the-art long-term dialogue datasets",
    401       "evidence": "Table 1: LOCOMO averages 9,209 tokens and 19.3 sessions vs MSC's 1,226 tokens and 4 sessions",
    402       "supported": "strong"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval"
    407   ],
    408   "key_findings": "LOCOMO is the first benchmark for very long-term dialogues (~300 turns, 9K tokens, up to 35 sessions), substantially exceeding prior datasets. Current LLMs achieve only 22–38% F1 on QA vs 87.9% human performance, with temporal reasoning the hardest category (73% below human). Counterintuitively, long-context LLMs degrade severely on adversarial questions (2.1% vs 12.8% baseline) and underperform base models on event summarization, suggesting extended context may amplify hallucination rather than reduce it. RAG with speaker observations offers the best accuracy-comprehension tradeoff.",
    409   "red_flags": [
    410     {
    411       "flag": "Single-run evaluation, no variance",
    412       "detail": "All results are from a single inference run with no error bars, CIs, or repeated trials, making effect sizes unreliable for the comparative claims made."
    413     },
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "No significance tests are used despite multiple comparative claims across models and retrieval conditions in a 50-conversation corpus."
    417     },
    418     {
    419       "flag": "Code and data not released at submission",
    420       "detail": "Both code and LOCOMO data are promised for future release; reproduction is impossible at time of publication."
    421     },
    422     {
    423       "flag": "GPT model versions unspecified",
    424       "detail": "GPT-3.5-turbo and GPT-4-turbo are referenced by marketing names linked to general documentation pages, not specific versioned snapshots."
    425     },
    426     {
    427       "flag": "Human baseline may be inflated by familiarity",
    428       "detail": "Human QA performance (87.9%) is measured using the same in-house annotators who created and verified the dataset, potentially inflating the upper bound."
    429     },
    430     {
    431       "flag": "Contamination not addressed",
    432       "detail": "No analysis of whether MSC source personas or LLM-generated content could overlap with training data of evaluated models; training cutoffs are unstated."
    433     },
    434     {
    435       "flag": "Small corpus (n=50 conversations)",
    436       "detail": "All benchmark evaluations are grounded in only 50 synthetic conversations; this limits statistical confidence in findings and subgroup analyses."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Beyond Goldfish Memory: Long-Term Open-Domain Conversation",
    442       "relevance": "Primary predecessor dataset (MSC) that LOCOMO extends; also provides source personas for pipeline initialization"
    443     },
    444     {
    445       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    446       "relevance": "Memory architecture (reflect-and-respond with observations) directly adapted for LOCOMO's dialogue generation pipeline"
    447     },
    448     {
    449       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    450       "relevance": "Cited to explain long-context model failures on adversarial questions and event summarization in LOCOMO"
    451     },
    452     {
    453       "title": "FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation",
    454       "relevance": "Key evaluation metric adopted for the event summarization task to measure factual precision and recall"
    455     },
    456     {
    457       "title": "Conversation Chronicles: Towards Diverse Temporal and Relational Dynamics in Multi-Session Conversations",
    458       "relevance": "Prior multi-session dialogue dataset directly compared to LOCOMO in Table 1; prior art for temporal dialogue"
    459     },
    460     {
    461       "title": "How to Train Your DRAGON: Diverse Augmentation Towards Generalizable Dense Retrieval",
    462       "relevance": "Retrieval model used in all RAG experiments"
    463     },
    464     {
    465       "title": "BooookScore: A Systematic Exploration of Book-Length Summarization in the Era of LLMs",
    466       "relevance": "Cited for incremental summarization approach applied in event summarization experiments"
    467     },
    468     {
    469       "title": "MiniGPT-5: Interleaved Vision-and-Language Generation via Generative Vokens",
    470       "relevance": "Base model for all multimodal dialogue generation experiments and ablations"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "LOCOMO provides a concrete benchmark for developers building long-term conversational agents, directly measuring memory failure modes that affect deployed chatbots."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "Long-context LLMs performing worse than constrained-context base models on adversarial questions and event summarization directly contradicts the intuition that more context always helps."
    481     },
    482     "fear_safety": {
    483       "score": 1,
    484       "justification": "Broader Impacts section raises parasocial relationship risks from realistic long-term agents and misinformation risks from multimodal generation, but these are brief rather than central findings."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "No major controversy or conflict angle; standard benchmark paper despite the counterintuitive long-context degradation finding."
    489     },
    490     "demo_ability": {
    491       "score": 1,
    492       "justification": "Code and data are promised but not yet released; practitioners cannot currently reproduce or try the benchmark."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "Snap Inc. is a recognizable company but not a primary AI research lab; UNC and USC are respected but not top-tier AI venues by brand alone."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "39568622",
    503         "title": "ArtPrompt: ASCII Art-Based Jailbreak Attacks Against Aligned LLMs",
    504         "points": 145,
    505         "comments": 55,
    506         "url": "https://news.ycombinator.com/item?id=39568622",
    507         "created_at": "2024-03-02T00:30:06Z"
    508       },
    509       {
    510         "hn_id": "39465357",
    511         "title": "LongRoPE: Extending LLM Context Window Beyond 2M Tokens",
    512         "points": 142,
    513         "comments": 46,
    514         "url": "https://news.ycombinator.com/item?id=39465357",
    515         "created_at": "2024-02-22T10:44:35Z"
    516       },
    517       {
    518         "hn_id": "39811319",
    519         "title": "Rose: Efficient and Extensible Autodiff on the Web",
    520         "points": 3,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=39811319",
    523         "created_at": "2024-03-24T23:03:03Z"
    524       },
    525       {
    526         "hn_id": "39462835",
    527         "title": "Microsoft's LongRoPE: Extending LLM Context Window Beyond 2M Tokens",
    528         "points": 3,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=39462835",
    531         "created_at": "2024-02-22T03:24:29Z"
    532       },
    533       {
    534         "hn_id": "47203853",
    535         "title": "Show HN: Engram – Memory for AI coding agents (2.5K installs, 80% on LOCOMO)",
    536         "points": 1,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=47203853",
    539         "created_at": "2026-03-01T04:55:07Z"
    540       },
    541       {
    542         "hn_id": "45007581",
    543         "title": "Evaluating Long-Term Conversational Memory of LLM Agents",
    544         "points": 1,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=45007581",
    547         "created_at": "2025-08-24T20:42:16Z"
    548       }
    549     ],
    550     "top_points": 145,
    551     "total_points": 295,
    552     "total_comments": 101
    553   }
    554 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs