calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (21196B)
      1 {
      2   "calibration_metadata": {
      3     "paper_slug": "agentic-memory-learning-2026",
      4     "calibration_model": "opus",
      5     "scan_model": "sonnet",
      6     "calibration_date": "2026-02-28",
      7     "schema_version": "scan.schema.json"
      8   },
      9   "opus_checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No GitHub link or code archive is provided for AgeMem itself. The paper references third-party frameworks (AgentScope, Trinity-RFT) but the AgeMem code is not released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All five benchmarks (ALFWorld, SciWorld, PDDL, BabyAI, HotpotQA) are publicly available datasets. The paper does not collect new proprietary data."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions '8 NVIDIA RTX 4090 GPUs with 48GB memory each' (Appendix C.4) and names the frameworks (AgentScope, Trinity-RFT), but no requirements.txt, Dockerfile, or detailed dependency/version listing is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The appendix gives algorithmic pseudocode and hyperparameters, but there is no README or script-level guidance to reproduce experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table 2 and all other result tables report only point estimates (e.g., 41.96% average). No confidence intervals, error bars, or standard deviations are reported."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Comparative claims such as 'AgeMem improves by 4.82 and 8.57 percentage points on average' are made without any statistical significance tests (no p-values, t-tests, etc.)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage point improvements with baseline context throughout (e.g., 'relative gains of 49.59% and 23.52% over no-memory' in Section 4.2, absolute differences like '+13.9%' in ablations with both baseline and new values). Per the schema, '12% improvement over baseline (from 45% to 57%)' provides enough context for YES — the paper does this consistently."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for evaluation set sizes. HotpotQA has ~90k training questions but the exact test split size used is not stated, and no power analysis is provided."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "All results are single-point estimates. Despite using K=8 rollouts per task and stochastic RL training, no standard deviations, variance across seeds, or spread measures are reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against four baselines (LangMem, A-Mem, Mem0, Mem0g) plus a no-memory baseline and an AgeMem-noRL ablation. Links to official implementations are in Appendix C.3."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "All baselines are from 2025: LangMem, A-Mem (arXiv:2502.12110), Mem0 (arXiv:2504.19413). These represent current state-of-the-art memory management for LLM agents."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Figures 4 and 9 present systematic ablations isolating LTM tools, RL training, and STM tools (+LT, +LT/RL, +LT/ST/RL). Tables 4 and 5 provide reward function ablations (All-Returns vs Answer-Only)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: Success Rate (SR) for ALFWorld/SciWorld/BabyAI, Progress Rate (PR) for PDDL, LLM-as-a-Judge (J) for HotpotQA, Memory Quality (MQ), and token count for context efficiency."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a benchmark evaluation paper using automated task completion metrics and LLM-as-a-Judge. Human evaluation of system outputs is not clearly relevant to the claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.1 states the model is 'fine-tuned with RL only on the HotpotQA training set and then evaluated directly on all datasets.' This establishes separation between training and evaluation data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-dataset breakdowns across all five benchmarks for both model backbones. Ablation figures also show per-dataset results."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The case studies in Appendix B compare trained vs. untrained agents in curated scenarios, but no systematic error analysis of where AgeMem fails on the benchmarks is provided. No failure mode analysis appears."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The ablation in Figure 9b shows that adding LTM alone to Qwen3-4B on SciWorld hurts performance (-4.4%). This is reported transparently."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims AgeMem 'consistently outperforms strong memory-augmented baselines across multiple LLM backbones, achieving improved task performance, higher-quality long-term memory, and more efficient context usage.' Table 2, Figure 2, and Figure 3 support these claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Ablation studies isolate individual components (LTM, STM, RL) through controlled single-variable manipulation, supporting causal claims like 'RL training contributes 8.53 percentage points improvement.' These are standard ablation-based causal inferences."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests on two specific Qwen model backbones (7B and 4B) on five benchmarks, but the conclusion claims results 'highlight the importance of unified, agent-centric memory policies' and 'suggest a promising direction for building scalable and adaptive LLM agents.' The title 'Agentic Memory' and broad framing do not bound generalization to the tested models and tasks."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The limitations section mentions fixed tool set and limited benchmark coverage but does not discuss alternative explanations for observed improvements (e.g., whether gains come from additional RL training compute rather than the architecture, or whether benchmark selection favors the approach)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper specifies 'Qwen2.5-7B-Instruct' and 'Qwen3-4B-Instruct' as base models, and 'Qwen-Max' as the LLM evaluator. However, per the schema, marketing names without a snapshot date or API version do NOT count. No specific model checkpoint dates, snapshot identifiers, or API versions are provided for any of these models. 'Qwen-Max' is especially vague — it is a marketing name with no version identifier."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The full system prompt for AgeMem is provided verbatim in Appendix A.1, including the tool-calling format and response structure. The summarization system prompt is also fully provided. LLM-as-a-Judge and Memory Quality evaluation prompts are given in Appendix C.2."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Key hyperparameters are reported in Appendix C.4: K=8 rollouts, KL coefficient β=0.1, uniform reward weights (1/3 each), FILTER threshold θ=0.6, max context length 8,192 tokens, max response length 2,048 tokens, penalty coefficients (Prounds=-1, Poverflow=-0.5)."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding is described in detail: three-stage trajectory structure, tool interface (6 tools with exact signatures in Figures 6-7), step-wise GRPO mechanism, and complete rollout algorithms (Algorithms 1-5)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper describes how HotpotQA supporting facts are used as Stage 1 contextual information and mentions a DISTRACTORGEN function for generating distractors, but does not detail the distractor generation process. The exact evaluation subsets (sizes, selection criteria) for each benchmark are not documented. The data pipeline from raw benchmarks to actual training/evaluation instances is underspecified."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section follows the conclusion, discussing the fixed tool set and limited benchmark coverage."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations section is generic: it notes 'broader coverage of tasks and environments may further strengthen the empirical understanding' and the tool set 'could be extended.' No specific threats to validity are discussed (e.g., LLM judge reliability, reward hacking, same-family model bias, contamination risk)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not note that results are limited to Qwen model families, that RL training was only on HotpotQA, or that the token reduction (3-5%) may not be practically significant."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Evaluation logs, model outputs, and raw trajectory data are not released. Only aggregated performance numbers in tables and figures are available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "All five datasets are established public benchmarks with documented collection procedures from their original papers. The paper describes their structure in Appendix C.1."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The paper evaluates on standard benchmarks, so recruitment methods are not applicable."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "While the three-stage trajectory structure is described algorithmically, the DISTRACTORGEN function for generating distractors is mentioned but not fully specified. The exact evaluation subsets and how benchmark data is converted into the three-stage format are not fully documented. The data pipeline has unexplained steps."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "There is no acknowledgments section or funding disclosure in the paper. Authors include Alibaba Group employees but no funding sources are mentioned."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Yi Yu, Liuyi Yao, Yuexiang Xie, Yaliang Li from Alibaba Group; others from Wuhan University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Multiple authors are from Alibaba Group. The system uses Alibaba's AgentScope framework, is evaluated with Qwen models (Alibaba), and uses Qwen-Max as the LLM judge. The funder/employer has a direct interest in the outcome."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement appears anywhere in the paper. The Alibaba affiliation is listed but no explicit declaration of financial interests or conflicts is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses Qwen2.5-7B-Instruct and Qwen3-4B-Instruct but does not state their training data cutoff dates. HotpotQA (2018) and other benchmarks predate these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether benchmark test examples appeared in Qwen model training data. Given the age of these benchmarks, contamination risk exists but is not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HotpotQA (2018), ALFWorld (2020), SciWorld (2022), BabyAI (2018) all predate the Qwen models. Contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants; this is a benchmark evaluation paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Token counts per episode are reported in Figure 3 (~2,117-2,310 tokens), but no API costs, wall-clock inference time per example, or cost per episode are provided."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper states '8 NVIDIA RTX 4090 GPUs with 48GB memory each' (Appendix C.4), specifying hardware. However, no training wall-clock time, total GPU-hours, or total compute budget is stated. Knowing the GPU model without knowing how long training took does not constitute a stated compute budget."
    280       }
    281     }
    282   },
    283   "comparison": {
    284     "total_questions": 50,
    285     "agreements": 45,
    286     "disagreements": 5,
    287     "agreement_rate": 0.90,
    288     "disagreements_detail": [
    289       {
    290         "category": "statistical_methodology",
    291         "question": "effect_sizes_reported",
    292         "sonnet_applies": true,
    293         "sonnet_answer": false,
    294         "opus_applies": true,
    295         "opus_answer": true,
    296         "direction": "opus_generous",
    297         "explanation": "Sonnet answered false, noting raw percentage differences but no standardized effect sizes. Opus answered true because the schema says 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper consistently reports improvements with baseline context (e.g., 'relative gains of 49.59%', '+13.9%' over clearly stated baselines). This is an interpretive boundary — Sonnet was stricter by requiring formal effect size measures like Cohen's d, while Opus followed the schema's example more literally."
    298       },
    299       {
    300         "category": "setup_transparency",
    301         "question": "model_versions_specified",
    302         "sonnet_applies": true,
    303         "sonnet_answer": true,
    304         "opus_applies": true,
    305         "opus_answer": false,
    306         "direction": "sonnet_generous",
    307         "explanation": "Sonnet accepted 'Qwen2.5-7B-Instruct' and 'Qwen3-4B-Instruct' as sufficiently specified versions. However, the schema explicitly states 'Marketing names like Gemini-2.5 or GPT-4o without a snapshot date or API version do NOT count as specified versions.' The Qwen model names are marketing names — there is no snapshot date, checkpoint identifier, or API version. 'Qwen-Max' (the evaluator model) is even more vague. Opus applied the schema strictly and answered false."
    308       },
    309       {
    310         "category": "setup_transparency",
    311         "question": "data_preprocessing_documented",
    312         "sonnet_applies": true,
    313         "sonnet_answer": true,
    314         "opus_applies": true,
    315         "opus_answer": false,
    316         "direction": "sonnet_generous",
    317         "explanation": "Sonnet credited the paper for describing how HotpotQA facts are used and how DISTRACTORGEN works 'conceptually.' Opus noted that DISTRACTORGEN is mentioned but not specified, the exact evaluation subsets (sizes, selection criteria) for each benchmark are not documented, and the pipeline from raw benchmarks to training/evaluation instances has unexplained steps. The schema requires that if the paper goes from 'we collected data' to 'here are the results' without describing intermediate processing, the answer is NO."
    318       },
    319       {
    320         "category": "data_integrity",
    321         "question": "data_pipeline_documented",
    322         "sonnet_applies": true,
    323         "sonnet_answer": true,
    324         "opus_applies": true,
    325         "opus_answer": false,
    326         "direction": "sonnet_generous",
    327         "explanation": "Sonnet credited the algorithmic pseudocode (Algorithms 3-5) as documenting the data pipeline. Opus noted that while the three-stage algorithm is described, the DISTRACTORGEN function is not fully specified, evaluation subset sizes are not stated, and the conversion from benchmark data to the three-stage format is not fully documented. The schema says 'if there are unexplained jumps... NO.' The distractor generation process is an unexplained step."
    328       },
    329       {
    330         "category": "cost_and_practicality",
    331         "question": "compute_budget_stated",
    332         "sonnet_applies": true,
    333         "sonnet_answer": true,
    334         "opus_applies": true,
    335         "opus_answer": false,
    336         "direction": "sonnet_generous",
    337         "explanation": "Sonnet credited '8 NVIDIA RTX 4090 GPUs with 48GB memory each' as the compute budget. However, the schema asks for 'GPU hours, total API spend, hardware used, training time.' Stating the hardware without training time or total GPU-hours does not constitute a 'total computational budget.' The reader knows the GPU model but not how long training took or the total compute consumed."
    338       }
    339     ]
    340   },
    341   "summary": {
    342     "agreement_rate": 0.90,
    343     "sonnet_generous_count": 4,
    344     "opus_generous_count": 1,
    345     "applies_boundary_count": 0,
    346     "interpretive_count": 0,
    347     "dominant_error_type": "sonnet_generous",
    348     "notes": "Of 5 disagreements, 4 are sonnet_generous (Sonnet credited the paper where Opus found it insufficient under strict schema reading) and 1 is opus_generous (Opus credited effect sizes based on the schema's example of contextual percentage improvements). The pattern continues the known Sonnet generosity bias: Sonnet tends to credit partial or conceptual descriptions (data_preprocessing_documented, data_pipeline_documented) and incomplete specifications (model_versions_specified, compute_budget_stated) as satisfying the criteria. Key issues: (1) model_versions_specified — Sonnet accepted marketing-style model names without snapshot dates, contradicting the schema's explicit prohibition; (2) compute_budget_stated — hardware alone without GPU-hours or training time is not a budget; (3) data pipeline items — mentioning a function name (DISTRACTORGEN) without specifying it does not count as documentation."
    349   }
    350 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs