scan.json (27879B)
1 { 2 "paper": { 3 "title": "ArcMemo: Abstract Reasoning Composition with Lifelong LLM Memory", 4 "authors": ["Matthew Ho", "Chen Si", "Zhaoxiang Feng", "Fangxu Yu", "Yichi Yang", "Zhijian Liu", "Zhiting Hu", "Lianhui Qin"], 5 "year": 2025, 6 "venue": "Preprint (arXiv)", 7 "arxiv_id": "2509.04439" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "A GitHub repository URL is provided: https://github.com/matt-seb-ho/arc_memo (footnote 1 on page 1). The paper also mentions releasing a concept-annotation dataset and configurable puzzle synthesis pipeline." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses the publicly available ARC-AGI-1 benchmark (Chollet, 2019) and reuses Li et al. (2024)'s 160 manually authored Python solutions as seed data. The paper also mentions releasing a hand-annotated concept dataset and puzzle synthesis pipeline (Appendix D.2)." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing is provided in the paper. The paper mentions models used (o4-mini, GPT-4.1) and some hyperparameters but does not specify the software environment or library versions needed to reproduce the work." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While the paper describes the method in detail and provides code via GitHub, there are no step-by-step reproduction instructions in the paper itself (no README walkthrough, no specific commands to run). The GitHub link is provided but the paper text does not include reproduction steps." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper reports standard deviations in parentheses (e.g., '59.33 (0.29)') but does not report confidence intervals or formal error bars. Standard deviation across 3 runs is reported, but no confidence intervals are constructed from them." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims ArcMemo-PS outperforms the baseline and other memory methods but provides no statistical significance tests (no p-values, t-tests, bootstrap tests, etc.). Comparisons are based solely on comparing point estimates with standard deviations." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports a '7.5% relative gain over a strong no-memory baseline' (abstract), and provides absolute scores with context: 'improves the official score from 55.17 to 59.33' (Section 5.1). This gives both relative and absolute magnitudes." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper justifies using a 100-puzzle subset: 'Following Akyürek et al. (2025), we evaluate a randomly selected 100-puzzle subset of the public val split. This makes repeated runs for more stable estimates feasible given cost and the sampling variance we observed' (Section 4). The limitation of this sample size is also acknowledged in Appendix D.1." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Standard deviations across 3 runs are reported in Tables 1, 2, 3, 4, and 5. For example, Table 4 shows individual run scores and aggregate standard deviations. The paper explicitly states 'we sample 3 runs for each setting and report the average single run score, average oracle@2 score, and their standard deviation' (Appendix B)." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares against a no-memory baseline (vanilla o4-mini) and a re-implementation of Dynamic Cheatsheet (Suzgun et al., 2025), labeled 'cheatsheet'. Table 1 also includes results for other models (DeepSeek R1, Claude Sonnet 4, Gemini 2.5 Flash, Qwen3)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The primary baseline (Dynamic Cheatsheet, Suzgun et al. 2025) is very recent. The backbone model (o4-mini) is described as second only to Grok 4 on the ARC-AGI leaderboard. Other compared models (Claude Sonnet 4, Gemini 2.5 Flash) are also contemporary." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5.2 presents a selection ablation removing the reasoning-based selection mechanism from ArcMemo-PS (Table 2). The system also compares two different memory format variants (OE vs PS), which functions as an ablation of the memory design choices." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports Oracle@1, Oracle@2 (official), and Oracle@3 scores across 0, 1, and 2 retries. Table 5 also reports strict scoring (requiring a single program to solve all test cases). Token efficiency is additionally analyzed in Figure 4." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 5.2 includes manual qualitative analysis: 'We manually analyzed these puzzles and found that only 40% of new solves in the cheatsheet setting were related to memory elements actually in the generated cheatsheet.' Manual inspection of reasoning-based selection results is also described." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The evaluation uses a randomly selected 100-puzzle subset of the ARC-AGI-1 public validation split, which has 'a difficulty distribution matching that of the private evaluation' (Section 4). The seed data for memory comes from the train split (160 puzzles), maintaining separation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Tables 1 and 4 break results down by retry count (0, 1, 2 retries) and by oracle@k level (k=1, 2, 3). Table 4 shows individual run scores for each setting. The qualitative analysis in Section 5.2 also examines specific puzzle-level differences." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses embedding-based retrieval failures (Appendix C: 'lowering the score from 0.26 to 0.22, marking a 15% reduction'), discusses where ArcMemo-OE underperforms ('situationally improve over the baseline but still underperform it in some regimes'), and discusses selection inaccuracies ('some irrelevant concepts still appear within the selection')." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports embedding-based retrieval was ineffective (Appendix C), DeepSeek R1's output token limit yielded unfinished solutions (Section 4), ArcMemo-OE underperforms the baseline in some regimes, and the continual update setting sometimes hurts at lower retry counts (Table 3, retry 0 and 1 for ArcMemo-OE + continual)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims a '7.5% relative gain over a strong no-memory baseline', supported by Table 1 (55.17 → 59.33). The claim that 'abstract concepts [are] the most consistent memory design, outscoring the baseline at all tested inference compute scales' is supported by Table 4. The claim about continual updates is supported by Table 3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about memory improving performance, supported by controlled ablation (Table 2, removing selection) and controlled comparisons holding the backbone model constant. The qualitative analysis in Section 5.2 linking solved puzzles to memory contents provides additional causal evidence, though the paper appropriately hedges: 'we cannot definitively conclude ArcMemo changes directly induced each of the new solves.'" 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper acknowledges evaluation is on a 100-task subset of ARC-AGI-1 only (Appendix D.1), notes that ARC-AGI-2 evaluation was infeasible, and frames ARC-AGI as 'a benchmark that stresses compositional generalization and abstract reasoning, making it a natural fit for concept memory' rather than claiming general applicability. The title uses 'Abstract Reasoning' not 'all reasoning.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 5.2 explicitly considers that improvements could be due to sampling variance rather than memory: '100% of new solves from ArcMemo-PS (-selection) can be linked to concept memory contents' vs only 40% for cheatsheet. The paper also discusses that token usage increases may explain some variance (Appendix C) and that evaluation order creates confounds (Section 3.5)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper specifies 'OpenAI's o4-mini' and 'OpenAI's GPT-4.1' but does not provide snapshot dates, API versions, or model version identifiers beyond the marketing names. No version suffixes like '-2025xxxx' are given." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes prompts in natural language ('few-shot demonstrations, example-rich templates, and comprehensive instructions' in Section 3.3) but does not include the actual prompt text used. The method descriptions in Sections 3.2-3.4 describe what prompts do but not their exact content. A GitHub repository is linked which may contain prompts, but the paper itself does not provide them." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix A.2 specifies: 'o4-mini (max tokens=32000, reasoning_effort=medium)' and 'GPT-4.1 (temperature=0.3, max tokens=1000)'. The update interval k for continual learning is stated as 'every 10 problems' (Section 5.3). The number of parallel samples (k=2 for official, k=3 for extra runs) is also specified." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The method involves multi-step scaffolding: VLM preprocessing for OE selection (Section 3.4), reasoning-based exploration for PS selection, concept abstraction pipeline with pseudocode preprocessing (Section 3.3), and execution feedback-based retry. Algorithm 1 formally describes the overall pipeline. The paper describes the workflow in detail across Sections 3.2-3.5." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4 describes: 100-puzzle random subset of ARC-AGI-1 public validation split, seed memory from Li et al. (2024)'s 160 manually authored Python solutions from the train split. Section 3.3 describes the PS abstraction preprocessing (solutions → pseudocode → concepts). The evaluation procedure is formalized in Appendix B." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Appendix D.1 is titled 'Limitations' and provides a substantive discussion of limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Appendix D.1 identifies specific threats: (1) evaluation on only 100 tasks limits observable gains, (2) 'substantial sampling variance' requiring multiple runs, (3) 'relatively small frontier of puzzles where memory augmentation can yield new solves', (4) inability to evaluate on ARC-AGI-2 due to cost constraints. Section 3.5 also discusses order dependency as a specific confound." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states what was NOT tested: ARC-AGI-2 evaluation was infeasible (Appendix D.1), DeepSeek R1 was abandoned due to token limits (Section 4), and the system requires test-time feedback ('some feedback is available at test-time', Section 3.1 Problem Assumptions). Future work sections explicitly list hierarchical designs and consolidation mechanisms as untested directions." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The underlying benchmark (ARC-AGI-1) is publicly available. Individual run scores are reported in Table 4 (not just aggregates). The paper releases code via GitHub and mentions releasing a concept-annotation dataset (Appendix D.2). The seed data (Li et al. 2024's solutions) is from published work." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4 describes: ARC-AGI-1 public validation split with 400 puzzles, random selection of 100 puzzles, 160 seed solutions from Li et al. (2024)'s train split. Appendix B describes the scoring procedure in formal mathematical notation." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants were recruited. The study uses a public benchmark (ARC-AGI-1) and existing seed solutions." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline is documented: seed solutions → concept abstraction (Section 3.3) → memory store → concept selection (Section 3.4) → puzzle solving with retries. Algorithm 1 formalizes the inference pipeline. Appendix B specifies the scoring procedure. The continual update interval (every 10 problems) is stated." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding sources are disclosed. The Acknowledgements section thanks two individuals for early contributions but does not mention any funding or grants." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: University of California, San Diego (7 authors) and University of Maryland (1 author). The paper evaluates OpenAI models but the authors are not affiliated with OpenAI." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed, so independence cannot be assessed. The paper uses OpenAI models extensively; if OpenAI provided API credits this would be relevant but is not mentioned." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state the training data cutoff dates for o4-mini or GPT-4.1. This is relevant because ARC-AGI-1 puzzles and solutions could potentially be in the training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper does not discuss whether ARC-AGI-1 puzzles or their solutions appeared in the training data of o4-mini or GPT-4.1. ARC-AGI-1 has been public since 2019 and could be in training data." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "ARC-AGI-1 was published in 2019 and is widely available online. The paper does not address the possibility that o4-mini may have been trained on ARC-AGI puzzle solutions. The paper does note that ARC-AGI was 'explicitly designed to evaluate intelligence as efficient acquisition of new skills instead of fixed possession/memorization of skills' (Section 4), but this addresses task design rather than data contamination." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants were involved in this study. It is a benchmark evaluation of an LLM memory system." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants were involved." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants were involved." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants were involved." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants were involved." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants were involved." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants were involved." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Figure 4 plots token efficiency (official score vs tokens used by the reasoning model). The paper discusses token usage across settings: 'memory-augmented runs tend to increase output token usage' (Appendix C). The paper mentions o4-mini is on 'the Pareto frontier of cost and performance' (Section 4)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total API spend, total token count, or total computational budget is stated. The paper mentions cost constraints as a reason for using a 100-puzzle subset and o4-mini over more expensive models, but does not quantify the actual expenditure." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "ArcMemo-PS achieves a 7.5% relative gain over the no-memory baseline on ARC-AGI-1 official scoring (from 55.17 to 59.33).", 286 "evidence": "Table 1 shows baseline oracle@2 = 55.17 (std 3.18) vs ArcMemo-PS oracle@2 = 59.33 (std 0.29), both using o4-mini medium with 0 retries. Table 4 provides individual run scores confirming this.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "ArcMemo-PS is the only memory design that consistently outperforms the baseline at all tested inference compute scales.", 291 "evidence": "Table 4 shows ArcMemo-PS outperforming the baseline across all retry levels (0, 1, 2) and oracle@k levels (k=1, 2, 3). Other methods (Cheatsheet, ArcMemo-OE) underperform the baseline in some regimes.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Reasoning-based concept selection improves downstream performance compared to including all memory.", 296 "evidence": "Table 2 shows the selection ablation: with selection, ArcMemo-PS scores 59.33 (0.29) vs without selection 55.17 (2.02) at 0 retries, oracle@2. However, at higher retry counts the gap narrows and sometimes reverses.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Continually updating memory during evaluation improves performance at higher retry depths.", 301 "evidence": "Table 3 shows ArcMemo-OE + continual update achieving 70.00 (1.73) vs fixed ArcMemo-OE 67.67 (2.52) at 2 retries, oracle@2. At 0 retries, continual updates slightly hurt performance (56.00 vs 56.67).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "100% of new solves from ArcMemo-PS (-selection) can be linked to concept memory contents, compared to only 40% for the cheatsheet method.", 306 "evidence": "Section 5.2 describes manual analysis of puzzles solved by one method but not the other (10 puzzles, 5 unique to each). This is qualitative analysis on a very small sample.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "ArcMemo introduces concept-level memory (abstract, modular) for LLM reasoning, evaluated on ARC-AGI-1. The program synthesis (PS) memory format achieves a 7.5% relative gain over a no-memory o4-mini baseline (55.17 to 59.33 official score) and is the only memory design that consistently outperforms the baseline across all tested inference compute scales. Reasoning-based concept selection improves over including all memory, and continual memory updates show benefits at higher retry depths, supporting the hypothesis that accumulated patterns enable further solutions.", 312 "red_flags": [ 313 { 314 "flag": "Small evaluation sample with high variance", 315 "detail": "Only 100 puzzles are used for evaluation with 3 runs per setting. Standard deviations in Table 1 show the baseline at 55.17 (std 3.18) and ArcMemo-PS at 59.33 (std 0.29) — the gains are within the baseline's standard deviation. No statistical significance tests are reported to distinguish signal from noise." 316 }, 317 { 318 "flag": "No benchmark contamination analysis", 319 "detail": "ARC-AGI-1 has been publicly available since 2019. The paper does not discuss whether o4-mini or GPT-4.1 may have been trained on ARC-AGI puzzles or solutions. While ARC-AGI is designed to resist memorization, contamination of the specific puzzle structures or solution patterns remains a concern." 320 }, 321 { 322 "flag": "Qualitative attribution based on tiny sample", 323 "detail": "The claim that ArcMemo memories are more attributable to actual problem-solving (100% vs 40% for cheatsheet) is based on manual analysis of only 10 puzzles (5 per method). This is too small for reliable conclusions about attribution." 324 }, 325 { 326 "flag": "No total cost disclosure", 327 "detail": "The paper acknowledges cost constraints drove design decisions (100-puzzle subset, o4-mini over more expensive models, inability to run ARC-AGI-2) but never quantifies the actual API expenditure, making it impossible to assess practical feasibility." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Dynamic Cheatsheet: Test-Time Learning with Adaptive Memory", 333 "authors": ["Mirac Suzgun", "Mert Yuksekgonul", "Federico Bianchi", "Dan Jurafsky", "James Zou"], 334 "year": 2025, 335 "arxiv_id": "2504.07952", 336 "relevance": "Primary memory baseline compared against; represents prior state-of-the-art in test-time memory for reasoning tasks." 337 }, 338 { 339 "title": "Buffer of Thoughts: Thought-Augmented Reasoning with Large Language Models", 340 "authors": ["Ling Yang", "Zhaochen Yu", "Tianjun Zhang", "Shiyi Cao", "Minkai Xu", "Wentao Zhang", "Joseph E. Gonzalez", "Bin Cui"], 341 "year": 2024, 342 "arxiv_id": "2406.04271", 343 "relevance": "Prior work on storing problem-specific reasoning templates in memory for LLM reasoning, a key comparison point for instance-level vs abstract memory." 344 }, 345 { 346 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 347 "authors": ["Noah Shinn", "Federico Cassano", "Edward Berman", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"], 348 "year": 2023, 349 "arxiv_id": "2303.11366", 350 "relevance": "Foundational work on test-time self-reflection in LLM agents, relevant to understanding parameter-free test-time learning approaches." 351 }, 352 { 353 "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models", 354 "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang", "Ajay Mandlekar", "Chaowei Xiao", "Yuke Zhu", "Linxi Fan", "Anima Anandkumar"], 355 "year": 2023, 356 "arxiv_id": "2305.16291", 357 "relevance": "Open-ended agent with growing skill library, directly relevant to agentic memory and skill acquisition research." 358 }, 359 { 360 "title": "MemGPT: Towards LLMs as Operating Systems", 361 "authors": ["Charles Packer", "Sarah Wooders", "Kevin Lin", "Vivian Fang", "Shishir G. Patil", "Ion Stoica", "Joseph E. Gonzalez"], 362 "year": 2024, 363 "arxiv_id": "2310.08560", 364 "relevance": "Hierarchical memory system for LLMs enabling long-term information management, relevant to LLM augmentation and agentic capabilities." 365 }, 366 { 367 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 368 "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"], 369 "year": 2023, 370 "arxiv_id": "2304.03442", 371 "relevance": "Foundational work on LLM agents with memory systems for storing and retrieving observations, directly relevant to agentic AI architectures." 372 }, 373 { 374 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 375 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta", "Skyler Hallinan", "Luyu Gao", "Sarah Wiegreffe", "Uri Alon", "Nouha Dziri", "Shrimai Prabhumoye", "Yiming Yang", "Shashank Gupta", "Bodhisattwa Prasad Majumder", "Katherine Hermann", "Sean Welleck", "Amir Yazdanbakhsh", "Peter Clark"], 376 "year": 2023, 377 "arxiv_id": "2303.17651", 378 "relevance": "Iterative self-feedback mechanism for LLMs, relevant to test-time improvement and self-correction capabilities." 379 }, 380 { 381 "title": "Large Language Models Cannot Self-Correct Reasoning Yet", 382 "authors": ["Jie Huang", "Xinyun Chen", "Swaroop Mishra", "Huaixiu Steven Zheng", "Adams Wei Yu", "Xinying Song", "Denny Zhou"], 383 "year": 2024, 384 "arxiv_id": "2310.01798", 385 "relevance": "Critical analysis of LLM self-correction capabilities, relevant to understanding limitations of test-time learning without external feedback." 386 }, 387 { 388 "title": "The Surprising Effectiveness of Test-Time Training for Few-Shot Learning", 389 "authors": ["Ekin Akyürek", "Mehul Damani", "Adam Zweiger", "Linlu Qiu", "Han Guo", "Jyothish Pari", "Yoon Kim", "Jacob Andreas"], 390 "year": 2025, 391 "arxiv_id": "2411.07279", 392 "relevance": "Test-time training for ARC-AGI evaluation, directly comparable approach that uses weight adaptation rather than external memory." 393 }, 394 { 395 "title": "A Survey of Self-Evolving Agents: On Path to Artificial Super Intelligence", 396 "authors": ["Huanang Gao"], 397 "year": 2025, 398 "arxiv_id": "2507.21046", 399 "relevance": "Survey of self-evolving agent systems relevant to understanding the broader landscape of continual learning and self-improvement in AI agents." 400 }, 401 { 402 "title": "Self-Improving Language Models for Evolutionary Program Synthesis: A Case Study on ARC-AGI", 403 "authors": ["Julien Pourcel", "Cédric Colas", "Pierre-Yves Oudeyer"], 404 "year": 2025, 405 "arxiv_id": "2507.14172", 406 "relevance": "Self-improving system for ARC-AGI that exploits generation-verification asymmetry, directly comparable approach to continual learning on the same benchmark." 407 } 408 ] 409 }