scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26149B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "IntroLM: Introspective Language Models via Prefilling-Time Self-Evaluation",
      6     "authors": [
      7       "Hossein Hosseini Kasnavieh",
      8       "Gholamreza Haffari",
      9       "Chris Leckie",
     10       "A. N. Toosi"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.03511",
     15     "doi": "10.48550/arXiv.2601.03511"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims ROC-AUC of 90% (Table 1: CHAT=90.1), outperforming DeBERTa-V3-Large by 14pp (HOTPOTQA: 86.3 vs 71.8=14.5pp), and routing improvements of up to 33% latency reduction (paper body reports up to 34%) and 50% large-model reduction — all reported in the results. Minor cherry-picking in abstract (90% is from CHAT, not QA), but claims are technically supported.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper makes causal claims that token-conditional LoRA and [CPX] tokens cause improved performance. Ablation studies in Tables 3 and 4 isolate the contribution of each component by removing them, providing adequate support for causal claims within this architectural context.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The limitations section explicitly states the method has only been evaluated on QA and chat benchmarks and that extension to 'creative generation, multi-turn dialogue, code generation, or domain-specific applications may require additional adaptation.' Scope boundaries are acknowledged.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss alternative explanations for IntroLM's superiority, such as whether the performance gap is attributable to the 8B parameter advantage over DeBERTa's 435M parameters rather than the introspective mechanism itself. Model size confound is not controlled or discussed.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper clearly defines that 'output quality' is a binary correctness label derived from LLM-as-judge evaluation (Figures 6 and 7), and the routing metrics (reliability, latency, large-model call rate) are explicitly defined in Section 5.3. Measurement and claims are aligned.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 9 is a dedicated Limitations section spanning several paragraphs covering task generalization, training cost relative to BERT, backbone size coverage, and supervision source.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Limitations are specific: only two backbone sizes evaluated (1.7B, 8B), higher training cost than BERT classifiers quantified conceptually, labeled dataset reliance noted with specific alternatives suggested. Not generic boilerplate.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The limitations section explicitly enumerates out-of-scope task families (creative generation, multi-turn dialogue, code generation, domain-specific applications) and states these require future work.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The acknowledgments section only mentions AI-tool use for proofreading. No funding source, grant, or institutional support is disclosed anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed on the title page: University of Melbourne and Monash University. No affiliation with any of the evaluated commercial products.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funder is disclosed, so this criterion is not applicable.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "There is no competing interests or financial interests declaration anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: 'output quality' as binary correctness (Section 3.1), 'prefilling phase' and 'decoding phase' explained in Section 3.2, 'prompt complexity' formalized as P(ℓ=1|x), routing metrics (reliability, large-model call rate, latency) defined in Section 5.3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states its contribution as a method (IntroLM) enabling causal LMs to predict output quality during prefilling via [CPX] tokens and token-conditional LoRA, without external evaluators or affecting generation.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Related work (Section 2 and Appendix A) clearly positions IntroLM against post-execution methods (FrugalGPT, AutoMix), pre-execution BERT classifiers (HybridLLM, BEST-Route, RouteLLM), and the closest work (Chuang et al. 2025 confidence tokens), explaining specific differences.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL, GitHub link, or code release is mentioned anywhere in the paper.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All three evaluation datasets are publicly available standard benchmarks: MMLU, MMLU-PRO, GSM8K, HotpotQA, and LMSYS-Chat-1M.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Appendix C provides training hyperparameters but no requirements.txt, Dockerfile, or Python/library version specifications are provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "While Appendix C describes training hyperparameters, there are no step-by-step reproduction instructions. The dataset construction pipeline requires running inference on QWEN3-8B to generate labels, but this process is not reproducibly documented.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported in any table or figure. All results are point estimates only.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to any of the comparative claims between IntroLM and baselines.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute ROC-AUC and PR-AUC values with differences are reported in tables (e.g., 89.1 vs 75.8 = +13.3pp for General QA), and routing improvements are reported as percentages with both peak and average values.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or justification for the sample sizes is provided. The 136,515 / 97,074 / 100K splits appear to use available dataset size without formal justification.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or results across multiple training runs are reported. All metrics are single-run point estimates.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "DeBERTa-V3-Base and DeBERTa-V3-Large are used as primary baselines for complexity prediction, and BERT-based routing is used as routing baseline, plus a random routing baseline.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "DeBERTa-V3 is the state-of-the-art encoder classifier used in the prior work the paper targets (RouteLLM, HybridLLM, BEST-Route). However, no RouteLLM or other LLM-based router is included as direct comparison.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 6.3 and Appendix D provide ablations on backbone model capacity (Table 2), effect of [CPX] tokens vs backbone-only (Table 3), LoRA target modules (Table 4), and layer-wise introspection depth (Table 5).",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Both ROC-AUC and PR-AUC are reported for complexity prediction; reliability, large-model call rate, and end-to-end latency are all reported for routing evaluation.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not applicable to this routing/complexity prediction task; automated LLM-as-judge is used for output quality labeling.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Appendix B.2 specifies a consistent 80/10/10 train/validation/test split at the prompt level, fixed across all experiments.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down across three distinct dataset categories: General QA (MMLU+MMLU-PRO+GSM8K), HotpotQA (long-context reasoning), and LMSYS-Chat (conversational), each reported separately in Table 1 and Figure 5.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "The limitations section mentions domains where the method may not generalize, but no specific failure cases or examples of misrouted prompts are shown.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The chat dataset (LMSYS-Chat-1M) shows smaller gains for IntroLM vs DeBERTa (90.1 vs 86.3), and the ablation showing 'No LoRA' performs substantially worse is reported honestly in Table 4.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "QWEN3-8B, DEBERTA-V3-BASE, DEBERTA-V3-LARGE are specified with explicit size parameters. QWEN3-8B is referenced via Yang et al. 2025 (arXiv:2505.09388). No snapshot dates, but specific model names are given.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The LLM-as-judge prompts are provided verbatim in Appendix B (Figures 6 and 7), including the full template with instruction and fill-in fields for question, ground-truth, and prediction.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Appendix C reports: context window 2048, batch size 64, cosine LR scheduling with 10% warmup, LoRA rank 32, alpha 64, target modules, max gradient norm 0.3, weight decay 0.002, learning rates 4-8×10^-5.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "This paper presents the IntroLM system itself rather than using agentic scaffolding; not applicable.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Appendix B documents dataset construction (combining MMLU/MMLU-PRO/GSM8K, HotpotQA filtering, LMSYS-Chat sampling and trivial input removal), labeling procedures, and judge prompts used.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The constructed labeled datasets (prompts + binary success labels generated by running QWEN3-8B inference) are not released; only the source benchmark names are provided.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Appendix B describes how each dataset was constructed: combining sources, filtering criteria (removing trivial/context-dependent inputs for chat), labeling via LLM-as-judge, and class balance statistics.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; uses standard benchmarks and automatically constructed datasets.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline is documented: raw benchmark → run QWEN3-8B → judge output correctness with LLM-as-judge → binary label → 80/10/10 split. Judge prompts are shown in Figures 6 and 7.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "QWEN3-8B's training data cutoff is never stated in the paper, despite evaluating it on MMLU and HotpotQA benchmarks that predate 2026 and likely appear in training data.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "The paper does not discuss whether QWEN3-8B may have seen MMLU, HotpotQA, or GSM8K examples during training, which would inflate success rates and affect the quality of binary labels.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "MMLU, GSM8K, and HotpotQA are widely-used benchmarks almost certainly present in QWEN3's pretraining data; the paper does not acknowledge or address this contamination risk.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "The paper reports end-to-end latency measurements using vLLM on two H100 GPUs, with TTFT and TPOT formulations, and shows latency reduction curves in Figure 5.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Total training compute budget (GPU-hours, FLOPs, or wall-clock training time) is never stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "IntroLM applied to QWEN3-8B achieves ROC-AUC of 90% for success prediction on question-answering benchmarks",
    374       "evidence": "Table 1 reports 89.1 for General QA and 86.3 for HotpotQA; 90.1 is achieved only on chat (LMSYS-Chat-1M). The abstract selects the best single number without specifying the dataset.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "IntroLM outperforms DeBERTa-V3-Large by 14% in ROC-AUC",
    379       "evidence": "Table 1 shows 13.3pp gap on General QA (89.1 vs 75.8) and 14.5pp on HotpotQA (86.3 vs 71.8). The 14pp figure is approximately accurate for HotpotQA.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "IntroLM reduces end-to-end latency by up to 33% at matched reliability compared to BERT-based routing",
    384       "evidence": "Figure 5b and 5d show routing curves. The paper body reports up to 34% on General QA and 30% on HotpotQA. Results are derived from analytical latency formulations using measured TTFT/TPOT on vLLM with H100 GPUs.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Token-conditional LoRA is essential for IntroLM performance, especially on long-context inputs",
    389       "evidence": "Table 3 shows [CPX] without LoRA drops ROC-AUC from 86.3 to 81.0 on HotpotQA and from 89.1 to 87.1 on General QA, with larger PR-AUC drops. Ablation clearly supports this.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "IntroLM preserves generation behavior of the base model unchanged",
    394       "evidence": "The paper describes architectural guarantees: [CPX] excluded from KV cache, decoding initialized from last original prompt token, backbone frozen. This is a design claim, not empirically verified by measuring generation distribution.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "IntroLM reduces large-model usage by up to 50% at matched reliability",
    399       "evidence": "Figure 5a and 5c show routing curves. The paper body states 'up to 50% (30% on average)' for General QA and 'up to 49% (41% on average)' for HotpotQA. Supported by curve plots.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "empirical"
    406   ],
    407   "key_findings": "IntroLM enables causal language models to predict their own output quality during the prefilling phase by appending special [CPX] tokens and applying token-conditional LoRA that selectively modifies [CPX] representations without affecting generation. Applied to QWEN3-8B, it achieves ROC-AUC of 86-90% across QA and chat benchmarks, outperforming DeBERTa-V3-Large by 13-14pp on QA tasks and demonstrating particular strength on long-context HotpotQA where BERT's 512-token limit is a structural disadvantage. When integrated into a routing system, IntroLM reduces large-model invocations by up to 50% and end-to-end latency by up to 34% at matched reliability versus BERT-based routing. The model-size confound (8B vs 435M parameters) is not fully controlled, but ablations confirm that both [CPX] tokens and LoRA adaptation each contribute meaningfully to performance.",
    408   "red_flags": [
    409     {
    410       "flag": "Model size confound unaddressed",
    411       "detail": "IntroLM uses QWEN3-8B (8 billion parameters) while baselines use DeBERTa-V3-Large (435M). The paper does not control for this 18x parameter difference; improved performance could be largely attributable to model capacity rather than the introspective mechanism."
    412     },
    413     {
    414       "flag": "No confidence intervals or significance tests",
    415       "detail": "All comparative results in Tables 1-5 are single-run point estimates with no variance, confidence intervals, or statistical significance tests despite making direct performance comparisons."
    416     },
    417     {
    418       "flag": "Benchmark contamination not discussed",
    419       "detail": "MMLU, GSM8K, and HotpotQA are well-known benchmarks likely present in QWEN3's pretraining data. The paper does not acknowledge that the model's success labels may be inflated by training data contamination."
    420     },
    421     {
    422       "flag": "No code or labeled dataset release",
    423       "detail": "Neither the IntroLM code nor the constructed labeled datasets (which required running QWEN3-8B inference on 300K+ prompts) are released, making reproduction very costly."
    424     },
    425     {
    426       "flag": "Abstract ROC-AUC cherry-picks dataset",
    427       "detail": "The abstract states 'ROC-AUC of 90%' without specifying this is from the chat dataset; QA benchmarks achieve 86-89%, a meaningful difference given that the chat comparison is against a stronger DeBERTa baseline."
    428     },
    429     {
    430       "flag": "No funding disclosed",
    431       "detail": "No funding source, grant number, or competing interests statement is present, reducing transparency about potential conflicts."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "RouteLLM: Learning to route LLMs from preference data",
    437       "relevance": "Key routing system baseline; explores multiple routing strategies including BERT classifiers and LLM-based classifiers from which IntroLM differentiates itself"
    438     },
    439     {
    440       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    441       "relevance": "Foundational post-execution cascade routing system used as comparison for motivation"
    442     },
    443     {
    444       "title": "Hybrid LLM: Cost-efficient and quality-aware query routing",
    445       "relevance": "BERT-based pre-execution routing approach that IntroLM directly improves upon"
    446     },
    447     {
    448       "title": "BEST-Route: Adaptive LLM routing with test-time optimal compute",
    449       "relevance": "Contemporary BERT-based routing baseline used in comparisons"
    450     },
    451     {
    452       "title": "Learning to route LLMs with confidence tokens",
    453       "relevance": "Most closely related work — confidence tokens generated at end of decoding vs IntroLM's prefilling-time approach"
    454     },
    455     {
    456       "title": "LoRA: Low-rank adaptation of large language models",
    457       "relevance": "Foundation technique that IntroLM extends with token-conditional masking"
    458     },
    459     {
    460       "title": "HotpotQA: A dataset for diverse, explainable multi-hop question answering",
    461       "relevance": "Primary long-context evaluation benchmark used to demonstrate IntroLM's advantage over BERT's 512-token limit"
    462     },
    463     {
    464       "title": "MMLU-Pro: A more robust and challenging multi-task language understanding benchmark",
    465       "relevance": "Part of the General QA evaluation corpus"
    466     },
    467     {
    468       "title": "AutoMix: Automatically mixing language models",
    469       "relevance": "Post-execution routing baseline using same-model verification"
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 3,
    475       "justification": "LLM routing is a commercially critical problem; reducing large-model API calls by 50% at matched reliability is directly actionable for any multi-model deployment."
    476     },
    477     "surprise_contrarian": {
    478       "score": 2,
    479       "justification": "Using the model's own prefilling representations for self-evaluation instead of external classifiers is a non-obvious architectural choice that challenges the BERT-router orthodoxy."
    480     },
    481     "fear_safety": {
    482       "score": 0,
    483       "justification": "No safety or risk concerns raised; purely an efficiency/performance paper."
    484     },
    485     "drama_conflict": {
    486       "score": 1,
    487       "justification": "The paper positions against BERT-based routers (dominant industry practice) but frames the contribution constructively rather than controversially."
    488     },
    489     "demo_ability": {
    490       "score": 2,
    491       "justification": "The method uses open-source models (QWEN3-8B) and standard benchmarks, making it replicable in principle, though no code is released."
    492     },
    493     "brand_recognition": {
    494       "score": 1,
    495       "justification": "University of Melbourne and Monash University authors; no famous lab affiliation, but evaluates against OpenAI and Anthropic models."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [],
    500     "top_points": 0,
    501     "total_points": 0,
    502     "total_comments": 0
    503   }
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs