scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25714B)
      1 {
      2   "paper": {
      3     "title": "Rethinking the Reliability of Multi-agent System: A Perspective from Byzantine Fault Tolerance",
      4     "authors": [
      5       "Lifan Zheng",
      6       "Jiawei Chen",
      7       "Qinghong Yin",
      8       "Jingyuan Zhang",
      9       "Xinyi Zeng",
     10       "Yu Tian"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2511.10400"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "LLM-based agents demonstrate stronger Byzantine fault tolerance than traditional agents, maintaining consensus even when 6 of 7 nodes are malicious (85.7% fault rate), exceeding the classical f < n/3 bound. The proposed CP-WBFT mechanism, particularly the Hidden-level Confidence Probe (HCP), achieves 100% round-level accuracy across all topologies under extreme Byzantine conditions. Network topology significantly impacts consensus effectiveness, with complete graphs enabling optimal performance while constrained topologies like chains limit it.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "GitHub repository link provided in the abstract: https://github.com/Z1ivan/Byzantine-Fault-Tolerance-in-LLM-MAS"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses publicly available benchmarks (GSM8K, XSTest, CommonsenseQA) and lists the exact questions used in Appendix Tables 4, 5, 7, 8, 9, 10."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. Generation parameters are listed (Tables 6, 11) but not the full software environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 1, 2, 17, 18 are point estimates with no confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims LLM-based agents 'outperform' traditional agents and HCP 'outperforms' PCP based solely on comparing numbers without any statistical tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "BFTI (Byzantine Fault Tolerance Improvement) is reported as the percentage improvement from IAA to FAA, providing context for the magnitude of improvement (e.g., '+85.71% BFTI from 14.29% baseline')."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Only 10 questions per dataset are used, described as 'specifically chosen to create a performance gap.' No justification for why 10 is sufficient, no power analysis."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs with a fixed seed (1234)."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Traditional agents serve as baselines in Table 1. PCP and HCP are compared against each other in Table 2. The paper also compares against the classical f < n/3 bound."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Models used include GPT-4o-mini, GPT-3.5-turbo, LLaMA3.1-8B-Instruct, and LLaMA3-8B-Instruct, all contemporary at time of writing."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table 3 compares three extraction strategies (pooled, answer, query) across layers and models. PCP vs HCP comparison also serves as an ablation of the confidence probe approach."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Four metrics are used: IAA (Initial Agent Accuracy), FAA (Final Agent Accuracy), BFTI (Byzantine Fault Tolerance Improvement), and RA (Round-level Accuracy)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated. No human evaluation of consensus quality, safety assessments, or mathematical reasoning outputs."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The 10 questions per dataset are specifically curated, but there is no separation into dev/test splits for the main evaluation. The HCP probe training uses train/test splits (Tables 14-16) but the system-level evaluation does not."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by network topology (7 topologies), task domain (GSM8K vs XSTest), and probe method (PCP vs HCP) in Tables 1, 2, 17, 18."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper discusses PCP's negative BFTI on XSTest tree structures (-11.43%) and star-center-malicious configurations (-12.86%), noting 'PCP exhibits neutral or even negative BFTI on several XSTest topologies.'"
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "PCP shows negative BFTI on multiple XSTest topologies (Table 2b: tree -11.43%, star center malicious -12.86%, random -1.43%). These negative results are explicitly discussed."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 85.7% fault rate tolerance, superior performance across topologies, and 'remarkable accuracy' — these are supported by Tables 1 and 2 showing HCP at 100% RA across topologies."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper claims LLM-based agents' 'superior reliability derives from advanced inherent reflective and discriminative capabilities' — this is a causal claim about mechanism, but the evidence is only correlational (LLM agents perform better on Byzantine tasks). The ablation of extraction strategies partially supports the probe design claims."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper claims to 'enhance the reliability of LLM-based agents' broadly, but tests only on 10 questions each from GSM8K and XSTest with specific model pairs. The title suggests general multi-agent system reliability, but results are limited to these narrow settings."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No discussion of alternative explanations. The 'malicious' agents are simply weaker models (GPT-3.5-turbo, LLaMA3), not truly adversarial. The performance gap could stem from model capability differences rather than Byzantine fault tolerance properties. This confound is not discussed."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper equates model performance on 10 curated questions with 'Byzantine fault tolerance' and 'reliability' of multi-agent systems. Using weaker models as 'malicious/Byzantine' agents is a significant proxy gap — real Byzantine faults involve adversarial, arbitrary behavior, not just lower capability. This distinction is never acknowledged."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are specified as 'GPT-4o-mini', 'GPT-3.5-turbo', 'LLaMA3-8B-Instruct', 'LLaMA3.1-8B-Instruct'. These are marketing names without snapshot dates or API versions."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Full prompt text is provided in Appendix B (Sections B.3.1, B.3.2, B.3.3 for PCP) and Appendix C (Section C.2.2 for HCP)."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Generation parameters are reported in Tables 6 and 11 (temperature, seed, max tokens, top-p, repetition penalty). Probe training parameters in Tables 12 and 13."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The CP-WBFT framework is described in detail: two-stage process (confidence probing + weighted consensus), information flow between agents, refinement protocol with equations (1-4), and Figure 3/4 illustrating the workflow."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Questions are described as 'specifically chosen to create a performance gap between strong and weak agents' but the selection criteria and process are not documented. For HCP, PCA preprocessing is described but the question curation process is opaque."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No limitations, threats to validity, or similar section exists in the paper. The conclusion mentions computational cost briefly but does not constitute a limitations section."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No threats to validity are discussed anywhere in the paper."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings or scenarios are excluded from its claims."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Raw model outputs, confidence scores, and consensus traces are not made available. Only aggregated results in tables are shown."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The paper states questions were 'specifically chosen to create a performance gap' but does not describe the selection procedure, how many candidates were screened, or what criteria defined a 'performance gap.'"
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Uses standard public benchmarks (GSM8K, XSTest, CommonsenseQA)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The pipeline from question selection to final results has gaps. How the 10 questions were selected from each dataset is not explained. HCP probe training pipeline is better documented (PCA, z-score, logistic regression) but question curation is opaque."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are listed: Zhejiang University, East China Normal University, Zhongguancun Academy, Beijing University of Posts and Telecommunications, Kuaishou Technology, Tsinghua University."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding source disclosed, so independence cannot be assessed. One author is affiliated with Kuaishou Technology (a commercial entity), creating a potential undisclosed conflict."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the four models used (GPT-4o-mini, GPT-3.5-turbo, LLaMA3, LLaMA3.1), despite evaluating them on GSM8K and XSTest benchmarks."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of whether the 10 GSM8K or XSTest questions appeared in the training data of the models used."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "GSM8K was published in 2021 and XSTest in 2023. All four models were likely trained on data including these benchmarks. This contamination risk is not discussed, and is especially concerning given that only 10 questions per dataset are used."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference cost, API cost, or latency reported despite using GPT-4o-mini and GPT-3.5-turbo APIs and running multi-round consensus across 7-node and 15-node networks."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No compute budget stated. The paper mentions 'significant computational cost' of the multi-round mechanism but does not quantify it."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "A single fixed seed (1234) is used across all experiments. No analysis of how results vary across different seeds."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is not explicitly stated. Results appear to be from single runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "For the main experiments, no hyperparameter search budget is reported. The HCP appendix shows top-10 configurations across layers but does not state total configurations searched."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Appendix C Tables 14-16 show top-10 configurations ranked by test accuracy across layers and extraction strategies, with test accuracy and AUC reported for selection justification."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors compare their CP-WBFT against traditional agents and between PCP/HCP variants, all implemented by themselves. No acknowledgment of self-comparison bias."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "HCP requires local model deployment and hidden state extraction while PCP uses API calls — fundamentally different compute costs. This compute difference is not quantified or discussed in relation to performance gains."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether GSM8K and XSTest actually measure Byzantine fault tolerance capability. Using weaker models as 'Byzantine' agents conflates model capability with adversarial behavior — a fundamental construct validity issue."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "The CP-WBFT framework IS the system being evaluated — the scaffold is the product under test, not a confound."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "GSM8K (2021) and XSTest (2023) predate the training of all models used. No discussion of whether models memorized benchmark answers."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information. The confidence prompts explicitly ask models to rate their certainty, which could interact with memorized answers."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Only 10 questions are used per dataset, hand-selected to create performance gaps. No discussion of whether these specific questions are independent or representative."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is used despite evaluating on public benchmarks with models likely trained on them."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "LLM-based agents demonstrate stronger Byzantine fault tolerance than traditional agents, maintaining consensus with up to 6 malicious nodes (85.7%) across most topologies.",
    371       "evidence": "Table 1 shows traditional agents collapse at 2-3 malicious nodes (RA=0%) while LLM-based agents maintain satisfactory IAA, FAA, and RA with 6/7 malicious nodes.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "HCP achieves +85.71% Byzantine Fault Tolerance Improvement on complete graphs across both mathematical reasoning and safety assessment tasks while maintaining 100% round-level accuracy.",
    376       "evidence": "Table 2 shows HCP achieving 100% FAA and 100% RA on complete graphs for both GSM8K and XSTest, with BFTI of +85.71% from 14.29% baseline.",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "CP-WBFT surpasses the classical Byzantine fault tolerance bound of f < n/3.",
    381       "evidence": "Table 1 and 2 show LLM-based agents maintaining consensus with 6/7 nodes malicious (85.7%), compared to the classical limit of tolerating fewer than 1/3 malicious nodes.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "Pooled hidden state extraction consistently outperforms answer and query extraction strategies.",
    386       "evidence": "Table 3 shows pooled achieving 85.29% (GSM8K) and 95.24% (XSTest) on LLaMA3.1, compared to answer (84.23%, 80.16%) and query (71.27%, 80.95%).",
    387       "supported": "moderate"
    388     }
    389   ],
    390   "red_flags": [
    391     {
    392       "flag": "Extremely small evaluation set",
    393       "detail": "Only 10 questions per dataset are used for all experiments. This is far too small to draw reliable conclusions about Byzantine fault tolerance. A single question changing outcome could shift results by 10 percentage points."
    394     },
    395     {
    396       "flag": "Cherry-picked evaluation questions",
    397       "detail": "Questions are 'specifically chosen to create a performance gap between strong and weak agents.' This means the evaluation is designed to show the method works, not to test it under realistic conditions. The selection criteria are not documented."
    398     },
    399     {
    400       "flag": "Conflation of model weakness with Byzantine adversarial behavior",
    401       "detail": "Byzantine agents are simulated using weaker models (GPT-3.5-turbo, LLaMA3) rather than truly adversarial agents. Real Byzantine faults involve arbitrary/malicious behavior, not just lower capability. The paper's claims about 'Byzantine fault tolerance' are overstated for what is essentially a strong-weak model consensus mechanism."
    402     },
    403     {
    404       "flag": "No uncertainty quantification",
    405       "detail": "All results are point estimates from single runs with a fixed seed. No error bars, confidence intervals, or multi-run analysis despite the stochastic nature of LLM outputs."
    406     },
    407     {
    408       "flag": "No limitations section",
    409       "detail": "The paper entirely lacks a limitations or threats-to-validity section, a significant omission for a paper making strong claims about reliability and fault tolerance."
    410     },
    411     {
    412       "flag": "Contamination risk unaddressed",
    413       "detail": "Using only 10 questions from public benchmarks (GSM8K published 2021, XSTest 2023) with models likely trained on this data creates severe contamination risk. The 'strong' model may simply have memorized answers, inflating the apparent Byzantine fault tolerance."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Large language model based multi-agents: A survey of progress and challenges",
    419       "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"],
    420       "year": 2024,
    421       "arxiv_id": "2402.01680",
    422       "relevance": "Survey of LLM-based multi-agent systems covering progress and challenges."
    423     },
    424     {
    425       "title": "Evil geniuses: Delving into the safety of LLM-based agents",
    426       "authors": ["Yu Tian", "Xiao Yang", "Jingyuan Zhang"],
    427       "year": 2023,
    428       "arxiv_id": "2311.11855",
    429       "relevance": "Investigates safety vulnerabilities in LLM-based agent systems."
    430     },
    431     {
    432       "title": "NetSafe: Exploring the topological safety of multi-agent networks",
    433       "authors": ["Miao Yu", "Shilong Wang", "Guibin Zhang"],
    434       "year": 2024,
    435       "arxiv_id": "2410.15686",
    436       "relevance": "Studies how network topology affects safety in multi-agent LLM systems."
    437     },
    438     {
    439       "title": "Scaling large language model-based multi-agent collaboration",
    440       "authors": ["Chen Qian", "Zihao Xie", "Yifei Wang"],
    441       "year": 2024,
    442       "arxiv_id": "2406.07155",
    443       "relevance": "Investigates scaling properties of LLM-based multi-agent collaboration."
    444     },
    445     {
    446       "title": "Multi-agent collaboration mechanisms: A survey of LLMs",
    447       "authors": ["Khanh-Tung Tran", "Dung Dao", "Minh-Duc Nguyen"],
    448       "year": 2025,
    449       "arxiv_id": "2501.06322",
    450       "relevance": "Survey of collaboration mechanisms in LLM-based multi-agent systems."
    451     },
    452     {
    453       "title": "From LLM reasoning to autonomous AI agents: A comprehensive review",
    454       "authors": ["Mohamed Amine Ferrag", "Norbert Tihanyi", "Merouane Debbah"],
    455       "year": 2025,
    456       "arxiv_id": "2504.19678",
    457       "relevance": "Comprehensive review of the progression from LLM reasoning to autonomous AI agents."
    458     },
    459     {
    460       "title": "Can LLMs express their uncertainty? An empirical evaluation of confidence elicitation in LLMs",
    461       "authors": ["Miao Xiong", "Zhiyuan Hu", "Xinyang Lu"],
    462       "year": 2023,
    463       "arxiv_id": "2306.13063",
    464       "relevance": "Empirical evaluation of LLM confidence calibration, directly relevant to the confidence probing approach."
    465     },
    466     {
    467       "title": "Survey on evaluation of LLM-based agents",
    468       "authors": ["Asaf Yehudai", "Lilach Eden", "Alan Li"],
    469       "year": 2025,
    470       "arxiv_id": "2503.16416",
    471       "relevance": "Survey on evaluation methodologies for LLM-based agents."
    472     },
    473     {
    474       "title": "Cut the crap: An economical communication pipeline for LLM-based multi-agent systems",
    475       "authors": ["Guibin Zhang", "Yanwei Yue", "Zhixun Li"],
    476       "year": 2024,
    477       "arxiv_id": "2410.02506",
    478       "relevance": "Proposes efficient communication methods for LLM-based multi-agent systems."
    479     },
    480     {
    481       "title": "Root defence strategies: Ensuring safety of LLM at the decoding level",
    482       "authors": ["Xinyi Zeng", "Yuying Shang", "Jiawei Chen"],
    483       "year": 2024,
    484       "arxiv_id": "2410.06809",
    485       "relevance": "Decoding-level safety mechanisms for LLMs, related to hidden-state confidence probing."
    486     }
    487   ]
    488 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs