scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25907B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "MatPlotAgent: Method and Evaluation for LLM-Based Agentic Scientific Data Visualization",
      6     "authors": [
      7       "Zhiyu Yang",
      8       "Zihan Zhou",
      9       "Shuo Wang",
     10       "Xin Cong",
     11       "Xu Han",
     12       "Yukun Yan",
     13       "Zhenghao Liu",
     14       "Zhixing Tan",
     15       "Pengyuan Liu",
     16       "Dong Yu",
     17       "Zhiyuan Liu",
     18       "Xiaodong Shi",
     19       "Maosong Sun"
     20     ],
     21     "year": 2024,
     22     "venue": "Annual Meeting of the Association for Computational Linguistics",
     23     "arxiv_id": "2402.11453",
     24     "doi": "10.48550/arXiv.2402.11453"
     25   },
     26   "checklist": {
     27     "claims_and_evidence": {
     28       "abstract_claims_supported": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Claims about MatPlotAgent improving LLM performance, the 100 human-verified benchmark, and strong correlation with human scores (r=0.876/0.836) are all supported by experimental results in Tables 1–4 and Figure 2.",
     32         "source": "haiku"
     33       },
     34       "causal_claims_justified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The claim that visual feedback causes improvement is supported by ablation studies (Tables 3–4) that isolate the visual feedback module, showing +7.72 and +5.94 point improvements for GPT-4 and GPT-3.5 respectively.",
     38         "source": "haiku"
     39       },
     40       "generalization_bounded": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper claims applicability to 'scientific data visualization' broadly but the benchmark is sourced entirely from Matplotlib Gallery and OriginLab GraphGallery; domain-specific scientific visualization (genomics, astronomy, etc.) is untested and only briefly flagged in the limitations.",
     44         "source": "haiku"
     45       },
     46       "alternative_explanations_discussed": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The observed improvements from MatPlotAgent could stem from additional inference compute and iterations rather than specifically the visual feedback mechanism; the paper does not discuss whether a second code-only iteration pass would yield similar gains.",
     50         "source": "haiku"
     51       },
     52       "proxy_outcome_distinction": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper explicitly validates the GPT-4V score proxy against human-annotated scores (r=0.876, p=7.41e-33), clearly distinguishing the automated measure from the underlying construct of visualization quality.",
     56         "source": "haiku"
     57       }
     58     },
     59     "limitations_and_scope": {
     60       "limitations_section_present": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 8 'Limitations' is present, noting the benchmark may not cover domain-specific requirements.",
     64         "source": "haiku"
     65       },
     66       "threats_to_validity_specific": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The limitations section only offers generic statements about domain coverage; specific threats such as the small benchmark size (100 examples), GPT-4V acting as both system component and evaluator, or lack of variance reporting are not discussed.",
     70         "source": "haiku"
     71       },
     72       "scope_boundaries_stated": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No explicit statement of what the results do not show; the single limitation mentioned is generic domain coverage without bounding which specific claims may not transfer.",
     76         "source": "haiku"
     77       }
     78     },
     79     "conflicts_of_interest": {
     80       "funding_disclosed": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No funding acknowledgment or grant information appears anywhere in the paper.",
     84         "source": "haiku"
     85       },
     86       "affiliations_disclosed": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Author affiliations are listed on the title page (Tsinghua University, Beijing Language and Culture University, Xiamen University, Northeastern University, Zhongguancun Laboratory).",
     90         "source": "haiku"
     91       },
     92       "funder_independent_of_outcome": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No funding is disclosed, making this criterion not applicable.",
     96         "source": "haiku"
     97       },
     98       "financial_interests_declared": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No competing interests statement or financial disclosure appears in the paper.",
    102         "source": "haiku"
    103       }
    104     },
    105     "scope_and_framing": {
    106       "key_terms_defined": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 formally defines the scientific data visualization task with an equation (V = f(x, D)) and specifies what x and D represent; 'agent' is described structurally via the three-module architecture.",
    110         "source": "haiku"
    111       },
    112       "intended_contribution_clear": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The introduction and abstract clearly state two contributions: the MatPlotBench benchmark and the MatPlotAgent framework, with bullet-pointed summaries.",
    116         "source": "haiku"
    117       },
    118       "engagement_with_prior_work": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 6 covers Code LLMs and LLM Agents, and Section 3 explicitly contrasts MatPlotBench with DS-1000 (average 3-line solutions too simple for agent evaluation), showing substantive engagement with related work.",
    122         "source": "haiku"
    123       }
    124     }
    125   },
    126   "type_checklist": {
    127     "empirical": {
    128       "artifacts": {
    129         "code_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Footnote 1 states both MatPlotAgent and MatPlotBench are publicly available at https://github.com/thunlp/MatPlotAgent.",
    133           "source": "haiku"
    134         },
    135         "data_released": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "MatPlotBench (100 test cases with queries, data, and ground-truth figures) is released through the GitHub repository.",
    139           "source": "haiku"
    140         },
    141         "environment_specified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "The paper mentions using vLLM for inference and OpenAI API but provides no requirements.txt, Dockerfile, or explicit dependency version list.",
    145           "source": "haiku"
    146         },
    147         "reproduction_instructions": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No step-by-step reproduction instructions are provided in the paper; the GitHub repository may contain them but the paper itself does not.",
    151           "source": "haiku"
    152         }
    153       },
    154       "statistical_methodology": {
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Tables 1–4 report only raw scores and absolute differences with no confidence intervals or error bars.",
    159           "source": "haiku"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No statistical significance tests are applied to the main performance comparisons; only the human-automatic correlation reports a p-value.",
    165           "source": "haiku"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Tables 1–4 report absolute score improvements (e.g., +12.30 for GPT-4, +9.48 for GPT-3.5) in the context of baseline scores, providing interpretable effect sizes.",
    171           "source": "haiku"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The choice of 100 benchmark examples is not justified with power analysis or sampling rationale.",
    177           "source": "haiku"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Temperature is set to 0.0 for code LLMs (deterministic), but GPT-4V evaluation variance is not reported; no standard deviations appear in any results table.",
    183           "source": "haiku"
    184         }
    185       },
    186       "evaluation_design": {
    187         "baselines_included": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Direct decoding and zero-shot chain-of-thought are used as baselines for all seven models tested.",
    191           "source": "haiku"
    192         },
    193         "baselines_contemporary": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Baselines include GPT-4, GPT-3.5, DeepSeekCoder, Magicoder, WizardCoder, and CodeLlama — all contemporary competitive models at time of writing.",
    197           "source": "haiku"
    198         },
    199         "ablation_study": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Section 5.4 and Tables 3–4 provide ablation by removing the visual feedback module, isolating its contribution.",
    203           "source": "haiku"
    204         },
    205         "multiple_metrics": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "The primary evaluation uses only the GPT-4V score (0-100); Table 3 uses code execution accuracy for the Qwen benchmark but this is a separate external benchmark, not applied consistently across main experiments.",
    209           "source": "haiku"
    210         },
    211         "human_evaluation": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Human annotators score model-generated plots; Section 3.2 uses k=100 sampled subsets to compute Pearson correlation between human and automatic scores.",
    215           "source": "haiku"
    216         },
    217         "held_out_test_set": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "MatPlotBench serves as a held-out test set; no LLMs are fine-tuned on it and data replacement was performed to mitigate memorization.",
    221           "source": "haiku"
    222         },
    223         "per_category_breakdown": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Table 3 provides Hard/Easy breakdown for the Qwen-Agent benchmark; however, no per-plot-type breakdown is provided for the main MatPlotBench results.",
    227           "source": "haiku"
    228         },
    229         "failure_cases_discussed": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Figure 5 presents a case study with an 'extremely challenging' example where all three models fail, and Section 5.5 explicitly discusses failure modes.",
    233           "source": "haiku"
    234         },
    235         "negative_results_reported": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Table 1 shows zero-shot CoT uniformly hurts or negligibly helps most models; MatPlotAgent fails to improve CodeLlama-34B-Instruct (−2.36); these are reported without downplaying.",
    239           "source": "haiku"
    240         }
    241       },
    242       "setup_transparency": {
    243         "model_versions_specified": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "GPT-4, GPT-3.5, and GPT-4V are referenced only by marketing name without snapshot dates or API version identifiers; open-source models have version numbers but commercial models do not.",
    247           "source": "haiku"
    248         },
    249         "prompts_provided": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Appendix A provides the complete prompts for automatic evaluation, query expansion, code generation, self-debugging, and the visual agent (Figures 6–10).",
    253           "source": "haiku"
    254         },
    255         "hyperparameters_reported": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Decoding temperature is set to 0.0 for all code LLMs and the maximum self-debugging iterations is set to 3; these are explicitly stated.",
    259           "source": "haiku"
    260         },
    261         "scaffolding_described": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 4 describes the three-module agentic scaffold (query expansion, code agent with iterative self-debugging, visual agent) with workflow diagram in Figure 3.",
    265           "source": "haiku"
    266         },
    267         "data_preprocessing_documented": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Section 3.1 documents the full benchmark construction pipeline: source selection, LLM-assisted query generation, data replacement, human modification, ground-truth figure generation, and final human verification.",
    271           "source": "haiku"
    272         }
    273       },
    274       "data_integrity": {
    275         "raw_data_available": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "The benchmark data (queries, raw data files, ground-truth figures) is released on GitHub.",
    279           "source": "haiku"
    280         },
    281         "data_collection_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Section 3.1 describes selection of 75 examples from Matplotlib Gallery and 25 from OriginLab GraphGallery with explicit criteria and modification procedures.",
    285           "source": "haiku"
    286         },
    287         "recruitment_methods_described": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "Appendix B states annotators were recruited from computer science departments at various universities via social media and were compensated above market rate.",
    291           "source": "haiku"
    292         },
    293         "data_pipeline_documented": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The complete pipeline from original example selection through five distinct stages (query generation, data replacement, human modification, ground-truth generation, human verification) is documented in Section 3.1.",
    297           "source": "haiku"
    298         }
    299       },
    300       "contamination": {
    301         "training_cutoff_stated": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No training data cutoff is stated for GPT-4, GPT-3.5, or any model; the paper acknowledges memorization but does not cite training cutoffs.",
    305           "source": "haiku"
    306         },
    307         "train_test_overlap_discussed": {
    308           "applies": true,
    309           "answer": true,
    310           "justification": "The paper explicitly notes 'memorization by GPT-4' as a motivation for data replacement in Matplotlib examples, directly addressing train/test overlap.",
    311           "source": "haiku"
    312         },
    313         "benchmark_contamination_addressed": {
    314           "applies": true,
    315           "answer": true,
    316           "justification": "Data replacement was performed for all 75 Matplotlib Gallery examples specifically because GPT-4 exhibited memorization; OriginLab examples were excluded from replacement because GPT-4 showed no memorization on them.",
    317           "source": "haiku"
    318         }
    319       },
    320       "human_studies": {
    321         "pre_registered": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "Human annotators are used for benchmark construction and metric validation, not as study participants; pre-registration is not applicable.",
    325           "source": "haiku"
    326         },
    327         "irb_or_ethics_approval": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "Annotation work for benchmark construction does not constitute human subjects research; IRB approval is not applicable.",
    331           "source": "haiku"
    332         },
    333         "demographics_reported": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "Not applicable; annotators are workers, not study participants.",
    337           "source": "haiku"
    338         },
    339         "inclusion_exclusion_criteria": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "Not applicable for annotation workers; minimum 3 years coding/NLP experience is mentioned as a practical requirement.",
    343           "source": "haiku"
    344         },
    345         "randomization_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "Not applicable for annotation workers.",
    349           "source": "haiku"
    350         },
    351         "blinding_described": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "Not applicable; human evaluators are scoring plots against ground truth, not participating in a controlled experiment.",
    355           "source": "haiku"
    356         },
    357         "attrition_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "Not applicable for annotation work.",
    361           "source": "haiku"
    362         }
    363       },
    364       "cost_and_practicality": {
    365         "inference_cost_reported": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No API cost, latency, or token count is reported for any experiment, despite multiple GPT-4 and GPT-4V API calls per benchmark example.",
    369           "source": "haiku"
    370         },
    371         "compute_budget_stated": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No total computational budget is mentioned for running the benchmark across seven models.",
    375           "source": "haiku"
    376         }
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "MatPlotAgent improves performance of various LLMs including GPT-4 (+12.30), GPT-3.5 (+9.48), and most open-source models",
    383       "evidence": "Table 1 shows absolute score improvements over direct decoding for 5 of 7 tested models",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "GPT-4V-based automatic evaluation strongly correlates with human evaluation (r=0.876)",
    388       "evidence": "Figure 2 and Section 3.2 report Pearson r=0.876, p=7.41e-33 for GPT-4 and r=0.836, p=2.67e-27 for GPT-3.5",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Visual feedback mechanism is necessary — removing it significantly degrades performance",
    393       "evidence": "Table 4 shows GPT-4 drops from 61.16 to 53.44 and GPT-3.5 from 47.51 to 41.57 without visual feedback",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Zero-shot chain-of-thought does not effectively improve code LLM performance on visualization tasks",
    398       "evidence": "Table 1 shows CoT degrades performance for 6 of 7 models, with only Deepseek-coder-33B showing improvement",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Open-source Magicoder-S-DS-6.7B with MatPlotAgent surpasses GPT-4 direct decoding",
    403       "evidence": "Table 1: Magicoder+MatPlotAgent scores 51.70 vs GPT-4 direct 48.86",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "MatPlotAgent generalizes across visual agents — Gemini Pro Vision also yields improvement",
    408       "evidence": "Table 2 shows +7.87 and +5.45 improvements for GPT-4 and GPT-3.5 with Gemini Pro Vision as visual agent",
    409       "supported": "moderate"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval",
    414     "case-study"
    415   ],
    416   "key_findings": "MatPlotAgent, a three-module agentic framework (query expansion, iterative code generation, visual feedback), consistently improves LLM performance on scientific data visualization, with gains of 7–13 points for GPT-4 and GPT-3.5. The visual feedback mechanism is the key driver of improvement: removing it reduces scores by ~7–8 points. Zero-shot chain-of-thought actually hurts performance on most models. The proposed GPT-4V-based automatic evaluation metric correlates strongly with human judgments (r=0.876), validating its use as a benchmark metric.",
    417   "red_flags": [
    418     {
    419       "flag": "Circular evaluation: GPT-4V evaluates systems that use GPT-4V",
    420       "detail": "GPT-4V serves as both the visual agent within MatPlotAgent and as the sole automatic evaluator for all experiments. This creates a potential circularity where GPT-4V-powered systems may be self-servingly rated higher. The paper does not discuss or control for this bias."
    421     },
    422     {
    423       "flag": "Benchmark too small for reliable ranking",
    424       "detail": "100 test examples with score differences of 1–12 points and no confidence intervals makes it impossible to determine whether observed rankings are statistically meaningful."
    425     },
    426     {
    427       "flag": "No variance or confidence intervals anywhere",
    428       "detail": "All main results tables report single point estimates with no standard deviations, confidence intervals, or significance tests, making reliability of comparisons unknown."
    429     },
    430     {
    431       "flag": "Commercial model version underspecification",
    432       "detail": "GPT-4, GPT-3.5, and GPT-4V are referenced only by marketing names without snapshot dates or API version strings, making exact reproduction impossible."
    433     },
    434     {
    435       "flag": "No metric for core task success",
    436       "detail": "The paper uses only a subjective visual similarity score (0-100 from GPT-4V) with no code execution success rate as a primary metric in the main experiments, despite code execution success being a more objective measure."
    437     },
    438     {
    439       "flag": "Additional compute confound not addressed",
    440       "detail": "MatPlotAgent runs multiple passes (query expansion + code generation + visual feedback + additional debugging), while direct decoding is a single pass. The improvement could reflect more inference compute rather than the specific visual feedback mechanism."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    446       "relevance": "Prior benchmark for Matplotlib code generation that MatPlotBench directly extends and positions against"
    447     },
    448     {
    449       "title": "AgentBench: Evaluating LLMs as Agents",
    450       "relevance": "Related benchmark for evaluating LLM agent capabilities across diverse tasks"
    451     },
    452     {
    453       "title": "Teaching Large Language Models to Self-Debug",
    454       "relevance": "Self-debugging mechanism adopted directly in MatPlotAgent's code agent module"
    455     },
    456     {
    457       "title": "Communicative Agents for Software Development (ChatDev)",
    458       "relevance": "Related LLM-based coding agent framework for comparison and context"
    459     },
    460     {
    461       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    462       "relevance": "Key open-source baseline model evaluated in experiments"
    463     },
    464     {
    465       "title": "Magicoder: Source Code Is All You Need",
    466       "relevance": "Key open-source baseline that outperforms larger models; central to open-source findings"
    467     },
    468     {
    469       "title": "Code Llama: Open Foundation Models for Code",
    470       "relevance": "Tested baseline; the 34B variant notably fails to improve with MatPlotAgent"
    471     },
    472     {
    473       "title": "GPT-4 Technical Report",
    474       "relevance": "Primary commercial model backbone and evaluator; central to all experiments"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "Directly usable tool for researchers who need to automate data visualization; code is released and model-agnostic."
    481     },
    482     "surprise_contrarian": {
    483       "score": 2,
    484       "justification": "Finding that 6.7B open-source Magicoder with agent scaffolding beats GPT-4 direct decoding challenges the assumption that larger proprietary models always win; CoT hurting performance is also counterintuitive."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No AI safety or risk concerns raised; purely a productivity/capability paper."
    489     },
    490     "drama_conflict": {
    491       "score": 0,
    492       "justification": "No controversy or competing claims; straightforward benchmark and agent contribution."
    493     },
    494     "demo_ability": {
    495       "score": 3,
    496       "justification": "Code and benchmark are publicly released on GitHub; users can run MatPlotAgent on their own visualization queries immediately."
    497     },
    498     "brand_recognition": {
    499       "score": 2,
    500       "justification": "Tsinghua University (Liu lab) has high recognition in NLP/AI research; paper published at ACL 2024."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "39568622",
    507         "title": "ArtPrompt: ASCII Art-Based Jailbreak Attacks Against Aligned LLMs",
    508         "points": 145,
    509         "comments": 55,
    510         "url": "https://news.ycombinator.com/item?id=39568622",
    511         "created_at": "2024-03-02T00:30:06Z"
    512       },
    513       {
    514         "hn_id": "43761387",
    515         "title": "Should We Respect LLMs? A Study on Influence of Prompt Politeness on Performance",
    516         "points": 48,
    517         "comments": 105,
    518         "url": "https://news.ycombinator.com/item?id=43761387",
    519         "created_at": "2025-04-22T12:35:12Z"
    520       },
    521       {
    522         "hn_id": "43175799",
    523         "title": "The Influence of Prompt Politeness on LLM Performance",
    524         "points": 17,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=43175799",
    527         "created_at": "2025-02-25T18:53:26Z"
    528       },
    529       {
    530         "hn_id": "39449100",
    531         "title": "VoltSchemer: Use Voltage Noise to Manipulate a Wireless Charger",
    532         "points": 3,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=39449100",
    535         "created_at": "2024-02-21T01:15:44Z"
    536       },
    537       {
    538         "hn_id": "39552114",
    539         "title": "Should We Respect LLMs? Studying the Influence of Politeness on LLM Performance",
    540         "points": 1,
    541         "comments": 2,
    542         "url": "https://news.ycombinator.com/item?id=39552114",
    543         "created_at": "2024-02-29T17:07:42Z"
    544       }
    545     ],
    546     "top_points": 145,
    547     "total_points": 214,
    548     "total_comments": 162
    549   }
    550 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs