scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27809B)
      1 {
      2   "paper": {
      3     "title": "SIDiffAgent: Self-Improving Diffusion Agent",
      4     "authors": ["Shivank Garg", "Ayush Singh", "Gaurav Kumar Nayak"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.02051"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "SIDiffAgent, a training-free multi-agent framework for text-to-image generation, achieves state-of-the-art VQA scores on GenAIBench (0.884) and DrawBench by combining prompt refinement, adaptive negative prompts, evaluation-driven editing, and a trajectory-based self-improvement memory. The framework yields a 16.77% VQA score improvement over the base Qwen-Image model across two episodes, and outperforms proprietary models like Imagen 3 by 5.39%. A human evaluation with 50 participants showed 69% preference for SIDiffAgent over T2I-Copilot, though inter-annotator agreement was only moderate (Cohen's κ = 0.286).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available benchmarks: GenAIBench, DrawBench, DPG, and GenEval. Seeds from the datasets were used for generation."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions 8×A100 GPUs, vLLM, and LangGraph but does not provide a requirements.txt, Dockerfile, or detailed dependency list with library versions."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the method but not how to replicate the experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results in Tables 1, 8, and 9 are point estimates without confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims SIDiffAgent outperforms multiple baselines based solely on comparing VQA score numbers without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Percentage improvements over baselines are reported with context: e.g., '+8.73% over T2I-Copilot', '+5.36% over Imagen 3', '+15.70% over SD 3.5' (Section 5). Baseline values are available in Table 1."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for the benchmark sizes used or the 50-participant human evaluation sample size."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. All tables show single-run point estimates."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Table 1 compares against 14 methods including proprietary (Imagen 3, Recraft V3, FLUX1.1-pro, Midjourney, DALL-E 3) and open-source models plus T2I-Copilot."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include recent models: Imagen 3 (2024), Recraft V3 (2024), FLUX.1-dev (2024), T2I-Copilot (2025), Lumina-Image 2.0 (2025), Janus Pro-7B (2025)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 4.2 presents five ablation variants: Qwen-Image (base), Qwen-Agents, Qwen-Agentneg, Qwen-Agentvneg, SIDiffAgent, and SIDiffAgentep2, isolating contributions of agentic workflow, negative prompts, adaptive negative prompts, guidance, and self-improvement."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Results are reported on VQAScore across GenAIBench and DrawBench (Table 1), DPG benchmark (Table 8), and GenEval (Table 9), each with distinct evaluation criteria."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 5 reports a human evaluation with 50 participants from varied geographical backgrounds, showing 69% preference for SIDiffAgent over T2I-Copilot, with Cohen's κ = 0.286."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The evaluation uses standard public benchmarks (GenAIBench, DrawBench, DPG, GenEval) that are separate from any training data, and the guidance memory was built during Episode 1 and tested on Episode 2."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 breaks down GenAIBench into Basic/Advanced/Overall and subcategories (Attribute, Scene, Relation, Count, Differ, Compare, Logical, Negate, Universal). DrawBench also has per-category breakdown."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Appendix G discusses three types of failure cases with examples in Figures 4 and 5: conflicting memory guidance, iterative regeneration degradation, and unnecessary correction loops from rare attribute proposals."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix G explicitly discusses cases where the multi-agent framework leads to decreased image quality, and Figure 5 shows examples where iterative regeneration worsens results."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims 0.884 VQA score on GenAIBench, outperforming open-source, proprietary, and agentic methods. Table 1 shows SIDiffAgentep2 at 0.884 overall on GenAIBench, exceeding all baselines listed."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The ablation study (Section 4.2) uses controlled single-variable manipulation: each ablation adds one component (agents, negative prompts, adaptive negative prompts, guidance, episodes), providing adequate evidence for causal claims about component contributions."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title and abstract make broad claims about 'Self-Improving Diffusion Agent' without bounding to the tested Qwen model family and specific benchmarks. While Appendix B tests Flux-dev, the general framing overstates the evidence. The conclusion claims the framework demonstrates potential for 'scalable deployment in real-world creative and professional applications' based on benchmark evaluations."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative explanations for the improvements. For example, the gains could partly be attributed to the additional compute (2.31 min vs 0.78 min per image) rather than the agentic architecture, but this confound is not analyzed."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper uses VQAScore as the primary metric and claims 'superior text-to-image alignment and realism' and 'enhanced generation quality.' VQAScore is a proxy for human-perceived quality, and while a human eval is included, the paper does not discuss the gap between VQAScore and actual user satisfaction or real-world deployment quality."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 4.1 specifies 'Qwen-2.5-72B-VL' with a HuggingFace link, and names Qwen-Image, Qwen-Edit, and Qwen-Embedding-0.6B. The models used are identifiable specific versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix K provides full prompt text for all agents and sub-agents: Creativity Analysis, Intention Analysis, Prompt Refinement, Adaptive Negative Prompt, Evaluation Agent, Trajectory Analysis, and Guidance generation prompts."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix J reports key hyperparameters: evaluation threshold 8.0, guidance extraction threshold 200 samples, similarity search k=5, max edits ≤2, guidance scale 4.0, negative prompt weight 1.0."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3 provides detailed description of the multi-agent architecture: AORC with 5 sub-agents, AEVAL, AGUID, including workflow diagrams (Figure 2), the editing loop, memory/RAG system, and the full algorithm in Appendix L."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 4.1 describes the setup: generation seeds from the dataset, 4-bit quantization for Qwen-Image/Edit, vLLM hosting, SQLite storage for trajectories, FAISS indexing. The benchmarks are used as-is without modification."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Appendix C is a dedicated 'Limitations' section discussing missing comparisons with proprietary models, lack of human evaluation for generated images, computational overhead of multiple sub-agents, and privacy considerations of stored prompts."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix C raises specific threats: not benchmarking against NanoBanana, GPT-Image, and Flux-Kontext-Pro due to resource constraints; the framework's reliance on multiple LLM calls increasing overhead; and privacy risks from stored trajectory prompts."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the specific benchmarks tested or acknowledge that the framework has only been validated on the Qwen family (with brief Flux-dev test). The conclusion broadly claims potential for 'real-world creative and professional applications.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw generated images, evaluation scores, or trajectory data are released for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 4.1 describes using GenAIBench and DrawBench benchmarks with provided generation seeds, and the human evaluation involved 50 participants from varied geographical backgrounds."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The human evaluation mentions '50 participants from varied geographical backgrounds' but does not describe how they were recruited, what platforms were used, or whether this introduces selection bias."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline is documented: input prompt → SCRE → SINT → SREF → SNEG → SGEN → AEVAL → editing loop → AGUID trajectory storage. Algorithm 1 in Appendix L formalizes the complete flow."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information, acknowledgments section, or grant numbers are provided in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors are listed as affiliated with Indian Institute of Technology, Roorkee. They are not evaluating their own institution's product."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is provided in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state the training data cutoff for Qwen-2.5-72B-VL or the Qwen-Image models, despite evaluating on public benchmarks."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether GenAIBench, DrawBench, DPG, or GenEval prompts appeared in the training data of the Qwen models."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "DrawBench (2022), GenEval (2023), and DPG (2024) were all published before Qwen-2.5-VL's training. No discussion of contamination risk."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No pre-registration is mentioned for the human evaluation study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics approval is mentioned for the 50-participant human evaluation."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "Only 'varied geographical backgrounds' is mentioned. No details on age, expertise, experience with image generation, or other demographics."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No inclusion or exclusion criteria for participants are described."
    262       },
    263       "randomization_described": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No description of how image pairs were presented to evaluators or whether presentation order was randomized."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No mention of whether participants knew which images came from SIDiffAgent vs T2I-Copilot."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No information about whether any participants dropped out or how many evaluations each participant completed."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table 2 (Appendix A) reports inference time per prompt: Qwen-Image 0.78 min, T2I-Copilot 1.50 min, SIDiffAgent 2.31 min on an A6000 GPU at 1024×1024 resolution."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 4.1 states the hardware: 8×80GB NVIDIA A100 GPUs, 4 for the VLM and 4 for generation. Qwen-Image/Edit require ~47GB VRAM under 4-bit quantization."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper uses seeds 'provided in the dataset' for reproducibility but does not report results across multiple random seeds to assess sensitivity."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not state how many runs produced the reported results. It appears to be a single run per benchmark."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Hyperparameters like threshold=8.0, k=5, and guidance extraction threshold=200 are reported but no search budget or tuning methodology is described."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Table 5 reports VQA scores for different k values (3, 5, 7, 10), showing k=5 was selected as best. Table 4 evaluates retrieval quality across k values. The selection is transparent."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, let alone multiple comparison corrections, despite comparing against 14+ baselines."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors compare their system against baselines using results from T2I-Copilot's paper for most baselines but do not acknowledge potential bias in evaluating their own system."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Table 2 reports latency comparison: SIDiffAgent takes 2.31 min vs 0.78 min for base Qwen-Image (3x overhead). The paper discusses the cost-quality tradeoff in Appendix A."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper uses VQAScore as the primary metric because Imagen3 identified it as more human-aligned than CLIPScore/PickScore/ImageReward, but does not independently discuss whether VQAScore actually measures text-to-image alignment quality."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "SIDiffAgent uses Qwen-Image as the base generator within a complex multi-agent scaffold, but comparisons with other models (Imagen 3, DALL-E 3, etc.) compare different models in different scaffolds. The paper does not address this confound—improvements could be due to the scaffold rather than the approach's novelty."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "DrawBench (2022), GenEval (2023), DPG (2024) all predate the Qwen models used. The Qwen VLM evaluator may have seen benchmark prompts during training. This is not discussed."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The evaluation uses Qwen-72B-VL as both the pipeline's reasoning backbone and effectively part of the generation system. The VLM may have encoded knowledge about benchmark-expected outputs. This is not discussed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether benchmark prompts share structural similarities with Qwen's training data."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "SIDiffAgent achieves an average VQA score of 0.884 on GenAIBench, outperforming all baselines.",
    364       "evidence": "Table 1 shows SIDiffAgentep2 achieves 0.884 overall on GenAIBench, compared to 0.813 for T2I-Copilot and 0.839 for Imagen 3 (task completion rate). Section 5 notes this is from the Episode 2 run.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "SIDiffAgent outperforms Imagen 3 by 5.36% in VQA Score.",
    369       "evidence": "Table 1: SIDiffAgent (ep1) achieves 0.852 DrawBench overall vs Imagen 3's 0.839. The 5.36% figure is from GenAIBench (0.884 vs 0.839). However the Imagen 3 comparison uses task completion rate scores from a different evaluation setup.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "The self-improvement mechanism (Episode 2) yields the largest gains, achieving +16.77% improvement over Qwen-Image.",
    374       "evidence": "Table 1 ablation: Qwen-Image 0.757 → SIDiffAgentep2 0.884 on GenAIBench overall. This is a 16.8% relative improvement. The ablation progression is shown clearly.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "SIDiffAgent is preferred by humans in 69% of cases compared to 31% for T2I-Copilot.",
    379       "evidence": "Section 5 reports 50 participants, but Cohen's κ = 0.286 (only 'fair' agreement). No details on study design, blinding, or participant selection. The low κ undermines confidence in the preference figure.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "The framework generalizes beyond the Qwen model family to Flux-dev.",
    384       "evidence": "Table 3 (Appendix B) shows Flux-dev improving from 0.775 to 0.865 across episodes on DrawBench. Only one alternative model tested on one benchmark.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "Memory generalizes to unseen datasets (DrawBench performance using GenAI-Bench memory).",
    389       "evidence": "Table 6 shows GenAI-memory achieves 0.8725 on DrawBench vs Episode-1 0.860 and DrawBench-native memory 0.901. A single benchmark pair is tested.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "No statistical tests on any comparison",
    396       "detail": "All claims of outperformance are based on comparing point estimates without confidence intervals, error bars, or significance tests. With single-run results, observed differences could be within noise."
    397     },
    398     {
    399       "flag": "Scaffold confound in cross-model comparisons",
    400       "detail": "SIDiffAgent wraps Qwen-Image in a complex multi-agent scaffold with evaluation, editing, and memory. Comparisons with Imagen 3, DALL-E 3, and other models compare different models in different scaffolds. The improvement may be primarily from the iterative refinement and editing loop (which any model could benefit from) rather than the novel components."
    401     },
    402     {
    403       "flag": "Self-evaluating system",
    404       "detail": "Qwen-72B-VL serves as both the reasoning backbone in the pipeline AND part of the system being evaluated. The AEVAL agent uses the same VLM to score images that the pipeline produced, creating a circular evaluation dynamic."
    405     },
    406     {
    407       "flag": "Weak human evaluation design",
    408       "detail": "The human evaluation (50 participants, κ=0.286) has no described recruitment method, no demographics, no blinding, no randomization, and low inter-annotator agreement. A κ of 0.286 is 'fair' at best and insufficient for strong claims."
    409     },
    410     {
    411       "flag": "Baseline results taken from another paper",
    412       "detail": "Results for most baselines are taken from T2I-Copilot (Chen et al., 2025a) rather than reproduced. While SIDiffAgent used identical seeds, differences in evaluation setup could affect comparability."
    413     },
    414     {
    415       "flag": "No contamination analysis",
    416       "detail": "The benchmarks (DrawBench 2022, GenEval 2023, DPG 2024) predate the Qwen models. The Qwen VLM evaluator may have seen these benchmarks during training, potentially inflating VQAScore evaluations."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "T2I-Copilot: A Training-Free Multi-Agent Text-to-Image System for Enhanced Prompt Interpretation and Interactive Generation",
    422       "authors": ["C.-Y. Chen", "M. Shi", "G. Zhang", "H. Shi"],
    423       "year": 2025,
    424       "arxiv_id": "2507.20536",
    425       "relevance": "Prior agentic framework for text-to-image generation that SIDiffAgent directly compares against and improves upon."
    426     },
    427     {
    428       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    429       "authors": ["S. Hong", "M. Zhuge", "J. Chen"],
    430       "year": 2024,
    431       "relevance": "Multi-agent LLM framework for software development, relevant to agentic AI architectures."
    432     },
    433     {
    434       "title": "ChatDev: Communicative Agents for Software Development",
    435       "authors": ["C. Qian", "W. Liu", "H. Liu"],
    436       "year": 2023,
    437       "arxiv_id": "2307.07924",
    438       "relevance": "Multi-agent system for code generation through communicative agents, relevant to agentic AI workflows."
    439     },
    440     {
    441       "title": "Why Do Multi-Agent LLM Systems Fail?",
    442       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    443       "year": 2025,
    444       "arxiv_id": "2503.13657",
    445       "relevance": "Analysis of failure modes in multi-agent LLM systems, directly relevant to understanding agentic system limitations."
    446     },
    447     {
    448       "title": "Automated Design of Agentic Systems",
    449       "authors": ["S. Hu", "C. Lu", "J. Clune"],
    450       "year": 2025,
    451       "arxiv_id": "2408.08435",
    452       "relevance": "Meta-agent approach to improving task-specific agents through automated exploration of design spaces."
    453     },
    454     {
    455       "title": "A Self-Improving Coding Agent",
    456       "authors": ["M. Robeyns", "M. Szummer", "L. Aitchison"],
    457       "year": 2025,
    458       "arxiv_id": "2504.15228",
    459       "relevance": "Self-improving coding agent that can edit its own codebase, relevant to self-improvement in agentic systems."
    460     },
    461     {
    462       "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery",
    463       "authors": ["A. Novikov"],
    464       "year": 2025,
    465       "arxiv_id": "2506.13131",
    466       "relevance": "Meta-agent system that uses coding agents for discovery, relevant to self-improving AI agent architectures."
    467     },
    468     {
    469       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Language Model Society",
    470       "authors": ["G. Li", "H. Hammoud", "H. Itani"],
    471       "year": 2023,
    472       "relevance": "Multi-agent communication framework for LLMs, foundational work on collaborative agent systems."
    473     },
    474     {
    475       "title": "Large Language Model Agent: A Survey on Methodology, Applications and Challenges",
    476       "authors": ["J. Luo", "W. Zhang", "Y. Yuan"],
    477       "year": 2025,
    478       "arxiv_id": "2503.21460",
    479       "relevance": "Comprehensive survey on LLM agents covering methodology and applications."
    480     },
    481     {
    482       "title": "GenAI-Bench: A Holistic Benchmark for Compositional Text-to-Visual Generation",
    483       "authors": ["B. Li", "Z. Lin", "D. Pathak"],
    484       "year": 2024,
    485       "relevance": "Primary evaluation benchmark used in this paper for assessing text-to-image generation quality."
    486     },
    487     {
    488       "title": "Evaluating Text-to-Visual Generation with Image-to-Text Generation (VQAScore)",
    489       "authors": ["Z. Lin", "D. Pathak", "B. Li"],
    490       "year": 2024,
    491       "relevance": "Defines VQAScore, the primary evaluation metric used throughout this paper."
    492     },
    493     {
    494       "title": "AgentSquare: Automatic LLM Agent Search in Modular Design Space",
    495       "authors": ["Y. Shang", "Y. Li", "K. Zhao"],
    496       "year": 2024,
    497       "arxiv_id": "2410.06153",
    498       "relevance": "Automated modular agent design, relevant to understanding agentic system architectures."
    499     }
    500   ]
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs