scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25305B)
      1 {
      2   "paper": {
      3     "title": "Self-Organized Agents: A LLM Multi-Agent Framework toward Ultra Large-Scale Code Generation and Optimization",
      4     "authors": ["Yoichi Ishibashi", "Yoshimasa Nishimura"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2404.02183",
      8     "doi": "10.48550/arXiv.2404.02183"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "The paper proposes Self-Organized Agents (SoA), a hierarchical multi-agent framework where Mother agents decompose tasks and spawn Child agents for sub-functions. Evaluated on HumanEval, SoA achieves 71.4% Pass@1 vs 66.5% for Reflexion using GPT-3.5-turbo. The analysis shows each SoA agent handles less code individually while total output is larger, but the evaluation only tests function-level problems, not the claimed ultra-large-scale code generation.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states 'Our code will be available at https://github.com/tsukushiAI/self-organized-agent' — this is a future promise, not a current release. Per criteria, a promise of future release counts as NO."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses HumanEval (Chen et al., 2021), a publicly available benchmark. No custom data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements files, or dependency details are provided. The paper only mentions using GPT-3.5-turbo."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The algorithmic description (Algorithm 1) is given but no runnable scripts, commands, or reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., 71.4% Pass@1) with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims 'SoA outperforms Reflexion by 5% in Pass@1' based solely on comparing two numbers (71.4% vs 66.5%) with no statistical test."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports the 5% improvement with baseline context — SoA 71.4% vs Reflexion 66.5% (Table 1). The magnitude of improvement is clear from the comparison."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "HumanEval has 164 problems. No justification is given for why this sample size is sufficient for the claims made, nor any power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or multiple-run results are reported. Results appear to be from single runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 1 compares against AlphaCode, Incoder, Codex, Gemini Pro, CoT, ChatGPT, Self-Edit, and Reflexion — a comprehensive set of baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include Gemini Pro (2023), Reflexion (2023), and AgentCoder (2023), which were contemporary at the time of writing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The SoA framework has multiple components (Mother/Child hierarchy, self-organization, code modification protocol) but no ablation study isolates their individual contributions."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The only quality metric is Pass@1. The paper also shows code volume metrics (characters, tokens) in Figure 5, but these measure quantity not quality."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of generated code quality. Evaluation is entirely automated via unit test pass/fail. Human evaluation could have assessed code readability, maintainability, or design quality."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "HumanEval is a standard benchmark with predefined test cases. The agent generates internally-created unit tests for self-debugging, and final evaluation uses HumanEval's held-out test suite."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No breakdown by problem difficulty, category, or type. Only aggregate Pass@1 across all 164 HumanEval problems is reported."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No failure case analysis. The paper does not discuss which problems SoA fails on or why."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results are reported. Every comparison and analysis shows SoA performing better."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims SoA enables 'ultra large-scale code generation and optimization' and 'the overall code volume to be increased indefinitely.' The evidence only shows function-level HumanEval results — the scalability claim for large-scale codebases is unsupported."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims self-organization 'enables' superior code generation, but the comparison is confounded: SoA uses 1 unit test while Reflexion uses 6, and other framework-level differences are not controlled. No ablation isolates the causal contribution of self-organization."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Ultra Large-Scale Code Generation' but results are on HumanEval (individual function-level problems, ~10-50 lines each). The paper does not bound claims to the tested setting — individual Python functions with GPT-3.5-turbo."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed. The 5% improvement could stem from the different unit test configuration (1 vs 6), the additional iteration budget, or increased total compute rather than self-organization."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures Pass@1 on HumanEval (individual functions) but frames the contribution as 'ultra large-scale code generation and optimization.' The gap between small-function correctness and large-scale codebase generation is not acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper specifies 'GPT3.5-turbo' with a footnote 'gpt3.5-turbo-1106' — a specific model version."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No actual prompt text is provided. The paper describes the process (skeleton generation, code modification) but never shows the actual prompts sent to the LLM. It mentions 'following (Shinn et al., 2023), we provided a few-shot trajectory' without showing it."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Framework parameters are given (max iterations=8, max tree depth=2, unit tests: 6 for Reflexion, 1 for SoA) but LLM-level hyperparameters (temperature, top-p, max tokens) are not reported."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The Mother/Child agent architecture is described in detail in Section 3, including code generation and modification protocols, memory management, hierarchical delegation, and the self-organization process. Algorithm 1 provides pseudocode."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The evaluation methodology is described: HumanEval Python set, unit test creation (n randomly selected test cases), and for the analysis 'we removed comments and docstrings from the HumanEval results and focused on the number of characters and tokens of pure code.'"
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. Limitations are briefly mentioned in the final paragraph of the Conclusion (§6) — the schema requires 'a dedicated section or subsection' and states 'a single sentence buried in the conclusion does not count.'"
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The limitations mentioned in the conclusion are somewhat generic: 'performance may be affected by the choice of LLM,' 'evaluated on a limited set of programming tasks,' and 'communication and collaboration mechanisms… can be further optimized.' These are generic observations, not specific threats to the validity of the reported results."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "While the conclusion mentions the framework has only been tested on limited tasks, the title and abstract claim 'ultra large-scale code generation' without explicitly bounding what the HumanEval results do NOT show about large-scale scenarios."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (per-problem results, agent outputs, generated code) is released. Only aggregate numbers are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data source is clearly stated: HumanEval benchmark (Chen et al., 2021), Python language set, following Reflexion's evaluation methodology with unit test construction process described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data source is a standard benchmark (HumanEval)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper does not fully document the pipeline from raw agent outputs to final Pass@1 numbers. It references Reflexion's methodology but does not detail how many problems were solved, aggregation steps, or handling of edge cases."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Both authors list their affiliation as TsukushiAI with email addresses."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The authors are affiliated with TsukushiAI, which may have commercial interest in multi-agent frameworks, but this is not discussed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided. The authors work for TsukushiAI, which may have commercial interests in the proposed framework."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The training data cutoff for gpt3.5-turbo-1106 is not stated. This is relevant since HumanEval (published 2021) predates the model."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GPT-3.5-turbo may have seen HumanEval problems during training. HumanEval was published in 2021 and is widely used, making contamination plausible."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HumanEval was published in 2021, well before GPT-3.5-turbo's training data was collected. The paper does not discuss this contamination risk at all."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or latency is reported. The paper notes GPT-4 was not used 'due to the high experimental cost required' but does not report the actual cost of the GPT-3.5-turbo experiments. Multi-agent systems inherently use more API calls."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget (API spend, wall-clock time, total tokens consumed) is reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or results across multiple seeds. Results appear to be from a single run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. It is unclear whether results represent a single run or an average."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Framework hyperparameters (tree depth=2, max iterations=8) appear selected without justification, and no search budget is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The configuration choices (tree depth=2, iterations=8, 1 unit test for SoA vs 6 for Reflexion) are stated but not justified. No explanation for why these specific values were chosen."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement and run their own framework against baselines. Some baseline numbers come from prior papers, but the Reflexion comparison appears to be their own run. Author-evaluation bias is not discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "SoA uses multiple agents making parallel LLM calls (more total compute than single-agent Reflexion), but performance is not reported as a function of compute budget. The additional compute cost of the multi-agent approach is not quantified or discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper claims to address 'ultra large-scale code generation' but evaluates on HumanEval, which tests individual function-level problems (typically 10-50 lines). The paper does not discuss whether HumanEval actually measures the claimed capability of large-scale code generation."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "SoA and Reflexion use entirely different scaffolding approaches AND different configurations (1 vs 6 unit tests). The paper attributes the 5% improvement to self-organization without controlling for the scaffold and configuration differences."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "HumanEval was published in 2021. GPT-3.5-turbo (November 2023 version) was trained on data well after HumanEval's publication. This temporal leakage is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks answer information. The self-debugging loop provides unit test feedback which is part of the design, but no discussion of potential leakage through this mechanism."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether HumanEval problems or similar problems appeared in GPT-3.5-turbo's training data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used (no canary strings, membership inference, or decontamination)."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SoA outperforms Reflexion by 5% in Pass@1 accuracy on HumanEval (71.4% vs 66.5%)",
    365       "evidence": "Table 1 (§4.1) shows Pass@1 comparisons across multiple baselines. SoA achieves 71.4% vs Reflexion's 66.5%.",
    366       "supported": "weak"
    367     },
    368     {
    369       "claim": "Each agent in SoA handles significantly less code than the single agent in Reflexion, yet the overall generated code is substantially greater",
    370       "evidence": "Figure 5 (§4.2) shows average character count and token count per function vs per final code for both SoA and Reflexion.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "The overall code volume can be increased indefinitely according to the number of agents while the amount of code managed by each agent remains constant",
    375       "evidence": "Theoretical argument in §3.3 and §4.2 analysis. Not empirically demonstrated beyond HumanEval's small functions.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Self-organized agents can generate code that functions well as a whole without needing to oversee the entire code",
    380       "evidence": "HumanEval Pass@1 results (Table 1, §4.1). However, HumanEval tests individual functions (not large-scale codebases), so the claim of scale is not validated.",
    381       "supported": "weak"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Unfair comparison configuration",
    387       "detail": "SoA uses 1 unit test while Reflexion uses 6 unit tests. This asymmetry could significantly affect results — fewer test constraints may make it easier for SoA to pass, or harder for self-debugging. The paper does not justify or analyze this difference."
    388     },
    389     {
    390       "flag": "Claims far exceed evidence",
    391       "detail": "Title claims 'Ultra Large-Scale Code Generation and Optimization' but evaluation is on HumanEval, which tests individual function-level problems (typically 10-50 lines). No large-scale code generation is demonstrated."
    392     },
    393     {
    394       "flag": "No error bars or statistical tests",
    395       "detail": "A 5% improvement (71.4% vs 66.5%) on 164 problems with no statistical significance test and no variance reporting. The difference may not be statistically significant."
    396     },
    397     {
    398       "flag": "No ablation study",
    399       "detail": "The framework has multiple components (Mother/Child hierarchy, self-organization, code modification feedback propagation) but no ablation determines which components contribute to the improvement."
    400     },
    401     {
    402       "flag": "Benchmark contamination risk",
    403       "detail": "HumanEval was published in 2021 and is widely available online. GPT-3.5-turbo (November 2023) may have been trained on HumanEval solutions. This contamination risk is not discussed."
    404     },
    405     {
    406       "flag": "Compute cost ignored",
    407       "detail": "Multi-agent SoA makes substantially more LLM API calls than single-agent Reflexion. Claiming superiority without reporting relative compute cost is misleading — the improvement may simply come from spending more tokens."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Reflexion: language agents with verbal reinforcement learning",
    413       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    414       "year": 2023,
    415       "relevance": "Key baseline for single-agent code generation with self-debugging, which SoA aims to surpass."
    416     },
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["Mark Chen", "Jerry Tworek"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduces HumanEval benchmark and Codex, foundational to LLM code generation evaluation."
    423     },
    424     {
    425       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    426       "authors": ["Sirui Hong", "Xiawu Zheng"],
    427       "year": 2023,
    428       "arxiv_id": "2308.00352",
    429       "relevance": "Multi-agent software development framework using role-based agent collaboration."
    430     },
    431     {
    432       "title": "Communicative agents for software development",
    433       "authors": ["Chen Qian", "Xin Cong"],
    434       "year": 2023,
    435       "arxiv_id": "2307.07924",
    436       "relevance": "ChatDev multi-agent framework for end-to-end software development."
    437     },
    438     {
    439       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    440       "authors": ["Dong Huang", "Qingwen Bu", "Jie M. Zhang"],
    441       "year": 2023,
    442       "arxiv_id": "2312.13010",
    443       "relevance": "Multi-agent code generation system with iterative testing, directly comparable approach."
    444     },
    445     {
    446       "title": "Self-collaboration code generation via ChatGPT",
    447       "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"],
    448       "year": 2023,
    449       "arxiv_id": "2304.07590",
    450       "relevance": "Multi-agent collaboration approach using ChatGPT for code generation."
    451     },
    452     {
    453       "title": "ReAct: Synergizing reasoning and acting in language models",
    454       "authors": ["Shunyu Yao", "Jeffrey Zhao"],
    455       "year": 2023,
    456       "relevance": "Foundational LLM agent framework combining reasoning and action."
    457     },
    458     {
    459       "title": "Self-Edit: Fault-aware code editor for code generation",
    460       "authors": ["Kechi Zhang", "Zhuo Li", "Jia Li"],
    461       "year": 2023,
    462       "relevance": "Self-debugging code generation method used as a baseline."
    463     },
    464     {
    465       "title": "Language agent tree search unifies reasoning acting and planning in language models",
    466       "authors": ["Andy Zhou", "Kai Yan"],
    467       "year": 2023,
    468       "arxiv_id": "2310.04406",
    469       "relevance": "Tree-search approach for LLM agents, related tree-structured agent methodology."
    470     },
    471     {
    472       "title": "LDB: A large language model debugger via verifying runtime execution step-by-step",
    473       "authors": ["Lily Zhong", "Zilong Wang", "Jingbo Shang"],
    474       "year": 2024,
    475       "arxiv_id": "2402.16906",
    476       "relevance": "LLM-based debugging approach for code generation, related self-improvement technique."
    477     }
    478   ]
    479 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs