ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25049B)


      1 {
      2   "paper": {
      3     "title": "TextResNet: Decoupling and Routing Optimization Signals in Compound AI Systems via Deep Residual Tuning",
      4     "authors": ["Suizhi Huang", "Mei Li", "Han Yu", "Xiaoxiao Li"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.08306"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "TextResNet introduces a framework for decoupling and routing textual gradients in Compound AI Systems via four innovations: Additive Semantic Deltas, Semantic Projector, Causal Routing, and Density-Aware Scheduling. On HotpotQA, it surpasses TextGrad by +21.37 F1 (46.23 vs 24.86). The framework achieves ~3x token efficiency (21k vs 63k tokens) while maintaining stability in deep chains where baselines collapse. Ablation studies confirm each component contributes monotonically to performance.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "A GitHub URL is provided in the abstract: https://github.com/JeanDiable/TextResNet."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available benchmarks: HotpotQA, BigCodeBench (HuggingFace), PubMedQA, and STaRK-PRIME (HuggingFace). Data sources and splits are documented in Appendix B.1."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper lists models used but not library versions or dependencies."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper. While hyperparameters and configurations are detailed in Appendix B, there are no specific commands or scripts referenced for reproducing results."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Table 1 reports ± notation for all methods across all benchmarks (e.g., '46.23 ± 1.15' for TextResNet on HotpotQA)."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests (p-values, t-tests, etc.) are reported despite claims that TextResNet 'outperforms' baselines. Comparisons rely solely on comparing means with standard deviations."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports absolute improvements with baseline context (e.g., '+21.37 F1' over TextGrad on HotpotQA, from 24.86 to 46.23). This provides sufficient context for understanding magnitude."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for the choice of train/dev/test split sizes (e.g., 1000/250/100 for HotpotQA, 500/25/70 for BigCodeBench). No power analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Table 1 reports standard deviations across runs (e.g., ± 1.15). Appendix B.4 states 'Test repeats: 3' and 'Repeated dev trials: 3'."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Five baselines are compared: CoT, HBC, DSPy (MIPRO), TextGrad, and TextGrad+Sum (Table 1)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "TextGrad (2025), DSPy/MIPRO (2024), and TextGrad+Sum (2025a) are recent. OPTIMAS (2025) is discussed but excluded from comparison with justification."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 2 provides a progressive ablation study incrementally adding each of the four components (Res, Proj, Route, Sched) on HotpotQA and BigCodeBench."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Four different metrics across benchmarks: F1 (HotpotQA), Pass% (BigCodeBench), Accuracy (PubMedQA), MRR (STARK-PRIME)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation is included. All evaluation is automated (F1, pass rate, accuracy, MRR)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Appendix B.1 describes explicit train/dev/test splits for all four benchmarks. The dev set is used for validation during optimization, and separate test sets are used for final evaluation."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per benchmark (Table 1). Additional per-component analysis in Figures 4-6 and per-scheduling-strategy breakdowns in Figure 9."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 1 (Figure 1) provides detailed failure case analysis showing three failure modes (Signal Blockage, Downstream Over-correction, Upstream Pollution). Section 6.3 tests robustness under batch shuffling intervention."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "TextGrad+Sum is shown to often underperform vanilla TextGrad (Section 5.2). Figure 7 shows TextGrad+Sum performs worst in deep chains. Ablation variants show degraded performance."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of 'superior performance' and 'remarkable stability' are supported by Table 1 (performance) and Figure 7 (stability across chain depths)."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims about each component's contribution are supported by controlled ablation studies (Table 2) with single-variable manipulation. The batch shuffling intervention (Section 6.3) provides a counterfactual test of causal attribution."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims optimization for 'Compound AI Systems' broadly, but results are on four specific benchmarks with specific pipeline configurations. The paper does not bound its generalization claims to these tested settings. Section G discusses limitations of window control but not generalization boundaries."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No substantive discussion of alternative explanations. For instance, the gains could partly come from the additional LLM calls for the Semantic Projector, or from the structured prompt format rather than the routing mechanism. Section G discusses limitations but not alternative explanations for the observed results."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures specific metrics (F1, pass rate, accuracy, MRR) on specific benchmarks and frames claims at the level of these metrics. It does not overclaim broader capabilities beyond the measured proxies."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Table 4 specifies exact model names: 'openai/gpt-4o-mini', 'anthropic/claude-3-haiku', 'anthropic/claude-3-haiku-20240307'. These include version identifiers."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix C provides full prompt templates for the Backward Semantic Projector (Listing 1), Prompt Optimizer (Listing 2), and auxiliary formatting constraints including the Causal Routing and Context Construction templates."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Tables 3-6 provide comprehensive hyperparameter details: temperatures, max tokens, batch sizes, training steps, evaluation frequency, random seed, and resource limits for code execution."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The full pipeline architecture is described in detail: Appendix B.2 documents all four system pipelines with component names and dataflow. Figure 2 provides an architectural overview. The three-stage optimization process is thoroughly described in Section 4."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix B.1 documents data splits, sampling procedures, and preprocessing for each benchmark. Appendix B.4 describes trace/stderr truncation limits and retrieval top-k settings."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Appendix G 'Limitations and Future Directions' provides a dedicated limitations discussion covering context window control limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "Appendix G discusses only one limitation (window control) and frames it as a future research direction. It does not discuss threats specific to the experimental validity of the current results (e.g., limited benchmark diversity, reliance on specific LLMs, sensitivity to prompt formatting)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the tested pipeline configurations, models, or benchmark types."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (individual predictions, per-example scores, optimization trajectories) is made available for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix B.1 describes how data is sourced for each benchmark (HuggingFace datasets, standard splits, random seed-based split generation)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data comes from standard benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Appendix B.1 documents the pipeline from data source to final splits including split ratios, random seeds, and subset sizes. Appendix B.4-B.6 document the forward execution pipeline."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Nanyang Technological University, Shanghai Jiao Tong University, University of British Columbia, Vector Institute."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses GPT-4o-mini and Claude 3 Haiku but does not state their training data cutoff dates. These models may have been trained on data including some benchmark solutions."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of potential overlap between model training data and benchmark test sets. HotpotQA (2018) and PubMedQA (2019) are old benchmarks that likely appear in LLM training data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of benchmark contamination risk despite using well-known benchmarks (HotpotQA, PubMedQA) that predate the models' training cutoffs."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Figure 10 reports token consumption: TextResNet uses ~21k feedback tokens vs TextGrad's ~63k tokens over 100 steps. Token efficiency is analyzed in Appendix F.4."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total API cost, GPU hours, or wall-clock time is reported. Only feedback token counts are shown, which is a partial cost measure (does not include forward pass tokens or API pricing)."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Table 3 states 'Random seed: 42' (single seed). While ± values are reported in Table 1, these appear to be from 3 test repeats (Table 3), not multiple seeds. No seed sensitivity analysis."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Table 3 states 'Test repeats: 3' and 'Repeated dev trials: 3'."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The Boltzmann temperature τ and other hyperparameters appear tuned but no search process is described."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Appendix B.3 describes validation-based selection: 'evaluate on a fixed held-out validation set every two optimization steps, and select the best-performing prompt.'"
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No significance tests are performed at all, let alone corrections for multiple comparisons across four benchmarks and multiple baselines."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors implement their own framework and compare against their own implementations of baselines. No acknowledgment of self-comparison bias per Lucic et al. (2018). OPTIMAS is excluded from comparison with a justification that could be seen as avoiding a harder baseline."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 10(b) directly compares total feedback token consumption vs final F1 performance for TextResNet vs TextGrad."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether HotpotQA, BigCodeBench, PubMedQA, or STARK-PRIME actually measure the claimed capability of 'optimization in Compound AI Systems.' The benchmarks are used because OPTIMAS used them, without questioning construct validity."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Section 5.1 states 'we adopt the SCG configurations of OPTIMAS to ensure topological consistency.' All methods use the same pipeline architecture, controlling for scaffold differences."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage despite using benchmarks (HotpotQA 2018, PubMedQA 2019) created well before the models' training data was collected."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information through context that wouldn't be available in real usage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether train/dev/test splits share structural similarities or whether examples are truly independent."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention methods are applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "TextResNet surpasses TextGrad by +21.37 F1 on HotpotQA",
    364       "evidence": "Table 1: TextResNet 46.23 ± 1.15 vs TextGrad 24.86 ± 1.19 on HotpotQA F1",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "TextResNet achieves 37.86% pass rate on BigCodeBench, improving over DSPy (33.81%) and TextGrad (35.71%)",
    369       "evidence": "Table 1: TextResNet 37.86 ± 0.45 vs DSPy 33.81 ± 2.75 and TextGrad 35.71 ± 0.10",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "TextResNet maintains performance stability as chain depth increases while TextGrad collapses",
    374       "evidence": "Figure 7 shows TextResNet stable from L=5 to L=20 while TextGrad degrades. However, the experiment uses synthetic 'Identity Node' padding.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "TextResNet consumes approximately 3x fewer tokens than TextGrad while achieving nearly double the performance",
    379       "evidence": "Figure 10(b): 21,909 vs 63,073 total feedback tokens; F1 46.23 vs 24.86",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Each of the four components contributes monotonically to performance",
    384       "evidence": "Table 2 ablation: 24.86 → 32.15 → 36.69 → 37.71 → 46.23 F1 on HotpotQA as components are added",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "TextResNet correctly identifies batch-shuffled upstream errors in 96% of cases",
    389       "evidence": "Figure 5 and Section 6.3: counterfactual batch shuffling intervention on HotpotQA",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "OPTIMAS excluded from comparison",
    396       "detail": "The most relevant baseline (OPTIMAS, which uses the same benchmark suite and pipeline configurations) is excluded from Table 1 with justification about 'fundamental divergence in supervision.' However, OPTIMAS results would provide important context. The stated reason (RL-based training vs training-free) is a valid methodological difference, but readers cannot assess relative performance."
    397     },
    398     {
    399       "flag": "No significance tests",
    400       "detail": "Claims of 'outperformance' across all benchmarks rely on comparing means ± std without any statistical tests. Some improvements are within overlapping error ranges (e.g., PubMedQA: 60.31 ± 1.51 vs DSPy 60.26 ± 0.40)."
    401     },
    402     {
    403       "flag": "Single random seed",
    404       "detail": "All experiments use seed=42 with only 3 test repeats. No seed sensitivity analysis despite the stochastic nature of LLM-based optimization."
    405     },
    406     {
    407       "flag": "Synthetic depth scalability test",
    408       "detail": "The depth scalability experiment (Figure 7) adds artificial 'Identity Nodes' to extend chains. This may not represent real-world deep chains where each node has meaningful computation."
    409     },
    410     {
    411       "flag": "No contamination analysis",
    412       "detail": "HotpotQA (2018) and PubMedQA (2019) are old benchmarks. GPT-4o-mini and Claude 3 Haiku were trained after these benchmarks were published. Contamination could differentially affect methods that better exploit memorized information."
    413     },
    414     {
    415       "flag": "Marginal improvements on some benchmarks",
    416       "detail": "On PubMedQA (60.31 vs 60.26) and STARK-PRIME (41.75 vs 41.40), improvements over DSPy are within error bars. The strong narrative is driven primarily by HotpotQA results."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Optimizing generative AI by backpropagating language model feedback",
    422       "authors": ["Mert Yuksekgonul", "Federico Bianchi", "Joseph Boen"],
    423       "year": 2025,
    424       "relevance": "TextGrad — the primary baseline; pioneered textual differentiation for compound AI systems."
    425     },
    426     {
    427       "title": "The shift from models to compound AI systems",
    428       "authors": ["Matei Zaharia", "Omar Khattab"],
    429       "year": 2024,
    430       "relevance": "Defines the Compound AI Systems paradigm that TextResNet optimizes."
    431     },
    432     {
    433       "title": "Demonstrate-search-predict: Composing retrieval and language models for knowledge-intensive NLP",
    434       "authors": ["Omar Khattab"],
    435       "year": 2022,
    436       "arxiv_id": "2212.14024",
    437       "relevance": "DSPy framework for programmatic prompt compilation, used as a baseline."
    438     },
    439     {
    440       "title": "OPTIMAS: Optimizing compound AI systems with globally aligned local rewards",
    441       "authors": ["Shuai Wu"],
    442       "year": 2025,
    443       "arxiv_id": "2507.03041",
    444       "relevance": "RL-based compound AI optimization; provides the pipeline configurations used in this paper."
    445     },
    446     {
    447       "title": "Why do multi-agent LLM systems fail?",
    448       "authors": ["Mert Cemri"],
    449       "year": 2025,
    450       "arxiv_id": "2503.13657",
    451       "relevance": "Analyzes failure modes in multi-agent LLM systems, directly relevant to attribution ambiguity."
    452     },
    453     {
    454       "title": "Reflexion: Language agents with verbal reinforcement learning",
    455       "authors": ["Noah Shinn"],
    456       "year": 2023,
    457       "relevance": "Verbal self-reflection approach for LLM agents; related optimization paradigm."
    458     },
    459     {
    460       "title": "Self-refine: Iterative refinement with self-feedback",
    461       "authors": ["Aman Madaan"],
    462       "year": 2023,
    463       "relevance": "Self-refinement via LLM feedback; baseline approach for iterative improvement."
    464     },
    465     {
    466       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    467       "authors": ["Shunyu Yao"],
    468       "year": 2023,
    469       "relevance": "Structured reasoning for LLMs; related compound reasoning approach."
    470     },
    471     {
    472       "title": "Automated design of agentic systems",
    473       "authors": ["Shengran Hu"],
    474       "year": 2024,
    475       "arxiv_id": "2408.08435",
    476       "relevance": "Automated agentic system design, related to compound AI system optimization."
    477     },
    478     {
    479       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    480       "authors": ["Terry Yue Zhuo"],
    481       "relevance": "One of the four evaluation benchmarks used in this paper."
    482     },
    483     {
    484       "title": "Are more LLM calls all you need? Towards the scaling properties of compound AI systems",
    485       "authors": ["Lingjiao Chen"],
    486       "year": 2024,
    487       "relevance": "Studies scaling properties of compound AI systems; directly relevant to optimization challenges."
    488     }
    489   ]
    490 }

Impressum · Datenschutz