ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18724B)


      1 {
      2   "paper": {
      3     "title": "Specifications: The missing link to making the development of LLM systems an engineering discipline",
      4     "authors": ["Ion Stoica", "Matei Zaharia", "Joseph Gonzalez", "Ken Goldberg", "Koushik Sen", "Hao Zhang", "Anastasios N. Angelopoulos", "Shishir G. Patil", "Lingjiao Chen", "Wei-Lin Chiang", "Jared Q. Davis"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2412.05299",
      8     "doi": "10.48550/arXiv.2412.05299"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical"],
     13   "key_findings": "The paper argues that clear specifications (both statement and solution specifications) are the key enabler for five engineering properties — verifiability, debuggability, modularity, reusability, and automated decision making — in LLM-based systems. It proposes a framework distinguishing statement specifications (what a task should do) from solution specifications (how to verify outputs), and outlines research directions including iterative disambiguation, proof-carrying outputs, process supervision, and structured outputs. The paper draws analogies from automotive and software engineering to argue that reducing specification ambiguity will enable the transition from monolithic to modular LLM systems.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository or archive is mentioned. The paper is theoretical but could have released illustrative code or frameworks."
     20       },
     21       "data_released": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a theoretical position paper with no dataset to release."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No experiments are conducted, so no environment specification is needed."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "No experiments to reproduce; this is a conceptual framework paper."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No experiments or quantitative results are reported."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative empirical claims requiring statistical tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative results are reported."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Theoretical paper with no samples."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are conducted."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No empirical evaluation is conducted that would require baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No baselines used; theoretical paper."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation metrics used; conceptual framework paper."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No datasets or test sets used."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No quantitative results to break down."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 1 discusses six real-world LLM failure cases (wrong policy, wrong offer, wrong health advice, bad code, data leakage, security vulnerability, legal misinformation) with descriptions and proposed fixes."
    104       },
    105       "negative_results_reported": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "No experiments from which to report negative results."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims the paper discusses progress through structured outputs, process supervision, and test-time compute, and outlines future directions. The paper body delivers on these claims in Sections 7-8."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims like 'specifications enable' modularity, verifiability, etc. (Section 4, Figure 1), and that 'lack of clear specifications makes it challenging to build reliable systems' (Section 1). These causal claims are argued by analogy to other engineering disciplines (automotive, software) but no empirical evidence is provided to demonstrate the causal link in the LLM context."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes very broad claims about LLM systems generally. The title claims specifications are 'the missing link' for the entire field of LLM development, but the arguments are primarily by analogy to other disciplines without bounding which types of LLM systems or tasks these arguments apply to."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Appendix D addresses several alternative viewpoints: whether rapid LLM progress makes modularity unnecessary, whether human ambiguity tolerance already suffices, and whether ambiguity is inherent to the tasks rather than the specifications. Each is addressed substantively."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "Theoretical paper with no measurements."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models are used in experiments. The paper shows example ChatGPT interactions in figures but these are illustrative, not experimental."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting experiments are conducted."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No experiments requiring hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No data preprocessing; theoretical paper."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No limitations section or subsection is present. The paper has a Summary (Section 9) but does not discuss limitations of the proposed framework."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The paper does not address potential weaknesses of the specification-centric framework."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what its framework does NOT cover or where the analogy to traditional engineering may break down. Some tasks are acknowledged as inherently ambiguous (e.g., 'Write a poem') but no systematic scope boundary is drawn."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No empirical data collected; theoretical paper."
    187       },
    188       "data_collection_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No data collection; theoretical paper."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No participants or samples recruited."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No data pipeline; theoretical paper."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: UC Berkeley, UC San Diego, Stanford University, Microsoft Research."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. Several authors are affiliated with companies that build LLM systems (e.g., co-author Lingjiao Chen is at Microsoft Research), and the paper advocates for approaches that align with industry product directions."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. Several authors have commercial interests (e.g., Matei Zaharia co-founded Databricks, Ion Stoica co-founded Databricks and Anyscale) that are relevant to the paper's advocacy for modular LLM systems."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation conducted."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation conducted."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Theoretical paper; no method with inference cost."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Theoretical paper; no computation performed."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Specifications (statement and solution) are the foundation enabling five key engineering properties: verifiability, debuggability, modularity, reusability, and automated decision making.",
    296       "evidence": "Argued by analogy to automotive engineering (Section 2, Figure 1) and software engineering (Sections 3-4, SQL query engine example in Figure 4). No empirical validation.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The lack of clear specifications is significantly hindering the growth of the LLM ecosystem.",
    301       "evidence": "Supported by Table 1 listing six real-world LLM failure cases (Air Canada chatbot, Chevrolet chatbot, health advice, code hallucinations, data leakage, legal misinformation). These are anecdotal examples, not systematic evidence.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "LLM prompts trade unambiguity for ease of specification in natural language, and this ambiguity can be reduced through techniques like iterative disambiguation, domain-specific rules, and structured outputs.",
    306       "evidence": "Section 7 proposes these techniques with illustrative examples (Figures 3, 7, 8) but provides no empirical evaluation of their effectiveness.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Modular AI systems offer advantages in accuracy, cost, speed of improvement, and integration with software systems compared to monolithic approaches.",
    311       "evidence": "Appendix B argues these points conceptually, citing FrugalML, FrugalGPT, compound AI systems blog post, but provides no new empirical comparison.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "red_flags": [
    316     {
    317       "flag": "No empirical validation",
    318       "detail": "The paper proposes a conceptual framework and research agenda but provides zero empirical evidence that specification-driven approaches improve LLM system reliability, modularity, or any other claimed property. All arguments are by analogy to other engineering disciplines."
    319     },
    320     {
    321       "flag": "Undisclosed conflicts of interest",
    322       "detail": "Several authors are founders of companies building LLM infrastructure (Ion Stoica and Matei Zaharia co-founded Databricks; Stoica co-founded Anyscale). The paper advocates for modular, specification-driven LLM development which aligns with their commercial interests. No competing interests statement is provided."
    323     },
    324     {
    325       "flag": "No limitations section",
    326       "detail": "A position paper advocating for a particular approach to LLM development provides no discussion of limitations, potential failure modes of the specification-centric approach, or cases where the framework may not apply."
    327     },
    328     {
    329       "flag": "Cherry-picked failure examples",
    330       "detail": "Table 1's failure examples are selected to support the specification narrative. No systematic analysis of LLM failures is provided, and no counterexamples (cases where specifications are impractical or counterproductive) are discussed."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "DSPy: Compiling declarative language model calls into self-improving pipelines",
    336       "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"],
    337       "year": 2023,
    338       "relevance": "Automated prompt engineering using solution specifications (examples), directly relevant to LLM programming techniques."
    339     },
    340     {
    341       "title": "Constitutional AI: Harmlessness from AI feedback",
    342       "authors": ["Yuntao Bai"],
    343       "year": 2022,
    344       "relevance": "Foundational work on rule-based alignment of LLMs, relevant to AI safety and specification-driven development."
    345     },
    346     {
    347       "title": "The shift from models to compound AI systems",
    348       "authors": ["Matei Zaharia", "Omar Khattab", "Lingjiao Chen"],
    349       "year": 2024,
    350       "relevance": "Core reference for modular/compound AI systems architecture, directly relevant to agentic AI development."
    351     },
    352     {
    353       "title": "CodeHalu: Investigating code hallucinations in LLMs via execution-based verification",
    354       "authors": ["Yuqi Tian"],
    355       "year": 2024,
    356       "relevance": "Benchmark for code hallucination detection in LLMs, relevant to code generation quality evaluation."
    357     },
    358     {
    359       "title": "Competition-level code generation with AlphaCode",
    360       "authors": ["Yujia Li", "David Choi"],
    361       "year": 2022,
    362       "relevance": "Large-scale code generation with diversity-based solution search, foundational for LLM code generation benchmarks."
    363     },
    364     {
    365       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    366       "authors": ["Qingyun Wu", "Gagan Bansal"],
    367       "year": 2023,
    368       "relevance": "Multi-agent framework for task decomposition, relevant to agentic AI workflows."
    369     },
    370     {
    371       "title": "GoEx: Perspectives and designs towards a runtime for autonomous LLM applications",
    372       "authors": ["Shishir G. Patil"],
    373       "year": 2024,
    374       "relevance": "Runtime for autonomous LLM execution with reversibility guarantees, relevant to agentic safety."
    375     },
    376     {
    377       "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference",
    378       "authors": ["Wei-Lin Chiang", "Lianmin Zheng"],
    379       "year": 2024,
    380       "relevance": "Major LLM evaluation platform using human preference, relevant to LLM benchmarking methodology."
    381     },
    382     {
    383       "title": "RouteLLM: Learning to route LLMs with preference data",
    384       "authors": ["Isaac Ong"],
    385       "year": 2024,
    386       "relevance": "LLM routing/selection based on prompt difficulty, relevant to compound AI system design."
    387     },
    388     {
    389       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    390       "authors": ["Jason Wei", "Xuezhi Wang"],
    391       "year": 2023,
    392       "relevance": "Foundational prompting technique for LLM reasoning, relevant to LLM programming methodology."
    393     },
    394     {
    395       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    396       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    397       "year": 2023,
    398       "relevance": "Cost-optimization for LLM usage, relevant to practical LLM deployment economics."
    399     },
    400     {
    401       "title": "Code generation with AlphaCodium: From prompt engineering to flow engineering",
    402       "authors": ["Tal Ridnik"],
    403       "year": 2024,
    404       "relevance": "Flow-based code generation combining solution specification and verification, relevant to LLM code generation."
    405     }
    406   ]
    407 }

Impressum · Datenschutz