scan.json (23601B)
1 { 2 "paper": { 3 "title": "Agint: Agentic Graph Compilation for Software Engineering Agents", 4 "authors": ["Abhi Chivukula", "Jay Somasundaram", "Vijay Somasundaram"], 5 "year": 2025, 6 "venue": "NeurIPS 2025 Workshop: Deep Learning For Code in the Agentic Era (DL4C)", 7 "arxiv_id": "2511.19635" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "The paper's own NeurIPS checklist (item 5) explicitly answers 'No' to open access to code: 'The paper describes a system architecture and implementation approach but does not provide open-source code.' A web demo at flow.AgintAI.com and API docs are referenced but no source code repository is provided." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No datasets are released. The paper contains no quantitative experiments and no evaluation data. One example output link to a GitHub repository (AgintHub/nifty-wilson) is provided but this is a single generated code example, not evaluation data." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, dependency lists, or requirements files are provided. The paper describes system architecture but does not specify what dependencies are needed to run any component." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. The paper shows CLI usage examples (Section 4) but these are illustrative; without the actual software being released, they cannot be followed to reproduce results." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": false, 35 "answer": false, 36 "justification": "The paper presents no quantitative experiments or statistical results. It is a systems/architecture paper with no empirical evaluation. The paper's own NeurIPS checklist marks items 6-8 (experimental setting, statistical significance, compute resources) as NA." 37 }, 38 "significance_tests": { 39 "applies": false, 40 "answer": false, 41 "justification": "No comparative experiments are conducted. The paper makes no quantitative comparative claims backed by data, so significance tests are structurally inapplicable." 42 }, 43 "effect_sizes_reported": { 44 "applies": false, 45 "answer": false, 46 "justification": "No quantitative experiments are reported. The one numerical claim ('reducing generation latency by 3-10x' in Section 2.6) is presented without any supporting experiment or measurement, making effect size reporting structurally inapplicable since there are no experiments to report effect sizes for." 47 }, 48 "sample_size_justified": { 49 "applies": false, 50 "answer": false, 51 "justification": "No experiments with samples are conducted. This is a systems architecture paper." 52 }, 53 "variance_reported": { 54 "applies": false, 55 "answer": false, 56 "justification": "No experimental runs are conducted. This is a systems architecture paper with no empirical results." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": false, 63 "justification": "No baselines are compared against. The paper acknowledges this in Section 5.1 (Evaluation Gaps): 'comprehensive evaluation against established benchmarks remains future work.' SWE-bench, ML-Bench, and Commit0 are mentioned as future evaluation targets." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "No baselines are included at all, so the question of whether they are contemporary is moot. The paper does not compare against any existing system." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is conducted. The system has multiple components (type floors, execution modes, Hydantic decomposition) but none are evaluated individually to show their contribution." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "No evaluation metrics of any kind are reported. The paper contains no quantitative evaluation." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is conducted. The paper claims the system enables 'reliable, scalable, and user-friendly workflows' but provides no user study or human assessment of the outputs." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "No experiments are conducted, so the concept of held-out test sets is structurally inapplicable." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": false, 93 "justification": "No quantitative results are reported, so no per-category breakdown exists. Even qualitative examples are limited to a few CLI usage demonstrations." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases or error analysis is presented. Section 5.1 discusses limitations at a high level (model dependency, scalability constraints, type system limitations) but does not show specific cases where the system fails." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": false, 103 "justification": "No negative results are reported. Every aspect of the system is presented positively. The limitations section discusses potential weaknesses but does not report any attempted approach that failed." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": false, 110 "justification": "The abstract makes several strong claims -- 'enabling dynamic graph refinement, reproducible and optimizable execution, speculative evaluation,' 'accelerated development using smaller, faster models for lower latency, more efficient context use, and higher throughput' -- but none of these are supported by quantitative evidence in the paper. The paper provides architectural descriptions and usage examples but no measurements demonstrating these benefits." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper makes causal claims such as 'reducing generation latency by 3-10x for large structured outputs' (Section 2.3) and that the graph structure 'reduces context overhead, increases throughput, accuracy' (Section 1). These are causal claims about the effect of the architecture but no controlled experiments or measurements support them." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim broad applicability to 'Software Engineering Agents' and promise 'scalable, collaborative coding agents.' The paper acknowledges in Section 5.1 that 'The current implementation has been tested primarily on workflows with hundreds of nodes' but the framing throughout suggests general applicability without bounding claims to the tested settings." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations are discussed. The paper does not consider whether other architectural approaches (e.g., monolithic generation with retry, existing orchestration frameworks) might achieve similar benefits. The related work discussion (Introduction) identifies problems but does not compare against alternative solutions for those problems." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "No specific model versions are mentioned anywhere in the paper. The system is described as multi-provider with 'multi-provider routing' through Flyte, but no specific LLM model names or versions are stated for any demonstration or example." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "No actual prompts are provided. The paper describes a prompt registry system in Flyte (Section 2.3) and references 'multi-shot' prompting but does not include any actual prompt text used in the system." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported. The CLI examples show an '--intelligence' flag and '--seed' flag but their semantics and default values are not documented." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The agentic scaffolding is described in substantial detail. Section 2 describes the compilation pipeline, type floors (TEXT->TYPED->SPEC->STUB->SHIM->PURE), execution modes (prefine, dynamic, predict), effect-aware execution with rollback, and the interaction between components (Dagify, Dagent, Flyte, Schemagin, Datagin). Figure 1 shows the architecture overview." 148 }, 149 "data_preprocessing_documented": { 150 "applies": false, 151 "answer": false, 152 "justification": "No data is collected or preprocessed in this paper. It is a systems architecture paper with no empirical evaluation." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 5.1 is titled 'Limitations' and discusses model dependency, scalability constraints, type system limitations, and evaluation gaps." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 5.1 includes specific threats: model dependency ('Performance degrades when models lack domain-specific knowledge or when rate limits restrict concurrent execution'), scalability ('tested primarily on workflows with hundreds of nodes'), type system ('PrimitiveType system constrains data to basic types'), and evaluation gaps ('comprehensive evaluation against established benchmarks remains future work')." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "While limitations are discussed, the paper does not explicitly state what the results do NOT show or what specific claims are NOT being made. The limitations section acknowledges weaknesses but the abstract and conclusion still make broad claims about enabling 'scalable, collaborative coding agents' without bounding these to what was actually demonstrated." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": false, 175 "answer": false, 176 "justification": "No data is collected or analyzed in this paper. It is a systems architecture paper with no empirical evaluation data." 177 }, 178 "data_collection_described": { 179 "applies": false, 180 "answer": false, 181 "justification": "No data collection occurs in this paper. It is a systems architecture paper." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No participants or data sources are recruited. This is a systems architecture paper with no human subjects and no benchmark evaluation." 187 }, 188 "data_pipeline_documented": { 189 "applies": false, 190 "answer": false, 191 "justification": "No data pipeline exists to document. This is a systems architecture paper with no empirical evaluation." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "The Acknowledgments section states: 'This work was supported solely by internal research efforts at Agint and received no external financial support.'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All three authors list 'Agint' as their affiliation with @AgintAI.com email addresses. The paper is describing the authors' own company's product." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "The work is funded internally by Agint, the company whose product is being described and promoted. The funder (Agint) has a direct financial interest in the positive portrayal of the Agint system." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present. The authors are employees (or founders) of Agint, the company whose product is being described, but no explicit declaration of financial interests (equity, patents, etc.) is made." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper does not evaluate any pre-trained model on any benchmark. It describes a system architecture without conducting benchmark evaluations." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "No benchmark evaluation is conducted, so train/test overlap is structurally inapplicable." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "No benchmark evaluation is conducted. Benchmark evaluation is listed as future work." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved. The paper's own NeurIPS checklist confirms this (items 14, 15 marked NA)." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in this systems architecture paper." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants or experimental conditions are involved." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants or experimental conditions are involved." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference costs are reported despite the system making multiple LLM API calls per workflow. The paper claims latency benefits ('reducing generation latency by 3-10x') but provides no actual cost or latency measurements." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No computational budget is stated. The paper describes a system that heavily uses LLM APIs but provides no information about API costs, compute resources, or time required to run workflows." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Hydantic's hierarchical decomposition of structured outputs reduces generation latency by 3-10x for large structured outputs with multiple independent fields.", 286 "evidence": "Section 2.3 states this claim in passing: 'reducing generation latency by 3-10× for large structured outputs with multiple independent fields.' No experimental data, benchmarks, or measurements are provided to support this claim.", 287 "supported": "unsupported" 288 }, 289 { 290 "claim": "Agint's DAG structure enables parallel compilation where independent subgraphs resolve concurrently without global synchronization.", 291 "evidence": "Section 2.1 describes the locality-preserving transformation mechanism and Section 2.2 describes parallel execution. These are architectural descriptions without empirical validation.", 292 "supported": "weak" 293 }, 294 { 295 "claim": "Agint addresses fundamental limitations of current LLM-based coding assistants: context management, reliability, and composability.", 296 "evidence": "Section 6.1 (Conclusion) makes this claim. The paper describes architectural mechanisms (type floors, DAG structure, effect-aware execution) that could address these, but no comparative evaluation against existing systems is provided.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "The system has been tested on workflows with hundreds of nodes.", 301 "evidence": "Section 5.1 states: 'The current implementation has been tested primarily on workflows with hundreds of nodes.' No details about these tests are provided.", 302 "supported": "weak" 303 } 304 ], 305 "methodology_tags": ["case-study"], 306 "key_findings": "Agint presents an architecture for compiling natural language specifications into typed, effect-aware directed acyclic graphs (DAGs) through a six-tier type floor system (TEXT, TYPED, SPEC, STUB, SHIM, PURE). The system includes a hybrid JIT runtime with three execution modes (prefine, dynamic, predict) and a Unix-style CLI toolchain for workflow orchestration and data management. The paper is entirely descriptive -- it provides no quantitative evaluation, no benchmark comparisons, and acknowledges that 'comprehensive evaluation against established benchmarks remains future work.'", 307 "red_flags": [ 308 { 309 "flag": "No empirical evaluation", 310 "detail": "The paper presents a system with multiple strong claims about latency, reliability, and scalability but contains zero quantitative experiments. The paper's own NeurIPS checklist marks experimental setting, statistical significance, and compute resources all as NA. The '3-10x latency reduction' claim for Hydantic is completely unsupported." 311 }, 312 { 313 "flag": "Company evaluating its own product", 314 "detail": "All three authors are affiliated with Agint (the company whose product is being described). The work is internally funded by Agint. No independent evaluation or third-party assessment is included. The paper reads as a product description rather than a research contribution." 315 }, 316 { 317 "flag": "Broad claims without evidence", 318 "detail": "The abstract and conclusion make sweeping claims about enabling 'scalable, collaborative coding agents' and solving 'fundamental limitations of current LLM-based coding assistants' without any comparative evaluation or quantitative evidence." 319 }, 320 { 321 "flag": "No code or artifacts released", 322 "detail": "Despite describing a complex software system, no source code is released. Only a web demo and API documentation URL are provided. This prevents independent verification of any claims." 323 }, 324 { 325 "flag": "No baseline comparisons", 326 "detail": "The paper does not compare Agint against any existing tool (e.g., ChatDev, MetaGPT, MASAI, or even vanilla LLM code generation). All claimed advantages are stated architecturally without empirical support." 327 } 328 ], 329 "cited_papers": [ 330 { 331 "title": "Masai: Modular architecture for software-engineering ai agents", 332 "authors": ["Daman Arora"], 333 "year": 2024, 334 "arxiv_id": "2406.11638", 335 "relevance": "Directly relevant as a modular multi-agent architecture for software engineering, which Agint positions itself against." 336 }, 337 { 338 "title": "Graph of thoughts: Solving elaborate problems with large language models", 339 "authors": ["Maciej Besta"], 340 "year": 2024, 341 "arxiv_id": "2305.16582", 342 "relevance": "Graph-based reasoning framework for LLMs that provides foundational concepts for Agint's DAG approach." 343 }, 344 { 345 "title": "Evaluating large language models trained on code", 346 "authors": ["Mark Chen"], 347 "year": 2021, 348 "arxiv_id": "2107.03374", 349 "relevance": "Foundational work on LLM code generation (Codex/HumanEval) that established the benchmark evaluation paradigm for coding agents." 350 }, 351 { 352 "title": "Challenges and paths towards ai for software engineering", 353 "authors": ["Alex Gu"], 354 "year": 2025, 355 "arxiv_id": "2503.22625", 356 "relevance": "Position paper on challenges in AI for software engineering, directly relevant to the survey scope." 357 }, 358 { 359 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 360 "authors": ["Shuyue Hong"], 361 "year": 2023, 362 "arxiv_id": "2308.00352", 363 "relevance": "Multi-agent framework for software development, directly comparable to Agint's multi-agent coordination claims." 364 }, 365 { 366 "title": "Swe-bench: Can language models resolve real-world github issues?", 367 "authors": ["Carlos E. Jimenez"], 368 "year": 2023, 369 "arxiv_id": "2310.06770", 370 "relevance": "Major benchmark for evaluating software engineering agents, mentioned by Agint as future evaluation target." 371 }, 372 { 373 "title": "ChatDev: Communicative agents for software development", 374 "authors": ["Chen Qian"], 375 "year": 2024, 376 "arxiv_id": "2307.07924", 377 "relevance": "Multi-agent software development framework, directly relevant as a baseline comparison for Agint's approach." 378 }, 379 { 380 "title": "Large language model-based agents for software engineering: A survey", 381 "authors": ["Junwei Liu"], 382 "year": 2024, 383 "arxiv_id": "2409.02977", 384 "relevance": "Survey of LLM-based agents for software engineering, directly relevant to the survey scope." 385 }, 386 { 387 "title": "Agents in software engineering: Survey, landscape, and vision", 388 "authors": ["Yanlin Wang"], 389 "year": 2024, 390 "arxiv_id": "2409.09030", 391 "relevance": "Another survey of agents in software engineering, relevant for understanding the landscape." 392 }, 393 { 394 "title": "Tree of thoughts: Deliberate problem solving with large language models", 395 "authors": ["Shunyu Yao"], 396 "year": 2023, 397 "arxiv_id": "2305.10601", 398 "relevance": "Tree-based reasoning framework for LLMs, related to Agint's graph-based decomposition approach." 399 }, 400 { 401 "title": "Diversity empowers intelligence: Integrating expertise of software engineering agents", 402 "authors": ["Kexun Zhang"], 403 "year": 2024, 404 "arxiv_id": "2408.07060", 405 "relevance": "Multi-agent coordination approach for software engineering, relevant to Agint's claims about agent collaboration." 406 }, 407 { 408 "title": "Commit0: Library generation from scratch", 409 "authors": ["Wenting Zhao"], 410 "year": 2025, 411 "relevance": "Benchmark for library-level code generation, mentioned as future evaluation target for Agint." 412 } 413 ] 414 }