scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23719B)
      1 {
      2   "paper": {
      3     "title": "LLM Agent for Fire Dynamics Simulations",
      4     "authors": ["Leidong Xu", "Danyal Mohaddes", "Yi Wang"],
      5     "year": 2024,
      6     "venue": "Foundation Models for Science Workshop, NeurIPS 2024",
      7     "arxiv_id": "2412.17146",
      8     "doi": "10.48550/arXiv.2412.17146"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["case-study"],
     13   "key_findings": "FoamPilot, an LLM agent using GPT-4o with RAG, shell, and Python tools, achieves consistent success (5/5) on simple FireFOAM tasks (code insight, parameter modification) but success drops sharply with complexity: serial job execution 4/5, HPC job submission 1/5, multi-functionality queries 2/5. The agent's limited domain-specific knowledge of FireFOAM/OpenFOAM is identified as the primary bottleneck for complex tasks. RAG-based code navigation outperforms keyword search for semantic retrieval but remains vulnerable to ambiguous queries.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL for FoamPilot is provided. The paper mentions FireFOAM's GitHub repository (fireFoam-v1912) but does not release FoamPilot's source code."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No test cases, evaluation data, or agent interaction logs are released. The tutorial cases used (poolFireMcCaffrey, burningBoxSuppression) are part of FireFOAM but the specific evaluation tasks and expected outputs are not published."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions LangChain/LangGraph 0.2, FAISS, GPT-4o, and text-embedding-ada-002, but provides no requirements.txt, Dockerfile, or detailed dependency listing with versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions. The paper describes the system architecture and prompts but not how to set up and run FoamPilot from scratch."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are reported as raw success counts (e.g., 5/5, 4/5, 1/5) in Table 1 without confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are performed. Differences between task success rates are presented as raw counts without any testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes reported. Results are only success/failure counts across 5 repetitions per task."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Each experiment was repeated 5 times with no justification for why 5 repetitions were chosen. No power analysis or discussion of whether 5 runs is sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures reported. The paper notes 'the agent still exhibits some variability in success rates' but does not quantify this beyond success/failure counts."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines included. FoamPilot is not compared against any alternative approach (e.g., manual configuration, other LLM agents, MetaOpenFOAM). Only absolute success rates are reported."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines at all. The paper cites MetaOpenFOAM (Chen et al. 2024) as prior work but does not compare against it."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The system has three tools (Shell, Python, RAG) and three functionalities, but no ablation study is performed to measure the contribution of individual components. The multi-functionality test is not framed as an ablation."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only a single metric is used: binary success/failure rate. No other metrics such as number of agent-tool loops, token usage, time to completion, or partial correctness are systematically reported."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 3 states: 'a successful outcome is determined by comparing the agent's actions to that of an experienced FireFOAM user.' An expert manually assessed correctness of each run."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No separation between development and evaluation tasks. The prompts were iteratively developed (e.g., case configuration approach was changed after initial failure), and the same tasks appear to be used for both development and final evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides per-task breakdown across all seven evaluation tasks, separated by functionality (Code Insight, Case Configuration, Job Execution, combined)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Failure cases are discussed extensively throughout Section 3: the agent hallucinated file existence, made incorrect RAG queries, failed to set up the HPC environment correctly, and retrieved irrelevant information that polluted the context window."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "HPC job execution succeeded only 1/5 times. Multi-functionality queries succeeded 2/5. The paper also reports that 'on multi-functionality tests that were more complex than the relatively simple one presented here, the agent failed consistently.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract states 'Promising results were achieved for each functionality, particularly for simple tasks, and opportunities were identified for significant further improvement for more complex tasks.' This is appropriately hedged and supported by Table 1."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims RAG 'offers a significant advantage over traditional user-driven keyword-based searches' (Section 2.1) and that the case configuration functionality 'makes FireFOAM more accessible' (Section 2.2), but no controlled comparison against keyword search or manual configuration is performed."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims 'the challenges identified and the solution approaches developed are broadly applicable to other scientific simulation tools' (Section 1 and Section 4) without testing on any other simulation tool. Only FireFOAM was tested."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are considered for the results. For example, the paper does not discuss whether the high success on simple tasks reflects GPT-4o's general capability vs. FoamPilot's specific design, or whether the failures stem from LLM limitations vs. prompt engineering issues."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures binary success/failure on specific tasks and frames results accordingly. It does not overclaim — it calls FoamPilot a 'proof-of-concept' and the measurements match the granularity of claims."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3 specifies: 'Azure/OpenAI's GPT-4o, version 2024-05-13' for the LLM and 'text-embedding-ada-002' for embeddings. Specific version identifiers are provided."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt texts are provided: system prompt (Fig. 6), case configuration prompt (Fig. 4), and job execution prompts for both serial and HPC modes (Fig. 5). These are actual prompts, not just descriptions."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3 states 'temperature setting of 0.0' and the embedding model's 'maximum input sizes' of 8192 tokens. Temperature is the most critical LLM hyperparameter and is explicitly reported."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The agent architecture is described in detail in Section 2 with Figure 2: three-node graph (user, LLM, tools), static conditional edges, tool descriptions (Shell, Python Interpreter, RAG), RAG preprocessing pipeline (Fig. 3), termination conditions, and iterative loop structure."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 2.1 documents RAG preprocessing: combining .H and .C files, stripping boilerplate headers and licenses, prepending file paths, embedding with text-embedding-ada-002, and FAISS vector store creation. Section 2.2 describes case file compression into a single string."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4 ('Conclusions and future work') contains substantive discussion of limitations: inability to handle complex tasks, lack of asynchronous execution, challenges with human feedback in LangChain, single LLM tested, limited domain knowledge."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper discusses specific threats: non-deterministic outputs even at temperature 0.0 (Section 3), ambiguous queries leading to incorrect RAG retrieval (Section 2.1), agent hallucinating file existence, and the risk that 'less experienced users would struggle to' detect incorrect but non-crashing configurations (Section 3)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4 explicitly states: 'Our experiments considered only GPT-4o for the agent's LLM,' lists unaddressed functionalities (asynchronous simulation, robust human feedback), and Section 3 notes testing was 'within a secure sandbox environment' on AWS."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No agent interaction logs, LLM outputs, or raw experimental data are released. Only summary success/failure counts in Table 1 are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3 describes the experimental setup: 'each experiment was repeated five times,' same GPT-4o version and temperature, dedicated AWS EC2 instance and parallelCluster, single user prompt per experiment."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The evaluation tasks are manually designed by the authors for a specific simulation code."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper does not document how evaluation tasks were selected, what criteria defined the correct outcomes, or why these specific tasks were chosen over others. The ground truth ('actions of an experienced FireFOAM user') is not formally documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No explicit funding disclosure or acknowledgments section. The first author's footnote notes 'Work performed during internship at FM' but no funding sources are listed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Leidong Xu at University of Connecticut, Danyal Mohaddes and Yi Wang at FM, Research Division. The internship relationship is also disclosed."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Two of three authors are from FM (FM Global), which develops FireFOAM. FM has a direct interest in FoamPilot succeeding, as it would enhance the usability of their own simulation tool. The funder is not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper tests an agent system on practical simulation tasks, not a pre-trained model's capability on a benchmark. The LLM is used as a reasoning engine within a tool-augmented agent, not evaluated on benchmark knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — the evaluation involves interactive agent tasks with a live codebase, not benchmark problems that could be in training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — no standardized benchmarks are used. The tasks are custom-designed for FireFOAM."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, token usage, or wall-clock time per experiment are reported, despite using a commercial API (Azure/OpenAI GPT-4o) with potentially significant costs across 35+ experimental runs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget stated. The paper mentions using AWS EC2 and parallelCluster but provides no information on instance types, costs, or total compute used."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "FoamPilot's Code Insight functionality consistently retrieves and summarizes relevant source code, succeeding 5/5 times on both test tasks.",
    296       "evidence": "Table 1 shows 5/5 success rate for both solid energy equation and non-spherical drag equation tasks. Figure 7 shows a correct example output.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Case Configuration functionality consistently succeeds at modifying simulation parameters, achieving 5/5 on both test tasks.",
    301       "evidence": "Table 1 shows 5/5 for burner size modification and cardboard thickness modification. Figure 8 shows correct file modifications.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Job Execution success drops significantly for HPC environments (1/5) compared to serial execution (4/5).",
    306       "evidence": "Table 1 reports 4/5 for serial jobs and 1/5 for HPC jobs. Section 3 describes specific failure modes including incorrect SLURM commands and environment setup failures.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Multi-functionality queries combining Code Insight and Case Configuration succeed 2/5 times, and more complex multi-functionality tasks fail consistently.",
    311       "evidence": "Table 1 shows 2/5 for the droplet drag model modification task. Section 3 states 'on multi-functionality tests that were more complex than the relatively simple one presented here, the agent failed consistently.'",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "RAG-based code navigation offers significant advantage over traditional keyword-based searches like grep and find.",
    316       "evidence": "Section 2.1 describes the conceptual advantages (semantic relationships, contextual relevance) but provides no empirical comparison against keyword search methods.",
    317       "supported": "weak"
    318     },
    319     {
    320       "claim": "The case configuration functionality makes FireFOAM more accessible to non-expert users.",
    321       "evidence": "Section 2.2 argues this conceptually, but no user study or comparison with manual configuration was performed. Only expert evaluation of correctness is provided.",
    322       "supported": "weak"
    323     }
    324   ],
    325   "red_flags": [
    326     {
    327       "flag": "Very small sample size",
    328       "detail": "Only 5 repetitions per task and only 2 tasks per functionality (7 total). With N=5, a single run difference changes the success rate by 20 percentage points. The paper acknowledges non-determinism but does not address whether 5 runs provides statistical power."
    329     },
    330     {
    331       "flag": "No baseline comparisons",
    332       "detail": "FoamPilot is not compared against any alternative (manual process, other LLM agents like MetaOpenFOAM, simpler automation scripts). Impossible to assess whether the LLM agent adds value over existing approaches."
    333     },
    334     {
    335       "flag": "Company evaluating enhancement to own product",
    336       "detail": "Two of three authors are from FM (FM Global), which develops FireFOAM. The paper evaluates FoamPilot's ability to enhance FireFOAM usability — a direct benefit to FM. This conflict is not acknowledged."
    337     },
    338     {
    339       "flag": "Possible cherry-picked tasks",
    340       "detail": "Only 'relatively simple' tasks are evaluated with full results. The paper notes complex tasks 'failed consistently' but these are not formally reported in Table 1, creating survivorship bias in the reported results."
    341     },
    342     {
    343       "flag": "No code or data release",
    344       "detail": "FoamPilot's implementation, evaluation tasks, expected outputs, and agent interaction logs are not released, preventing independent verification or reproduction."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "GPT-4 technical report",
    350       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    351       "year": 2023,
    352       "arxiv_id": "2303.08774",
    353       "relevance": "Foundational LLM capability paper; GPT-4o used as the agent's LLM in this work."
    354     },
    355     {
    356       "title": "Code LLaMA: Open foundation models for code",
    357       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    358       "year": 2023,
    359       "arxiv_id": "2308.12950",
    360       "relevance": "Open-source code generation model relevant to LLM code generation capabilities."
    361     },
    362     {
    363       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    364       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    365       "year": 2023,
    366       "arxiv_id": "2308.00352",
    367       "relevance": "Multi-agent LLM framework for collaborative software development."
    368     },
    369     {
    370       "title": "ChatDev: Communicative agents for software development",
    371       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    372       "year": 2024,
    373       "relevance": "Multi-agent system for software development using communicative LLM agents."
    374     },
    375     {
    376       "title": "RepairAgent: An autonomous, LLM-based agent for program repair",
    377       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    378       "year": 2024,
    379       "arxiv_id": "2403.17134",
    380       "relevance": "Autonomous LLM agent for program repair, directly relevant to agentic code workflows."
    381     },
    382     {
    383       "title": "A survey on large language models for code generation",
    384       "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen"],
    385       "year": 2024,
    386       "arxiv_id": "2406.00515",
    387       "relevance": "Survey of LLM code generation capabilities and limitations."
    388     },
    389     {
    390       "title": "Large language model-based agents for software engineering: A survey",
    391       "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen"],
    392       "year": 2024,
    393       "arxiv_id": "2409.02977",
    394       "relevance": "Comprehensive survey of LLM-based agents for software engineering tasks."
    395     },
    396     {
    397       "title": "MetaOpenFOAM: an LLM-based multi-agent framework for CFD",
    398       "authors": ["Yuxuan Chen", "Xu Zhu", "Hua Zhou", "Zhuyin Ren"],
    399       "year": 2024,
    400       "arxiv_id": "2407.21320",
    401       "relevance": "Directly comparable system: LLM multi-agent framework for the same simulation toolbox (OpenFOAM)."
    402     },
    403     {
    404       "title": "A survey on large language model based autonomous agents",
    405       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    406       "year": 2024,
    407       "relevance": "Survey of autonomous LLM agent architectures and capabilities."
    408     },
    409     {
    410       "title": "LLM experiments with simulation: Large language model multi-agent system for process simulation parametrization in digital twins",
    411       "authors": ["Yuchen Xia", "Daniel Dittler", "Nasser Jazdi"],
    412       "year": 2024,
    413       "arxiv_id": "2405.18092",
    414       "relevance": "LLM agents for scientific simulation parametrization, directly relevant to agentic simulation workflows."
    415     },
    416     {
    417       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    418       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    419       "year": 2020,
    420       "arxiv_id": "2002.08155",
    421       "relevance": "Pre-trained model for code understanding, foundational for code-related NLP."
    422     },
    423     {
    424       "title": "Language models are few-shot learners",
    425       "authors": ["Tom B Brown"],
    426       "year": 2020,
    427       "arxiv_id": "2005.14165",
    428       "relevance": "GPT-3 paper establishing few-shot LLM capabilities foundational to agentic workflows."
    429     }
    430   ]
    431 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs