ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21317B)


      1 {
      2   "paper": {
      3     "title": "Chat-of-Thought: Collaborative Multi-Agent System for Generating Domain Specific Information",
      4     "authors": [
      5       "Christodoulos Constantinides",
      6       "Shuxin Lin",
      7       "Nianjun Zhou",
      8       "Dhaval Patel"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2506.10086"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code repository URL is provided in the paper. The IBM Generative AI Python SDK is cited as a reference [IBM, 2024] with a GitHub link, but this is a general IBM SDK, not the Chat-of-Thought system code."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset or FMEA data is released. The paper references domain-specific knowledge repositories and the Uptake Asset Strategy Library but does not provide any publicly available data."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, or software versions are provided anywhere in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The paper describes the system architecture conceptually but provides no steps for recreating or running the system."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper presents no quantitative results at all, let alone confidence intervals or error bars. Section 4 ('Result and Demonstration') contains only qualitative descriptions of system capabilities."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are performed. The paper makes claims about system effectiveness ('reliably identifies failure modes') without any statistical testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes, percentage improvements, or any quantitative measures of performance are reported anywhere in the paper."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No sample size is stated. The paper does not report how many assets, FMEA tables, or test cases were evaluated, let alone justify such a number."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or any measure of variability is reported. There are no quantitative experimental results in the paper."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are included. The paper does not compare Chat-of-Thought against any alternative system, manual FMEA creation, single-agent approaches, or prior work."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines of any kind are included, so contemporariness cannot be assessed."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is provided. The system has multiple components (facilitator, reliability engineer, quality engineer, SME validator, summarizer, quality check) but none are individually evaluated for their contribution."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No evaluation metrics of any kind are reported. The paper mentions self-BLEU for duplicate filtering (Section 2.5) but does not report any metric values for system evaluation."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Section 4 states 'Validation by SMEs confirms that the system reliably identifies failure modes, root causes, and potential effects' but provides no details: no number of SMEs, no evaluation criteria, no structured evaluation protocol, no results. This is an unsubstantiated claim, not a human evaluation."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No test set of any kind is mentioned. There is no separation of data into training/development/test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No per-category or per-asset breakdown of results is provided. The paper mentions 'standard and out-of-scope assets' in Section 4 but shows no breakdown."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No failure cases are discussed. The paper presents only positive claims about the system's capabilities."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results are reported. Every aspect of the system is described in exclusively positive terms."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims the system 'optimize[s] the generation and validation of FMEA tables' and 'demonstrates the potential of Chat-of-Thought in addressing these challenges.' However, the results section (Section 4) provides no quantitative evidence that the system optimizes anything or successfully addresses the stated challenges — only prose assertions."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes implicit causal claims throughout, e.g., that the multi-agent architecture 'ensures accuracy and scalability' (Section 1) and that iterative refinement 'enhance[s] performance' (Section 3). No experimental evidence supports these causal claims. The ablation-like Round progression (Round 1-4 in Section 3) is described but never quantitatively evaluated."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper's title says 'Domain Specific Information' but the abstract and introduction make broad claims about 'revolutioniz[ing] traditional workflows' and 'enabling industries to harness artificial intelligence for enhanced reliability and operational efficiency' (Section 1). No bounds are placed on what settings the system has been tested in."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed. The paper does not consider whether simpler approaches, single-agent systems, or template-based methods without LLMs could achieve similar results."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No specific LLM model versions are specified. The paper references GPT-4, LLaMA, and Mistral 7B in the introduction as general LLM citations but never states which model(s) are actually used in Chat-of-Thought experiments."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No prompts or system messages are provided. The paper mentions agents have 'predefined roles, skills, and contextual system messages' (Section 2) but never shows the actual prompt text."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No hyperparameters are reported: no temperature, no top-p, no max tokens, no self-BLEU threshold value, no number of rounds."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The scaffolding is described at a high level (Sections 2.1-2.5: agent roles, context discovery, multi-round interactions, template-driven routing, quality check) but critical details are missing: no concrete tool descriptions, no retry logic details, no memory management specifics, no actual template examples. The description is too abstract to reproduce."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No data preprocessing steps are documented. Section 2.2 mentions 'dynamically extracted from domain-specific knowledge repositories and historical data' but provides no details on how this extraction works or what transformations are applied."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "There is no limitations, threats-to-validity, or future-work section in the paper. The paper ends with the references section."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No threats to validity are discussed anywhere in the paper."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No explicit scope boundaries are stated. While Section 1.1 mentions the target domain (industrial equipment), the paper does not state what the system has NOT been tested on or what claims it is NOT making."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No raw data is made available. No FMEA outputs, agent conversation logs, or evaluation data are provided."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No data collection procedure is described. The paper references 'domain-specific knowledge repositories and historical data' (Section 2.2) and the Uptake Asset Strategy Library without describing how data was collected or selected."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are recruited for a study. The SME validation mentioned in Section 4 is not described as a formal study with recruited participants."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No data pipeline is documented. The flow from raw inputs to FMEA outputs is described only at a conceptual level without specifying filtering, transformation, or processing steps."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding information is disclosed anywhere in the paper. All authors are from IBM/IBM Research."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly stated: Christodoulos Constantinides is from IBM, and Shuxin Lin, Nianjun Zhou, and Dhaval Patel are from IBM Research."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "All authors are IBM employees. IBM has a commercial interest in demonstrating AI capabilities for industrial applications. The funder (IBM, implicitly) is not independent of the outcome. No funding independence statement is provided."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial interests statement is provided in the paper."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It describes a multi-agent system for FMEA generation; contamination is not a relevant concern."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is performed. The system generates FMEA documents; there is no train/test split or benchmark to be contaminated."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmarks are used. The paper describes a system demonstration, not a benchmark evaluation."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human subjects study is conducted. The SME validation mentioned in Section 4 is not described as a formal study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects study is conducted."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study is conducted."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study is conducted."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study is conducted."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study is conducted."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study is conducted."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No inference cost, latency, or token consumption is reported. The system calls LLMs across multiple rounds with multiple agents, but no cost information is provided."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No computational budget, GPU hours, API costs, or hardware specifications are stated."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "Chat-of-Thought optimizes the generation and validation of FMEA tables through multi-agent collaboration.",
    291       "evidence": "Section 1 and abstract describe the system architecture; Section 4 states 'Validation by SMEs confirms that the system reliably identifies failure modes, root causes, and potential effects' but provides no quantitative evidence.",
    292       "supported": "unsupported"
    293     },
    294     {
    295       "claim": "The system demonstrates capability to generate detailed and accurate FMEA tables for both standard and out-of-scope assets.",
    296       "evidence": "Section 4 makes this assertion but provides no quantitative metrics, no examples of generated FMEA tables, and no comparison with manually created FMEAs or other systems.",
    297       "supported": "unsupported"
    298     },
    299     {
    300       "claim": "The Chat-of-Thought mechanism extends beyond traditional Chain-of-Thought by fostering dynamic, context-aware dialogue across specialized personas.",
    301       "evidence": "Section 2.3 describes the multi-round interaction process conceptually. No empirical comparison with standard Chain-of-Thought or single-agent approaches is provided.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "The quality check mechanism effectively filters useless questions using a pre-trained classifier and removes duplicates using self-BLEU scores.",
    306       "evidence": "Section 2.5 describes this mechanism but provides no evaluation of the classifier's accuracy, no self-BLEU threshold value, and no results showing how many questions were filtered.",
    307       "supported": "unsupported"
    308     }
    309   ],
    310   "methodology_tags": [
    311     "case-study"
    312   ],
    313   "key_findings": "This paper presents Chat-of-Thought, a multi-agent LLM-based system designed to automate FMEA document generation for industrial assets through collaborative agent personas. The paper is a system description and demonstration without any quantitative evaluation. No baselines, metrics, ablations, or statistical analysis are provided. SME validation is claimed but not documented with any specifics.",
    314   "red_flags": [
    315     {
    316       "flag": "No quantitative evaluation",
    317       "detail": "The paper's 'Result and Demonstration' section (Section 4) contains only qualitative prose descriptions. No metrics, numbers, tables, or figures showing system performance are provided."
    318     },
    319     {
    320       "flag": "Unsubstantiated SME validation claim",
    321       "detail": "Section 4 states 'Validation by SMEs confirms that the system reliably identifies failure modes' but provides zero details: no number of SMEs, no evaluation protocol, no criteria, no results."
    322     },
    323     {
    324       "flag": "No baselines or comparisons",
    325       "detail": "The system is not compared against any baseline: no manual FMEA creation, no single-agent approach, no prior automated FMEA tools. It is impossible to assess whether the multi-agent approach adds value."
    326     },
    327     {
    328       "flag": "Overclaiming in abstract and introduction",
    329       "detail": "The paper uses strong language ('optimize', 'revolutionize traditional workflows', 'ensures accuracy and scalability') without any evidence to support these claims."
    330     },
    331     {
    332       "flag": "No model specification",
    333       "detail": "The paper never states which LLM(s) are actually used in the system. GPT-4, LLaMA, and Mistral are cited as general references but the actual model used for the demo is not disclosed."
    334     },
    335     {
    336       "flag": "Corporate conflict of interest undisclosed",
    337       "detail": "All authors are from IBM/IBM Research. IBM has commercial interest in demonstrating AI for industrial applications. No conflict of interest statement is provided."
    338     },
    339     {
    340       "flag": "No limitations discussion",
    341       "detail": "The paper contains no limitations section, no threats to validity, and no discussion of when or where the system might fail."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "GPT-4 Technical Report",
    347       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    348       "year": 2023,
    349       "arxiv_id": "2303.08774",
    350       "relevance": "Foundational LLM referenced as the basis for multi-agent systems; relevant to LLM capability evaluation."
    351     },
    352     {
    353       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    354       "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai"],
    355       "year": 2023,
    356       "relevance": "Foundational work on LLM-based multi-agent simulation, directly relevant to agentic AI research."
    357     },
    358     {
    359       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    360       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    361       "year": 2023,
    362       "relevance": "Key multi-agent LLM framework; directly relevant to agentic AI and multi-agent system evaluation."
    363     },
    364     {
    365       "title": "LLaMA: Open and Efficient Foundation Language Models",
    366       "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"],
    367       "year": 2023,
    368       "arxiv_id": "2302.13971",
    369       "relevance": "Open-source LLM used in agentic systems; relevant to LLM capability and open-source model evaluation."
    370     },
    371     {
    372       "title": "Mistral 7B",
    373       "authors": ["Albert Q Jiang", "Alexandre Sablayrolles", "Arthur Mensch"],
    374       "year": 2023,
    375       "arxiv_id": "2310.06825",
    376       "relevance": "Open-source LLM relevant to evaluating smaller model capabilities in agentic settings."
    377     },
    378     {
    379       "title": "Large Language Models are Zero-Shot Reasoners",
    380       "authors": ["Takeshi Kojima", "Shixiang (Shane) Gu", "Machel Reid"],
    381       "year": 2022,
    382       "relevance": "Foundational prompting technique (zero-shot CoT) referenced in the multi-round system design; relevant to LLM prompting methodology."
    383     },
    384     {
    385       "title": "Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision",
    386       "authors": ["Zhiqing Sun", "Yikang Shen", "Qinhong Zhou"],
    387       "year": 2023,
    388       "relevance": "LLM alignment technique relevant to AI safety and alignment research."
    389     },
    390     {
    391       "title": "An Open Source Data Contamination Report for Large Language Models",
    392       "authors": ["Yucheng Li"],
    393       "year": 2023,
    394       "relevance": "Directly relevant to benchmark contamination concerns in LLM evaluation."
    395     },
    396     {
    397       "title": "Auto-Q: Automated Domain Questions Generation for Industrial Assets",
    398       "authors": ["Christodoulos Constantinides", "Vivek Sharma", "Shuxin Lin"],
    399       "year": 2025,
    400       "relevance": "Companion work from the same group on automated question generation for industrial domains; relevant to LLM application in domain-specific tasks."
    401     }
    402   ]
    403 }

Impressum · Datenschutz