scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26640B)
      1 {
      2   "paper": {
      3     "title": "Think Locally, Explain Globally: Graph-Guided LLM Investigations via Local Reasoning and Belief Propagation",
      4     "authors": [
      5       "Saurabh Jha",
      6       "Rohan Arora",
      7       "Bhavya",
      8       "Noah Zheutlin",
      9       "Paulina Toro Isaza",
     10       "Laura Shwartz",
     11       "Yu Deng",
     12       "Daby Sow",
     13       "Ruchi Mahindru",
     14       "Ruchir Puri"
     15     ],
     16     "year": 2026,
     17     "venue": "arXiv",
     18     "arxiv_id": "2601.17915",
     19     "doi": "10.48550/arXiv.2601.17915"
     20   },
     21   "scan_version": 2,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval"],
     24   "key_findings": "EoG, a disaggregated architecture separating LLM-based local abductive reasoning from deterministic graph traversal and belief propagation, achieves 7x higher Majority@k F1 than ReAct baselines on ITBench SRE diagnostic scenarios. The reliability gap between Pass@k and Majority@k is near-eliminated (e.g., GPT-5.1 goes from 22.9/8.6 under ReAct to 88.9/86.1 under EoG). Semantic Belief Propagation provides 7-64% additional improvement over structured traversal alone, with largest gains on weaker models.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper states the implementation is 'available as part of the open-source Codex project' at https://github.com/openai/codex (§B.8, §5). ITBench is also described as open-source."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "ITBench is described as an open-source benchmark (§1 footnote, §3). The evaluation scenarios are part of the public benchmark."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions ~5,000 lines of Rust and Codex CLI v0.76 but no dependency or environment setup details."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture and prompts but does not include a README or commands to replicate experiments."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Tables 2-4 report point estimates (F1, Recall) with no confidence intervals or error bars despite running 3 trials per scenario."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No statistical significance tests are used despite claiming EoG outperforms ReAct across multiple models. Comparisons are made by directly comparing numbers."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Effect sizes are reported with baseline context: '7× average gain in Majority@k F1' (abstract), and Table 2 shows absolute values for both systems (e.g., GPT-5.1: 8.6→86.1 maj@3 F1). Table 4 reports relative improvement percentages (e.g., +6.9% to +64.3%)."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No justification for why 35 scenarios with 3 runs was chosen. No power analysis or discussion of whether this sample is sufficient for the claims."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "While the paper uses Majority@k vs Pass@k to illustrate consistency, no standard deviation, IQR, or variance across the 3 runs is reported in any table."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "ReAct baselines are included across 4 models (Table 2). The ReAct baseline 'incorporates best practices from prior work' and achieves 'higher performance than published baselines on ITBench' (§5)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Baselines use contemporary models (GPT-5.1, Gemini 3 Flash/Pro, Claude Opus 4.5, Kimi K2) and the ReAct implementation uses the current Codex CLI v0.76 framework."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Two ablation studies: (1) Prompting ablation showing ReAct prompted with EoG algorithm (Table 3), and (2) SBP ablation isolating belief propagation contribution (Table 4). Also ablation of distributed traces as data source."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Multiple metrics reported: Pass@k F1, Pass@k Recall, Majority@k F1, Majority@k Recall for RC Entity, plus Majority@k Recall for RC Reasoning (Table 2). Also input/output token counts."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Evaluation uses LLM-as-a-Judge (Appendix D) rather than human evaluation. No human expert validation of the diagnoses is reported."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "ITBench scenarios serve as the test set. The EoG system is evaluated on 35 ITBench SRE scenarios that are independent of the system's development."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results broken down per model (Table 2), with separate metrics for RC Entity and RC Reasoning. Failure mode analysis broken down by model (Figures 1, 2, 7). MAST failure taxonomy with per-category prevalence rates (§C.1)."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Extensive failure mode analysis in §3 (exploration failures, controller failures, path dependence). Figures 2a-c show GT discovery funnel, evidence discovery failures. §C analyzes plan abandonment, tool repetition, syntactic failures."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Table 3 shows ReAct prompted with EoG algorithm collapses to near-zero Majority@k, a negative result. The distributed traces ablation showed 'no statistically significant change in results' (§5)."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims '7× average gain in Majority@k F1 score' and 'improves accuracy and run-to-run consistency' are supported by Table 2 results across multiple models."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Causal claims about EoG's design improvements are justified through ablation studies: SBP ablation (Table 4) isolates belief propagation contribution, prompting ablation (Table 3) shows the controller is necessary. These are controlled single-variable manipulations."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper tests only on ITBench SRE scenarios but makes broad claims about 'diagnostic investigations' generalizing to 'medical diagnosis', 'security threat analysis', 'FinOps, DevOps, healthcare, and forensics' (§2). While the methodology section references these domains, no evidence is provided outside IT SRE. The title says 'Graph-Guided LLM Investigations' broadly."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No discussion of alternative explanations for the observed improvements. Could the gains be due to the specific prompts used rather than the architecture? Could the ITBench scenarios be particularly suited to graph traversal? These alternatives are not considered."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper clearly distinguishes what it measures (RC Entity F1/Recall, RC Reasoning accuracy on ITBench scenarios) from broader diagnostic capability. The Pass@k vs Majority@k distinction explicitly addresses the gap between sporadic success and consistent reasoning."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Models listed as 'GPT-5.1', 'GPT-OSS-120B', 'Gemini 3 Flash', 'Gemini 3 Pro', 'Kimi K2', 'Claude Opus 4.5', etc. — marketing names without specific API versions or snapshot dates."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Full prompt text provided for all system components: ReAct agent prompt (§A.2), and all EoG prompts — select_investigation_window, filter_events, filter_spec_changes, bootstrap, explore (πabd), and finalize (§B.5-B.7)."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "§B.8 states 'All LLM calls use temperature 0 for determinism.' Also reports kthresh for damping, kmax=5 max visits, kcool=2 cooldown, >80K token chunking threshold, and 'high reasoning effort settings' for experiments."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The EoG scaffolding is described in extensive detail: Deterministic Controller, Context Contract (CxC), Abductive Policy, event-driven architecture (§4, §B). MCP tool interfaces documented (Table 1, Table 6). Algorithm pseudocode provided (§4.3)."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Data preprocessing described: Context Contract implements token-aware pagination, tournament-style reduction for relevance filtering (§B.8). Event and spec change filtering with explicit criteria (§B.5.2-B.5.3). Entity normalization format documented."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No dedicated limitations section. The paper has an 'Impact Statement' (§8) but it contains only a generic disclaimer: 'There are many potential societal consequences of our work, none of which we feel must be specifically highlighted here.'"
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No threats to validity discussed. The paper does not address threats like benchmark representativeness, LLM-as-a-Judge reliability, or the small number of runs (k=3)."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No explicit scope boundaries stated. The paper claims generalizability to healthcare, forensics, FinOps (§2) without stating what the results do NOT show. §B.9.1 lists correctness assumptions but these are formal properties, not scope limitations."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No raw agent trajectories, detailed per-scenario results, or raw evaluation data are released. Only aggregate metrics in tables."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "ITBench is described as containing 35 SRE scenarios 'spanning misconfigurations, resource exhaustion, and cascading failures' (§5). The evaluation uses 3 runs per scenario with LLM-as-a-Judge evaluation (Appendix D)."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants. The study uses benchmark scenarios from ITBench."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The evaluation pipeline is documented: scenarios run → agent produces JSON output → LLM-as-a-Judge evaluates against ground truth with entity mapping (Appendix D). Entity normalization, alias handling, and scoring rubrics are specified."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding or acknowledgments section. All authors are from IBM Research but no funding source is disclosed."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All authors listed as 'IBM Research, Yorktown Heights, New York, USA' on page 1."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "All authors are IBM Research employees. IBM has commercial interests in IT operations tools (IBM Instana is cited). The funder (IBM) has a financial interest in demonstrating AI-driven IT diagnostics."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement. IBM Instana (IBM's commercial observability product) is cited but no conflict is acknowledged."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No training data cutoff dates stated for any of the LLMs used (GPT-5.1, Gemini, etc.), despite evaluating on ITBench which could be in training data."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether ITBench scenarios or solutions could appear in the training data of the evaluated models."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "ITBench was published in 2025. Models like GPT-5.1 and Gemini 3 could have been trained on data including ITBench papers and solutions. This is not discussed."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Table 2 reports average input and output token counts per trial per scenario for each agent/model combination (e.g., EoG GPT-5.1: 1837K input, 1.7K output). Note excludes reasoning tokens."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total compute budget, API costs, or wall-clock time reported. Token counts are per-scenario averages but total cost across all experiments is not stated."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Temperature 0 is used for determinism, but no seed sensitivity analysis is provided. The paper uses k=3 runs to measure Pass@k vs Majority@k, but does not report variance across seeds."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Explicitly stated: '3 runs per scenario to measure both accuracy and consistency' (§5). k=3 is used throughout for Pass@k and Majority@k."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget reported. The damping threshold kthresh, kmax=5, kcool=2 appear to be set without describing how they were selected."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No justification for how hyperparameters (kthresh, kmax, kcool, token budget thresholds) were selected. No mention of validation set or selection process."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Multiple comparisons across 8+ models with no correction applied. No significance tests are used at all, so correction is moot, but the issue remains."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors built both EoG and the ReAct baseline. No acknowledgment that their implementation of the baseline may systematically underperform. While they note the baseline 'achieves higher performance than published baselines,' the ReAct implementation choices could still favor EoG."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Table 2 reports token usage for both EoG and ReAct. EoG uses more input tokens (e.g., 1837K vs 779K for GPT-5.1) but far fewer output tokens (1.7K vs 9.3K). The compute difference is visible, though not formally analyzed as a function."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether ITBench scenarios are representative of real-world SRE incidents. No analysis of construct validity — whether success on ITBench implies capability in actual production diagnosis."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The scaffold confound is directly addressed: EoG vs ReAct comparisons use the same models, explicitly isolating the scaffolding effect. The prompting ablation (Table 3) further controls for this by giving ReAct the EoG algorithm as instructions."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether models could have seen ITBench solutions or similar diagnostic patterns during training."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the evaluation setup provides hints not available in real SRE usage. The MCP tools abstract away real production complexity."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether the 35 ITBench scenarios are independent or share structural similarities that could inflate performance."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention methods applied."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "EoG achieves 7× higher Majority@k F1 than ReAct baselines on ITBench SRE scenarios.",
    376       "evidence": "Table 2 shows EoG maj@3 F1 scores ranging from 64.2-92.1% vs ReAct 5.7-74.3%. Average gain computed across models (§5, abstract).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "EoG near-eliminates the reliability gap between Pass@k and Majority@k.",
    381       "evidence": "Table 2: GPT-5.1 gap reduces from 14.3pp (22.9-8.6) under ReAct to 2.8pp (88.9-86.1) under EoG. Similar narrowing across models.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Prompting ReAct with EoG algorithm instructions is insufficient — the deterministic controller is necessary.",
    386       "evidence": "Table 3: ReAct prompted with EoG algorithm achieves 40-71.4% pass@3 but 0-2.9% maj@3, showing instruction-following cannot substitute for external control.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Semantic Belief Propagation provides 7-64% additional improvement over structured traversal alone.",
    391       "evidence": "Table 4: SBP ablation shows relative improvements from +6.9% (GPT-5.1) to +64.3% (GPT-OSS-20B) on maj@3 Recall.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "ReAct agents fail primarily due to exploration failures (not reasoning failures).",
    396       "evidence": "Figure 2b shows high 'Evidence not in context' rates across models, while 'Evidence ignored' is small. Figure 2c shows 14-70% improvement from oracle reordering without adding new information (§3.2).",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "The EoG formalism generalizes to SWE tasks for diagnosing production bugs.",
    401       "evidence": "Only a citation to Cui et al. (2025) is provided (§4 contributions). No empirical evidence in this paper.",
    402       "supported": "weak"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "Company evaluating its own system",
    408       "detail": "All authors are from IBM Research. IBM has commercial observability products (IBM Instana is cited). The paper proposes an architecture that could be productized. No conflict of interest statement is provided."
    409     },
    410     {
    411       "flag": "No limitations section",
    412       "detail": "The paper lacks any limitations discussion despite making broad generalization claims to healthcare, forensics, FinOps, etc. The Impact Statement is a generic non-answer."
    413     },
    414     {
    415       "flag": "Small k for consistency claims",
    416       "detail": "Consistency claims (Majority@k) are based on k=3 runs. With 3 runs, Majority@3 requires 2/3 success — a small sample for claiming 'consistent reasoning.' No confidence intervals are provided."
    417     },
    418     {
    419       "flag": "LLM-as-a-Judge evaluation",
    420       "detail": "The evaluation uses an LLM judge (Appendix D) with complex entity mapping and semantic scoring. No human validation of the judge's accuracy is reported, despite the authors noting the evaluation has non-trivial complexity (alias groups, regex matching)."
    421     },
    422     {
    423       "flag": "No error bars despite stochastic claims",
    424       "detail": "The paper's central argument is about reliability and consistency, yet no standard deviations, confidence intervals, or significance tests are reported across the 3 runs."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "ITBench: Evaluating AI agents across diverse real-world IT automation tasks",
    430       "authors": ["Saurabh Jha", "Rohan R. Arora"],
    431       "year": 2025,
    432       "relevance": "Primary benchmark used for evaluation; tests AI agents on real-world IT diagnostic tasks."
    433     },
    434     {
    435       "title": "ReAct: Synergizing reasoning and acting in language models",
    436       "authors": ["Shunyu Yao"],
    437       "year": 2023,
    438       "relevance": "Primary baseline agent paradigm evaluated and critiqued in this paper."
    439     },
    440     {
    441       "title": "Executable code actions elicit better LLM agents",
    442       "authors": ["Xingyao Wang"],
    443       "year": 2024,
    444       "relevance": "CodeAct framework used as basis for the ReAct baseline implementation."
    445     },
    446     {
    447       "title": "Why do multi-agent LLM systems fail?",
    448       "authors": ["Mert Cemri"],
    449       "year": 2025,
    450       "arxiv_id": "2503.13657",
    451       "relevance": "Characterizes failure modes of multi-agent LLM systems including tool invocation failures."
    452     },
    453     {
    454       "title": "From local to global: A graph RAG approach to query-focused summarization",
    455       "authors": ["Darren Edge"],
    456       "year": 2024,
    457       "relevance": "Graph-RAG approach for knowledge graph reasoning, compared against EoG in Table 5."
    458     },
    459     {
    460       "title": "Exploring LLM-based agents for root cause analysis",
    461       "authors": ["Devjeet Roy"],
    462       "year": 2024,
    463       "relevance": "Prior work on LLM-based IT diagnostics; ReAct baseline incorporates best practices from this work."
    464     },
    465     {
    466       "title": "Lost in the middle: How language models use long contexts",
    467       "authors": ["Nelson F. Liu"],
    468       "year": 2024,
    469       "relevance": "Documents how LLMs lose information in long contexts, motivating EoG's bounded context design."
    470     },
    471     {
    472       "title": "GALA: Can graph-augmented large language model agentic workflows elevate root cause analysis?",
    473       "authors": ["Yining Tian"],
    474       "year": 2025,
    475       "relevance": "Graph-augmented LLM diagnostic agent compared in Table 5."
    476     },
    477     {
    478       "title": "STRATUS: A multi-agent system for autonomous reliability engineering of modern clouds",
    479       "authors": ["Yuchen Chen"],
    480       "year": 2025,
    481       "relevance": "Multi-agent cloud reliability system compared in Table 5."
    482     },
    483     {
    484       "title": "AgentCompass: Towards reliable evaluation of agentic workflows in production",
    485       "authors": ["Nishant Kartik"],
    486       "year": 2025,
    487       "relevance": "Addresses reliability evaluation of agentic systems in production, directly relevant to the reliability gap concept."
    488     },
    489     {
    490       "title": "Theorem-of-thought: A multi-agent framework for abductive, deductive, and inductive reasoning in language models",
    491       "authors": ["Sara Abdaljalil"],
    492       "year": 2025,
    493       "relevance": "Uses Bayesian belief propagation over agent reasoning graphs; contrasted with EoG's sequential belief revision approach."
    494     },
    495     {
    496       "title": "When agents go astray: Course-correcting SWE agents with PRMs",
    497       "authors": ["Sarthak Gandhi"],
    498       "year": 2025,
    499       "relevance": "Addresses plan revision failures in SWE agents, related to plan abandonment analysis in this paper."
    500     }
    501   ]
    502 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs