scan.json (25165B)
1 { 2 "paper": { 3 "title": "Process-Centric Analysis of Agentic Software Systems", 4 "authors": ["Shuyang Liu", "Yang Chen", "Rahul Krishna", "Saurabh Sinha", "Jatin Ganhotra", "Reyhaneh Jabbarvand"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2512.02393", 8 "doi": "10.48550/arXiv.2512.02393" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": false, 17 "justification": "The paper mentions 'artifacts' and providing a 'rich dataset' for the community but does not provide a repository URL or download link for source code." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper states it provides 'the community with a rich dataset to explore future process-centric analyses algorithms on top of Graphectory and Langutory' (§1) and uses SWE-bench Verified which is publicly available. However, no explicit download link is given for the 4000 trajectories. The benchmark itself (SWE-bench Verified) is public." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned. The paper describes experimental setup but not the software environment needed to reproduce." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions or README are provided. The paper describes the experimental setup (§3.1) but not how to reproduce the pipeline." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper reports median and Interquartile Range (IQR) for all process-centric metrics in Figure 3: 'each heatmap cell shows the median of the metric and Interquartile Range (IQR), representing the spread around the median.'" 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper uses Mann-Whitney U test (§3.2.3, Figure 5-a) to test correlation between metrics and repair status, and Kendall's τ_b test (§3.2.4, Figure 5-b) for difficulty alignment. P-values are reported with significance threshold p ≤ 0.05." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": false, 49 "justification": "While the paper reports p-values from statistical tests and median/IQR values, it does not report effect sizes (e.g., Cohen's d, rank-biserial correlation) for the Mann-Whitney U or Kendall's τ_b tests." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The sample size of 500 issues × 8 configurations = 4000 trajectories is stated but not justified. No power analysis or justification for why 500 issues is sufficient." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "IQR is reported for all metrics in Figure 3. The paper states 'reporting the median and interquartile range values' (§3.1)." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper compares two agents (SWE-agent, OpenHands) across four LLMs, providing cross-agent and cross-model comparisons throughout §3." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "All agents and LLMs are contemporary: SWE-agent, OpenHands, DeepSeek-V3, DeepSeek-R1, Devstral-small-2505, Claude Sonnet 4. All are 2024-2025 systems." 72 }, 73 "ablation_study": { 74 "applies": false, 75 "answer": false, 76 "justification": "Graphectory is an analysis framework, not a system with components to ablate. The paper's contribution is the representation and metrics, not a system being optimized." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Six process-centric metrics are defined and used: Node Count, Temporal Edge Count, Loop Count, Average Loop Length, Structural Edge Count, Structural Breadth (Table 1, §2.2)." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": true, 86 "justification": "Two human annotators independently labeled actions with phases for the phase map (§2.1). Manual review of 15% of Graphectory (600 samples) was performed for anti-pattern identification (§3.4). Manual investigation of trajectories is referenced throughout." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "This is an analysis study, not a prediction task. There is no train/test split to evaluate." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down by agent, by LLM, by repair status (resolved/unresolved), and by problem difficulty (easy/medium/hard/very hard) throughout §3." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Extensive failure analysis is provided: §3.4 identifies nine anti-patterns with qualitative examples (Figures 10-13), and unresolved trajectories are analyzed throughout." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper reports that stronger models (Claude Sonnet 4) exhibit more inefficiencies despite higher success rates (§3.4), and that even successful runs contain significant inefficiencies. DeepSeek-R1's format compliance issues are noted (§3.2.3)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The three main abstract claims (richer prompts/stronger LLMs → more complex Graphectory; strategies vary with difficulty; successful runs still inefficient) are all supported by results in §3.2-3.4." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper makes implicit causal claims (e.g., 'agents using richer prompts or stronger LLMs exhibit more complex Graphectory, reflecting deeper exploration'). This conflates correlation with causation — richer prompts and stronger LLMs are confounded with agent design differences. The study is observational, not controlled." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "The title says 'Agentic Software Systems' broadly, and the paper claims Graphectory 'applies broadly to any agentic system' (§1), but results are only on two programming agents on SWE-bench Verified. No non-SE agents are evaluated." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper does not discuss alternative explanations for its findings. For example, the correlation between metric complexity and repair status could be confounded by timeout limits rather than agent behavior quality. No threats to validity section exists." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper uses graph metrics (NC, TEC, LC, etc.) as proxies for 'effort', 'complexity', and 'efficiency' without discussing whether these metrics actually capture those constructs. Higher node count is equated with more effort without validation." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Models are specified: 'DeepSeek-V3 671B MoE', 'DeepSeek-R1', 'Devstral-small-2505 24B', 'Claude Sonnet 4' (§3.1). Devstral includes the version suffix '2505'." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper references SWE-agent and OpenHands system prompts via URLs to GitHub (refs [38], [45]) but does not include the actual prompt text. The GPT-5 annotation prompt is also not provided." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": false, 150 "justification": "No temperature, top-p, or other LLM hyperparameters are reported. Only agent-level configs are mentioned: '$2 cost cap for SWE-agent, 100 iterations for OpenHands' (§3.1)." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "Both SWE-agent and OpenHands scaffolding are described: their tool usage, system prompts, and workflow differences are discussed in §2.1 and §3.1. The agents' default configurations and tools are specified." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "The Graphectory construction pipeline is documented: trajectory log parsing, node/edge construction, phase labeling via Algorithm 1, and Langutory construction via Algorithm 2 (§2.1)." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "There is no limitations or threats-to-validity section in the paper. The conclusion (§5) briefly mentions future directions but does not discuss limitations." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "No threats to validity are discussed. Specific concerns such as the impact of cost/iteration limits, the representativeness of SWE-bench, or the reliability of automated phase labeling are not addressed." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper claims broad applicability ('Graphectory applies broadly to any agentic system') without stating specific scope boundaries. It does not explicitly state what the results do NOT show." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "The paper mentions providing a 'rich dataset' but no download link or data repository is provided for the 4000 trajectories or constructed Graphectory." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Data collection is described: 500 SWE-bench Verified issues, 8 agent-model pairs, default configurations, Pass@1 results (§3.1)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. The data source is SWE-bench Verified, a standard benchmark." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline from trajectory logs to Graphectory to metrics/analyses is fully documented with formal definitions (§2) and algorithms (Algorithms 1-3)." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding acknowledgment section is present in the paper." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly stated: UIUC and IBM Research. IBM Research authors are evaluating agents that are not IBM products, so no direct product conflict." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding is disclosed, so independence cannot be assessed. IBM Research affiliations could imply corporate interest in agent analysis tooling." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper evaluates LLMs on SWE-bench Verified but does not state training cutoff dates for any of the four models used." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "No discussion of whether the SWE-bench Verified issues appeared in the training data of DeepSeek-V3, DeepSeek-R1, Devstral, or Claude Sonnet 4." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "SWE-bench was published in 2023 and SWE-bench Verified in 2024. All models were potentially trained after these dates. No contamination discussion is provided." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in the study. The human annotators for phase labeling are part of the methodology, not study subjects." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "SWE-agent has a $2 cost cap per instance but actual costs incurred are not reported. No aggregate cost figures are given for the 4000 trajectories." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No total compute budget, GPU hours, or API spend is reported for running 4000 trajectories across 4 LLMs." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "The paper acknowledges LLMs are 'inherently non-deterministic' (§3.1) but reports only Pass@1 single-run results. No multi-seed analysis is performed." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": true, 299 "justification": "The paper explicitly states Pass@1 results (first submission per issue) and 500 issues × 8 configurations = 4000 trajectories (§3.1)." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": false, 303 "answer": false, 304 "justification": "No hyperparameter search was performed; default configurations are used for both agents." 305 }, 306 "best_config_selection_justified": { 307 "applies": false, 308 "answer": false, 309 "justification": "No configuration selection was performed; the paper uses default settings for each framework." 310 }, 311 "multiple_comparison_correction": { 312 "applies": true, 313 "answer": false, 314 "justification": "Multiple Mann-Whitney U and Kendall's τ_b tests are performed across 8 agent-model pairs × 6 metrics (Figures 5a-5b) without any multiple comparison correction mentioned." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": false, 318 "answer": false, 319 "justification": "The paper does not propose a new agent system; it analyzes existing agents. No self-comparison bias applies." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": true, 323 "answer": false, 324 "justification": "Different LLMs likely have very different compute costs, but performance is not reported as a function of compute budget. The $2 cost cap for SWE-agent is mentioned but not analyzed." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": false, 329 "justification": "The paper uses SWE-bench Verified without discussing its construct validity — whether it represents real-world software engineering tasks, or whether Pass@1 on human-validated issues measures agent quality." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": true, 333 "answer": true, 334 "justification": "The paper explicitly compares agents across the same LLMs and LLMs across the same agents, treating both agent scaffold and LLM as variables. Results are broken down by agent-model pair throughout §3." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "SWE-bench issues are from real GitHub repositories. No discussion of whether these issues/solutions appeared in model training data." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether the evaluation setup leaks information (e.g., issue descriptions containing solution hints)." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "SWE-bench issues come from 12 repositories but no discussion of whether code from these repositories appeared in training data." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No leakage detection or prevention method is applied." 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "Agents using richer prompts or stronger LLMs exhibit more complex Graphectory, reflecting deeper exploration and more thorough validation", 363 "evidence": "Figure 3 shows OpenHands (richer prompts) has consistently higher metric values than SWE-agent. Claude Sonnet 4 produces more complex Graphectory than other LLMs (§3.2.1-3.2.2).", 364 "supported": "moderate" 365 }, 366 { 367 "claim": "Process-centric metrics effectively distinguish successful and unsuccessful repairs", 368 "evidence": "Mann-Whitney U tests show significant differences (p ≤ 0.05) for most agent-model pairs (Figure 5-a), except Claude Sonnet 4 which shows non-significant differences.", 369 "supported": "moderate" 370 }, 371 { 372 "claim": "Trajectory complexity grows with task difficulty, aligned with human difficulty ratings", 373 "evidence": "Kendall's τ_b test shows significant positive trends between metrics and difficulty (Figure 5-b), with qualitative examples in Figure 6.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Even successful agents exhibit significant inefficiency during problem solving", 378 "evidence": "Anti-pattern analysis (§3.4, Figures 14-15) shows localization and patching inefficiencies are prevalent even in resolved instances, though at lower rates than unresolved ones.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Stronger models (Claude Sonnet 4) exhibit more inefficiency patterns despite higher success rates", 383 "evidence": "Figures 14-15 show Claude Sonnet 4 has higher rates of localization inefficiency patterns. Manual analysis suggests extended internal reasoning (§3.4).", 384 "supported": "moderate" 385 } 386 ], 387 "methodology_tags": ["benchmark-eval", "observational"], 388 "key_findings": "Graphectory, a graph-based representation of agentic trajectories, enables systematic process-centric analysis beyond outcome-centric evaluation. Analysis of 4000 trajectories from SWE-agent and OpenHands across four LLMs reveals that unsuccessful runs are longer with more repetitions, trajectory complexity correlates with task difficulty, and even successful agents exhibit significant inefficiencies. Nine anti-patterns are identified in localization and patching phases, with stronger models paradoxically showing more inefficiency despite higher success rates.", 389 "red_flags": [ 390 { 391 "flag": "No limitations section", 392 "detail": "The paper lacks any limitations or threats-to-validity section despite making broad claims about agentic systems generally from only two programming agents on one benchmark." 393 }, 394 { 395 "flag": "Overclaimed generalizability", 396 "detail": "The paper claims Graphectory 'applies broadly to any agentic system' but only evaluates two programming agents on SWE-bench Verified. No evidence for web navigation, robotics, or scientific discovery domains mentioned in the introduction." 397 }, 398 { 399 "flag": "No contamination discussion", 400 "detail": "Uses SWE-bench Verified with four LLMs that could have been trained on SWE-bench data. Resolution rates may be inflated by data contamination, which would affect the process-centric analysis of resolved vs. unresolved trajectories." 401 }, 402 { 403 "flag": "Single-run results with acknowledged non-determinism", 404 "detail": "The paper acknowledges LLMs are 'inherently non-deterministic' but reports only Pass@1 single-run results. Trajectory patterns could vary significantly across runs of the same problem." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "SWE-agent: agent-computer interfaces enable automated software engineering", 410 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 411 "year": 2025, 412 "relevance": "One of the two primary agentic programming systems analyzed in this paper." 413 }, 414 { 415 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 416 "authors": ["Xingyao Wang"], 417 "year": 2025, 418 "arxiv_id": "2407.16741", 419 "relevance": "The other primary agentic programming system analyzed in this paper." 420 }, 421 { 422 "title": "Swe-bench: Can language models resolve real-world github issues?", 423 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 424 "year": 2023, 425 "relevance": "The benchmark used for evaluation (SWE-bench Verified subset)." 426 }, 427 { 428 "title": "Agentless: Demystifying llm-based software engineering agents", 429 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 430 "year": 2024, 431 "relevance": "Pipeline-based alternative to agentic approaches for automated issue resolution." 432 }, 433 { 434 "title": "TRAIL: Trace Reasoning and Agentic Issue Localization", 435 "authors": ["Darshan Deshpande", "Varun Gangal", "Hersh Mehta"], 436 "year": 2025, 437 "relevance": "Prior work on taxonomy of errors in agent trajectories from GAIA and SWE-Bench Lite." 438 }, 439 { 440 "title": "Why do multi-agent llm systems fail?", 441 "authors": ["Mert Cemri"], 442 "year": 2025, 443 "relevance": "Proposes MAST failure taxonomy for multi-agent systems, requiring human annotations." 444 }, 445 { 446 "title": "Understanding Software Engineering Agents Through the Lens of Traceability", 447 "authors": ["Ira Ceka", "Saurabh Pujar"], 448 "year": 2025, 449 "relevance": "Manual analysis of agent action flow graphs for software engineering agents." 450 }, 451 { 452 "title": "An Empirical Study on Failures in Automated Issue Solving", 453 "authors": ["Simiao Liu", "Fang Liu"], 454 "year": 2025, 455 "relevance": "Develops taxonomy of LLM failure modes for SWE tasks with manual analysis of 150 issues." 456 }, 457 { 458 "title": "ReAct: Synergizing reasoning and acting in language models", 459 "authors": ["Shunyu Yao"], 460 "year": 2022, 461 "relevance": "Foundational ReAct paradigm used by the agentic systems analyzed." 462 }, 463 { 464 "title": "AgentBench: Evaluating LLMs as Agents", 465 "authors": ["Xiao Liu"], 466 "year": 2023, 467 "relevance": "Benchmark for evaluating LLMs as agents across diverse tasks." 468 }, 469 { 470 "title": "DeepSeek-V3 Technical Report", 471 "authors": ["DeepSeek-AI"], 472 "year": 2025, 473 "arxiv_id": "2412.19437", 474 "relevance": "One of the four backbone LLMs evaluated in the study." 475 }, 476 { 477 "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning", 478 "authors": ["Daya Guo"], 479 "year": 2025, 480 "relevance": "Reasoning model used as one of the four backbone LLMs." 481 } 482 ] 483 }