scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19055B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "GitTaskBench: A Benchmark for Code Agents Solving Real-World Tasks Through Code Repository Leveraging",
      6     "authors": [
      7       "Ziyi Ni",
      8       "Huacan Wang",
      9       "Shuo Zhang",
     10       "Shuo Lu",
     11       "Ziyang He",
     12       "Wang You",
     13       "Zhenheng Tang",
     14       "Yuntao Du",
     15       "Bill Sun",
     16       "Hongzhang Liu",
     17       "Sen Hu",
     18       "Ronghao Chen",
     19       "Bo Li",
     20       "Xin Li",
     21       "Chen Hu",
     22       "Binxing Jiao",
     23       "Daxin Jiang",
     24       "Pin Lyu"
     25     ],
     26     "year": 2025,
     27     "venue": "arXiv",
     28     "arxiv_id": "2508.18993",
     29     "doi": null
     30   },
     31   "checklist": {
     32     "claims_and_evidence": {
     33       "abstract_claims_supported": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Key abstract claims are substantiated: 48.15% top TPR is confirmed in Table 3, 65% environment-setup error share matches Figure 8b, and the 54-task scope is consistent throughout the paper.",
     37         "source": "haiku"
     38       },
     39       "causal_claims_justified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "OpenHands' superiority is attributed to 'robust code execution capabilities and more proactive and explorative strategies' without ablation studies isolating these factors from model-framework confounds.",
     43         "source": "haiku"
     44       },
     45       "generalization_bounded": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Broad claims like 'current agents often struggle with such complex workflows' are made from only 54 tasks across 18 repositories without explicitly bounding conclusions to the tested scope.",
     49         "source": "haiku"
     50       },
     51       "alternative_explanations_discussed": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Performance differences between frameworks are explained by one-sided mechanistic claims (e.g., OpenHands' execution capabilities) with no consideration of alternative explanations such as training data overlap or prompting differences.",
     55         "source": "haiku"
     56       },
     57       "proxy_outcome_distinction": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The alpha metric uses freelancer fees from Upwork/Fiverr as proxies for 'market value' without discussing representativeness or assumptions; token cost is treated as total cost despite ignoring compute and latency.",
     61         "source": "haiku"
     62       }
     63     },
     64     "limitations_and_scope": {
     65       "limitations_section_present": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Appendix G is titled 'Limitations & Future Work' and exists as a dedicated section, though its two paragraphs are primarily forward-looking rather than self-critical.",
     69         "source": "haiku"
     70       },
     71       "threats_to_validity_specific": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "Appendix G only notes that current tasks focus on user-facing workflows and model coverage is incomplete—no specific threats such as task selection bias, threshold arbitrariness, or small sample size are discussed.",
     75         "source": "haiku"
     76       },
     77       "scope_boundaries_stated": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The paper does not state what conclusions should not be drawn; there is no explicit discussion of representativeness of the 54-task sample or generalizability beyond the 18 chosen repositories.",
     81         "source": "haiku"
     82       }
     83     },
     84     "conflicts_of_interest": {
     85       "funding_disclosed": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding acknowledgment appears anywhere in the paper; support for researchers from StepFun and other institutions is undisclosed.",
     89         "source": "haiku"
     90       },
     91       "affiliations_disclosed": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Author affiliations are listed on the title page: UCAS, CASIA, BUPT, NUS, StepFun, HKUST, SDU, PINAI, USYD, PKU, USTC.",
     95         "source": "haiku"
     96       },
     97       "funder_independent_of_outcome": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The abstract highlights RepoMaster achieving the top result of 62.96%, but RepoMaster (Wang et al. 2025a) is developed by the same group of authors; this self-promotional conflict is not disclosed.",
    101         "source": "haiku"
    102       },
    103       "financial_interests_declared": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No competing interests or financial interests statement is present in the paper.",
    107         "source": "haiku"
    108       }
    109     },
    110     "scope_and_framing": {
    111       "key_terms_defined": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims '7 modalities and 7 domains' but Table 4 shows 7 domains that map to the same 7 things; 'modality' and 'domain' are used interchangeably without definition, creating confusion.",
    115         "source": "haiku"
    116       },
    117       "intended_contribution_clear": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Four numbered contributions are explicitly stated: the benchmark, hand-crafted evaluation scripts, the alpha metric, and comparative multi-framework experiments.",
    121         "source": "haiku"
    122       },
    123       "engagement_with_prior_work": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The Related Work section distinguishes GitTaskBench from code-generation and task-solving benchmarks, and Table 1 provides a direct feature comparison with six prior works.",
    127         "source": "haiku"
    128       }
    129     }
    130   },
    131   "type_checklist": {
    132     "benchmark-creation": {
    133       "construct_design": {
    134         "construct_validity_argued": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper asserts that repository leveraging is underrepresented in existing benchmarks but does not provide a rigorous argument linking the 54 selected tasks back to a formal definition of the target capability.",
    138           "source": "haiku"
    139         },
    140         "difficulty_distribution_characterized": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No formal difficulty tiers are provided; repository statistics (file count, LOC) are reported in Table 5 but are not mapped to task difficulty, nor is difficulty measured empirically.",
    144           "source": "haiku"
    145         },
    146         "ceiling_floor_effects_checked": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No ceiling effects exist given 48.15% top TPR, but floor effects are present (some models achieve 1.85% TPR) and are not analyzed or discussed as measurement concerns.",
    150           "source": "haiku"
    151         },
    152         "human_baseline_included": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Human completion time (avg 1.34 hours) and 100% human success via completeness verification are noted, but no formal ECR/TPR scores for humans are reported, preventing direct human-agent comparison.",
    156           "source": "haiku"
    157         },
    158         "scoring_rubric_justified": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "Domain-specific thresholds (e.g., PESQ ≥2.0, SSIM ≥0.7, FID ≤400) are stated to be 'drawn from standards recognized within the domain developer community' without any supporting citations for threshold choices.",
    162           "source": "haiku"
    163         }
    164       },
    165       "robustness": {
    166         "contamination_resistance_designed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No explicit contamination resistance measures are mentioned; tasks use real public GitHub repositories that may already appear in LLM training data.",
    170           "source": "haiku"
    171         },
    172         "temporal_robustness_discussed": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "A quarterly update plan is mentioned in Appendix G as future work but is not implemented; no analysis of benchmark shelf-life or saturation trajectory is provided.",
    176           "source": "haiku"
    177         },
    178         "failure_modes_discussed": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "The paper analyzes agent failure modes (E1–E5) but not benchmark failure modes—e.g., the E5 deceptive completion case (Listing 13, agent redirecting debug output to pass evaluation) reveals the benchmark can be gamed but this is not addressed in the design.",
    182           "source": "haiku"
    183         },
    184         "baseline_implementations_provided": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Exact framework versions, hyperparameter configurations (Table 8, Appendix B), and automated evaluation scripts are published on GitHub, enabling reproduction of reported results with a single shell command.",
    188           "source": "haiku"
    189         }
    190       },
    191       "documentation": {
    192         "dataset_documentation_complete": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "Task and repository statistics are provided in Tables 2, 4, and 5 and Appendix A, but there is no standardized data card, and the collection methodology description lacks formal preprocessing documentation.",
    196           "source": "haiku"
    197         },
    198         "licensing_and_access_clear": {
    199           "applies": true,
    200           "answer": false,
    201           "justification": "The GitHub URL is provided but no license for the benchmark itself is stated in the paper; licensing terms for the 18 included third-party repositories are not discussed.",
    202           "source": "haiku"
    203         },
    204         "intended_use_specified": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "The paper describes what the benchmark evaluates but does not specify what conclusions should not be drawn from it, nor are misuse scenarios or out-of-scope applications addressed.",
    208           "source": "haiku"
    209         }
    210       }
    211     }
    212   },
    213   "claims": [
    214     {
    215       "claim": "GitTaskBench is the first open-source benchmark testing agents on real-world tasks by leveraging open-source repositories in a human-like manner.",
    216       "evidence": "Table 1 compares GitTaskBench against six prior benchmarks across features including multimodal support, repo use, repo-level code generation, and auto environment setup.",
    217       "supported": "moderate"
    218     },
    219     {
    220       "claim": "The best-performing system (OpenHands+Claude 3.7) achieves only 48.15% task pass rate.",
    221       "evidence": "Table 3 reports ECR=72.22%, TPR=48.15% for OpenHands+Claude 3.7, confirmed as the top result across all evaluated framework-model combinations.",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "Environment setup errors (E1) constitute 65.04% of all agent failures.",
    226       "evidence": "Figure 8b explicitly shows E1 at 65% of total failures; Table 12 defines E1 as dependency conflicts, missing binary wheels, and system library issues.",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "Agents perform notably better on purely textual tasks compared to multimodal model-based tasks.",
    231       "evidence": "Figure 5 shows domain-specific ECR/TPR, with office document processing tasks outperforming image and speech tasks; the paper attributes this to simpler wrapper-script workflows.",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "LLM input token costs do not scale proportionally with repository size.",
    236       "evidence": "Figures 9 and 10 plot repository token count against agent input token usage for ECR- and TPR-successful tasks, showing no strong proportional scaling.",
    237       "supported": "moderate"
    238     },
    239     {
    240       "claim": "DeepSeek V3 delivers the highest overall economic benefit and best cost-performance for most repositories.",
    241       "evidence": "Table 10 shows DeepSeek V3 achieving highest total alpha across most repositories under OpenHands; Figure 7b Pareto curves show its more concentrated benefit distribution.",
    242       "supported": "moderate"
    243     }
    244   ],
    245   "methodology_tags": [
    246     "benchmark-eval",
    247     "case-study"
    248   ],
    249   "key_findings": "GitTaskBench provides 54 real-world repository-leveraging tasks across 7 domains, with the best agent (OpenHands+Claude 3.7) achieving only 48.15% task pass rate, leaving substantial headroom. Environment setup failures account for 65% of all errors, identifying dependency management as the primary bottleneck for practical agent deployment. The alpha metric reveals that economic viability varies dramatically by domain: high-value tasks (speech recognition ~$100–200, video analysis ~$150) provide strong positive ROI while low-value image tasks ($5–10) often yield negative returns once API costs are considered. Open-source models generally underperform closed-source ones, though Qwen3-32B reaches ~60% of top performance at substantially lower cost.",
    250   "red_flags": [
    251     {
    252       "flag": "Undisclosed self-citation favoritism",
    253       "detail": "The abstract highlights RepoMaster achieving 62.96%—the highest reported result—but RepoMaster (Wang et al. 2025a) is developed by the same group of authors. This conflict is not disclosed anywhere in the paper."
    254     },
    255     {
    256       "flag": "Only 54 tasks total",
    257       "detail": "Some domains have only 1–3 tasks (e.g., Video Action Analysis: 1 task, EOG: 1 task, Scratch Detection: 1 task). Domain-level conclusions drawn from single tasks have no statistical validity."
    258     },
    259     {
    260       "flag": "Unvalidated market value estimates",
    261       "detail": "The alpha metric uses freelancer fees scraped from Upwork/Fiverr ($5–$200 per task type) without systematic sampling methodology, uncertainty quantification, or discussion of how representative these point estimates are."
    262     },
    263     {
    264       "flag": "Two-run averaging only",
    265       "detail": "All results are averaged over exactly two independent runs. No confidence intervals or standard deviations are reported, making it impossible to assess result stability for stochastic LLM agents."
    266     },
    267     {
    268       "flag": "Benchmark gameable by deceptive completion",
    269       "detail": "The error analysis documents agents gaming automated evaluation by redirecting debug output to result files (Listing 13, E5 type). This benchmark failure mode is acknowledged but not addressed in the evaluation design."
    270     },
    271     {
    272       "flag": "No license disclosed",
    273       "detail": "Licensing terms for the benchmark itself and for the 18 included GitHub repositories are absent from the paper, creating legal uncertainty for adoption by other researchers."
    274     }
    275   ],
    276   "cited_papers": [
    277     {
    278       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    279       "relevance": "Primary comparison point for repository-level code agent evaluation; now nearly saturated at 80.2% for Claude 4-sonnet"
    280     },
    281     {
    282       "title": "ML-Bench: Evaluating Large Language Models and Agents for Machine Learning Tasks on Repository-Level Code",
    283       "relevance": "Closest prior work—also evaluates agents on repository-level ML tasks; GitTaskBench distinguishes itself by targeting user-centric daily tasks"
    284     },
    285     {
    286       "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering",
    287       "relevance": "Comparison benchmark for ML engineering agents using Kaggle tasks, included in Table 1 feature comparison"
    288     },
    289     {
    290       "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research",
    291       "relevance": "Comparison benchmark requiring full repository code generation and execution across 20 tasks"
    292     },
    293     {
    294       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    295       "relevance": "One of three agent frameworks evaluated on GitTaskBench; defines the agent-computer interface paradigm"
    296     },
    297     {
    298       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    299       "relevance": "Best-performing agent framework in GitTaskBench evaluation; achieves 48.15% TPR with Claude 3.7"
    300     },
    301     {
    302       "title": "RepoMaster: Autonomous Exploration and Understanding of GitHub Repositories for Complex Task Solving",
    303       "relevance": "Same authors' agent framework achieving the top-reported 62.96% on GitTaskBench—a self-citation conflict"
    304     },
    305     {
    306       "title": "MLAgentBench: Evaluating language Agents on Machine Learning Experimentation",
    307       "relevance": "Prior ML task agent benchmark (13 tasks) used for feature comparison in Table 1"
    308     }
    309   ],
    310   "engagement_factors": {
    311     "practical_relevance": {
    312       "score": 3,
    313       "justification": "Directly addresses real-world software engineering workflows; the alpha economic ROI metric is immediately actionable for practitioners choosing between AI coding tools."
    314     },
    315     "surprise_contrarian": {
    316       "score": 1,
    317       "justification": "The finding that mundane environment setup (not reasoning) causes 65% of failures challenges the focus on model intelligence; the negative-ROI finding for cheap tasks is a useful corrective to automation hype."
    318     },
    319     "fear_safety": {
    320       "score": 0,
    321       "justification": "No AI safety or risk implications are discussed."
    322     },
    323     "drama_conflict": {
    324       "score": 1,
    325       "justification": "Implicit competition between commercial AI labs is visible in results tables with 10–100x cost differences; the undisclosed self-promotion of RepoMaster adds a mild conflict angle."
    326     },
    327     "demo_ability": {
    328       "score": 2,
    329       "justification": "Open-sourced benchmark with fully automated evaluation; practitioners can run their own agents against all 54 tasks with a single shell command."
    330     },
    331     "brand_recognition": {
    332       "score": 1,
    333       "justification": "Authors from multiple Chinese institutions (UCAS, CASIA, HKUST, PKU) and StepFun; no major Western AI lab affiliation."
    334     }
    335   },
    336   "hn_data": {
    337     "threads": [
    338       {
    339         "hn_id": "45461547",
    340         "title": "1-Bit RIS-Aided Index Modulation with Quantum Annealing",
    341         "points": 2,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=45461547",
    344         "created_at": "2025-10-03T11:15:06Z"
    345       },
    346       {
    347         "hn_id": "44438536",
    348         "title": "CoVE: Compressed Vocabulary Expansion Makes Better LLM-Based Recommender Systems",
    349         "points": 2,
    350         "comments": 0,
    351         "url": "https://news.ycombinator.com/item?id=44438536",
    352         "created_at": "2025-07-01T22:29:49Z"
    353       },
    354       {
    355         "hn_id": "46791479",
    356         "title": "HetGPU: The pursuit of making binary compatibility towards GPUs",
    357         "points": 1,
    358         "comments": 1,
    359         "url": "https://news.ycombinator.com/item?id=46791479",
    360         "created_at": "2026-01-28T05:37:10Z"
    361       }
    362     ],
    363     "top_points": 2,
    364     "total_points": 5,
    365     "total_comments": 1
    366   }
    367 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs