scan-v4.json (33763B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DeepCode: Open Agentic Coding", 6 "authors": [ 7 "Zongwei Li", 8 "Zhonghang Li", 9 "Zirui Guo", 10 "Xubin Ren", 11 "Chao Huang" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2512.07921", 16 "doi": "10.48550/arXiv.2512.07921" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": false, 23 "justification": "The abstract claims 'surpassing PhD-level human experts' but the human comparison is on only 3 papers (75.9±4.5 vs Best@3 of 72.4), where the difference (3.5 points) is within the standard error. Additionally, Figures 1 and 4 show inconsistent numbers for the same comparisons (e.g., 73.5 vs 73.6, 84.8 vs 85.4, 75.9 vs 76.7), undermining confidence in the reported results.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The ablation studies use controlled single-variable manipulation (adequate for component claims). However, the paper's main causal claim — 'the advantage is directly attributable to the superior agentic architecture' — compares against commercial agents without controlling for compute budget, time budget, prompting effort, or the fact that Cursor/Claude Code weren't designed for paper reproduction. Same base model ≠ same conditions.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The title claims 'Open Agentic Coding' and Section 2.1 defines 'Software System Generation' as a target task, but evaluation is solely on PaperBench (20 ML papers). The abstract claims 'new foundations for autonomous scientific reproduction that can accelerate research evaluation and discovery,' far exceeding the tested scope.", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "No alternative explanations for results are discussed. Could the improvement be from better prompting rather than architecture? Could it be from more compute time? Could the specific task (ML paper reproduction) favor DeepCode's design? None of these alternatives are considered.", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper measures Replication Score via static code analysis (SimpleJudge), but frames results as 'reproduction' and 'surpassing human experts' at replication. Section 4.1 acknowledges grading 'does not include post-submission reproduction' but the abstract and conclusion still frame static code quality matching as scientific reproduction. The gap between code structure quality (measured) and actual reproduction (claimed) is not adequately acknowledged.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 6 'Discussion: Challenges and Future Directions' discusses three challenges: computational efficiency and reliance on proprietary LLMs, episodic vs evolving agents, and dynamic planning limitations.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 6 discusses general challenges for the field (e.g., 'SOTA performance currently relies on massive, proprietary LLMs') but not specific threats to this study's validity. No mention of the small evaluation set (20 papers), the 3-paper human comparison, benchmark-specific overfitting risk, or the automated judge's reliability.", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "No explicit statements about what the results do NOT show. The paper does not acknowledge that results are limited to ML paper reproduction on PaperBench, does not state that software system generation (mentioned in Section 2.1) was not tested, and does not bound claims to the specific models and benchmark used.", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper.", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All authors are listed as affiliated with The University of Hong Kong. They are not affiliated with any of the commercial products being evaluated (Cursor, Claude Code, Codex).", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "No funding information is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of conflict.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or financial disclosure is present in the paper.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are defined: 'document-to-repository synthesis' is formalized with equations (Fgen: D→P), 'replication score' is defined via the hierarchical rubric aggregation procedure, and 'information-flow management' is explained via the channel optimization framing.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Three explicit contributions are listed: (1) information-theoretic characterization of doc-to-repo synthesis, (2) the DeepCode framework with four orchestrated information operations, (3) SOTA performance on PaperBench surpassing human experts.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 5 engages substantively with two categories of related work (general coding agents and scientific coding agents), explaining how DeepCode differs from ChatDev, MetaGPT, SWE-agent, PaperCoder, CodeScientist, AlphaEvolve, and AI Scientist.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "GitHub repository URL is provided at the top of the paper: 'Source Code: https://github.com/HKUDS/DeepCode'.", 125 "source": "opus" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The evaluation uses PaperBench Code-Dev, a publicly available benchmark from OpenAI (cited as [7]).", 131 "source": "opus" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper mentions 'Ubuntu 22.04 LTS-based sandboxed environment' with 'standard Python development stack and essential dependencies' but provides no requirements.txt, Dockerfile, or detailed library versions. Not enough to recreate the environment.", 137 "source": "opus" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link exists but the paper itself contains no commands, scripts, or specific instructions for reproducing the experiments.", 143 "source": "opus" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": true, 150 "justification": "Table 2 reports ± values (e.g., '73.6 ± 5.3') and Table 3 reports standard errors for each paper across 3 runs.", 151 "source": "opus" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used. Claims like 'decisively outperforming' are based on comparing point estimates without any formal tests (no p-values, t-tests, bootstrap tests, etc.).", 157 "source": "opus" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "The paper reports relative improvements with baseline context, e.g., '70% relative improvement over the best LLM agent baseline' and absolute score differences with baseline values (e.g., 73.5 vs 43.3).", 163 "source": "opus" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The benchmark has 20 papers total, with key comparisons on 3-paper and 5-paper subsets. No justification for these small sample sizes and no power analysis is provided.", 169 "source": "opus" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": true, 174 "justification": "Table 3 reports standard errors across 3 runs per paper. Table 2 reports ± values for aggregate scores.", 175 "source": "opus" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Extensive baselines across four categories: LLM agents (6 models × 2 scaffolds), scientific code agents (PaperCoder), commercial agents (Cursor, Claude Code, Codex), and human experts.", 183 "source": "opus" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include very recent systems: Cursor v1.7.52, Claude Code v2.0.22, Codex with GPT-5, Claude 4.5 Sonnet, and Gemini-2.5-Pro. These are contemporary and competitive.", 189 "source": "opus" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 4.4 presents ablation studies for three core components: CodeRAG, CodeMem, and Automated Verification, each showing contribution to performance.", 195 "source": "opus" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": false, 200 "justification": "Only a single metric is used: Replication Score. While it is a composite metric (aggregated from hierarchical rubric), no other metrics like execution success rate, time-to-completion, or code quality metrics are reported.", 201 "source": "opus" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "No human evaluation of DeepCode's outputs is performed. The human experts in PaperBench serve as a baseline comparison, not as evaluators. All evaluation is automated via SimpleJudge (o3-mini).", 207 "source": "opus" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": false, 212 "justification": "All 20 PaperBench papers are used for evaluation. There is no indication of a separate development set for tuning the framework's design decisions. Ablations and final results are reported on overlapping subsets.", 213 "source": "opus" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Table 1 provides per-paper scores for the 5-paper subset. Table 3 provides per-paper, per-run scores for all 20 papers.", 219 "source": "opus" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "No failure analysis is provided. The paper does not discuss where DeepCode fails, which papers it struggles with, or what types of errors persist after verification. Every result is presented positively.", 225 "source": "opus" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": false, 230 "justification": "No negative results are reported. All ablations show the components help. All comparisons show DeepCode winning. No mention of approaches tried and abandoned or configurations that failed.", 231 "source": "opus" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "Models are referenced by marketing names: 'Claude Sonnet 4.5-thinking', 'GPT-5 Codex-high', 'Claude-4.5-Sonnet', 'Gemini-2.5-Pro', 'DeepSeek-R1'. No API versions, snapshot dates, or specific model identifiers are provided.", 239 "source": "opus" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "No actual prompt text is provided. Table 4 and Appendix A.3 describe sub-agent responsibilities in natural language (e.g., 'Conducts semantic parsing of natural language inputs to extract functional requirements') but the actual prompts sent to the LLMs are not included.", 245 "source": "opus" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for any of the models or agents used. No learning rates or sampling settings mentioned.", 251 "source": "opus" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "The paper's main contribution is the scaffolding architecture. Sections 3.1-3.3 describe the three-phase pipeline in detail: Blueprint Generation (content segmentation, multi-agent analysis), Code Generation (CodeMem, CodeRAG), and Automated Verification (static analysis, sandbox execution). Table 4 details all sub-agents. Table 5 describes the MCP tool stack.", 257 "source": "opus" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 3.1.1 describes hierarchical content segmentation (structural parsing, keyword-chunk association). Section 4.1 describes the source code blacklist, input formats (PDF and Markdown), and the grading pipeline.", 263 "source": "opus" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Per-paper per-run scores are in Table 3, but the generated repositories, SimpleJudge outputs, and detailed rubric scores are not released. No way to independently verify the scoring.", 271 "source": "opus" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "The evaluation procedure is well-documented in Section 4.1: PaperBench benchmark, three independent trials per paper, SimpleJudge grading protocol, hierarchical rubric aggregation.", 277 "source": "opus" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study. The human baseline data comes from PaperBench [7], and the evaluation benchmark is a standard public benchmark.", 283 "source": "opus" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Section 4.1 describes the pipeline: paper input → DeepCode autonomous workflow → SimpleJudge evaluation → leaf node scoring → weighted aggregation → final Replication Score. Three trials averaged.", 289 "source": "opus" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training data cutoff dates are stated for any of the models used (Claude 4.5 Sonnet, GPT-5, Gemini-2.5-Pro, DeepSeek-R1). The models likely have training data including ICML 2024 papers (which PaperBench uses).", 297 "source": "opus" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of whether models' training data includes the PaperBench papers or related content. A source code blacklist prevents accessing original repos during execution, but training data contamination is not addressed.", 303 "source": "opus" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "PaperBench uses ICML 2024 papers which were published before the training cutoffs of the models used. The paper enforces a 'source code blacklist' at runtime but does not discuss whether models have already seen the papers or solutions during training.", 309 "source": "opus" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study. The human baseline data comes from PaperBench [7].", 317 "source": "opus" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "opus" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "opus" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "opus" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "opus" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "opus" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in this study.", 353 "source": "opus" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Table 3 reports average cost per paper, ranging from $7.06 (FTRL) to $11.90 (BBOX), providing concrete dollar costs for each of the 20 benchmark tasks.", 361 "source": "opus" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Per-paper API costs are in Table 3 but no total compute budget is stated. GPU requirements are not quantified for DeepCode. Baseline agents had A10 GPUs and 12-hour time limits; DeepCode's time budget is not specified, making fair comparison impossible.", 367 "source": "opus" 368 } 369 }, 370 "experimental_rigor": { 371 "seed_sensitivity_reported": { 372 "applies": true, 373 "answer": true, 374 "justification": "Table 3 reports three independent runs per paper with mean and standard error, showing sensitivity to stochastic variation (e.g., RICE ranges from 0.609 to 0.761 across runs).", 375 "source": "opus" 376 }, 377 "number_of_runs_stated": { 378 "applies": true, 379 "answer": true, 380 "justification": "Section 4.1 explicitly states: 'For each target paper, three independent replication trials are performed, and each resulting repository is scored separately.'", 381 "source": "opus" 382 }, 383 "hyperparameter_search_budget": { 384 "applies": true, 385 "answer": false, 386 "justification": "No hyperparameter search budget is reported. The framework has many design parameters (agent configurations, prompt designs, memory structures) but no discussion of how these were tuned or how many configurations were tried.", 387 "source": "opus" 388 }, 389 "best_config_selection_justified": { 390 "applies": true, 391 "answer": false, 392 "justification": "No discussion of how the final configuration was selected. The paper presents one configuration without explaining what alternatives were tried or how design decisions were validated.", 393 "source": "opus" 394 }, 395 "multiple_comparison_correction": { 396 "applies": false, 397 "answer": false, 398 "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable.", 399 "source": "opus" 400 }, 401 "self_comparison_bias_addressed": { 402 "applies": true, 403 "answer": false, 404 "justification": "No acknowledgment of author-evaluation bias. The authors designed, implemented, and evaluated DeepCode. They compare against commercial tools not designed for this specific task. Lucic et al.'s finding about author implementations of baselines systematically underperforming is not discussed.", 405 "source": "opus" 406 }, 407 "compute_budget_vs_performance": { 408 "applies": true, 409 "answer": false, 410 "justification": "Baseline agents had strict 12-hour time limits (36 hours for select o1 runs). DeepCode's time budget is never stated. Per-paper API costs are reported but not compared against baseline compute costs. If DeepCode uses significantly more compute, the comparison is unfair.", 411 "source": "opus" 412 }, 413 "benchmark_construct_validity": { 414 "applies": true, 415 "answer": false, 416 "justification": "No discussion of whether PaperBench's Replication Score (static code analysis by SimpleJudge) actually measures scientific reproduction capability. The paper uses the benchmark at face value without questioning whether code structure matching equates to faithful replication.", 417 "source": "opus" 418 }, 419 "scaffold_confound_addressed": { 420 "applies": true, 421 "answer": true, 422 "justification": "The paper explicitly notes: 'DeepCode uses the same base model as both Cursor and Claude Code' (Claude Sonnet 4.5-thinking), isolating the scaffold/architecture effect from the model effect in the commercial agent comparison.", 423 "source": "opus" 424 } 425 }, 426 "data_leakage": { 427 "temporal_leakage_addressed": { 428 "applies": true, 429 "answer": false, 430 "justification": "No discussion of temporal leakage. PaperBench uses ICML 2024 papers. Models trained after 2024 have likely seen these papers and potentially related implementations in their training data.", 431 "source": "opus" 432 }, 433 "feature_leakage_addressed": { 434 "applies": true, 435 "answer": false, 436 "justification": "A source code blacklist prevents accessing original repositories during execution, which is a partial mitigation. However, there is no systematic discussion of whether the models' parametric knowledge contains implementation details from having seen the papers during training.", 437 "source": "opus" 438 }, 439 "non_independence_addressed": { 440 "applies": true, 441 "answer": false, 442 "justification": "No discussion of independence between model training data and PaperBench test papers. The 20 ICML 2024 papers are likely in the training data of the frontier models used.", 443 "source": "opus" 444 }, 445 "leakage_detection_method": { 446 "applies": true, 447 "answer": false, 448 "justification": "The source code blacklist is a prevention method (blocking repo access at runtime) but not a detection method. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used to detect training data contamination.", 449 "source": "opus" 450 } 451 } 452 } 453 }, 454 "claims": [ 455 { 456 "claim": "DeepCode achieves 73.5% replication score on PaperBench, a 70% relative improvement over the best LLM agent baseline (o1 IterativeAgent at 43.3%)", 457 "evidence": "Table 2 and Figure 4 show DeepCode at 73.6±5.3 vs o1 (IterativeAgent) at 43.3±1.1 on the full 20-paper benchmark", 458 "supported": "strong" 459 }, 460 { 461 "claim": "DeepCode surpasses human PhD expert performance on scientific paper reproduction", 462 "evidence": "On a 3-paper subset, DeepCode averages 75.9±4.5 vs human Best@3 score of 72.4; the 3.5pp gap falls within the ±4.5 standard error, making this claim statistically unsupported", 463 "supported": "weak" 464 }, 465 { 466 "claim": "DeepCode decisively outperforms commercial agents (Cursor, Claude Code, Codex) despite using the same base model", 467 "evidence": "Table 1 (5-paper subset): DeepCode 0.848 vs Cursor 0.584, Claude Code 0.587, Codex 0.400; all three use the same Claude Sonnet 4.5-thinking or GPT-5 backbone", 468 "supported": "moderate" 469 }, 470 { 471 "claim": "Principled information-flow management provides larger gains than scaling model size", 472 "evidence": "Figure 5 shows DeepCode's ranking across 5 backbones is stable but lacks a controlled experiment comparing the same model with vs. without DeepCode vs. larger models with simpler scaffolding", 473 "supported": "weak" 474 }, 475 { 476 "claim": "CodeMem prevents context saturation, raising scores from 0.33–0.43 (naive sliding window) to 0.70–0.92", 477 "evidence": "Figure 6b ablation on 5 PaperBench tasks comparing CodeMem vs. sliding-window context management using Claude 4.5 Sonnet", 478 "supported": "strong" 479 }, 480 { 481 "claim": "CodeRAG delivers up to 71% relative performance gain for cost-efficient models like Gemini-2.5-Flash", 482 "evidence": "Figure 6a shows CodeRAG improvements of +68.8%, +41.0%, +71.3% on 3 papers for Gemini-2.5-Flash; effects are negligible for frontier models", 483 "supported": "strong" 484 } 485 ], 486 "methodology_tags": [ 487 "benchmark-eval" 488 ], 489 "key_findings": "DeepCode is a multi-phase agentic framework for converting scientific papers to executable code repositories using four information operations: blueprint distillation (Phase 1), stateful code memory (CodeMem) plus retrieval-augmented generation (CodeRAG) (Phase 2), and closed-loop verification (Phase 3). On PaperBench (20 ICML 2024 papers), it achieves 73.5% replication score, substantially outperforming the best LLM agent (43.3%) and specialized competitor PaperCoder (51.1%). On a 3-paper subset, it scores 75.9±4.5 vs human expert best@3 at 72.4, though this difference is within statistical uncertainty. Ablations show CodeMem is the most critical component for cross-file consistency (restoring scores from 0.33–0.43 to 0.70–0.92), while CodeRAG primarily benefits smaller models that lack sufficient parametric knowledge.", 490 "red_flags": [ 491 { 492 "flag": "Human comparison statistically unsupported", 493 "detail": "DeepCode scores 75.9±4.5 vs human best@3 at 72.4 on a 3-paper subset. The 3.5pp gap is within the ±4.5 standard error; the headline 'surpassing human experts' claim is not statistically justified and the subset is too small for generalization." 494 }, 495 { 496 "flag": "Benchmark contamination unaddressed", 497 "detail": "PaperBench uses ICML 2024 papers publicly available before Claude 4.5 and GPT-5 training cutoffs. Models may have memorized paper content, code patterns, or related implementations. Training cutoffs are not stated and contamination risk is not discussed anywhere." 498 }, 499 { 500 "flag": "Commercial comparison on only 5 of 20 papers", 501 "detail": "The headline comparison against Cursor, Claude Code, and Codex is conducted on only 5 papers (not the full 20-paper benchmark), and the paper does not explain the selection criteria for this subset." 502 }, 503 { 504 "flag": "SimpleJudge (o3-mini) as unvalidated evaluator", 505 "detail": "All results are derived from o3-mini-powered SimpleJudge measuring static code properties. No validation is provided that SimpleJudge scores correlate with actual experiment reproduction success, and using an OpenAI model to evaluate may introduce bias favoring GPT-family systems." 506 }, 507 { 508 "flag": "Inconsistent numerical results", 509 "detail": "The abstract and Figure 1 report 75.9% for DeepCode's 3-paper human comparison, but Figure 4 and Table 2 show 76.7% for the same condition. The full benchmark score is 73.5 in the abstract/Section 4.2 but 73.6 in Table 2; these discrepancies suggest the results were updated without full consistency." 510 }, 511 { 512 "flag": "No statistical significance testing", 513 "detail": "Despite making 'decisive outperformance' claims across multiple baselines, no statistical significance tests are performed; standard errors across papers do not substitute for formal hypothesis testing in comparative claims." 514 } 515 ], 516 "cited_papers": [ 517 { 518 "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", 519 "relevance": "Primary benchmark providing the evaluation framework, grading rubric, LLM agent baselines, and human expert reference scores used throughout the paper" 520 }, 521 { 522 "title": "Paper2Code: Automating Code Generation from Scientific Papers in Machine Learning", 523 "relevance": "Direct scientific code agent competitor (PaperCoder); DeepCode claims a 22pp improvement over this system on the same benchmark" 524 }, 525 { 526 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 527 "relevance": "Related agentic coding framework introducing ACI concepts; represents the tool-augmented agent paradigm that DeepCode builds upon" 528 }, 529 { 530 "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery", 531 "relevance": "Related work on automated scientific research agents; DeepCode targets a complementary task (implementation of specified papers rather than autonomous ideation)" 532 }, 533 { 534 "title": "ChatDev: Communicative Agents for Software Development", 535 "relevance": "Multi-agent coding framework representing the team-simulation paradigm; one of the key prior works in agentic software engineering" 536 }, 537 { 538 "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework", 539 "relevance": "Multi-agent framework for software development; key prior work whose multi-agent design philosophy is compared against DeepCode's information-flow approach" 540 }, 541 { 542 "title": "CodeScientist: End-to-End Semi-Automated Scientific Discovery with Code-Based Experimentation", 543 "relevance": "Related scientific coding agent using iterative generate-execute-reflect cycle; represents an alternative approach to automated scientific code generation" 544 }, 545 { 546 "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery", 547 "relevance": "Recent Google DeepMind work using LLMs for evolutionary code generation; represents the frontier of automated code generation for scientific purposes" 548 } 549 ], 550 "engagement_factors": { 551 "practical_relevance": { 552 "score": 1, 553 "justification": "Paper-to-code reproduction is a niche use case; most developers won't apply this framework in their daily workflow despite available source code." 554 }, 555 "surprise_contrarian": { 556 "score": 1, 557 "justification": "The human-surpassing claim grabs attention but is on only 3 papers within one standard error, making it more hype than genuine surprise." 558 }, 559 "fear_safety": { 560 "score": 0, 561 "justification": "No safety, security, or risk angle whatsoever." 562 }, 563 "drama_conflict": { 564 "score": 2, 565 "justification": "Directly claims to 'decisively outperform' Cursor, Claude Code, and Codex while red flags reveal inconsistent numbers, tiny subsets, and unfair comparison methodologies." 566 }, 567 "demo_ability": { 568 "score": 1, 569 "justification": "GitHub repo exists but reproducing results requires PaperBench setup, sandboxed environments, and expensive frontier model API keys." 570 }, 571 "brand_recognition": { 572 "score": 1, 573 "justification": "University of Hong Kong is recognized but not a famous AI lab; comparisons against Cursor/Claude Code/Codex add indirect name recognition." 574 } 575 }, 576 "hn_data": { 577 "threads": [ 578 { 579 "hn_id": "42401619", 580 "title": "Forking Paths in Neural Text Generation", 581 "points": 2, 582 "comments": 0, 583 "url": "https://news.ycombinator.com/item?id=42401619", 584 "created_at": "2024-12-12T18:06:05Z" 585 }, 586 { 587 "hn_id": "34029704", 588 "title": "Roscoe: A Suite of Metrics for Scoring Step-by-Step Reasoning (Meta)", 589 "points": 1, 590 "comments": 0, 591 "url": "https://news.ycombinator.com/item?id=34029704", 592 "created_at": "2022-12-17T17:06:48Z" 593 } 594 ], 595 "top_points": 2, 596 "total_points": 3, 597 "total_comments": 0 598 } 599 }