scan.json (25651B)
1 { 2 "paper": { 3 "title": "AutoP2C: An LLM-Based Agent Framework for Code Repository Generation from Multimodal Content in Academic Papers", 4 "authors": [ 5 "Zijie Lin", 6 "Yiqing Shen", 7 "Qilin Cai", 8 "He Sun", 9 "Jinrui Zhou", 10 "Mingjun Xiao" 11 ], 12 "year": 2025, 13 "venue": "Preprint, Under Review", 14 "arxiv_id": "2504.20115" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The abstract states 'The code is available at https://github.com/shoushouyu/Automated-Paper-to-Code' — a working URL is provided." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "The Paper2Repo benchmark of eight papers is described but no dataset download or structured benchmark release is provided. The papers are identified by arXiv IDs, but there is no released benchmark artifact (e.g., a dataset repository or structured test suite). PaperBench Code-Dev is a third-party benchmark." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "Section 4.3 mentions 'Intel(R) Xeon(R) Platinum 8352V CPU and an NVIDIA A40 (48GB) GPU' but no requirements.txt, Dockerfile, or dependency specifications are mentioned in the paper. There is no listing of library versions or environment setup instructions." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README with commands, scripts, or 'Reproducing Results' section is described in the paper itself." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "Table 1 (Paper2Repo benchmark) reports only point estimates with no confidence intervals or error bars. Table 2 reports standard error for some baselines (e.g., '49.2 ± 14.0' for AutoP2C on PaperBench) but this is only for one of two benchmarks and the ± 14.0 is extremely large relative to the claimed improvement." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper claims AutoP2C outperforms baselines but no statistical significance tests are used. Comparisons are based solely on point estimates in Tables 1 and 2." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "The paper provides baseline context for its improvements: e.g., '91.4% accuracy compared to 81.8% for o1 (9.6% improvement) and 75.3% for R1 (16.1% improvement)' in Section 4.4.1, and relative performance ratios (89.8% to 122.0% of original). COMPclass and COMPfunc are reported with multipliers (1.9x, 2.1x, etc.)." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The Paper2Repo benchmark contains only 8 papers with no justification for why 8 papers are sufficient. No power analysis or discussion of sample adequacy is provided." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "Table 1 results are single-run with no variance or standard deviation reported. Table 2 reports standard error for PaperBench (49.2 ± 14.0) but this is only for one benchmark. No mention of multiple runs or seeds for the main Paper2Repo results." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares against OpenAI o1 and DeepSeek-R1 on Paper2Repo (Table 1), and against BasicAgent, IterativeAgent, and PaperCoder on PaperBench Code-Dev (Table 2)." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "OpenAI o1, DeepSeek-R1, and PaperCoder are all from 2024-2025, which are contemporary and competitive baselines for this task." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Tables 3 and 4 present ablation studies. Table 3 removes individual components (blueprint extraction, multimodal parsing, hierarchical decomposition, iterative feedback) and measures impact. Table 4 ablates modalities (image, table)." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper uses four metrics: absolute performance (accuracy), relative performance (ratio to original), COMPclass (class completeness), and COMPfunc (function completeness), as described in Section 4.1 and Table 1." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "No human evaluation is included. COMPclass and COMPfunc use LLM-as-a-Judge scoring (Section 4.1). The system generates code repositories whose quality could benefit from human assessment, but evaluation is entirely automated." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": false, 95 "justification": "The Paper2Repo benchmark of 8 papers was constructed by the authors. There is no separation of dev/test — all 8 papers appear to be used for both development and evaluation. The PaperBench Code-Dev is a separate benchmark but the paper does not discuss whether any tuning was done on it." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Table 1 provides per-paper breakdowns across all 8 papers for all four metrics. The ablation study (Tables 3-4) also breaks down by individual papers." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": false, 105 "justification": "The paper does not discuss failure cases or error analysis for AutoP2C. It notes baselines fail (marked with red crosses), but does not analyze where or why AutoP2C's generated code falls short (e.g., the cases where COMPfunc is low like 18.5% or 31.4%)." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": false, 110 "justification": "Every experiment shows AutoP2C outperforming baselines. No approaches that were tried and abandoned or configurations that failed are discussed. The ablation results show degradation when components are removed, but no negative results about the full system." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims AutoP2C 'can successfully generate executable code repositories for all eight papers, while OpenAI-o1 or DeepSeek-R1 can only produce runnable code for one paper.' This is supported by Table 1, which shows the baselines fail on 7/8 papers." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The ablation study (Tables 3-4) makes causal claims about component contributions (e.g., 'removal of any component degrades performance'). The controlled single-variable ablation design, removing one component at a time, is adequate for these causal claims." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The title claims 'Code Repository Generation from Multimodal Content in Academic Papers' broadly, but the evaluation is limited to 8 ML papers in specific subdomains (CV, NLP, graph). No discussion bounds the generalization to other fields, programming languages, or paper types. Section 5 briefly mentions extending to other languages as future work but does not bound current claims." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for its results. For example, the baselines (o1 and R1) are single-model systems while AutoP2C is a multi-agent pipeline with multiple LLM calls — the improvement could partly be due to the additional compute rather than the framework design. No such confounds are discussed." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "Section 4.3 mentions 'GPT-4o', 'o1-mini', 'o1', and 'o3-mini' without snapshot dates or API versions. These are marketing names that do not uniquely identify a model version." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper describes prompts in natural language (e.g., 'guiding prompt consisting of three key components' in Section 3.4) but does not provide the actual prompt text used in experiments. No appendix with full prompts is included." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Section 4.3 states 'we standardize the decoding temperature to 0 for all temperature-configurable LLMs.' The hardware is specified (Intel Xeon Platinum 8352V CPU, NVIDIA A40 48GB GPU). Token counts are reported in Table 1." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The four-stage pipeline is described in detail in Sections 3.2-3.6: repository blueprint extraction, multimodal content parsing, hierarchical task decomposition, and iterative feedback-driven implementation. The agent roles (coding agent, verification agent), workflow, retry/debugging logic, and inter-component dependencies are well-documented." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 3.4 documents the multimodal content parsing pipeline: OCR with MinerU for PDF-to-markdown conversion, VLM for visual elements, LLM for equations and tables, followed by integration and filtering steps. The paper selection criteria for the benchmark are described in Section 4.2." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": false, 166 "justification": "There is no dedicated Limitations or Threats to Validity section. The Conclusion (Section 5) contains a single sentence about future work ('explore the extension of AutoP2C to support additional programming languages beyond Python') but no substantive limitations discussion." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "No threats to validity are discussed. There is no analysis of potential biases in the benchmark selection, limitations of LLM-as-a-Judge evaluation, or risks of the small sample size." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper does not explicitly state what the results do NOT show. The future work sentence in Section 5 implicitly acknowledges Python-only scope, but no explicit scope boundaries are stated regarding the types of papers, domains, or complexity levels the framework can or cannot handle." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "The generated code repositories, LLM judge scores, and raw experimental outputs are not available for independent verification. Only aggregated metrics in Tables 1-4 are reported." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 4.2 describes how the Paper2Repo benchmark was constructed: papers selected from paperswithcode.com with criteria including publication date (after 2024), availability of GitHub repositories, spanning six ML tasks and three modal types." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants are involved in this study. The benchmark consists of academic papers and automated evaluation." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": false, 198 "justification": "While the system pipeline is documented, the evaluation data pipeline is not fully transparent. It is unclear how COMPclass and COMPfunc scores were computed in practice — which LLM was used as judge, what prompts were given, and whether there was any filtering or aggregation beyond what is described." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding source or acknowledgments section is present in the paper." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: University of Science and Technology of China and Johns Hopkins University. The evaluated systems (OpenAI o1, DeepSeek-R1) are from different organizations than the authors." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is itself a concern." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement is present in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper uses GPT-4o, o1-mini, o1, and o3-mini but does not state their training data cutoff dates. The benchmark papers were published in 2024, and it is unclear whether any of these papers or their code repositories appeared in the training data of the models used." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "No discussion of whether the benchmark papers or their existing code repositories could have appeared in the training data of the models used. This is a significant concern since the models might have seen the original code repositories." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "The Paper2Repo benchmark papers have GitHub repositories that were publicly available before the models' likely training cutoffs. The paper does not address whether the models may have already seen these repositories during training, which could inflate structural completeness scores." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants are involved in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are involved in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Table 1 reports input and output token counts for each paper processed by AutoP2C (e.g., 852K input / 120K output for the first paper). This provides a proxy for API cost, though no dollar amounts are given." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "The hardware is specified (Section 4.3: Intel Xeon Platinum 8352V, NVIDIA A40 48GB) and token counts are reported, but no total API spend, wall-clock time per experiment, or total computational budget is stated." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "AutoP2C can successfully generate executable code repositories for all eight papers, while OpenAI-o1 or DeepSeek-R1 can only produce runnable code for one paper.", 293 "evidence": "Table 1 shows AutoP2C achieves non-zero performance across all 8 papers while o1 and R1 fail on 7 papers (marked with red crosses). Section 4.4.1.", 294 "supported": "strong" 295 }, 296 { 297 "claim": "AutoP2C achieves an average relative performance of 99.5% compared to the original implementations across all eight papers.", 298 "evidence": "Table 1 shows relative performance ranging from 89.8% to 122.0%. Section 4.4.1 states the average is 99.5%.", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "AutoP2C achieves 49.2% replication score on PaperBench Code-Dev, surpassing PaperCoder (44.2%).", 303 "evidence": "Table 2 reports AutoP2C at 49.2 ± 14.0% vs PaperCoder at 44.2%. Section 4.4.3.", 304 "supported": "weak" 305 }, 306 { 307 "claim": "Each component of AutoP2C contributes to the overall performance; removing any degrades results.", 308 "evidence": "Table 3 shows ablation results across four papers for all four components. Removing blueprint extraction drops average performance from 83.5% to 51.4%. Section 4.5.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "AutoP2C achieves 1.9x and 2.1x improvement in class completeness over o1 and R1 respectively.", 313 "evidence": "Section 4.4.2 reports average COMPclass of 65.7% for AutoP2C vs 34.9% for o1 and 31.1% for R1, from Table 1.", 314 "supported": "moderate" 315 } 316 ], 317 "methodology_tags": [ 318 "benchmark-eval" 319 ], 320 "key_findings": "AutoP2C is a four-stage multi-agent framework for generating executable code repositories from ML research papers, processing multimodal content (text, diagrams, tables). On the authors' Paper2Repo benchmark of 8 papers, AutoP2C generates executable code for all 8 papers while OpenAI o1 and DeepSeek-R1 succeed on only 1, achieving an average 99.5% relative performance to original implementations. On PaperBench Code-Dev, AutoP2C achieves 49.2% replication score, slightly above PaperCoder's 44.2%. Ablation studies confirm each pipeline stage and modality contributes to performance.", 321 "red_flags": [ 322 { 323 "flag": "Tiny benchmark size", 324 "detail": "The Paper2Repo benchmark contains only 8 papers, which is extremely small for making general claims about repository generation from academic papers. No justification for why 8 papers are sufficient is provided." 325 }, 326 { 327 "flag": "No uncertainty quantification on main results", 328 "detail": "Table 1 (the main evaluation) reports single-run point estimates with no error bars, confidence intervals, or variance across runs. Temperature is set to 0, but LLM outputs can still vary. The PaperBench result (49.2 ± 14.0) has a standard error so large it overlaps with PaperCoder's 44.2." 329 }, 330 { 331 "flag": "Contamination risk", 332 "detail": "The 8 benchmark papers all have public GitHub repositories. The LLMs used (GPT-4o, o1, o3-mini) may have seen these repositories during training. The paper does not address this contamination risk, which could inflate both functional and structural completeness scores." 333 }, 334 { 335 "flag": "LLM-as-a-Judge without validation", 336 "detail": "COMPclass and COMPfunc are LLM-scored metrics (Section 4.1) but no human validation of these scores is provided. The judge LLM, its prompts, and any calibration are not disclosed, making these metrics difficult to reproduce or verify." 337 }, 338 { 339 "flag": "Unfair baseline comparison", 340 "detail": "AutoP2C uses multiple LLM calls across four stages (GPT-4o, o1-mini, o1, o3-mini) while baselines are single-model systems. The comparison conflates framework design benefits with additional compute expenditure. Token counts show AutoP2C uses 300K-1.2M input tokens per paper." 341 }, 342 { 343 "flag": "No limitations section", 344 "detail": "The paper has no limitations or threats-to-validity section. This is concerning given the small benchmark, LLM-scored metrics, potential contamination, and narrow domain coverage." 345 }, 346 { 347 "flag": "Missing prompt details", 348 "detail": "The actual prompts used for each stage are not provided, only natural language descriptions of what they do. This makes the framework impossible to fully reproduce even with the code release." 349 } 350 ], 351 "cited_papers": [ 352 { 353 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 354 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 355 "year": 2024, 356 "relevance": "Key multi-agent framework for code generation, directly relevant as a baseline approach for agentic software development." 357 }, 358 { 359 "title": "Autogen: Enabling next-gen llm applications via multi-agent conversation", 360 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 361 "year": 2023, 362 "arxiv_id": "2308.08155", 363 "relevance": "Foundational multi-agent framework for LLM applications, relevant to the survey's coverage of agentic AI systems." 364 }, 365 { 366 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 367 "authors": ["John Yang", "Carlos Jimenez", "Alexander Wettig"], 368 "year": 2024, 369 "relevance": "Major agentic coding framework that uses agent-computer interfaces, directly relevant to automated software engineering evaluation." 370 }, 371 { 372 "title": "CodeAgent: Enhancing code generation with tool-integrated agent systems for real-world repo-level coding challenges", 373 "authors": ["Kechi Zhang", "Jia Li", "Ge Li"], 374 "year": 2024, 375 "arxiv_id": "2401.07339", 376 "relevance": "First framework specifically designed for repository-level code generation using agents with tools." 377 }, 378 { 379 "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation", 380 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 381 "year": 2023, 382 "relevance": "Rigorous evaluation methodology for LLM code generation, relevant to benchmarking methodology in the survey." 383 }, 384 { 385 "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", 386 "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn"], 387 "year": 2025, 388 "arxiv_id": "2504.01848", 389 "relevance": "Benchmark for evaluating AI's ability to replicate research, directly relevant to agentic AI capability evaluation." 390 }, 391 { 392 "title": "Paper2Code: Automating Code Generation from Scientific Papers in Machine Learning", 393 "authors": ["Minju Seo", "Jinheon Baek", "Seongyun Lee", "Sung Ju Hwang"], 394 "year": 2025, 395 "arxiv_id": "2504.17192", 396 "relevance": "Directly competing multi-agent framework for the same paper-to-code task, serving as a baseline comparison." 397 }, 398 { 399 "title": "Judging llm-as-a-judge with mt-bench and chatbot arena", 400 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 401 "year": 2023, 402 "relevance": "Foundational work on LLM-as-a-Judge evaluation paradigm, used as basis for the COMPclass and COMPfunc metrics." 403 }, 404 { 405 "title": "Agentcoder: Multi-agent-based code generation with iterative testing and optimisation", 406 "authors": ["Dong Huang", "Jie M Zhang", "Michael Luck"], 407 "year": 2023, 408 "arxiv_id": "2312.13010", 409 "relevance": "Multi-agent code generation with iterative testing, relevant to the survey's coverage of agentic coding approaches." 410 }, 411 { 412 "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning", 413 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 414 "year": 2025, 415 "arxiv_id": "2501.12948", 416 "relevance": "Major reasoning LLM used as a baseline, relevant to LLM capability evaluation in the survey." 417 }, 418 { 419 "title": "CodeRefine: A Pipeline for Enhancing LLM-Generated Code Implementations of Research Papers", 420 "authors": ["Ekaterina Trofimova", "Emil Sataev", "Abhijit Singh Jowhari"], 421 "year": 2024, 422 "arxiv_id": "2408.13366", 423 "relevance": "Pipeline for improving LLM code generation from papers, directly related to the paper-to-code task." 424 }, 425 { 426 "title": "Magis: Llm-based multi-agent framework for github issue resolution", 427 "authors": ["Wei Tao", "Yucheng Zhou", "Yanlin Wang"], 428 "year": 2024, 429 "relevance": "Multi-agent framework for GitHub issue resolution, relevant to agentic software engineering in the survey." 430 } 431 ] 432 }