scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18680B)
      1 {
      2   "paper": {
      3     "title": "Context-Augmented Code Generation Using Programming Knowledge Graphs",
      4     "authors": ["Iman Saberi", "Fatemeh Fard"],
      5     "year": 2025,
      6     "venue": "ICML 2025",
      7     "doi": null
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper. No mention of code availability."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets: PythonAlpaca (Huggingface link provided) and Tutorials dataset from Wang et al. (2024). HumanEval and MBPP are public benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 'a single A100 GPU' and Neo4j version 5.20.0, but no requirements.txt, Dockerfile, or comprehensive dependency list is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No README or step-by-step reproduction instructions are provided. The methodology is described but not as executable reproduction steps."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., '40%' pass@1) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims improvements (e.g., 'up to 20%') by comparing raw numbers across methods. No statistical significance tests are reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Tables 1-4 report absolute pass@1 percentages for all methods and baselines, providing baseline context (e.g., CodeLlama-7B goes from 33% NoRAG to 40% Block-PKG on HumanEval)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "HumanEval has 164 problems and MBPP has ~400. No justification for whether these sample sizes are sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Greedy decoding (temperature=0) is used, so results are deterministic per run. However, no variance across different random seeds or configurations is reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are compared: NoRAG, BM25, VoyageEmb, Func-BM25, Func-PKG, Block-PKG, plus an 'Ideal Reranker' upper bound (Tables 1-4)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include Voyage-Code-2 (2024), BM25, and the paper compares against CodeRAG-Bench (Wang et al., 2024). Models used are recent (DeepSeek-Coder, Llama3.1, StarCoder2)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Appendix A.1 presents an ablation study on tree pruning (Table 7), and A.2 evaluates the effectiveness of individual re-ranker steps (Tables 8-10)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only pass@1 is used as the evaluation metric. No other metrics (e.g., pass@10, CodeBLEU) are reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated code quality is included. Evaluation is entirely automated via pass@1 test execution."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "HumanEval and MBPP are standard held-out benchmarks with predefined test cases used for evaluation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "RQ4 provides per-topic breakdowns across 10 categories on MBPP (Figures 4, 5, 10). Error analysis by type is in Table 5."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "RQ5 discusses error types introduced by RAG (Table 5). Section A.4 discusses retrieval challenges including string manipulation failures. Case studies in A.9."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that DeepSeek-Coder shows less improvement on HumanEval, that RAG can degrade correct solutions, and that PKG underperforms on string manipulation and data structures topics."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 'up to 20% pass@1 accuracy gains' and '34% improvement over baselines on MBPP' are supported by Tables 1-2 (e.g., CodeLlama-7B on MBPP: 38% NoRAG to 58% Reranked = 20pp gain)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about PKG improving performance are supported by controlled ablation studies (tree pruning ablation in Table 7, re-ranker step evaluation in Tables 8-10) with single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Code Generation' broadly, but experiments are limited to Python only on two benchmarks (HumanEval, MBPP) with 7B-13B models. No explicit bounding of generalization."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the results. For example, improvements could partly come from the re-ranker's test execution filtering rather than PKG retrieval quality."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are identified by name and size (e.g., 'CodeLlama-7B', 'DeepSeek-Coder-7B', 'Llama3.1-8B') but no specific version identifiers or snapshot dates are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt templates with actual code are provided in Appendix sections A.6.1, A.7.1, and A.8.1 for CodeLlama, StarCoder2, and DeepSeek-Coder respectively."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Temperature=0, max_new_tokens=512, and greedy decoding are specified in Section 4 (Evaluation Metric)."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The PKG retrieval pipeline, tree pruning, and re-ranking mechanism are described in detail in Section 2 with mathematical formulations and figures."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4 documents preprocessing: 143,000 Q&A pairs → 115,000 Python functions extracted → PKG with 425,058 nodes. Tutorial dataset processing is also described."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section. Section A.4 discusses retrieval challenges but not broader study limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The paper does not address potential confounds or methodological limitations."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do not show or what settings are excluded. The paper does not bound its claims to Python or specific model sizes."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (per-problem results, generated code outputs) is made available for verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4 describes data sources: PythonAlpaca with 143,000 Q&A pairs and Tutorials dataset with 76,600 entries. Extraction counts are provided."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard public datasets and benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from dataset selection through block extraction, graph extraction, encoding, and Neo4j import is documented in Section 2.1 with counts at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Authors are identified as being from University of British Columbia Okanagan, with email addresses provided."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates are stated for any of the models used (CodeLlama, DeepSeek-Coder, StarCoder2, Llama3.1)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether HumanEval or MBPP appeared in the training data of the models used. This is a significant concern given these are widely-used benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval (2021) and MBPP (2021) were published before all models' training cutoffs. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section 5.1 reports retrieval time (~3 seconds per query), token budget analysis (Table 11), and Table 6 provides time and storage costs for PKG construction."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Table 6 reports total processing times (301 min for PKG, 241 for VoyageAI, 44 for BM25), storage usage (12,530 MB for PKG), and experiments used a single A100 GPU."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "PKG-based retrieval improves pass@1 accuracy by up to 20% compared to NoRAG on HumanEval and MBPP.",
    286       "evidence": "Tables 1-2 show improvements across models. E.g., CodeLlama-7B on MBPP: 38% (NoRAG) to 58% (Reranked) = 20pp gain.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "PKG with re-ranking achieves up to 34% improvement over baselines on MBPP.",
    291       "evidence": "Table 2 shows StarCoder2-7B: BM25 at 25%, Reranked at 62%, a 37pp difference. CodeLlama-7B: VoyageEmb at 32%, Reranked at 58%, a 26pp difference.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Block-wise retrieval (finer granularity) outperforms function-wise retrieval across most models.",
    296       "evidence": "Tables 1-2 show Block-PKG consistently outperforms Func-PKG (e.g., StarCoder2-7B on HumanEval: 59% vs 56%).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Tree pruning improves retrieval quality.",
    301       "evidence": "Table 7 (ablation study) shows Block-PKG with pruning outperforms without pruning across all models (1-11pp gains).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "RAG can introduce new error types even while reducing others.",
    306       "evidence": "Table 5 shows assertion errors decrease but name errors and type errors can increase when using Block-PKG.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper proposes Programming Knowledge Graphs (PKG) for RAG-based code generation, evaluated on HumanEval and MBPP with five models (7B-13B). PKG with re-ranking improves pass@1 by up to 20pp over NoRAG. Finer-grained block-wise retrieval outperforms function-wise retrieval. However, RAG can degrade performance on certain problem categories (string manipulation, data structures) and introduce new error types while reducing others.",
    312   "red_flags": [
    313     {
    314       "flag": "No contamination analysis",
    315       "detail": "HumanEval and MBPP were published in 2021, well before the training data cutoffs of all models used. The paper does not discuss whether these benchmarks appeared in training data, which could inflate baseline and method results alike."
    316     },
    317     {
    318       "flag": "Single metric evaluation",
    319       "detail": "Only pass@1 with greedy decoding is reported. No pass@k for k>1, no code quality metrics, no functional correctness beyond test cases."
    320     },
    321     {
    322       "flag": "No statistical testing",
    323       "detail": "All comparisons are based on raw percentage differences with no significance tests, confidence intervals, or variance measures. With small benchmark sizes (164 HumanEval problems), observed differences could be due to chance."
    324     },
    325     {
    326       "flag": "No limitations section",
    327       "detail": "The paper lacks any formal limitations or threats-to-validity discussion, despite making broad claims about code generation improvement."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating large language models trained on code",
    333       "authors": ["Mark Chen", "Jerry Tworek"],
    334       "year": 2021,
    335       "arxiv_id": "2107.03374",
    336       "relevance": "Introduces HumanEval benchmark and Codex, foundational for LLM code generation evaluation."
    337     },
    338     {
    339       "title": "Program synthesis with large language models",
    340       "authors": ["Jacob Austin"],
    341       "year": 2021,
    342       "arxiv_id": "2108.07732",
    343       "relevance": "Introduces MBPP benchmark used for evaluating code generation capabilities."
    344     },
    345     {
    346       "title": "CodeRAG-Bench: Can retrieval augment code generation?",
    347       "authors": ["Zora Zhiruo Wang"],
    348       "year": 2024,
    349       "arxiv_id": "2406.14497",
    350       "relevance": "Directly related benchmark studying RAG for code generation, key baseline and motivation for this work."
    351     },
    352     {
    353       "title": "Code Llama: Open foundation models for code",
    354       "authors": ["Baptiste Roziere"],
    355       "year": 2023,
    356       "arxiv_id": "2308.12950",
    357       "relevance": "One of the primary code LLMs evaluated in code generation benchmarks."
    358     },
    359     {
    360       "title": "DeepSeek-Coder-V2: Breaking the barrier of closed-source models in code intelligence",
    361       "authors": ["Qihao Zhu"],
    362       "year": 2024,
    363       "arxiv_id": "2406.11931",
    364       "relevance": "Code LLM evaluated in the study, relevant to understanding model capabilities for code generation."
    365     },
    366     {
    367       "title": "StarCoder 2 and The Stack v2: The next generation",
    368       "authors": ["Anton Lozhkov"],
    369       "year": 2024,
    370       "arxiv_id": "2402.19173",
    371       "relevance": "Code LLM used as baseline model in code generation benchmarks."
    372     },
    373     {
    374       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    375       "authors": ["Dong Huang"],
    376       "year": 2023,
    377       "arxiv_id": "2312.13010",
    378       "relevance": "Multi-agent approach to code generation, related agentic coding methodology."
    379     },
    380     {
    381       "title": "Retrieval-augmented generation for large language models: A survey",
    382       "authors": ["Yunfan Gao"],
    383       "year": 2023,
    384       "arxiv_id": "2312.10997",
    385       "relevance": "Comprehensive RAG survey relevant to understanding retrieval-augmented approaches for LLMs."
    386     },
    387     {
    388       "title": "DocPrompting: Generating code by retrieving the docs",
    389       "authors": ["Shuyan Zhou"],
    390       "year": 2022,
    391       "relevance": "Prior work on documentation-based RAG for code generation, directly compared in this paper."
    392     },
    393     {
    394       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    395       "authors": ["Fengji Zhang"],
    396       "year": 2023,
    397       "arxiv_id": "2303.12570",
    398       "relevance": "Repository-level code completion using iterative RAG, related retrieval-based code generation approach."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs