scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25242B)
      1 {
      2   "paper": {
      3     "title": "CodeGRAG: Bridging the Gap between Natural Language and Programming Language via Graphical Retrieval Augmented Generation",
      4     "authors": [
      5       "Kounianhua Du",
      6       "Jizheng Chen",
      7       "Renting Rui",
      8       "Huacan Chai",
      9       "Lingyue Fu",
     10       "Wei Xia",
     11       "Yasheng Wang",
     12       "Ruiming Tang",
     13       "Yong Yu",
     14       "Weinan Zhang"
     15     ],
     16     "year": 2024,
     17     "venue": "NeurIPS 2024 (preprint, under review)",
     18     "arxiv_id": "2405.02355"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper provides a code repository at https://anonymous.4open.science/r/Code-5970/ (stated in abstract and Appendix A.1). While this is an anonymous review link, it is a working URL with code available."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses publicly available datasets: HumanEval-X (GitHub link provided), CodeForce (paperswithcode link), and APPS (arXiv link). These are standard public benchmarks that were not modified."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Appendix A.1 provides finetuning details including 8-bit quantization, LoRA rank 8, alpha 16, dropout 0.05, AdamW optimizer, learning rate 0.001, weight decay 1e-5. The anonymous code repository presumably contains environment specifications. These details are sufficient for reconstruction."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While code is released and some hyperparameters are listed in Appendix A.1, the paper does not provide step-by-step reproduction instructions. The NeurIPS checklist item 4 itself notes 'we do not provide the API needed to reproduce the results.' There is no README or reproduction guide described in the paper."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No confidence intervals or error bars are reported anywhere in the paper. All results in Tables 1-5 are point estimates only. The paper's own NeurIPS checklist (item 7) answers '[No]' for error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are used. The paper claims improvements (e.g., CodeGRAG vs baselines in Tables 1-2) based solely on comparing raw numbers without any significance testing."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "While raw Pass@1 scores are reported for baselines and the proposed method, no formal effect sizes (Cohen's d, etc.) are reported. The improvements are presented as raw number differences without contextualizing magnitude beyond the raw scores."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for the sample sizes used. The paper reports using 11,913 C++ and 2,359 Python code snippets for retrieval, and 10,609/8,691 data points for training, but does not justify why these sizes are adequate."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance or standard deviation is reported across runs. The paper uses greedy decoding (Section 3.1), which makes results deterministic for a given model checkpoint, but there is no reporting of variance across training seeds for the finetuned models."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares against base models without retrieval (N/A rows in Tables 1-2), code block retrieval methods (Nashid et al., 2023; Lu et al., 2022), and standard supervised finetuning (SFT) as baselines."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The code retrieval baselines (Nashid et al., 2023; Lu et al., 2022; Su et al., 2024) are reasonably contemporary. The foundation models used include GPT-4omini and recent open-source models."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table 3 presents ablation on the two pretraining objectives (alignment and structure-preserving), and Table 4 studies the impact of individual graph components (edge type, node name, node type, topological structure). These constitute proper ablation studies."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper uses only Pass@1 as the evaluation metric across all experiments. No other metrics (e.g., Pass@10, CodeBLEU, compilation rate) are reported."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation is included. All evaluation is automated via Pass@1 on test suites. For a code generation system claiming to bridge NL-PL gaps, human evaluation of code quality, readability, or correctness beyond test cases would be relevant."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper evaluates on standard held-out test sets: HumanEval-X, CodeForce, and APPS benchmarks all have standard test splits. The training data for soft prompting uses separate datasets (10,609 CodeContest and 8,691 APPS training samples per Appendix A.1)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by programming language (C++ and Python) across Tables 1-5, and by model (GPT-3.5-Turbo, GPT-4omini, Gemma 7b, Llama2 13b, CodeLlama 7b). Table 4 breaks down by graph component."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No failure cases are shown or discussed. The paper does not provide qualitative examples of failures, error analysis, or discussion of where the approach breaks down."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The ablation study in Table 3 shows negative results: removing alignment causes Gemma 7b on CodeForce to drop from 19.13 to 7.88, and removing structure-preserving causes Llama2 13b on CodeForce to drop from 13.62 to 5.50. Table 4 also shows configurations that perform worse than the full system."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims that CodeGRAG 'significantly improves the code generation ability of LLMs and can even offer performance gain for cross-lingual code generation.' Tables 1-2 show improvements over baselines, and the multi-lingual rows in Table 1 demonstrate cross-lingual benefits. The word 'significantly' is used loosely (no statistical significance test), but the improvements are consistent across settings."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper makes causal claims through ablation studies (Table 3: removing objectives degrades performance, Table 4: removing components shows their contribution). These are controlled single-variable manipulations, which constitute adequate causal evidence for the claims about component contributions."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper's title claims to 'bridge the gap between Natural Language and Programming Language' broadly, but results are limited to C++ and Python on three specific benchmarks. The abstract and conclusions do not adequately bound generalization to these specific settings."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations for the results are discussed. For example, improvements from the meta-graph prompt could be due to simply providing more context tokens rather than the structural information specifically. The paper does not consider this or other confounds."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper uses 'GPT-3.5-Turbo' and 'GPT-4omini' without specifying API versions or snapshot dates. For open-source models, 'Gemma 7b', 'Llama2 13b', and 'CodeLlama 7b' are named but specific checkpoint versions are not given."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Figure 4 provides the actual prompt templates for both hard meta-graph prompting and soft prompting, including the system prompt text, retrieved knowledge insertion format, and problem prompt structure. The placeholders ([lang], [meta graph], [problem prompt]) are structural and the actual fill values come from the benchmark tasks and graph extraction."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix A.1 reports key hyperparameters: learning rate 0.001, weight decay 1e-5, 8-bit quantization, LoRA rank 8, alpha 16, dropout 0.05, AdamW optimizer. Section 3.1 states greedy decoding. However, temperature/sampling settings for API models are not explicitly stated (greedy decoding implies temperature 0)."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "CodeGRAG is a retrieval-augmented generation framework, not an agentic scaffolding system. It uses a single-pass retrieve-then-generate pipeline without tools, retry logic, or multi-step agent workflows."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 2.2 describes the graph extraction pipeline in detail: AST construction, data flow graph extraction, control flow graph extraction, and composition into the composed syntax graph. Appendix A.1 provides retrieval pool sizes (11,913 C++ snippets, 2,359 Python snippets) and training set sizes."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 5 is a dedicated 'Limitations' section. However, it is very brief (3 sentences) and discusses only dependency on the quality of the external knowledge base."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The limitations section (Section 5) mentions only a generic concern about 'dependency on the quality of the external knowledge base.' It does not discuss specific threats such as the small number of languages tested, the limited retrieval pool size, potential confounds from context length differences, or single-run evaluation."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the tested languages (C++, Python) or benchmarks (HumanEval-X, CodeForce, APPS). The title and abstract imply broad applicability without explicit scope limitations."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The benchmarks used (HumanEval-X, CodeForce, APPS) are publicly available. The code repository is released, which presumably includes the processed graph data. Raw experimental outputs (generated code) are not explicitly available."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 2.2 describes how the composed syntax graphs are extracted from code blocks. Appendix A.1 specifies the retrieval pool sizes and training data sizes. Section 3.1 identifies the source datasets with links."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants. The data sources are standard public benchmarks (HumanEval-X, CodeForce, APPS)."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The data pipeline is documented: code blocks are parsed into ASTs, then DFG and CFG are extracted, composed into syntax graphs, abstracted into meta-graphs (hard) or encoded by GNN (soft). Section 2.2 and Figure 2 describe this pipeline. Appendix A.1 provides dataset sizes."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "Section 7 ('Acknowledgments and Disclosure of Funding') contains only NeurIPS template boilerplate text instructing authors to fill in funding information. No actual funding sources are disclosed. This appears to be a submission oversight."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are listed: Shanghai Jiao Tong University and Huawei Noah's Ark Lab. This is transparent about the Huawei involvement, though the paper does not evaluate Huawei products specifically."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding is disclosed (the acknowledgments section is template boilerplate), so it is impossible to assess funder independence. Given Huawei affiliation, there may be corporate funding that is not disclosed."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests statement is present in the paper. The acknowledgments section contains only NeurIPS template text with no actual disclosures."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper uses GPT-3.5-Turbo and GPT-4omini but does not state their training data cutoff dates. For Gemma 7b, Llama2 13b, and CodeLlama 7b, no training cutoff information is provided either."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No discussion of potential train/test overlap. HumanEval was published in 2021 and HumanEval-X in 2023; GPT-3.5-Turbo and GPT-4omini may have been trained on these benchmarks. APPS was published in 2021. No contamination analysis is provided."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "HumanEval (2021), APPS (2021), and HumanEval-X (2023) were all published before the likely training cutoffs of GPT-3.5-Turbo and GPT-4omini. The paper does not address the risk that these models may have seen the benchmark problems during training."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study. It is a benchmark evaluation of code generation models."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. NeurIPS checklist items 14-15 are answered [NA]."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No inference cost or latency is reported. The paper calls GPT-3.5-Turbo and GPT-4omini APIs and finetunes multiple models but provides no information about API costs, tokens consumed, or wall-clock time for inference."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No total computational budget is stated. The NeurIPS checklist item 8 claims '[Yes]' and points to Appendix A.1, but Appendix A.1 only describes hyperparameters and dataset sizes without reporting GPU hours, total API spend, or hardware used."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "CodeGRAG with meta-graph prompting improves Pass@1 over base GPT-3.5-Turbo on HumanEval-X C++ from 57.93% to 64.02% (multi-lingual setting).",
    297       "evidence": "Table 1 shows GPT-3.5-Turbo N/A baseline at 57.93 C++ and (Multi-Lingual) Meta-Graph at 64.02 C++.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The meta-graph representation provides more informative knowledge hints than raw code blocks for retrieval-augmented code generation.",
    302       "evidence": "Table 1 shows meta-graph consistently outperforming code block retrieval across models and languages (e.g., GPT-3.5-Turbo C++: 62.20 vs 60.37; multi-lingual: 64.02 vs 62.20).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Soft prompting with expert GNN signals outperforms standard supervised finetuning for code generation.",
    307       "evidence": "Table 2 shows soft prompting beating SFT across all three models on both CodeForce and APPS (e.g., Gemma 7b CodeForce: 19.13 vs 14.76; CodeLlama 7b APPS: 30.26 vs 26.15).",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Both alignment and structure-preserving contrastive learning objectives contribute to the effectiveness of the soft signal.",
    312       "evidence": "Table 3 ablation shows performance drops when either objective is removed (e.g., removing alignment drops Gemma 7b CodeForce from 19.13 to 7.88; removing structure-preserving drops Llama2 13b CodeForce from 13.62 to 5.50).",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "The graph extraction process has high compatibility, with extraction rates above 91% even for generated code.",
    317       "evidence": "Table 5 reports extraction rates of 91.46-96.95% for generated codes across C++ and Python.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "CodeGRAG proposes using composed syntax graphs (combining control flow and data flow) as structured programming hints for retrieval-augmented code generation. The hard meta-graph prompting technique improves Pass@1 by 2-6 percentage points over base models and 1-2 points over raw code block retrieval on HumanEval-X. The soft prompting technique with GNN expert signals outperforms standard supervised finetuning by 1-5 percentage points on CodeForce and APPS benchmarks. Ablation studies confirm both alignment and structure-preserving objectives contribute to the method's effectiveness.",
    325   "red_flags": [
    326     {
    327       "flag": "No error bars or variance reporting",
    328       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or variance across runs. The paper's own NeurIPS checklist acknowledges this with a '[No]' answer for statistical significance. Given that improvements are often small (1-2 percentage points), it is unclear whether the gains are statistically meaningful."
    329     },
    330     {
    331       "flag": "No significance testing for comparative claims",
    332       "detail": "The paper claims the meta-graph is superior to raw code block retrieval, but improvements on Python HumanEval-X are often 0-1.2 percentage points (e.g., GPT-3.5-Turbo: 72.56 vs 72.56 for single-language). Without significance tests, these small differences may be noise."
    333     },
    334     {
    335       "flag": "Benchmark contamination risk unaddressed",
    336       "detail": "HumanEval was published in 2021 and APPS in 2021, both well before the training of GPT-3.5-Turbo and GPT-4omini. The paper does not discuss whether these models have seen the benchmark problems, which could confound the evaluation of the retrieval augmentation."
    337     },
    338     {
    339       "flag": "Unfilled funding/conflict disclosure",
    340       "detail": "The Acknowledgments and Disclosure of Funding section (Section 7) contains only NeurIPS template boilerplate text with no actual funding disclosure. Authors include Huawei Noah's Ark Lab researchers but no competing interests are declared."
    341     },
    342     {
    343       "flag": "Single evaluation metric",
    344       "detail": "Only Pass@1 is used across all experiments. Other established code generation metrics (Pass@k for k>1, CodeBLEU, compilation rate, functional correctness beyond test suites) are not reported."
    345     },
    346     {
    347       "flag": "Minimal limitations discussion",
    348       "detail": "Section 5 (Limitations) is only three sentences long and discusses only 'dependency on the quality of the external knowledge base.' It does not address the limited language coverage (only C++ and Python), small retrieval pool, lack of statistical rigor, or benchmark contamination risks."
    349     }
    350   ],
    351   "cited_papers": [
    352     {
    353       "title": "GPT-4 Technical Report",
    354       "authors": ["Josh Achiam"],
    355       "year": 2023,
    356       "arxiv_id": "2303.08774",
    357       "relevance": "Foundational LLM used in code generation evaluations; relevant to understanding model capabilities."
    358     },
    359     {
    360       "title": "Code llama: Open foundation models for code",
    361       "authors": ["Baptiste Roziere"],
    362       "year": 2023,
    363       "arxiv_id": "2308.12950",
    364       "relevance": "Open-source code generation model used as a baseline in this paper's soft prompting experiments."
    365     },
    366     {
    367       "title": "Retrieval-based prompt selection for code-related few-shot learning",
    368       "authors": ["Noor Nashid", "Mifta Sintaha", "Ali Mesbah"],
    369       "year": 2023,
    370       "relevance": "Key baseline for code retrieval augmented generation that CodeGRAG compares against."
    371     },
    372     {
    373       "title": "Reacc: A retrieval-augmented code completion framework",
    374       "authors": ["Shuai Lu"],
    375       "year": 2022,
    376       "arxiv_id": "2203.07722",
    377       "relevance": "Another code retrieval augmentation baseline used for comparison in this paper."
    378     },
    379     {
    380       "title": "Docprompting: Generating code by retrieving the docs",
    381       "authors": ["Shuyan Zhou"],
    382       "year": 2022,
    383       "relevance": "Alternative approach to retrieval-augmented code generation using documentation rather than code blocks."
    384     },
    385     {
    386       "title": "EvoR: Evolving Retrieval for Code Generation",
    387       "authors": ["Hongjin Su"],
    388       "year": 2024,
    389       "relevance": "Contemporary work on evolving retrieval strategies for code generation."
    390     },
    391     {
    392       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    393       "authors": ["Zhangyin Feng"],
    394       "year": 2020,
    395       "arxiv_id": "2002.08155",
    396       "relevance": "Foundational pre-trained model for code understanding; represents the code representation approach lineage."
    397     },
    398     {
    399       "title": "GraphCodeBERT: Pre-training code representations with data flow",
    400       "authors": ["Daya Guo"],
    401       "year": 2020,
    402       "arxiv_id": "2009.08366",
    403       "relevance": "Directly relevant prior work on incorporating data flow graphs into code pre-training."
    404     },
    405     {
    406       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    407       "authors": ["Yue Wang"],
    408       "year": 2021,
    409       "arxiv_id": "2109.00859",
    410       "relevance": "Pre-trained code model used as the retrieval encoder in CodeGRAG."
    411     },
    412     {
    413       "title": "Measuring coding challenge competence with APPS",
    414       "authors": ["Dan Hendrycks"],
    415       "year": 2021,
    416       "arxiv_id": "2105.09938",
    417       "relevance": "One of the main evaluation benchmarks used in this paper for code generation assessment."
    418     },
    419     {
    420       "title": "CodeGeeX: A pre-trained model for code generation with multilingual evaluations on HumanEval-X",
    421       "authors": ["Qinkai Zheng"],
    422       "year": 2023,
    423       "arxiv_id": "2303.17568",
    424       "relevance": "Provides the HumanEval-X multilingual benchmark used as the primary evaluation dataset in this paper."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs