ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20165B)


      1 {
      2   "paper": {
      3     "title": "Completion by Comprehension: Guiding Code Generation with Multi-Granularity Understanding",
      4     "authors": ["Xinkui Zhao", "Rongkai Liu", "Yifan Zhang", "Chen Zhi", "Lufei Zhang", "Guanjie Cheng", "Yueshen Xu", "Shuiguang Deng", "Jianwei Yin"],
      5     "year": 2025,
      6     "venue": "Manuscript submitted to ACM",
      7     "arxiv_id": "2512.04538"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The paper states 'We make the code publicly available at the link' but no actual URL is provided — just the word 'link' as a placeholder. No working repository URL found."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks CrossCodeEval and RepoEval, and reuses publicly available evaluation implementations from prior studies [8, 52]."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 'one Tesla A100 GPU, with 40 GB memory' but provides no requirements.txt, Dockerfile, or library version details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The experimental setup section describes settings but not how to run the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 1-3 and Figures 7-8 report only point estimates with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CoCo 'consistently outperforms' baselines but provides no statistical significance tests. Comparisons are based solely on comparing raw numbers."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context, e.g., '20.2% gains in EM' and provides absolute numbers in tables (e.g., from 24.91% to 29.93%)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for benchmark sizes. The RepoEval subset is stated as 1600 samples but no rationale is given for why this is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper states 'the random seed is set to 123' suggesting single-run experiments. No variance, standard deviation, or multi-run results are reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Three baselines are compared: RawRAG, RepoCoder, and RLCoder (Section 4.2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "RLCoder (2024) is described as the state-of-the-art method. RepoCoder (2023) is also recent. These are reasonable contemporary baselines for repository-level code completion."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Two ablation studies are conducted: RQ2 removes Code Comprehension and Structural-Aware Re-Ranker individually (Section 5.2), and RQ5 tests individual granularity levels (Section 5.5)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four metrics are used: Exact Match (EM), Edit Similarity (ES), Identifier Exact Match (ID.EM), and F1-score (Section 4.3)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated code quality is performed. All evaluation is automated via EM, ES, ID.EM, and F1 against ground truth."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses established benchmark test sets from CrossCodeEval and RepoEval, which have predefined splits."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by benchmark (CrossCodeEval Python, CrossCodeEval Java, RepoEval Line, RepoEval API) and by model (CodeLlama-7B, Yi-Coder-1.5B, DeepSeekCoder-1B) in Table 1."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The case study (Section 5.6) shows a success case. No failure cases or error analysis of where CoCo produces incorrect completions are discussed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that CoCo_all does not significantly outperform CoCo_pro (Section 5.5), and that the re-ranker provides only 'relatively modest' gains (Section 5.2). The Jaccard similarity limitation is acknowledged."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'up to 20.2% gains in EM' which is supported by Table 1 (Yi-Coder-1.5B on CrossCodeEval Python: 24.91 → 29.93, ~20.2% relative improvement). Claims of consistent outperformance are supported across Table 1."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims ('multi-granularity context improves completion') are supported by controlled ablation studies (RQ2, RQ5) that systematically remove components and measure impact."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Guiding Code Generation' generally but experiments are limited to Python and Java on two benchmarks with three relatively small models (1B-7B). The abstract claims the framework is 'model-agnostic' without testing on larger or proprietary models."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The threats to validity section (Section 6) discusses limitations of static analysis tools but does not consider alternative explanations for the observed improvements (e.g., whether gains come from simply providing more context tokens rather than structured comprehension)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are specified as 'DeepSeekCoder-1B', 'Codellama-7B', 'Yi-Coder-1.5B' without specific version identifiers or snapshot dates."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Figure 6 shows a prompt template with placeholders (e.g., 'file_info 1', 'pro_info 1', 'code example 1') but the actual fill values and complete prompt text are not provided."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Random seed (123) and max tokens (64) are stated. However, temperature, top-p, and other sampling parameters are not reported. The teleportation factor α for PageRank and top-k values for context selection are not specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "CoCo is a pipeline framework (static analysis + retrieval + generation), not an agentic scaffolding system with retry logic or feedback loops."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The methodology (Section 3) describes in detail how code is parsed at function, file, and project levels using AST parsers and Tree-sitter, how the graph-based context selection filters information, and how the re-ranker processes candidates."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 'Threats to Validity' provides substantive discussion of limitations across three paragraphs."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats section discusses specific issues: CFG construction missing complex execution paths, context inflation exceeding model windows, inability to handle non-entity code elements like comments and annotations."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what settings or claims are excluded. It does not bound its claims to the tested languages (Python, Java) or model sizes (1B-7B)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental outputs, generated code samples, or per-example results are made available for verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The benchmarks used (CrossCodeEval, RepoEval) are well-described with references. The paper describes using Python and Java subsets of CrossCodeEval and line/API subsets of RepoEval (1600 samples)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from code repository input through AST parsing, multi-granularity context extraction, graph-based selection, retrieval, re-ranking, and prompt construction is documented in Section 3 with algorithms and figures."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "Section 9 is titled 'Acknowledgments' but contains no content — no funding sources are mentioned anywhere in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All author affiliations are listed: Zhejiang University, State Key Laboratory of Mathematical Engineering and Advanced Computing, and Xidian University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The empty Acknowledgments section suggests non-disclosure rather than absence of funding."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses pre-trained code LLMs (DeepSeekCoder, CodeLlama, Yi-Coder) on benchmarks but does not state training data cutoff dates for any of these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the benchmark examples (CrossCodeEval, RepoEval) could have appeared in the training data of the models used."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "CrossCodeEval (2023) and RepoEval (2023) were published before or around the training of the models used, yet no contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 3 reports wall-clock inference times and overhead in seconds for each benchmark (e.g., 132.4s overhead on CrossCodeEval Python, ~1.9% increase)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (Tesla A100 40GB) but total compute budget, GPU hours, or total experiment time is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoCo achieves up to 20.2% gains in Exact Match over state-of-the-art baselines.",
    286       "evidence": "Table 1: Yi-Coder-1.5B on CrossCodeEval Python, RLCoder EM=24.91 vs CoCo EM=29.93, ~20.2% relative improvement.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "CoCo consistently outperforms all baselines across all backbone LLMs and benchmarks.",
    291       "evidence": "Table 1 shows CoCo outperforming RawRAG, RepoCoder, and RLCoder across all 4 benchmarks and 3 models on all 4 metrics. However, only single-run results with no statistical tests.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "CoCo can be seamlessly integrated into existing methods as a plug-in, consistently enhancing performance.",
    296       "evidence": "Table 2 shows RawRAG+CoCo and RepoCoder+CoCo both improve over their base methods across all benchmarks.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The integration overhead of CoCo is negligible (below 5% latency increase).",
    301       "evidence": "Table 3 shows overhead ranging from 1.0% to 2.1% across benchmarks.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Project-level information contributes the most to code generation performance among the three granularity levels.",
    306       "evidence": "Section 5.5 and Figure 8 show CoCo_pro outperforming CoCo_func and CoCo_file on CrossCodeEval Python.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CoCo proposes a comprehension-driven approach to repository-level code completion that extracts multi-granularity context (function, file, project level) via static analysis and uses Personalized PageRank for context selection. On CrossCodeEval and RepoEval benchmarks with three small code LLMs (1B-7B parameters), CoCo achieves up to 20.2% relative EM improvement over RLCoder. The framework adds less than 2.1% latency overhead and can be integrated as a plug-in into existing RAG-based methods. Ablations show project-level context provides the largest gains, and the graph-based context selection is important for filtering noise from combined multi-granularity input.",
    312   "red_flags": [
    313     {
    314       "flag": "Single-run evaluation",
    315       "detail": "All experiments use a single random seed (123) with no variance reporting across multiple runs, making it impossible to assess result stability."
    316     },
    317     {
    318       "flag": "No statistical significance testing",
    319       "detail": "Claims of outperformance are based solely on comparing point estimates without any significance tests, despite some improvements being small (e.g., <2 percentage points)."
    320     },
    321     {
    322       "flag": "No contamination analysis",
    323       "detail": "Benchmarks from 2023 are used with models that may have been trained on data including those benchmarks, but no contamination analysis is performed."
    324     },
    325     {
    326       "flag": "Placeholder code release link",
    327       "detail": "The paper claims code is publicly available but the actual URL is missing — only the word 'link' appears as a placeholder, suggesting the artifact is not actually released."
    328     },
    329     {
    330       "flag": "Missing hyperparameters",
    331       "detail": "Key hyperparameters like temperature, top-p, PageRank teleportation factor α, and top-k values for context selection are not reported, hindering reproducibility."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming–The Rise of Code Intelligence",
    337       "authors": ["Daya Guo"],
    338       "year": 2024,
    339       "arxiv_id": "2401.14196",
    340       "relevance": "Major open-source code LLM used as backbone in code generation evaluations."
    341     },
    342     {
    343       "title": "Code llama: Open foundation models for code",
    344       "authors": ["Baptiste Roziere"],
    345       "year": 2023,
    346       "arxiv_id": "2308.12950",
    347       "relevance": "Open-source code LLM family widely used as baseline in code generation research."
    348     },
    349     {
    350       "title": "Crosscodeeval: A diverse and multilingual benchmark for cross-file code completion",
    351       "authors": ["Yangruibo Ding"],
    352       "year": 2023,
    353       "relevance": "Key benchmark for evaluating repository-level code completion systems."
    354     },
    355     {
    356       "title": "Repocoder: Repository-level code completion through iterative retrieval and generation",
    357       "authors": ["Fengji Zhang"],
    358       "year": 2023,
    359       "arxiv_id": "2303.12570",
    360       "relevance": "Influential RAG-based repository-level code completion method used as baseline."
    361     },
    362     {
    363       "title": "RLCoder: Reinforcement learning for repository-level code completion",
    364       "authors": ["Yanlin Wang"],
    365       "year": 2024,
    366       "arxiv_id": "2407.19487",
    367       "relevance": "State-of-the-art RL-based retriever for repository-level code completion, primary baseline."
    368     },
    369     {
    370       "title": "Graphcoder: Enhancing repository-level code completion via code context graph-based retrieval and language model",
    371       "authors": ["Wei Liu"],
    372       "year": 2024,
    373       "arxiv_id": "2406.07003",
    374       "relevance": "Graph-based retrieval framework for code completion, related structural approach."
    375     },
    376     {
    377       "title": "Dataflow-guided retrieval augmentation for repository-level code completion",
    378       "authors": ["Wei Cheng"],
    379       "year": 2024,
    380       "arxiv_id": "2405.19782",
    381       "relevance": "Dataflow-guided retrieval for repository code completion, closely related work."
    382     },
    383     {
    384       "title": "Swe-bench: Can language models resolve real-world github issues?",
    385       "authors": ["Carlos E Jimenez"],
    386       "year": 2023,
    387       "arxiv_id": "2310.06770",
    388       "relevance": "Major benchmark for evaluating LLM coding capabilities on real-world software engineering tasks."
    389     },
    390     {
    391       "title": "Productivity assessment of neural code completion",
    392       "authors": ["Albert Ziegler"],
    393       "year": 2022,
    394       "relevance": "Empirical study on productivity impact of code completion tools in practice."
    395     },
    396     {
    397       "title": "Retrieval-augmented generation for large language models: A survey",
    398       "authors": ["Yunfan Gao"],
    399       "year": 2023,
    400       "arxiv_id": "2312.10997",
    401       "relevance": "Comprehensive survey of RAG techniques applied to LLMs."
    402     }
    403   ]
    404 }

Impressum · Datenschutz