scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29269B)
      1 {
      2   "paper": {
      3     "title": "GraphCodeAgent: Dual Graph-Guided LLM Agent for Retrieval-Augmented Repo-Level Code Generation",
      4     "authors": [
      5       "Jia Li",
      6       "Xianjie Shi",
      7       "Kechi Zhang",
      8       "Ge Li",
      9       "Zhi Jin",
     10       "Lei Li",
     11       "Huangzhao Zhang",
     12       "Jia Li",
     13       "Fang Liu",
     14       "Yuwei Zhang",
     15       "Zhengwei Tao",
     16       "Yihong Dong",
     17       "Yuqi Zhu",
     18       "Chongyang Tao"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv",
     22     "arxiv_id": "2504.10046"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor", "data_leakage"],
     26   "methodology_tags": ["benchmark-eval"],
     27   "key_findings": "GraphCodeAgent proposes a dual graph-guided LLM agent combining a Requirement Graph (RG) and Structural-Semantic Code Graph (SSCG) for retrieval-augmented repo-level code generation. On DevEval, it achieves 58.14% Pass@1 with GPT-4o, a 43.81% relative improvement over Dense RACG. The advantage is most pronounced for non-standalone code with complex cross-file dependencies, where it nearly doubles the best baseline's performance. Ablation shows the SSCG traversal tool is the most critical component (12.17% relative drop when removed).",
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 9 'Data Availability' states: 'The code implementation and data are publicly available at figshare anonymous link' with URL https://figshare.com/s/4148a1c56d08804cd75a."
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper uses publicly available benchmarks DevEval and CoderEval, and the figshare link claims to include data. Both benchmarks are standard public datasets."
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No requirements.txt, Dockerfile, or dependency specifications are provided in the paper. The paper mentions tools (tree-sitter, Neo4j, stella_en_400M_v5, Black, DuckDuckGo API) but does not provide version-specific environment setup details."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No step-by-step reproduction instructions are provided in the paper. The figshare link is mentioned but the paper itself contains no instructions on how to replicate experiments."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "All results in Tables 4, 5, 6, and 7 are reported as point estimates (e.g., '58.14' Pass@1) with no confidence intervals or error bars."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper repeatedly claims GraphCodeAgent 'significantly outperforms' baselines but provides no statistical significance tests (no p-values, t-tests, or any formal tests). All claims are based on comparing point estimates."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper consistently reports relative improvements with baseline context, e.g., '43.81% relative improvement over the strongest baseline Dense RACG' (58.14 vs 40.43), '94.30% relative gain' for cross-file tasks (43.31 vs 22.29). Absolute values are provided in all tables."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No justification for sample sizes. DevEval has 1,825 examples and CoderEval's Python task count is not stated. No power analysis or discussion of whether these sizes are sufficient for the claims made."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "Section 6.2 states 'we execute each method two times and report the average experimental results' but no standard deviation, variance, or spread measure is reported across those runs. Temperature is set to 0, so runs should be near-deterministic, but this is not explicitly discussed."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Six baselines are compared: ScratchCG, Sparse RACG, Dense RACG, RepoCoder, GraphCoder, and CodeAgent, covering text-based, graph-based, and agent-based RACG approaches (Section 4.2)."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Baselines include RepoCoder (2023), GraphCoder (2024), CodeAgent (2024), and CodexGraph (2024). These are recent and represent the state of the art in RACG."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Table 5 presents a systematic ablation study removing each tool individually (SSCGTraverse, WebSearch, CodeTesting) and reporting the impact on Pass@1 with GPT-4o on DevEval."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Only Pass@1 is used as the evaluation metric. The paper defines Pass@k generally (Equation 1) but only reports k=1. No other metrics (e.g., CodeBLEU, edit similarity) are used."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No human evaluation of generated code quality. The PhD candidates verified RG construction (Section 3.2.2) but did not evaluate the system's code generation outputs. All evaluation is automated via test-case execution."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "DevEval and CoderEval are established benchmarks with designated test sets. DevEval contains 1,825 test examples from 117 repositories (Section 4.3)."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 6 provides detailed breakdowns by dependency type: Standalone (502), Non-standalone subdivided into Local-file (455), Cross-file (157), and Local&Cross-file (571)."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "Section 6.1 provides a case study showing a successful generation but no failure cases are analyzed. There is no error analysis or discussion of where the approach breaks down."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "All experiments show GraphCodeAgent outperforming baselines. The ablation (Table 5) shows all components contribute positively. No failed approaches or configurations are reported. WebSearch's minimal contribution (0.51% drop) is spun positively."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Abstract claims of '43.81% with GPT-4o and 39.15% with Gemini-1.5-Pro on DevEval, and 31.91% with GPT-4o and 8.25% with Gemini-1.5-Pro on CoderEval' all match Table 4 exactly."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The ablation study (Table 5) uses controlled single-variable manipulation (removing one tool at a time) to support causal claims about component contributions. The comparative evaluation holds the backbone LLM constant while varying the approach."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The title claims 'Repo-Level Code Generation' broadly, but evaluation is limited to Python benchmarks (DevEval and CoderEval). The paper does not bound its claims to Python or to these specific benchmark types. Section 8 concludes with unbounded claims about 'real-world coding challenges.'"
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "Section 6.2 discusses threats to validity but does not consider alternative explanations for the improvements. For example, GraphCodeAgent may simply provide more context tokens to the LLM than baselines — the paper claims equal retrieved code elements but not equal information volume (graph structure adds information). This is not discussed."
    145       },
    146       "proxy_outcome_distinction": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper measures Pass@1 (test-case execution correctness) and claims this measures code generation quality. The claims match the granularity of measurements without broader framing — they do not claim to measure developer productivity or software quality more broadly."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "GPT-4o is specified as 'GPT-4o-2024-08-06' (specific version), but Gemini is listed as 'Gemini-1.5-Pro-latest' which is not a pinned version (changes over time). QWQ-32B is given without a specific version date. Not all models have exact versions."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper describes tool functionality in natural language (Section 3.4) and mentions instructions 'shown in the link provided at Section 9' (figshare). The actual agent prompts, ReAct reasoning prompts, and tool descriptions sent to the LLM are not provided in the paper."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.5 reports: temperature=0, maximum generation length=500 tokens, similarity threshold ε=0.8, embedding model stella_en_400M_v5, RepoCoder iterations=2. Key hyperparameters are stated."
    167       },
    168       "scaffolding_described": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3.4 describes the agent scaffolding in detail: five tools (RGRetrieval, DualGraphMapping, SSCGTraverse, WebSearch, CodeTesting) with inputs/outputs (Table 2), ReAct reasoning strategy, workflow order, and stopping conditions. Figure 4 provides an overview diagram."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Sections 3.2.2 and 3.3.2 document how repositories are parsed with tree-sitter, how functions/classes/methods are extracted, how RG and SSCG are constructed, and how semantic similarity edges are computed. Section 4.5 describes prompt length truncation."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6.2 'Threats to Validity' discusses both internal validity (hyperparameter settings, fairness of comparison) and external validity (benchmark quality, baseline selection, LLM choice, metric selection)."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 6.2 discusses study-specific threats: the use of DeepSeek-V2.5 for RG construction with manual verification by PhD candidates, the adaptation of RepoCoder to repo-level generation, and the specific embedding model choice (stella_en_400M_v5)."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of which languages, repository types, or task complexities are excluded. The threats section discusses potential validity issues but does not state explicit scope boundaries."
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "While a figshare link is provided, the paper does not indicate that raw per-task results (individual pass/fail outcomes) are available. Only aggregate Pass@1 scores are reported in the paper."
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "DevEval is described as containing '1,825 test examples from 117 repositories' covering '10 domains such as Internet and Database' (Section 4.3). CoderEval is described as originating from 'open-source projects from various domains' with contextual dependencies."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "No human participants. Data sources are standard benchmarks (DevEval and CoderEval)."
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The pipeline from repository parsing (tree-sitter) through graph construction (RG and SSCG) to retrieval and code generation is documented across Sections 3.2-3.4. The evaluation pipeline (single generation, temperature 0, test execution) is described in Section 4.5."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "All author affiliations are clearly listed: Wuhan University, Peking University, University of Hong Kong, Tsinghua University, Beihang University, Institute of Software Chinese Academy of Sciences, Academy of Military Sciences. Authors are not affiliated with the evaluated model providers."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No funding is disclosed. Without knowing the funding source, independence cannot be assessed. The authors use commercial APIs (GPT-4o, Gemini) but are not affiliated with those companies."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No competing interests statement or financial disclosure is present in the paper."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No training data cutoff dates are stated for GPT-4o, Gemini-1.5-Pro, or QwQ-32B. The models' training data periods are not mentioned."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No discussion of whether DevEval or CoderEval examples could have appeared in the training data of the models used. DevEval and CoderEval are sourced from public repositories that may be in training data."
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "DevEval (2024) and CoderEval (2024) use code from public repositories. GPT-4o-2024-08-06 could have been trained on these repositories. No contamination analysis is performed or discussed."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study. The PhD candidates verifying RG construction are not study participants."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "demographics_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       },
    283       "blinding_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants."
    287       },
    288       "attrition_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The paper mentions 'The retrieval process of our approach typically takes only a few seconds on each task' but does not report API costs, token consumption, or total inference cost. The multi-turn LLM agent calls (average 2.3 SSCGTraverse invocations per task) would incur significant API cost which is not quantified."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No total computational budget is stated. No GPU hours, API spend, or hardware specifications for running the experiments are provided."
    304       }
    305     },
    306     "experimental_rigor": {
    307       "seed_sensitivity_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Temperature is set to 0, making generation near-deterministic, but no seed sensitivity analysis is reported. The paper does not discuss whether results are sensitive to any source of randomness."
    311       },
    312       "number_of_runs_stated": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Section 6.2 states: 'we execute each method two times and report the average experimental results.'"
    316       },
    317       "hyperparameter_search_budget": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The similarity threshold ε=0.8 for SSCG and RepoCoder iterations=2 appear chosen without reporting how many configurations were tried. No hyperparameter search budget is stated."
    321       },
    322       "best_config_selection_justified": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "ε=0.8 is used without justification for why this threshold was chosen. RepoCoder iterations=2 is justified by citing the original paper, but other hyperparameters are not justified."
    326       },
    327       "multiple_comparison_correction": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    331       },
    332       "self_comparison_bias_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The authors implement and evaluate their own system against their own implementations of baselines. No acknowledgment of author-evaluation bias (Lucic et al. 2018). They mention keeping the same retrieved code count for fairness but do not discuss the inherent bias of authors implementing their own baselines."
    336       },
    337       "compute_budget_vs_performance": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "GraphCodeAgent uses an iterative multi-tool agent (average 2.3 SSCG traversals + other tool calls per task) requiring substantially more LLM calls than non-agent baselines. This compute difference is not discussed or controlled for."
    341       },
    342       "benchmark_construct_validity": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "No discussion of whether Pass@1 on DevEval and CoderEval actually measures repo-level code generation capability in practice. The paper takes benchmark validity as given without questioning construct validity."
    346       },
    347       "scaffold_confound_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "GraphCodeAgent uses a different scaffold (RG+SSCG-guided agent with 5 tools) than CodeAgent (5 programming tools with different strategies). Performance differences are attributed to the approach without controlling for or discussing the scaffolding confound."
    351       }
    352     },
    353     "data_leakage": {
    354       "temporal_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of temporal leakage. DevEval and CoderEval are sourced from public repositories that may predate model training. The models could have seen the solution code during training."
    358       },
    359       "feature_leakage_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether the evaluation setup leaks information. The WebSearch tool potentially provides external hints not discussed in terms of leakage. The paper mentions blocking 'websites that may lead to data leakage' but does not explain how."
    363       },
    364       "non_independence_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No discussion of whether train and test examples share structural similarities or come from repositories in the models' training data."
    368       },
    369       "leakage_detection_method": {
    370         "applies": true,
    371         "answer": false,
    372         "justification": "No concrete leakage detection or prevention method is applied. The mention of blocking leakage-prone websites in WebSearch (Section 3.4.1) is not elaborated and does not constitute a detection method for benchmark contamination."
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "GraphCodeAgent achieves 43.81% relative improvement with GPT-4o on DevEval (Pass@1: 58.14% vs 40.43% for Dense RACG) and 39.15% relative improvement with Gemini-1.5-Pro (54.74% vs 39.34%)",
    379       "evidence": "Table 4 shows full results on DevEval across all baselines and both LLMs. Section 5.1 discusses the results.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "GraphCodeAgent achieves 31.91% relative improvement with GPT-4o on CoderEval (Pass@1: 53.91% vs 40.87% for Sparse RACG) and 8.25% with Gemini-1.5-Pro (45.65% vs 42.17%)",
    384       "evidence": "Table 4 shows CoderEval results. The improvement on Gemini-1.5-Pro is notably smaller (8.25%) compared to DevEval.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "SSCGTraverse is the most critical component, causing 12.17% relative performance drop when removed (Pass@1 from 58.14 to 51.83)",
    389       "evidence": "Table 5 ablation study on GPT-4o/DevEval. Only one model and one benchmark tested for ablation.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "GraphCodeAgent's advantage is particularly significant for non-standalone code with complex cross-file dependencies, achieving 94.30% relative gain on cross-file tasks",
    394       "evidence": "Table 6 breaks down DevEval by dependency type. Cross-file: 43.31 vs 22.29 for RepoCoder. Local&Cross-file: 45.18 vs 32.07 for GraphCoder.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "GraphCodeAgent generalizes to reasoning models, achieving 10.65% relative improvement over the best baseline on QwQ-32B",
    399       "evidence": "Table 7 shows QwQ-32B results on DevEval only. GraphCodeAgent: 54.14 vs RepoCoder: 48.93.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "The paper claims 'significantly outperforms' repeatedly but provides no statistical tests (no p-values, confidence intervals, or formal tests). All comparisons are based on raw point estimates. With temperature=0 and only 2 runs averaged, the reliability of reported differences is unknown."
    407     },
    408     {
    409       "flag": "Relative improvement framing inflates perceived gains",
    410       "detail": "The paper prominently reports relative improvements (e.g., '43.81%') rather than absolute differences. The absolute improvement on DevEval with GPT-4o is ~18 percentage points (58.14 vs 40.43). The 8.25% relative improvement with Gemini on CoderEval is only ~3.5pp absolute."
    411     },
    412     {
    413       "flag": "No contamination analysis despite using public benchmark code",
    414       "detail": "DevEval and CoderEval source code from public repositories. GPT-4o and Gemini may have seen this code during training. No temporal analysis, contamination checks, or decontamination is performed."
    415     },
    416     {
    417       "flag": "Ablation study limited to one model and one benchmark",
    418       "detail": "The ablation (Table 5) is only performed with GPT-4o on DevEval. It is unclear whether component contributions hold for Gemini-1.5-Pro or on CoderEval."
    419     },
    420     {
    421       "flag": "Uncontrolled compute budget across methods",
    422       "detail": "GraphCodeAgent uses multiple LLM calls per task (RGRetrieval + DualGraphMapping + average 2.3 SSCGTraverse + 0.4 WebSearch + 0.8 CodeTesting), while simpler baselines use a single generation call. The compute and cost difference is never quantified or discussed."
    423     },
    424     {
    425       "flag": "Single metric evaluation",
    426       "detail": "Only Pass@1 is reported. Code generation quality has multiple dimensions (readability, maintainability, efficiency, style consistency) that are not measured. The paper claims suitability for 'practical application in complex software development workflows' based solely on test-case pass rates."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Evaluating large language models trained on code",
    432       "authors": ["Mark Chen", "Jerry Tworek"],
    433       "year": 2021,
    434       "arxiv_id": "2107.03374",
    435       "relevance": "Introduces HumanEval benchmark for code generation evaluation, foundational benchmark in the field."
    436     },
    437     {
    438       "title": "CodeAgent: Enhancing code generation with tool-integrated agent systems for real-world repo-level coding challenges",
    439       "authors": ["Kechi Zhang", "Jia Li", "Ge Li"],
    440       "year": 2024,
    441       "arxiv_id": "2401.07339",
    442       "relevance": "Pioneer LLM-based agent framework for repo-level code generation, key baseline in this paper."
    443     },
    444     {
    445       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    446       "authors": ["Fengji Zhang", "Bei Chen"],
    447       "year": 2023,
    448       "arxiv_id": "2303.12570",
    449       "relevance": "Introduces iterative retrieval-generation pipeline for repo-level code completion, important RACG baseline."
    450     },
    451     {
    452       "title": "GraphCoder: Enhancing repository-level code completion via code context graph-based retrieval and language model",
    453       "authors": ["Wei Liu", "Ailun Yu"],
    454       "year": 2024,
    455       "arxiv_id": "2406.07003",
    456       "relevance": "Graph-based retrieval-augmented code completion framework using code context graphs."
    457     },
    458     {
    459       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    460       "authors": ["Carlos E Jimenez", "John Yang"],
    461       "year": 2023,
    462       "arxiv_id": "2310.06770",
    463       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks from GitHub issues."
    464     },
    465     {
    466       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    467       "authors": ["Jia Li", "Ge Li"],
    468       "year": 2024,
    469       "arxiv_id": "2405.19856",
    470       "relevance": "Primary evaluation benchmark in this paper for repo-level code generation with 1,825 examples from 117 repositories."
    471     },
    472     {
    473       "title": "Agentless: Demystifying LLM-based software engineering agents",
    474       "authors": ["Chunqiu Steven Xia", "Yinlin Deng"],
    475       "year": 2024,
    476       "arxiv_id": "2407.01489",
    477       "relevance": "Alternative approach to LLM-based software engineering that preprocesses repository structure without agentic scaffolding."
    478     },
    479     {
    480       "title": "ReAct: Synergizing reasoning and acting in language models",
    481       "authors": ["Shunyu Yao", "Jeffrey Zhao"],
    482       "year": 2023,
    483       "relevance": "Foundational reasoning-and-acting framework used as the agent reasoning strategy in GraphCodeAgent."
    484     },
    485     {
    486       "title": "CodexGraph: Bridging large language models and code repositories via code graph databases",
    487       "authors": ["Xiangyan Liu", "Bo Lan"],
    488       "year": 2024,
    489       "arxiv_id": "2408.03910",
    490       "relevance": "Uses static analysis to extract code graphs from repositories for LLM-based code tasks."
    491     },
    492     {
    493       "title": "Locagent: Graph-guided LLM agents for code localization",
    494       "authors": ["Zhaoling Chen", "Xiangru Tang"],
    495       "year": 2025,
    496       "arxiv_id": "2503.09089",
    497       "relevance": "Combines graph-based repository representation with agentic LLM for code localization, closely related approach."
    498     },
    499     {
    500       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    501       "authors": ["Daya Guo", "Dejian Yang"],
    502       "year": 2025,
    503       "arxiv_id": "2501.12948",
    504       "relevance": "Reasoning model relevant to understanding LLM capability improvements through RL training."
    505     },
    506     {
    507       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    508       "authors": ["Hao Yu", "Bo Shen"],
    509       "year": 2024,
    510       "relevance": "Second primary evaluation benchmark in this paper, assessing pragmatic code generation with contextual dependencies."
    511     }
    512   ]
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs