scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28083B)
      1 {
      2   "paper": {
      3     "title": "Towards Automated Smart Contract Generation: Evaluation, Benchmarking, and Retrieval-Augmented Repair",
      4     "authors": [
      5       "Zaoyu Chen",
      6       "Haoran Qin",
      7       "Nuo Chen",
      8       "Xiangyu Zhao",
      9       "Lei Xue",
     10       "Xiapu Luo",
     11       "Xiao-Ming Wu"
     12     ],
     13     "year": 2026,
     14     "venue": "FSE '26",
     15     "arxiv_id": "2503.01098",
     16     "doi": ""
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides a GitHub link: https://github.com/ZaoyuChen/SolBench. Section 8 (Data Availability) explicitly states 'The SolBench dataset and code are available at https://github.com/ZaoyuChen/SolBench.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The SolBench dataset (28,825 functions from 7,604 contracts) is released at the same GitHub repository. The paper states 'Dataset and code are available at https://github.com/ZaoyuChen/SolBench' in the abstract and Section 8."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using '4x NVIDIA A100 (80 GB) GPUs' and 'vLLM' for local model deployment, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions. No conda environment or dependency listing is provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper provides the GitHub link but does not include step-by-step reproduction instructions within the paper itself. There is no 'Reproducing Results' section or specific commands to run the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 5-10 report only point estimates (e.g., '88.87% P@1'). No confidence intervals, error bars, or +/- notation are provided for any of the main results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes many comparative claims (e.g., 'GPT-5-mini achieves the highest performance', 'RAR consistently outperforms') but no statistical significance tests (p-values, t-tests, etc.) are reported. Comparisons are made solely by comparing raw numbers."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports improvements with baseline context (e.g., 'RAR yields an average improvement of 13.83% in P@1', '50.20% cost reduction'). Tables 8 and 10 show absolute improvements and percentage increases over baselines, providing sufficient context to understand effect magnitudes."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The benchmark uses 28,825 functions, which is large but the size is not justified through any power analysis or formal reasoning about why this number is sufficient. The dataset size is a result of the filtering pipeline rather than a design choice."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviations, variance, or any spread measures are reported across experimental runs. The paper appears to report single-run results for each model/configuration. Pass@k is computed with n=1 for Pass@1, meaning a single generation per problem with no repeated trials."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper includes multiple baselines: 14 LLMs are compared against each other, 'No Repair' baselines are used for RAR evaluation, and four code repair methods (Self Edit, Self Debug, Self Repair, Self Refine) serve as baselines. Six retrieval methods are also compared."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The evaluated models include very recent ones: GPT-5-mini (Aug 2025), DeepSeek-R1 (Jan 2025), Qwen2.5-Coder (Sep 2024), and Claude-3.5-Haiku (Oct 2024). The model selection criteria explicitly include 'release dates covering both recent and earlier models' (Table 4)."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The RAR framework is ablated across multiple dimensions: different retrieval methods (Table 9), different code repair methods (Table 8), with and without retrieval ('No Retrieval' baselines), and varying context lengths. This effectively shows which components contribute to performance."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses Pass@1 (P@1) for functional correctness and Compilation@1 (C@1) for compilation rate as two distinct metrics. Additionally, BLEU and CrystalBLEU are included for comparison in Table 3, and Pass@k for k=1,2,3,5 in Table 7. Inference cost is also reported as a metric."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is performed. All evaluation is automated through differential fuzzing (Diffusc + Echidna). Given that the paper makes claims about practical utility for smart contract development, human evaluation of code quality could have been valuable."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "SolBench is used as both the evaluation benchmark and the development target. There is no separation into dev/test splits. The same 28,825 functions are used for all evaluations, and hyperparameter/method choices appear to be evaluated on the same data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 2 provides contract type statistics across 10 domains (Token, NFT, DeFi, Governance, etc.). Table 5 provides per-model breakdowns across different context lengths. The filtering analysis in Section 2.1.2 breaks down by contract domain."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 3.2.2 and Figure 4 provide error distribution analysis showing the top five error types (Undeclared Identifier, Member errors, etc.). The paper discusses why models fail (missing contextual information) and illustrates failure scenarios in Figures 1 and 5."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that reasoning-oriented models (DeepSeek-R1) show 'no advantage and even slight drops compared to Deepseek-V3' (Section 3.2.2, Finding 1). Self Refine is shown to be the weakest repair method. CodeLlama-7B underperforms smaller but newer models."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims that (1) BLEU correlates weakly with functional correctness (supported by Table 3, r<=0.41), (2) missing contextual information is the dominant failure mode (supported by Fig. 4), (3) RAR reduces cost by half while improving accuracy (supported by Fig. 6 and Tables 8-10). All claims are backed by experimental results."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims through ablation-style experiments: removing/adding retrieval, varying repair methods, and varying context lengths. The RAR framework evaluation (Table 10) uses controlled single-variable manipulation, comparing 'No Repair' baselines against RAR variants while holding other factors constant."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title says 'Automated Smart Contract Generation' broadly, but the work is limited to Solidity on Ethereum. Section 5.1 acknowledges this but the title and abstract overreach. The paper evaluates only function-level completion (not full contract generation), and results are specific to the SolBench benchmark which filters out ~90% of functions."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 5 (Threats to Validity) discusses multiple alternative explanations: training data overlap could inflate performance (Section 5.2), prompt construction and inference settings could influence results (Section 5.2), and the benchmark focuses only on functional correctness, missing code readability and gas efficiency (Section 5.3)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Table 4 lists models with names and release dates but not specific API versions or snapshot dates. 'GPT-4o-mini', 'GPT-5-mini', 'Claude-3.5-Haiku', 'Doubao-Pro' are used without API version identifiers. Open-source models specify parameter counts but not exact checkpoint versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper describes the prompting approach (function comments and signature as input) but does not provide the actual prompt templates or system instructions used. No appendix with prompt text is included. The exact instructions given to models for code completion and code repair are not shown."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper states 'All models were sampled under their default inference settings' (Section 3.2.2) but does not report specific temperature, top-p, or max token settings for the main benchmarking experiments. For RAR experiments, 'maximum number of generated tokens' is 1024 (Section 4.3.1), but the main LLM inference settings are not detailed."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The RAR framework is described in detail in Section 4.1 with a workflow diagram (Fig. 5). The executor-retriever-LLM pipeline is clearly described: (1) LLM completes function, (2) executor checks correctness, (3) retriever extracts relevant snippets, (4) LLM repairs. Each repair method (Self Edit, Self Debug, Self Repair, Self Refine) is described in Section 4.2.1."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 2.1 documents the four-step data construction pipeline with counts at each stage: 514,506 unique Solidity files -> 2,609,128 functions extracted -> 342,975 after deduplication -> 694,953 pass verification -> 28,825 final functions after deduplication. Filtering criteria (on-chain storage, constructor dependencies, mint operations) are specified."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 'Threats to Validity' contains three substantive subsections: External Validity (Section 5.1), Internal Validity (Section 5.2), and Construct Validity (Section 5.3)."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The threats are specific to this study: (1) Solidity-specific tools limit generalization (5.1), (2) training data overlap with SolBench functions (5.2), (3) prompt construction and inference settings could affect results (5.2), (4) only functional correctness is measured, missing readability and gas efficiency (5.3)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5.1 explicitly states that 'our evaluation and the RAR framework are implemented in Solidity on Ethereum' and that tools 'would need to be replaced with appropriate execution or testing frameworks when porting to other languages.' Section 5.3 states the evaluation 'does not consider other important aspects such as code readability, maintainability, or gas efficiency.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The SolBench dataset is publicly available at https://github.com/ZaoyuChen/SolBench. The underlying data comes from the DISL dataset of Ethereum smart contracts which is also publicly available. This enables independent verification of the benchmark construction."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 2.1 describes the data collection in detail: sourced from the DISL dataset of 514,506 unique Solidity files from Ethereum mainnet (genesis to 2024). The four-step pipeline (collection, extraction, verification, deduplication) is documented with specific criteria at each stage."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved. The data comes from publicly deployed smart contracts on the Ethereum blockchain and models evaluated via API or local deployment."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 2.1.1 documents the full pipeline with counts: 514,506 unique contracts -> 2,609,128 functions extracted -> 342,975 unique after deduplication (86.85% duplication rate) -> 694,953 pass functional correctness verification -> 28,825 final after content-based deduplication. Each filtering step's rationale is explained."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section is visible in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: The Hong Kong Polytechnic University and Sun Yat-sen University. The authors are academic researchers and do not appear to be affiliated with any of the companies whose models are evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is itself a concern, as it is impossible to verify whether any financial relationships exist with the model providers being evaluated."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper. There is no declaration regarding patents, equity, or other financial interests."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state the training data cutoff dates for any of the 14 evaluated models. Release dates are given in Table 4 but these are not the same as training cutoff dates. This is critical because SolBench contains contracts from genesis to 2024, which likely overlaps with model training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 2.1.2 analyzes overlap: 'We further analyze potential data overlap by comparing SolBench against the Solidity subset of The Stack, the largest publicly available code corpus containing Solidity, and find that only 25.49% of functions exhibit high similarity (Jaccard similarity >= 0.9).' Section 5.2 also discusses this threat."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 5.2 directly addresses contamination: 'Some functions in SolBench may overlap with the training data of the evaluated LLMs, which could potentially inflate their performance.' The paper argues that high natural duplication in Solidity (86.85%) means overlap reflects real-world patterns. Section 2.1.2 quantifies overlap with The Stack (25.49% high similarity)."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study. All evaluation is automated benchmark evaluation."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved. The study uses publicly available smart contract code and LLM APIs."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Figure 6 plots P@1 vs. inference cost in USD. The paper reports specific costs: '$5,286' for full-context Claude-Opus-4.1, '$10.94' for RAR to achieve 90.81% P@1 vs. '$21.97' for the baseline, and API pricing for GPT-4o-mini ($0.15/$0.6 per million tokens). Cost-effectiveness is a major theme of the paper."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "While API costs are reported for some experiments, the total computational budget is not stated. Hardware is mentioned (4x NVIDIA A100 80GB for local models) but GPU hours, total API spend across all experiments, or total training/inference time are not quantified."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "BLEU and CrystalBLEU correlate weakly with functional correctness (Pass@1) for Solidity code completion, with Pearson r <= 0.41.",
    295       "evidence": "Table 3 shows Pearson correlation coefficients between BLEU/CrystalBLEU and P@1 across context lengths: BLEU r=0.26-0.41, CrystalBLEU r=0.24-0.40 (Section 3.2.1, RQ1).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Context length is the primary bottleneck for Solidity code completion, with performance improving from 38.88% to 88.87% P@1 as context increases from 0 to 4k tokens (GPT-4o-mini).",
    300       "evidence": "Table 5 shows consistent improvement across all 14 models with increasing context length. GPT-4o-mini improves from 38.88% (0 context) to 88.87% (4k context). Error analysis in Fig. 4 shows 'Undeclared Identifier' as the dominant error at 0 context (Section 3.2.2).",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "RAR reduces inference cost by approximately 50% while achieving equal or higher Pass@1 compared to baselines without RAR.",
    305       "evidence": "Figure 6 shows that RAR achieves 90.81% P@1 at $10.94, while the baseline needs $21.97 for 90.76% P@1 (Section 4.3.2). Table 10 shows RAR at 1k context matches no-RAR at 2k context (85.21% vs 85.93%).",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "GPT-5-mini achieves the best overall performance among evaluated models, with 97.19% P@1 at 32k context.",
    310       "evidence": "Table 6 shows GPT-5-mini at 97.19% P@1 (32k context). Table 5 shows average P@1 of 81.78%, highest among all 14 models (Section 3.2.2).",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "Reasoning-oriented models show no advantage over non-reasoning models for Solidity code completion.",
    315       "evidence": "Table 5 shows DeepSeek-R1-671B (avg P@1: 73.52%) slightly underperforms DeepSeek-V3-671B (avg P@1: 76.80%). DeepSeek-R1-Distill-Llama-70B (52.27%) significantly underperforms its base scale class (Section 3.2.2, Finding 1).",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "LCS is the most practical retrieval method, achieving the best results at minimal computational cost.",
    320       "evidence": "Table 9 shows LCS achieves the highest average improvement across repair methods at both 256 (4.70% avg improvement) and 1k context (2.47% avg improvement), outperforming BM25, TF-IDF, Jaccard, UniXCoder, and CodeBERT (Section 4.3.1, Finding 2).",
    321       "supported": "strong"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "The paper introduces SolBench, a large-scale Solidity code completion benchmark with 28,825 functions evaluated via differential fuzzing, demonstrating that BLEU/CrystalBLEU correlate weakly with functional correctness (r<=0.41). Evaluating 14 LLMs, GPT-5-mini achieves 97.19% Pass@1 at 32k context, while missing intra-contract context is the dominant failure mode. The proposed RAR framework using LCS retrieval and executor feedback reduces inference cost by ~50% while matching or exceeding the accuracy of models with double the context window.",
    328   "red_flags": [
    329     {
    330       "flag": "No uncertainty quantification",
    331       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or standard deviations reported. For a benchmark of 28,825 functions, even single-run results may be stable, but the absence of any uncertainty measure makes it impossible to assess whether observed differences between methods are meaningful or within noise."
    332     },
    333     {
    334       "flag": "No statistical significance tests",
    335       "detail": "Numerous comparative claims are made (e.g., model A outperforms model B, RAR improves over baseline) based solely on comparing raw numbers without any statistical significance testing."
    336     },
    337     {
    338       "flag": "Default inference settings without specification",
    339       "detail": "The paper states models were used with 'default inference settings' without reporting what those settings are. Temperature, top-p, and other sampling parameters significantly affect output quality and can differ between models and API versions."
    340     },
    341     {
    342       "flag": "Potential contamination not fully addressed",
    343       "detail": "While the paper measures overlap with The Stack (25.49%), many evaluated models may have been trained on Etherscan data directly. Training cutoff dates are not stated for any model, making it impossible to fully assess contamination risk. The paper's argument that high natural duplication makes contamination acceptable is questionable."
    344     },
    345     {
    346       "flag": "Benchmark filters out ~90% of real functions",
    347       "detail": "The filtering process removes approximately 90% of functions across all domains (Table 2), keeping only stateless functions that can be verified without on-chain storage. This means SolBench evaluates a narrow subset of smart contract functionality, yet the paper's title and claims generalize to 'smart contract generation' broadly."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "Evaluating large language models trained on code",
    353       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    354       "year": 2021,
    355       "arxiv_id": "2107.03374",
    356       "relevance": "Foundational work introducing HumanEval and Pass@k metric for functional correctness evaluation of code LLMs."
    357     },
    358     {
    359       "title": "Teaching Large Language Models to Self-Debug",
    360       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Scharli", "Denny Zhou"],
    361       "year": 2023,
    362       "arxiv_id": "2304.05128",
    363       "relevance": "Core code repair baseline method used in the RAR framework evaluation."
    364     },
    365     {
    366       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    367       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    368       "year": 2023,
    369       "arxiv_id": "2303.17651",
    370       "relevance": "Self-reflection-based code repair method used as a baseline in the RAR framework."
    371     },
    372     {
    373       "title": "Self-Edit: Fault-Aware Code Editor for Code Generation",
    374       "authors": ["Kechi Zhang", "Zhuo Li", "Jia Li", "Ge Li", "Zhi Jin"],
    375       "year": 2023,
    376       "arxiv_id": "2305.04087",
    377       "relevance": "Execution-feedback-based code repair method that is the best-performing repair strategy in the RAR framework."
    378     },
    379     {
    380       "title": "Is Self-Repair a Silver Bullet for Code Generation?",
    381       "authors": ["Theo X. Olausson", "Jeevana Priya Inala", "Chenglong Wang"],
    382       "year": 2024,
    383       "arxiv_id": "2306.09896",
    384       "relevance": "Evaluates the limitations of self-repair for code generation, used as a baseline repair method."
    385     },
    386     {
    387       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming--The Rise of Code Intelligence",
    388       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    389       "year": 2024,
    390       "arxiv_id": "2401.14196",
    391       "relevance": "One of the key open-source code LLM families evaluated in the benchmark across multiple model sizes."
    392     },
    393     {
    394       "title": "Code llama: Open foundation models for code",
    395       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    396       "year": 2023,
    397       "arxiv_id": "2308.12950",
    398       "relevance": "Open-source code LLM evaluated in the benchmark, showing that older models underperform smaller but newer ones."
    399     },
    400     {
    401       "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
    402       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    403       "year": 2020,
    404       "relevance": "Foundational RAG paper that the RAR framework builds upon, differing by using intra-contract rather than external retrieval."
    405     },
    406     {
    407       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    408       "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang"],
    409       "year": 2023,
    410       "relevance": "Repository-level code completion using retrieval, directly related to the context-augmented code generation problem addressed by RAR."
    411     },
    412     {
    413       "title": "SolEval: Benchmarking Large Language Models for Repository-level Solidity Code Generation",
    414       "authors": ["Zhiyuan Peng", "Xin Yin", "Rui Qian"],
    415       "year": 2025,
    416       "arxiv_id": "2502.18793",
    417       "relevance": "Concurrent work on Solidity code generation benchmarking, providing a smaller-scale alternative to SolBench with 1,125 samples."
    418     },
    419     {
    420       "title": "Starcoder: may the source be with you!",
    421       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    422       "year": 2023,
    423       "arxiv_id": "2305.06161",
    424       "relevance": "Major open-source code LLM whose training data (The Stack) is used for contamination analysis."
    425     },
    426     {
    427       "title": "Retrieval-augmented generation for large language models: A survey",
    428       "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"],
    429       "year": 2023,
    430       "arxiv_id": "2312.10997",
    431       "relevance": "Survey of RAG techniques for LLMs, providing context for the retrieval-augmented approach used in RAR."
    432     }
    433   ]
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs