scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19684B)
      1 {
      2   "paper": {
      3     "title": "EVOR: Evolving Retrieval for Code Generation",
      4     "authors": ["Hongjin Su", "Shuyang Jiang", "Yuhang Lai", "Haoyuan Wu", "Boao Shi", "Che Liu", "Qian Liu", "Tao Yu"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2402.12317"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'Our model, code, and data are available at https://arks-codegen.github.io.' A URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states code and data are available at the project page. They also compile a new benchmark EVOR-BENCH with four datasets."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section listing library versions is provided in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are included in the paper itself. The project page is referenced but the paper does not contain a README or reproducing results section."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 2-4 and figures are reported as point estimates (e.g., '37.9') with no confidence intervals, error bars, or ± notation."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims EVOR 'significantly' outperforms baselines but provides no statistical significance tests (no p-values, t-tests, etc.)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute percentage improvements with baseline context, e.g., 'EVOR outperforms DocPrompting by 18.6% on average using CodeLlama' and provides full baseline numbers in Table 2."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Dataset sizes are reported (142, 45, 107, 113 problems) but no justification is given for why these sizes are sufficient for the claims being made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance across runs, or spread measures are reported. Results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Five baselines are compared: Vanilla, MPSC, ExeDec, Reflexion, and DocPrompting (Section 3.1, Table 2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent methods: Reflexion (2024), DocPrompting (2023), MPSC (2023), ExeDec (2023). These are contemporary and relevant."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4.1 (Table 3) ablates query evolution vs. knowledge evolution vs. both. Section 4.2 (Table 4) ablates knowledge source types."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The paper uses only execution accuracy (pass@1) as the metric: 'By default, we use the execution accuracy (pass@1) as the metric throughout the paper.' Section 4.4 also uses pass@t but this is the same metric at different token budgets."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. All evaluation is automated via execution accuracy. Human evaluation could assess code quality beyond pass/fail."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "EVOR-BENCH is a newly compiled benchmark with manually written ground truth solutions. The benchmark problems are separate from any development data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-dataset breakdowns across all four datasets (Scipy-M, Tensorflow-M, Ring, Pony) rather than just averages."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No error analysis or qualitative discussion of failure cases is presented. The paper does not show where EVOR fails or discuss specific failure modes."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that web search 'only marginally improves the results' (Section 3.2) and achieves less than 1% improvement when used alone without query evolution (Section 4.2, Table 4)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims '2 to 4 times execution accuracy' which is supported by Table 2 (e.g., EVOR 35.3% vs Reflexion 13.9% with ChatGPT). Claims about flexibility and combination with other methods are supported in Section 4.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies in Sections 4.1 and 4.2, where individual components are systematically added/removed."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'Evolving Retrieval for Code Generation' is broad, but the evaluation is limited to 4 specific datasets (2 modified Python libraries, 2 long-tail languages) and 2 models. The paper does not explicitly bound its generalization claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the results are discussed. For example, whether the gains come primarily from more LLM calls/tokens rather than the retrieval evolution is not adequately addressed beyond Section 4.4."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions are stated: 'gpt-3.5-turbo-1106' (footnote 5), 'CodeLlama-34b-Instruct-hf' (footnote 6), 'GPT-4-1106' and 'Claude-3-opus' for SWE-bench experiments."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompting approaches but does not provide full prompt text. Query evolution and code generation prompts are described in natural language without the actual text used."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Maximum iterations set to 30, termination condition (same feedback 3 consecutive iterations), maximum context length of 4096, INSTRUCTOR-xl as retrieval model (Section 3.2). However, temperature/sampling settings are not explicitly stated."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Algorithm 1 provides a detailed formal description of the EVOR pipeline including query evolution, retrieval, generation, execution feedback, and knowledge base evolution steps."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2.4 describes dataset curation: modified Python libraries to simulate updates, adapted DS-1000 problems, selected Ring and Pony for long-tail languages, manual ground truth annotation. More details in Appendix A."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 'Limitations' discusses iterative process leading to longer latency and increased energy consumption."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations section is generic, mentioning only latency and energy consumption concerns. No specific threats to validity for the experimental results are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show or what settings/populations are excluded from the claims."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The project page (https://arks-codegen.github.io) claims to make model, code, and data available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2.4 describes how datasets were compiled: modifying Scipy/Tensorflow libraries, adapting DS-1000 problems, selecting LeetCode problems for Ring/Pony, manual ground truth writing."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data is from standard benchmarks and programmatically constructed datasets."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Table 1 provides dataset statistics. The curation process is described in Section 2.4 with further details in Appendix A, including how problems were adapted and solutions annotated."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section mentioning grants or sponsors is visible in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: University of Hong Kong, Fudan University, Sea AI Lab."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "Training cutoff dates for gpt-3.5-turbo-1106 and CodeLlama are not explicitly stated, though the paper addresses contamination concerns through benchmark design."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 2.4 explicitly addresses this: 'We do not use a real library update version because it is potentially exposed to LLM training data.' The benchmark is designed to avoid overlap by modifying libraries and using long-tail languages excluded from StarCoder training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The entire benchmark design (EVOR-BENCH) is motivated by avoiding contamination: modified libraries simulate unseen updates, and Ring/Pony are chosen because they 'have little public data and are excluded from the StarCoder training set' (Section 2.4)."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section 4.4 analyzes token consumption at different budgets (4k-24k tokens) and compares EVOR's efficiency against DocPrompting. Figure 3 shows pass@t at different token levels."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total API spend, GPU hours, or total computational budget is reported. The token analysis in Section 4.4 shows per-example token budgets but not total experiment cost."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "EVOR achieves 2-4x execution accuracy compared to existing methods like Reflexion and DocPrompting",
    286       "evidence": "Table 2: EVOR achieves 35.3% avg with ChatGPT vs Reflexion 13.9% (~2.5x) and DocPrompting 19.2% (~1.8x). With CodeLlama, EVOR 32.2% vs DocPrompting 16.0% (2x).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Synchronous evolution of both queries and knowledge is consistently better than evolving either alone",
    291       "evidence": "Table 3 (Section 4.1): EVOR (evolve both) achieves 35.3% vs evolve query only 28.4% vs evolve knowledge only 23.8% with ChatGPT. Similar pattern with CodeLlama.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "EVOR can be combined with existing methods (MPSC, ExeDec, Reflexion) for further improvement",
    296       "evidence": "Table 2: EVOR + Reflexion achieves 37.9% vs EVOR alone 35.3% with ChatGPT, up to 2.6% additional gain on average.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "EVOR is a more effective approach to using tokens, achieving superior results at all token budgets",
    301       "evidence": "Figure 3 (Section 4.4): EVOR achieves higher pass@t than DocPrompting at all token consumption levels from 4k to 24k for both models.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Diverse knowledge sources enhance RACG performance, with larger improvements under evolution",
    306       "evidence": "Table 4 (Section 4.2): CodeLlama with Exec+Code+Doc achieves 32.2% with evolution vs 20.4% without. Adding documentation to Exec+Code improves by 6.9% with evolution but only 4.5% without.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "EVOR, a retrieval-augmented code generation pipeline that synchronously evolves both queries and knowledge bases, achieves 2-4x execution accuracy over existing methods on a new benchmark (EVOR-BENCH) covering updated libraries and long-tail programming languages. Ablation studies show that evolving both queries and knowledge is consistently superior to evolving either alone. The approach is composable with existing methods like Reflexion and SWE-agent for additional gains. Diverse knowledge sources (documentation, code snippets, execution feedback) provide complementary benefits that are amplified by the evolution mechanism.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance or uncertainty quantification",
    315       "detail": "All results are single-run point estimates with no error bars, standard deviations, or confidence intervals. Given that LLM outputs are stochastic, results could vary across runs."
    316     },
    317     {
    318       "flag": "No significance tests despite repeated claims of 'significant' improvement",
    319       "detail": "The paper uses the word 'significantly' multiple times to describe improvements but provides no statistical significance tests."
    320     },
    321     {
    322       "flag": "Small dataset sizes for some benchmarks",
    323       "detail": "Tensorflow-M has only 45 problems. Performance differences of a few percentage points on such small datasets may not be meaningful."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    329       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    330       "year": 2023,
    331       "arxiv_id": "2310.06770",
    332       "relevance": "Key benchmark for evaluating LLM agents on real-world software engineering tasks, used in EVOR's repo-level evaluation."
    333     },
    334     {
    335       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    336       "authors": ["Noah Shinn"],
    337       "year": 2024,
    338       "relevance": "Iterative LLM self-improvement framework used as baseline; represents agent-based code generation approaches."
    339     },
    340     {
    341       "title": "Evaluating Large Language Models Trained on Code",
    342       "authors": ["Mark Chen"],
    343       "year": 2021,
    344       "arxiv_id": "2107.03374",
    345       "relevance": "Introduced HumanEval benchmark for code generation, foundational for LLM code evaluation."
    346     },
    347     {
    348       "title": "DocPrompting: Generating Code by Retrieving the Docs",
    349       "authors": ["Shuyan Zhou"],
    350       "year": 2022,
    351       "relevance": "Retrieval-augmented code generation using documentation; key baseline and predecessor to EVOR."
    352     },
    353     {
    354       "title": "Teaching Large Language Models to Self-Debug",
    355       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    356       "year": 2023,
    357       "arxiv_id": "2304.05128",
    358       "relevance": "Uses execution feedback to refine LLM code generation, related approach to EVOR's execution feedback component."
    359     },
    360     {
    361       "title": "StarCoder: May the Source Be with You!",
    362       "authors": ["Raymond Li"],
    363       "year": 2023,
    364       "arxiv_id": "2305.06161",
    365       "relevance": "Open-source code LLM; its training set exclusions informed EVOR-BENCH's choice of long-tail languages."
    366     },
    367     {
    368       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    369       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    370       "year": 2023,
    371       "arxiv_id": "2305.01210",
    372       "relevance": "Rigorous evaluation methodology for LLM code generation quality."
    373     },
    374     {
    375       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    376       "authors": ["Yuhang Lai"],
    377       "year": 2023,
    378       "relevance": "Source benchmark from which EVOR-BENCH's Scipy and Tensorflow problems were adapted."
    379     },
    380     {
    381       "title": "Active Retrieval Augmented Generation",
    382       "authors": ["Zhengbao Jiang"],
    383       "year": 2023,
    384       "relevance": "Active RAG approach that iteratively decides when to retrieve; related to EVOR's evolving retrieval paradigm."
    385     }
    386   ]
    387 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs