scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31361B)
      1 {
      2   "paper": {
      3     "title": "RepoGenReflex: Enhancing Repository-Level Code Completion with Verbal Reinforcement and Retrieval-Augmented Generation",
      4     "authors": [
      5       "Jicheng Wang",
      6       "Yifeng He",
      7       "Hao Chen"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2409.13122",
     12     "doi": "10.48550/arXiv.2409.13122"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval"
     21   ],
     22   "key_findings": "RepoGenReflex proposes an iterative RAG framework enhanced with verbal reinforcement learning for repository-level code completion, using a Reflector component to provide feedback between iterations. The paper finds that a general-purpose model (Meta-Llama-3-8B) outperforms a code-specialized model (CodeGen-Mono-6B) as the Reflector, and claims the full framework outperforms CodeT5+ 2B, CodeLlama-7b-hf, StarCoder, and CodeGemma on both RepoEval and their new RepoGenEval benchmark. However, the framework's Evaluator component uses ground-truth EM/ES scores during the iterative loop — oracle access that baselines do not receive — making the comparison fundamentally unfair.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The abstract states 'Our source code and benchmark will be publicly available' — future tense promise, not an actual release. No repository URL is provided anywhere in the paper."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The RepoGenEval benchmark is described but not released. The promise is future tense ('will be publicly available'). The underlying repos are public GitHub projects, but the curated 1600-sample benchmark is not available. RepoEval is a pre-existing public benchmark they did not create."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No environment specifications, requirements files, hardware descriptions, or dependency lists are provided anywhere in the paper."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No reproduction instructions are provided. There is no README, no scripts, and no step-by-step guide to replicate the experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All tables (Tables 2-5) report only point estimates for EM and ES. No confidence intervals, error bars, or uncertainty measures are provided."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims 'significant improvements' and 'superior performance' but provides no statistical significance tests (no p-values, t-tests, or any other test). All comparisons are based solely on comparing raw numbers."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Tables 2-5 report absolute EM and ES scores for all models and baselines, allowing direct computation of improvement magnitudes. For example, RepoGenReflex achieves 0.480 EM vs CodeGemma's 0.463 on RepoEval (Table 3), providing enough context to assess the magnitude of differences."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The benchmark uses 200 randomly selected lines per repository (1600 total) with no justification for why this sample size is adequate. No power analysis or discussion of statistical power."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or spread measures are reported anywhere."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper compares against four baselines: CodeT5+ 2B, CodeLlama-7b-hf, StarCoder, and CodeGemma (Section 4.3, Tables 3-4)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include CodeGemma (2024), StarCoder (2023), CodeLlama (2023), and CodeT5+ (2023). These are reasonably contemporary models for a 2024 paper."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 4.4 presents an ablation study with three configurations: Full Model, without Reflector+Experience, and without Evaluator (Table 5)."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Two metrics are used throughout: Exact Match (EM) and Edit Similarity (ES), as defined in Section 3.1."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation is included. All evaluation is automated via EM and ES metrics. Human evaluation could have assessed code quality, usefulness, or correctness in context."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The Evaluator component computes EM/ES against ground truth during the iterative loop (Algorithm 1, lines 7-16). The ground truth labels are used to guide the optimization process (stopping criteria and Reflector feedback), meaning test data leaks into the generation pipeline. There is no separation between evaluation data and optimization data."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Per-repository breakdowns are provided in all results tables (Tables 2-5), showing performance on each of the 8 repositories in both RepoEval and RepoGenEval."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No failure cases are discussed. The paper presents only aggregate metrics without any qualitative analysis of where the framework fails or produces poor completions."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "Every experiment shows improvement. The paper reports no configurations that failed, no approaches that were tried and abandoned, and no settings where the framework underperformed baselines."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract claims 'significant improvements' but no statistical significance tests are performed. The abstract also claims 'superior performance and effectiveness across standard code completion tasks' — the improvements over baselines are small (1-2% EM) and could be entirely attributable to oracle access via the Evaluator, not the framework's design."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The ablation study (Section 4.4) makes causal claims about component contributions ('disabling Reflector and Experience led to a significant decline'). While the ablation design is structurally adequate, the Evaluator's oracle access to ground truth confounds all causal claims — improvements may be due to label leakage rather than component quality."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Repository-Level Code Completion' generally, but all experiments are on Python-only repositories. The abstract claims 'robustness and adaptability' without qualification. Results are on only 16 repos total (8 RepoEval + 8 RepoGenEval), all Python."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations are discussed for the observed improvements. The most obvious alternative — that the framework's advantage stems from oracle access to ground truth EM/ES during iteration rather than from the retrieval-generation design — is never acknowledged."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures EM and ES and claims these reflect code completion accuracy and quality. The metrics directly measure what is claimed (exact match and edit similarity to ground truth), so there is no significant proxy gap."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Specific model names with sizes are provided: CodeGen-Mono-6B, Meta-Llama-3-8B, CodeT5+ 2B, CodeLlama-7b-hf, StarCoder, and CodeGemma 7B. These are identifiable model checkpoints."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Figures 2 and 3 show schematic illustrations of the prompt construction and example feedback, but the actual prompt templates and fill values used in experiments are not provided. The reader cannot reconstruct the exact prompts sent to the models."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No LLM generation hyperparameters are reported (temperature, top-p, max tokens). Algorithm 1 specifies framework parameters (max_iter=10, no_improvement_threshold=3), but model inference settings are absent."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The iterative framework (Retriever, Actor, Evaluator, Reflector, Experience) is described in detail in Section 3 with a workflow diagram (Figure 1), component descriptions, mathematical formulations, and Algorithm 1."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.1 documents the RepoGenEval construction criteria: non-forked, open-source, 700-40000 stars, >90% Python, explicit unit tests, created no earlier than 2023. Line selection: 200 random lines per repo, unique, non-comment (1600 total samples)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "There is no limitations section, threats-to-validity section, or any substantive discussion of the work's limitations anywhere in the paper."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No threats to validity are discussed. The paper does not mention any specific concerns about its methodology, evaluation design, or generalizability."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No scope boundaries are stated. The paper does not explicitly discuss what the results do NOT show or what settings/populations are excluded from its claims."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw data (model outputs, per-sample results, benchmark files) are available for verification. Only aggregate metrics in tables are provided."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.1 describes the data collection for RepoGenEval: GitHub REST API and manual keyword searches for Python projects meeting specific criteria (stars, Python ratio, unit tests, creation date). Table 1 lists the 8 selected repositories with metadata."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data sources are public GitHub repositories and a pre-existing benchmark (RepoEval)."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "While repo selection criteria are documented, the pipeline from selected repos to final benchmark samples is sparse. How random line selection was implemented, how 'unique' was defined, how lines were extracted for completion tasks, and how ground truth was constructed are not documented."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source, grants, or sponsorship are mentioned anywhere in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly stated: all three authors are from University of California, Davis. They do not evaluate a commercial product from their own institution."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence cannot be verified. Without a funding disclosure statement, we cannot assess whether any funder had a stake in the results."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No training data cutoff dates are stated for any of the models used (CodeGen-Mono-6B, Meta-Llama-3-8B, CodeT5+ 2B, CodeLlama-7b-hf, StarCoder, CodeGemma)."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the test repositories or code lines appeared in any model's training data. The RepoEval benchmark repos are from GitHub and could be in training sets of models trained on GitHub data."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "RepoEval was published in 2023 and uses GitHub repos; models like Meta-Llama-3 and CodeGemma trained on internet data may have seen it. RepoGenEval uses GitHub repos created in 2023, which could also be in training sets. No contamination analysis is performed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. It is a benchmark evaluation of code completion models."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference cost or latency is reported. The iterative process runs up to 10 iterations per example, each involving Retriever, Actor, Evaluator, and Reflector LLM calls — potentially 40+ LLM calls per sample — but cost is never mentioned."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No computational budget is stated. No GPU hours, hardware specifications, or total experiment time are provided."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No multi-seed results are reported. All results appear to be from single runs without any sensitivity analysis."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged across multiple attempts."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search budget is reported. Framework parameters (max_iter=10, no_improvement_threshold=3) appear chosen without justification, and no search over alternatives is documented."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Section 4.2 transparently compares two Reflector model choices (CodeGen-Mono-6B vs Meta-Llama-3-8B) and reports results for both (Table 2), justifying the selection of Meta-Llama-3-8B based on superior performance."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite making many cross-model and cross-repository comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors evaluate their own framework against baselines without acknowledging the self-comparison bias. Baselines are used as-is (from their JSONL files) while RepoGenReflex uses the full iterative pipeline, but this asymmetry is not discussed."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "RepoGenReflex uses up to 10 iterations with multiple LLM calls per iteration, while baselines use single-pass inference. This massive compute difference (potentially 10-40x more LLM calls) is never discussed or controlled for."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether EM/ES on random single-line completions actually measures useful code completion ability. No analysis of construct validity for either RepoEval or RepoGenEval."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "RepoGenReflex wraps CodeGen-Mono-6B in an elaborate iterative scaffold (Retriever + Actor + Evaluator + Reflector + Experience) while baselines are standalone models. The comparison attributes differences to the framework but does not isolate the scaffold effect from the model effect. The baselines 'only utilized the JSONL file of our benchmark' while 'our framework leveraged all elements in the benchmark and the iterative loop' (Section 4.3)."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the benchmark repositories (some from 2023) could have been in model training data. RepoGenEval uses repos created after 2023, which may help for some models, but this is not discussed as a leakage mitigation strategy."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The Evaluator computes EM/ES against ground truth during the iterative loop and feeds these scores to the Reflector (Figure 2, Algorithm 1). This gives the framework oracle access to how close its output is to the answer — a direct form of label leakage during inference that baselines do not receive. This is never acknowledged."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether code in test repositories shares patterns, dependencies, or near-duplicates with other data used in the process."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference tests, n-gram overlap analysis, or temporal splits."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Meta-Llama-3-8B is a better Reflector than CodeGen-Mono-6B for the RepoGenReflex framework",
    374       "evidence": "Table 2 shows Meta-Llama-3-8B consistently achieves higher EM and ES scores across all 8 RepoGenEval repositories (e.g., Auto-GPT-ZH EM 0.325 vs 0.280, ES 0.483 vs 0.438).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "RepoGenReflex outperforms SOTA models (CodeT5+ 2B, CodeLlama-7b-hf, StarCoder, CodeGemma) on both RepoEval and RepoGenEval benchmarks",
    379       "evidence": "Tables 3-4 show RepoGenReflex achieving the highest EM and ES scores across repositories on both benchmarks. However, the comparison is fundamentally unfair: RepoGenReflex uses ground-truth EM/ES scores during its iterative loop (oracle access) while baselines perform single-pass inference.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Each component (Reflector, Experience, Evaluator) is critical for the framework's performance",
    384       "evidence": "Ablation study in Table 5 shows performance degrades when Reflector+Experience are removed (EM drops ~0.12) and when Evaluator is removed (EM drops ~0.04). However, the ablation data for 'Model A' shows EM values decreasing in perfect 0.005 arithmetic steps across repos (0.365, 0.360, 0.355, 0.350, 0.345, 0.340), which is inconsistent with real experimental results.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "RepoGenReflex achieves 'significant improvements' in code completion accuracy",
    389       "evidence": "The improvements over the best baseline (CodeGemma) are small: ~1.7% EM on RepoEval (Table 3), ~1.4% EM on RepoGenEval (Table 4). No statistical significance tests are performed. The word 'significant' is used without statistical backing.",
    390       "supported": "weak"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Oracle access to ground truth during inference",
    396       "detail": "The Evaluator component computes EM/ES against the ground truth answer during the iterative optimization loop (Algorithm 1, lines 7-16) and feeds these scores to the Reflector (Figure 2). This gives the framework direct knowledge of how close its output is to the correct answer — information that baselines do not receive. This makes the comparison fundamentally unfair and means the framework cannot work in practice (where the ground truth is unknown). The entire performance advantage could be attributed to this oracle signal rather than the retrieval-generation design."
    397     },
    398     {
    399       "flag": "Suspiciously regular experimental results",
    400       "detail": "In Table 5 (ablation), Model A EM values decrease in perfect 0.005 arithmetic steps across the first 6 repositories: 0.365, 0.360, 0.355, 0.350, 0.345, 0.340. Model A ES shows the identical pattern: 0.665, 0.660, 0.655, 0.650, 0.645, 0.640. Real experimental results on different repositories do not produce perfect arithmetic progressions. In Table 3, all models show remarkably tight clustering across 8 different repos (e.g., RepoGenReflex EM ranges only 0.476-0.481). These patterns are inconsistent with genuine experimental measurement."
    401     },
    402     {
    403       "flag": "Unfair baseline comparison (scaffold confound)",
    404       "detail": "RepoGenReflex uses an iterative pipeline with up to 10 iterations of Retriever + Actor + Evaluator + Reflector calls per sample, while baselines perform single-pass inference. The paper acknowledges this asymmetry ('Unlike the SOTA models, which only utilized the JSONL file of our benchmark, our framework leveraged all elements') but does not control for the massive difference in compute and oracle access."
    405     },
    406     {
    407       "flag": "No error bars or statistical tests",
    408       "detail": "All results are point estimates from apparently single runs. Claims of 'significant improvements' and 'superior performance' are made without any statistical tests, confidence intervals, or variance measures across the entire paper."
    409     },
    410     {
    411       "flag": "No limitations discussion",
    412       "detail": "The paper contains no limitations section, no threats to validity, and no discussion of the framework's weaknesses. The most critical limitation — that the Evaluator requires ground truth and thus the framework cannot function in real deployment — is never acknowledged."
    413     },
    414     {
    415       "flag": "No cost analysis despite expensive iterative design",
    416       "detail": "The iterative process involves up to 10 iterations per sample, each requiring multiple LLM calls (Retriever, Actor, Evaluator, Reflector). This could mean 40+ LLM calls per completion versus 1 for baselines. No cost, latency, or compute comparison is provided."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    422       "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang", "Jin Liu", "Daoguang Zan", "Yi Mao", "Jian-Guang Lou", "Weizhu Chen"],
    423       "year": 2023,
    424       "arxiv_id": "2303.12570",
    425       "relevance": "Direct baseline for repository-level code completion using iterative RAG; RepoGenReflex extends this approach with verbal reinforcement learning."
    426     },
    427     {
    428       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    429       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    430       "year": 2024,
    431       "relevance": "Foundational work on verbal reinforcement learning (VRL) that RepoGenReflex adapts for code completion; shows iterative LLM self-refinement without weight updates."
    432     },
    433     {
    434       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    435       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    436       "year": 2023,
    437       "arxiv_id": "2310.06770",
    438       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks; relevant to the survey's scope of agentic AI evaluation."
    439     },
    440     {
    441       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    442       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal"],
    443       "year": 2020,
    444       "relevance": "Foundational RAG paper that RepoGenReflex builds upon; establishes the retrieval-augmented generation paradigm used across LLM applications."
    445     },
    446     {
    447       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    448       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi", "Lifu Tu", "Huan Wang", "Yingbo Zhou", "Silvio Savarese", "Caiming Xiong"],
    449       "year": 2022,
    450       "arxiv_id": "2203.13474",
    451       "relevance": "CodeGen-Mono-6B is the primary Actor model in the RepoGenReflex framework; key code generation model in the LLM programming space."
    452     },
    453     {
    454       "title": "RLCoder: Reinforcement Learning for Repository-Level Code Completion",
    455       "authors": ["Yanlin Wang", "Yanxian Wang", "Daya Guo", "Jiachi Chen", "Ruikai Zhang", "Yuchi Ma", "Zibin Zheng"],
    456       "year": 2024,
    457       "arxiv_id": "2407.19487",
    458       "relevance": "Concurrent work on reinforcement learning for repository-level code completion; directly relevant to the survey scope of LLM-based code generation."
    459     },
    460     {
    461       "title": "REPOFORMER: Selective Retrieval for Repository-Level Code Completion",
    462       "authors": ["Di Wu", "Wasi Uddin Ahmad", "Dejiao Zhang", "Murali Krishna Ramanathan", "Xiaofei Ma"],
    463       "year": 2024,
    464       "arxiv_id": "2403.10059",
    465       "relevance": "Addresses selective retrieval for repository-level code completion using RAG; directly comparable approach to RepoGenReflex."
    466     },
    467     {
    468       "title": "Competition-Level Code Generation with AlphaCode",
    469       "authors": ["Yujia Li", "David Choi", "Junyoung Chung", "Nate Kushman", "Julian Schrittwieser"],
    470       "year": 2022,
    471       "relevance": "Major code generation system from DeepMind; demonstrates large-scale code generation capabilities relevant to the LLM programming survey."
    472     },
    473     {
    474       "title": "Repository-Level Prompt Generation for Large Language Models of Code",
    475       "authors": ["Disha Shrivastava", "Hugo Larochelle", "Daniel Tarlow"],
    476       "year": 2023,
    477       "relevance": "Proposes prompt generation methods for repository-level code completion; directly relevant to the repository-level code completion research area."
    478     },
    479     {
    480       "title": "LongCoder: A Long-Range Pre-Trained Language Model for Code Completion",
    481       "authors": ["Daya Guo", "Canwen Xu", "Nan Duan", "Jian Yin", "Julian McAuley"],
    482       "year": 2023,
    483       "relevance": "Addresses the long-range context challenge in code completion; relevant to understanding LLM approaches to repository-level code understanding."
    484     },
    485     {
    486       "title": "Cognitive Architectures for Language Agents",
    487       "authors": ["Theodore R. Sumers", "Shunyu Yao", "Karthik Narasimhan", "Thomas L. Griffiths"],
    488       "year": 2023,
    489       "arxiv_id": "2309.02427",
    490       "relevance": "Foundational work on cognitive architectures for LLM-based agents; relevant to the survey's coverage of agentic AI frameworks."
    491     }
    492   ],
    493   "engagement_factors": {
    494     "practical_relevance": {
    495       "score": 1,
    496       "justification": "The concept of iterative refinement for code completion is practically relevant, but the framework requires ground truth during inference, making it unusable in practice, and no code is released."
    497     },
    498     "surprise_contrarian": {
    499       "score": 0,
    500       "justification": "Confirms the expected finding that iterative refinement with feedback improves model outputs; does not challenge any conventional wisdom."
    501     },
    502     "fear_safety": {
    503       "score": 0,
    504       "justification": "No safety or security implications; this is a code completion accuracy paper."
    505     },
    506     "drama_conflict": {
    507       "score": 0,
    508       "justification": "No controversy or conflict with existing work or claims."
    509     },
    510     "demo_ability": {
    511       "score": 0,
    512       "justification": "No code, demo, or benchmark released despite promises; nothing for anyone to try."
    513     },
    514     "brand_recognition": {
    515       "score": 0,
    516       "justification": "From UC Davis, not a major AI lab; does not evaluate or involve any high-profile product."
    517     }
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs