ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29410B)


      1 {
      2   "paper": {
      3     "title": "Optimizing Code Runtime Performance through Context-Aware Retrieval-Augmented Generation",
      4     "authors": [
      5       "Manish Acharya",
      6       "Yifan Zhang",
      7       "Kevin Leach",
      8       "Yu Huang"
      9     ],
     10     "year": 2025,
     11     "venue": "IEEE International Conference on Program Comprehension",
     12     "arxiv_id": "2501.16692",
     13     "doi": "10.1109/ICPC66645.2025.00028"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "AUTOPATCH combines CFG diff analysis with RAG to guide GPT-4o in optimizing C++ code, achieving a 7.3% execution time improvement over zero-shot GPT-4o on 116 executable programs from IBM Project CodeNet. Naive retrieval using source code embeddings without CFG knowledge actually hurts performance by 27.3%, suggesting CFG-based context selection is critical. Lexical similarity improvements are modest (4–14%), and the entire evaluation lacks statistical significance testing or variance reporting.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The Data Availability section provides a GitHub URL (https://github.com/manishacharya60/rag-optimization) and describes its contents including source code, preprocessing scripts, and implementation modules."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The study uses the publicly available IBM Project CodeNet dataset. The repository also includes preprocessed C++ code pairs with embeddings used for retrieval."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper specifies hardware (Intel Xeon Gold 6330N CPU) and mentions Clang, GPT-4o, and CodeBERT, but provides no software dependency specifications, library versions, requirements.txt, or environment setup details."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The Data Availability section states the repository includes 'step-by-step guidelines to replicate or extend the experiments' along with preprocessing scripts and implementation modules."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables III and IV report only point estimates (e.g., 7.3% improvement, 0.3815s average time). No confidence intervals, error bars, or uncertainty measures are provided anywhere."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims AUTOPATCH outperforms baselines but provides no statistical significance tests (no p-values, t-tests, or any hypothesis testing). Differences are compared by raw numbers only."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Percentage improvements are reported with baseline context: 7.3% execution time improvement (Table IV), +4.41% LO, +13.52% EDS, +9.91% TO (Table III), with absolute values provided for comparison."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The dataset is 1,200 code pairs (1,000 for vector DB, 200 for testing, refined to 116 executable programs) but no justification is given for these sizes. No power analysis or sample size rationale is discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or any spread measure is reported. All results are single-point estimates with no indication of result stability across runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Two baselines are included: Zero-Shot Generation (GPT-4o without context) and Naive Generation (retrieval using source code embeddings without CFG analysis). Section III.B describes both."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both baselines use GPT-4o, which is a current state-of-the-art model. However, no comparison with other dedicated code optimization tools or frameworks (e.g., PIE, other LLM optimization pipelines) is included."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The three-way comparison (Zero-Shot, Naive, Context) partially shows the effect of retrieval and CFG analysis, but this is not presented as a systematic ablation. Individual components (CFG diff, RAG retrieval, prompt structure) are not isolated and tested separately."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses two categories of metrics: lexical similarity (Line Overlap, Edit Distance Similarity, Token Overlap) and execution time, providing complementary perspectives."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of the generated optimized code is performed. All evaluation is automated (lexical metrics and execution time measurement)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table I describes an 80/20 split: 1,000 pairs for the vector database and 200 pairs for testing. The test set is separate from the retrieval corpus."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 2 provides execution time breakdowns across five optimization types (Code Refactoring, Memory Optimization, Performance Enhancement, Algorithmic Simplification, Loop Optimization). Table II describes the distribution."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The paper briefly mentions 'a slight underperformance in the performance enhancement category' but provides no qualitative error analysis, no examples of failures, and no discussion of where the approach breaks down."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "Naive Generation performs 27.3% worse than Zero-Shot (Table IV), which is a striking negative result, but it is not discussed or analyzed. The paper also does not report failed configurations or approaches that were tried and abandoned."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims '7.3% improvement in execution efficiency over GPT-4o across common generated executable code,' which is supported by Table IV showing Context Generation at 0.3815s vs Zero-Shot at 0.4115s."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper claims AUTOPATCH's CFG-based approach improves performance. The controlled comparison holds the prompt structure constant across conditions (Section III.B: 'All methods share the same prompt structure'), varying only the retrieval method. This is adequate for the causal claims made."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Optimizing Code Runtime Performance through Context-Aware Retrieval-Augmented Generation' is broader than what was tested. The study evaluates only C++ code from IBM CodeNet, but the framing suggests general code optimization. The threats section partially bounds this but the title and abstract do not."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section VI (Threats to Validity) discusses specific alternatives: the retrieval process configuration could influence results, hardware features or compiler behaviors could affect outcomes, and different configurations or prompting strategies might yield different improvements."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures execution time directly for runtime performance claims, and explicitly notes that lexical similarity 'does not guarantee logical correctness' (Section IV.A). The main claim matches the measurement granularity."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper uses 'GPT-4o' throughout without specifying a snapshot date, API version, or model version identifier. 'GPT-4o' is a marketing name whose behavior changes over time."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section II.C describes prompt components abstractly (CFG differences, optimization rationales, retrieved example) but the actual prompt text is never provided. The reader cannot reconstruct the exact prompts sent to GPT-4o."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported anywhere in the paper. No embedding hyperparameters for CodeBERT or retrieval similarity thresholds are specified."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "AUTOPATCH is a single-pass RAG pipeline (retrieve → prompt → generate), not an agentic scaffold with retry logic, tool use, or feedback loops."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section III.D describes preprocessing: C++ code converted to CFGs using Clang, headers standardized, dependencies resolved, unsupported attributes removed. The test set filtering from 200 to 116 is explained as excluding non-executable or anomalous code."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section VI 'Threats to Validity' provides a dedicated multi-paragraph discussion of limitations."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The threats section identifies specific concerns: C++ programs from CodeNet 'may not represent the full diversity of real-world codebases, languages, or hardware,' the RAG retrieval configuration could influence results, and only a few baselines were compared."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section VI explicitly states what results do not cover: 'these do not cover maintainability, scalability, or project-specific constraints' and notes findings are limited to C++ programs from IBM Project CodeNet."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The Data Availability section states the repository includes preprocessed datasets with original and optimized C++ code pairs, metadata, and code embeddings. The underlying IBM CodeNet dataset is also publicly available."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper describes using 1,200 C++ code pairs sampled from IBM Project CodeNet (Table I), split 80/20 for vector DB and testing, with the test set refined to 116 executable programs."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. The data source is IBM Project CodeNet, a standard public benchmark."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The paper describes sampling 1,200 pairs from CodeNet and splitting 80/20, but the filtering from 200 test pairs to 116 executable programs lacks detail — no counts per exclusion type, and 'anomalous code' is undefined. The sampling method from the larger CodeNet dataset is also not described."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Acknowledgments section discloses funding from The SyBBURE Searle Undergraduate Research Program and NSF Grant No. CCF-2211429."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors are listed as affiliated with Vanderbilt University. No conflicts exist with the evaluated products (GPT-4o is from OpenAI; CodeBERT from Microsoft)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "NSF and the SyBBURE Searle Undergraduate Research Program are independent academic funders with no financial stake in the experimental outcomes."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper uses GPT-4o but does not state its training data cutoff date. This is relevant because the IBM CodeNet dataset has been public since 2021."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "IBM Project CodeNet was published in 2021. GPT-4o likely trained on data including CodeNet solutions, but no analysis of potential train/test overlap is provided."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "CodeNet has been publicly available since 2021 and could be in GPT-4o's training data. The paper does not discuss contamination risk at all, despite this being a significant threat to the validity of the execution time comparisons."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No API costs, token counts, or per-example latency are reported despite the method requiring GPT-4o API calls for both test case generation, explanation generation, and code optimization."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Hardware is listed (Intel Xeon Gold 6330N) but total compute time, GPU hours, API costs, or wall-clock time for the experiments are not stated."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Results appear to be from single runs. No mention of multiple random seeds or analysis of result sensitivity to randomness in GPT-4o generation or retrieval."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single generation or averaged across multiple attempts."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search budget is reported. The paper mentions 'Preliminary experiments retrieving two or three examples did not yield additional gains' but provides no data or systematic search description."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The choice to retrieve exactly one example is justified only by a passing mention that 'two or three examples did not yield additional gains.' No data is shown to support this claim, and no validation-based selection process is described."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors designed all three methods (Zero-Shot, Naive, Context) and evaluated them against their own implementations. No acknowledgment of author-evaluation bias or independent verification."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "AUTOPATCH requires additional compute for CFG generation (Clang), embedding computation (CodeBERT), vector retrieval, and prompt construction compared to zero-shot. These compute differences are not discussed or compared."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether CodeNet code optimization pairs actually represent real-world optimization tasks, or whether execution time on competitive programming problems generalizes to production code optimization."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "All methods use the same model (GPT-4o) with different prompts. This is not a cross-scaffold model comparison; the prompting strategy IS the variable being tested."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "IBM CodeNet was published in 2021. GPT-4o was trained on data well after this date, meaning it likely saw CodeNet solutions. This temporal leakage is not discussed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The RAG pipeline retrieves a highly similar code pair (including the optimized version) as context, which could provide strong hints. Whether this constitutes feature leakage relative to real-world usage is not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether vector DB examples and test examples might share structural similarities (e.g., same problem types, same solution patterns from competitive programming)."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No contamination checks, temporal splits, or decontamination pipelines are used."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "AUTOPATCH achieves a 7.3% improvement in execution efficiency over GPT-4o zero-shot generation.",
    370       "evidence": "Table IV shows average execution time of 0.3815s for Context Generation vs 0.4115s for Zero-Shot Generation, calculated over 116 executable test programs from IBM CodeNet.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Context-aware generation consistently outperforms baselines in lexical similarity metrics (Line Overlap, Edit Distance Similarity, Token Overlap).",
    375       "evidence": "Table III shows Context Generation improvements of +4.41% LO, +13.52% EDS, and +9.91% TO compared to the better of the two baselines. Absolute values remain modest (8.53% LO, 16.54% EDS, 59.91% TO).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Context-based generation generally achieves lower execution times across optimization types, except for a slight underperformance in performance enhancement.",
    380       "evidence": "Figure 2 shows execution times across five optimization categories. Context Generation is lowest in four of five categories but slightly worse for Performance Enhancement.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Retrieving a single structurally close example is sufficient; retrieving two or three examples did not yield additional gains.",
    385       "evidence": "Section II.C states 'Preliminary experiments retrieving two or three examples did not yield additional gains and often produced longer, less focused prompts.' No supporting data is shown.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Naive retrieval (source code embeddings without CFG knowledge) hurts performance relative to zero-shot.",
    390       "evidence": "Table IV shows Naive Generation at 0.5238s average execution time, which is 27.3% worse than Zero-Shot's 0.4115s. This striking negative result is reported but not analyzed.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No error bars or uncertainty quantification",
    397       "detail": "All results in Tables III and IV are single-point estimates. No statistical tests, confidence intervals, standard deviations, or multiple-run averages are reported. With LLM generation being stochastic, single-run results are unreliable."
    398     },
    399     {
    400       "flag": "Small effective sample size",
    401       "detail": "Only 116 executable programs remain after filtering from 200 test pairs (42% attrition). The 7.3% improvement on 116 programs without significance testing is not a robust finding."
    402     },
    403     {
    404       "flag": "Benchmark contamination risk",
    405       "detail": "IBM CodeNet was published in 2021 and is publicly available. GPT-4o was trained on data through at least 2023. The model may have memorized CodeNet solutions, making the optimization comparison unreliable."
    406     },
    407     {
    408       "flag": "Test cases generated by the evaluated model",
    409       "detail": "Section III.D states 'Execution testcases are generated by GPT-4o' — the same model family being evaluated generates the test inputs used to measure execution correctness, creating a potential bias toward code that GPT-4o can generate valid tests for."
    410     },
    411     {
    412       "flag": "Unexplained negative baseline result",
    413       "detail": "Naive Generation performs 27.3% worse than Zero-Shot (Table IV), meaning naive retrieval actively hurts. This significant finding is reported but never analyzed or explained, raising questions about the robustness of the retrieval approach."
    414     },
    415     {
    416       "flag": "No prompts provided",
    417       "detail": "Despite being a prompting-based approach where prompt design is central to the contribution, the actual prompt text is never shown. Only abstract descriptions of prompt components are given."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Learning performance-improving code edits",
    423       "authors": ["A. Shypula", "A. Madaan", "Y. Zeng", "U. Alon", "J. Gardner", "M. Hashemi", "G. Neubig", "P. Ranganathan", "O. Bastani", "A. Yazdanbakhsh"],
    424       "year": 2023,
    425       "arxiv_id": "2302.07867",
    426       "relevance": "Directly addresses AI-driven code performance optimization, the core topic of this paper."
    427     },
    428     {
    429       "title": "Search-based LLMs for code optimization",
    430       "authors": ["S. Gao", "C. Gao", "W. Gu", "M. Lyu"],
    431       "year": 2024,
    432       "relevance": "LLM-based code optimization approach presented at ICSE, a direct competitor/related method."
    433     },
    434     {
    435       "title": "How efficient is LLM-generated code? A rigorous & high-standard benchmark",
    436       "authors": ["R. Qiu", "W. W. Zeng", "H. Tong", "J. Ezick", "C. Lott"],
    437       "year": 2024,
    438       "arxiv_id": "2406.06647",
    439       "relevance": "Benchmarks LLM-generated code efficiency, directly relevant to evaluating AI code generation quality."
    440     },
    441     {
    442       "title": "CodeNet: A large-scale AI for code dataset for learning a diversity of coding tasks",
    443       "authors": ["R. Puri", "D. S. Kung", "G. Janssen"],
    444       "year": 2021,
    445       "arxiv_id": "2105.12655",
    446       "relevance": "The dataset used in this study; a major benchmark for AI code tasks."
    447     },
    448     {
    449       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    450       "authors": ["Z. Feng", "D. Guo", "D. Tang", "N. Duan"],
    451       "year": 2020,
    452       "arxiv_id": "2002.08155",
    453       "relevance": "Pre-trained code model used for embeddings in this paper; foundational for code understanding with LLMs."
    454     },
    455     {
    456       "title": "Frustrated with code quality issues? LLMs can help!",
    457       "authors": ["N. Wadhwa", "J. Pradhan", "A. Sonwane", "S. P. Sahu"],
    458       "year": 2023,
    459       "arxiv_id": "2309.12938",
    460       "relevance": "Evaluates LLMs for code quality improvement, related to automated code refinement."
    461     },
    462     {
    463       "title": "Using an LLM to help with code understanding",
    464       "authors": ["D. Nam", "A. Macvean", "V. Hellendoorn", "B. Vasilescu", "B. Myers"],
    465       "year": 2024,
    466       "relevance": "Studies LLM-assisted code comprehension, relevant to AI-augmented programming productivity."
    467     },
    468     {
    469       "title": "ARKS: Active retrieval in knowledge soup for code generation",
    470       "authors": ["H. Su", "S. Jiang", "Y. Lai", "H. Wu"],
    471       "year": 2024,
    472       "arxiv_id": "2402.12317",
    473       "relevance": "RAG framework for code generation, directly comparable retrieval-augmented approach."
    474     },
    475     {
    476       "title": "CodeRAG-Bench: Can retrieval augment code generation?",
    477       "authors": ["Z. Z. Wang", "A. Asai", "X. V. Yu", "F. F. Xu", "Y. Xie", "G. Neubig", "D. Fried"],
    478       "year": 2024,
    479       "arxiv_id": "2406.14497",
    480       "relevance": "Benchmark for evaluating RAG in code generation tasks, directly relevant to assessing retrieval-augmented code approaches."
    481     },
    482     {
    483       "title": "Enhancing code translation in language models with few-shot learning via retrieval-augmented generation",
    484       "authors": ["M. Bhattarai", "J. E. Santos", "S. Jones", "A. Biswas", "B. Alexandrov", "D. O'Malley"],
    485       "year": 2024,
    486       "arxiv_id": "2407.19619",
    487       "relevance": "Applies RAG to code translation with few-shot learning, related retrieval-augmented code transformation approach."
    488     },
    489     {
    490       "title": "Measuring the runtime performance of code produced with GitHub Copilot",
    491       "authors": ["D. Erhabor", "S. Udayashankar", "M. Nagappan", "S. Al-Kiswany"],
    492       "year": 2023,
    493       "arxiv_id": "2305.06439",
    494       "relevance": "Evaluates runtime performance of AI-generated code, directly relevant to the code efficiency evaluation methodology."
    495     },
    496     {
    497       "title": "Prompt-based code completion via multi-retrieval augmented generation",
    498       "authors": ["H. Tan", "Q. Luo", "L. Jiang", "Z. Zhan"],
    499       "year": 2024,
    500       "arxiv_id": "2405.07530",
    501       "relevance": "Multi-retrieval RAG approach for code completion, related retrieval-augmented prompting method."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 1,
    507       "justification": "The concept of automated code optimization is useful, but the approach requires CFG analysis tooling, is limited to C++, and the 7.3% improvement is modest."
    508     },
    509     "surprise_contrarian": {
    510       "score": 0,
    511       "justification": "Confirms the expected finding that adding relevant context to LLM prompts improves output quality."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No AI safety, security, or risk concerns raised."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy or conflict with established claims."
    520     },
    521     "demo_ability": {
    522       "score": 2,
    523       "justification": "Code and data are released on GitHub with reproduction instructions, though setup requires Clang and GPT-4o API access."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "Uses GPT-4o (well-known product) but from Vanderbilt University, not a major AI lab."
    528     }
    529   }
    530 }

Impressum · Datenschutz