scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28994B)
      1 {
      2   "paper": {
      3     "title": "ReCode: Improving LLM-based Code Repair with Fine-Grained Retrieval-Augmented Generation",
      4     "authors": [
      5       "Yicong Zhao",
      6       "Shisong Chen",
      7       "Jiacheng Zhang",
      8       "Zhixu Li"
      9     ],
     10     "year": 2025,
     11     "venue": "International Conference on Information and Knowledge Management",
     12     "arxiv_id": "2509.02330",
     13     "doi": "10.1145/3746252.3761035"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "ReCode, a retrieval-augmented code repair framework using algorithm-aware retrieval and dual-view encoding, consistently outperforms best-of-N and self-repair baselines across 6 LLMs on both in-distribution (RACodeBench) and out-of-distribution competitive programming datasets. The method achieves comparable repair accuracy to baselines at roughly half the inference budget (N=4 vs N=8), reducing inference cost 3-4x on out-of-distribution tasks. Dual encoding of code and text separately yields 1-2 percentage point improvements over unified encoding.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The method and benchmark are described but no implementation is released."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "RACodeBench is described as a key contribution but no download link, repository URL, or data archive is provided. The OOD evaluation datasets (AtCoder, CodeChef, etc.) are public platforms, but the curated benchmark itself is not released."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications are provided — no requirements.txt, Dockerfile, library versions, or hardware details."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are included. The method is described conceptually but no runnable scripts, commands, or README with reproduction steps are provided."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Tables 1-3 and Figures 3-6 are reported as point estimates with no confidence intervals, error bars, or uncertainty quantification."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims ReCode 'consistently outperforms' and 'achieves superior performance' over baselines by comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports percentage improvements with baseline context, e.g., 'Best-of-N increases by 6.94% (from 23.34 to 24.96)' and 'our retrieval-augmented approach improves significantly by 14.42% (from 26.07 to 29.83)' in Section 4.2."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The size of RACodeBench is not explicitly stated in the paper, and no justification or power analysis is provided for why the benchmark size is sufficient to support the claims made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Two baselines are compared: best-of-N sampling and self-repair, both described in Section 4.1 and compared in Table 2 and Figures 3-4."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Best-of-N and self-repair are contemporary, commonly used inference-time strategies for code repair. The paper cites recent work (2023-2024) for these approaches."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 1 ablates unified vs. dual encoding. Figure 5 ablates relevant vs. random example selection. Section 4.3 explicitly frames this as an ablation study."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Two metrics are used: test pass rate (Eq. 4) and strict accuracy (Eq. 5), both reported in Table 2 and Figures 3-4."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is performed. All evaluation is fully automated via test case execution (pass/fail)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper explicitly states 'RACodeBench implements strict partitioning between the benchmark dataset and the retrieval knowledge base' (Section 3.4). OOD evaluation on 6 separate competitive programming datasets further ensures held-out evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figures 3 and 4 provide per-dataset breakdowns across 6 out-of-distribution competitive programming datasets (AtCoder, CodeChef, HackerRank, HackerEarth, GeeksforGeeks, Aizu)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Figure 7 shows a qualitative success case. No failure cases are discussed — there is no analysis of when or why ReCode fails, or what types of bugs it cannot handle."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.4 reports that 'ReCode performs slightly worse at the initial call (N=1) due to its initial algorithm type identification step,' acknowledging a disadvantage at low inference budgets."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 'higher repair accuracy with significantly reduced inference cost,' which is supported by Table 2 (accuracy improvements across all models) and Table 3 / Section 4.4 (3-4x inference cost reduction)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims about dual encoding and algorithm-aware retrieval improving performance are supported by controlled ablation studies: Table 1 (dual vs. unified encoding, same model) and Figure 5 (relevant vs. random examples, same budget)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Code Repair' broadly, but all evaluation is on competitive programming problems (Codeforces, AtCoder, CodeChef, etc.). No evaluation on real-world software bugs (e.g., from GitHub repositories). The paper frames RACodeBench as reflecting 'authentic software development scenarios' but it consists entirely of competitive programming submissions."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as whether the improvement comes from simply providing any additional context vs. the specific retrieval strategy, or whether Codeforces-trained knowledge transfers genuinely to other competitive programming platforms due to shared problem patterns."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures test pass rate on competitive programming problems but frames this as 'real-world code repair scenarios' and 'authentic software development scenarios' without acknowledging the gap between competitive programming bug fixes and actual software development bug repair."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Models are listed as 'GPT-4o-mini', 'Gemini-1.5-Flash', 'Gemma-2-9B', 'Gemma-2-27B', 'DeepSeek-V2-Chat', 'DeepSeek-Coder-V2-Instruct' without specific API versions, snapshot dates, or exact model identifiers."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No actual prompt text is provided. The prompting approach is described conceptually (Eq. 2: Concat exemplar with query) but the actual prompt templates and text are not shown."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Temperature=1.0 is reported for the best-of-N baseline, and N=8/32 for in-distribution/OOD settings. However, generation hyperparameters (temperature, top-p, max tokens) for ReCode's own inference are not reported, nor are retrieval hyperparameters (fusion weights, number of retrieved examples, similarity thresholds)."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. ReCode is a single-pass RAG pipeline (retrieve, then generate), not an iterative agent with tools or feedback loops."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The RACodeBench construction pipeline is described at a high level (Section 3.4) — 'semi-automated pipeline' with 'differential analysis,' 'compiler diagnostics,' and 'manual verification on a carefully selected representative subset.' However, specific filtering criteria, counts at each stage, and the exact number of examples in RACodeBench are not stated."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No dedicated limitations, threats to validity, or scope boundary section exists in the paper."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed anywhere in the paper."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit scope boundaries are stated. The paper does not acknowledge that results are limited to competitive programming problems, single-function bugs, or specific programming languages."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw data is available. Neither RACodeBench nor the knowledge base data is released for independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.4 describes data collection from Codeforces: leveraging problem statements and user submissions, organizing by algorithmic tags, curating erroneous-corrected pairs, and using differential analysis with compiler diagnostics."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data is sourced from public Codeforces submissions."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The pipeline is described conceptually (syntactic/semantic differencing, compiler diagnostics, manual verification) but lacks specifics: how many problems were considered initially, how many survived each filter, what criteria were used for stratified sampling, and the final dataset size."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Funding is disclosed in the Acknowledgments section: 'Key-Area Research and Development Program of Guangdong Province (2024B0101050005), Suzhou Key Laboratory of Artificial Intelligence and Social Governance Technologies (SZS2023007),' and two additional programs."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly stated: Fudan University, East China Normal University, and Renmin University of China. No evaluated product affiliation conflict exists."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding is from government research programs and academic institutions with no commercial stake in the evaluation outcomes."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is provided in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the 6 LLMs evaluated (GPT-4o-mini, Gemini-1.5-Flash, Gemma-2, DeepSeek). This is critical since Codeforces problems are widely available online."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper ensures separation between its retrieval knowledge base and test set, but does not discuss whether the evaluated LLMs may have seen Codeforces problems during pre-training. Codeforces is a major competitive programming platform whose problems and solutions are extensively indexed online."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Codeforces problems and solutions have been publicly available online for years. No discussion of whether GPT-4o-mini, DeepSeek, or other models were trained on this data. The paper's strict partitioning only addresses internal knowledge base vs. benchmark separation, not model pre-training contamination."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 4.4 and Table 3 report inference cost as number of LLM calls to reach performance thresholds. ReCode needs 4-6 calls vs. 11-27 for baselines. Figure 6 shows performance vs. number of calls."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total compute budget is stated — no GPU hours, API costs, wall-clock time, or hardware specifications are reported for the experiments."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never explicitly stated. It is unclear whether results are from single runs or averaged across multiple runs."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search budget is reported for the retrieval system (feature fusion, encoder selection, similarity thresholds) or generation parameters."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The choice of OASIS-code-1.3B as code encoder and bge-m3 as text encoder is not justified with comparisons to alternatives. No explanation of how the final configuration was selected."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement baselines (best-of-N, self-repair) themselves without acknowledging the bias of evaluating their own system against their own implementations of competitors."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Figure 6 shows performance (test pass rate and strict accuracy) as a function of number of LLM calls (N=1 through N=8). Table 3 shows LLM calls needed to reach specific performance thresholds."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether RACodeBench's competitive programming problems actually measure 'code repair' ability in the general sense claimed. The paper does not address whether fixing competitive programming bugs is a valid proxy for real-world code repair."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No agentic scaffolding is involved. All methods use the same underlying LLMs with different prompting/retrieval strategies."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. Codeforces problems from historical archives could appear in the training data of all evaluated models, which were trained on internet data that extensively includes competitive programming solutions."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the retrieved exemplars provide information that would not be available in a real deployment scenario, or whether the algorithm-type labels leak information about the solution approach."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of non-independence. The knowledge base and RACodeBench are both sourced from Codeforces, and while problem-level separation is ensured, structural similarities between problems (same platform, same algorithmic patterns, same user population) are not addressed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection method is applied. The paper ensures knowledge base/benchmark separation but uses no canary strings, membership inference, n-gram overlap, or decontamination pipeline for model training data."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "ReCode consistently outperforms best-of-N and self-repair baselines across all tested LLMs on RACodeBench in both test pass rate and strict accuracy.",
    370       "evidence": "Table 2 shows ReCode achieves highest scores across all 6 models: e.g., GPT-4o-mini ReCode 41.06% test pass rate vs. 34.79% self-repair and 31.09% best-of-N; strict accuracy 30.41% vs. 24.58% and 21.25%.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "ReCode reduces inference cost by 3-4x compared to baselines to reach the same performance thresholds.",
    375       "evidence": "Table 3 shows on AtCoder, ReCode needs 4 LLM calls to reach 35.0% test pass rate vs. 11 for best-of-N and 15 for self-repair. Figure 6 shows ReCode at N=4 matches baselines at N=8.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Dual encoding (separate code and text encoders) outperforms unified encoding for retrieval in code repair.",
    380       "evidence": "Table 1 shows dual encoding improves over unified encoding: Gemma-2-27B from 26.25% to 27.73%, GPT-4o-mini from 36.72% to 38.49% test pass rate.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Retrieval-augmented model scaling benefits are amplified compared to baseline scaling: 14.42% improvement for ReCode from 9B to 27B vs. 6.94% for best-of-N.",
    385       "evidence": "Section 4.2 compares Gemma-2-9B to 27B scaling: ReCode 26.07→29.83 (14.42% relative) vs. best-of-N 23.34→24.96 (6.94%) and self-repair 22.67→22.74 (0.3%).",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Relevant example retrieval significantly outperforms random example selection under limited inference budgets.",
    390       "evidence": "Figure 5 shows consistent performance gap between relevant and random example selection across N=1 to N=8 LLM calls.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No code or data released",
    397       "detail": "Despite RACodeBench being presented as a key contribution, no code, data, or benchmark is released. There are no repository URLs or download links, making the benchmark contribution unverifiable."
    398     },
    399     {
    400       "flag": "No statistical significance testing",
    401       "detail": "All claims of superiority are based on comparing raw numbers without any statistical tests. With no variance or error bars reported, it is impossible to assess whether differences are meaningful or within noise."
    402     },
    403     {
    404       "flag": "Benchmark size not stated",
    405       "detail": "The exact number of examples in RACodeBench is never explicitly stated in the paper, despite it being a primary contribution. This makes it impossible to assess whether the benchmark is large enough to support the claims."
    406     },
    407     {
    408       "flag": "Competitive programming framed as real-world code repair",
    409       "detail": "All evaluation is on competitive programming problems (Codeforces, AtCoder, CodeChef, etc.), but the paper repeatedly claims evaluation reflects 'authentic software development scenarios' and 'real-world code repair.' These domains differ substantially — competitive programming bugs are typically algorithmic logic errors in single-function programs, not the multi-file, integration, or API-related bugs common in real software."
    410     },
    411     {
    412       "flag": "No limitations section",
    413       "detail": "The paper contains no limitations, threats to validity, or scope boundary discussion whatsoever, despite making broad claims about 'real-world code repair.'"
    414     },
    415     {
    416       "flag": "Severe contamination risk unaddressed",
    417       "detail": "Codeforces problems and solutions are extensively indexed online and likely present in the training data of GPT-4o-mini, DeepSeek, and other evaluated models. The paper does not address this at all — the internal knowledge base/benchmark separation does not mitigate LLM training data contamination."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Evaluating large language models trained on code",
    423       "authors": ["Mark Chen", "Jerry Tworek"],
    424       "year": 2021,
    425       "arxiv_id": "2107.03374",
    426       "relevance": "Foundational Codex paper establishing LLM code generation capabilities and the HumanEval benchmark."
    427     },
    428     {
    429       "title": "Is self-repair a silver bullet for code generation?",
    430       "authors": ["Theo X Olausson", "Jeevana Priya Inala"],
    431       "year": 2023,
    432       "arxiv_id": "2306.09896",
    433       "relevance": "Directly relevant baseline studying LLM self-repair capabilities for code generation."
    434     },
    435     {
    436       "title": "Teaching large language models to self-debug",
    437       "authors": ["Xinyun Chen", "Maxwell Lin"],
    438       "year": 2023,
    439       "arxiv_id": "2304.05128",
    440       "relevance": "Self-debugging approach using execution feedback for iterative code repair."
    441     },
    442     {
    443       "title": "ThinkRepair: Self-directed automated program repair",
    444       "authors": ["Xin Yin", "Chao Ni"],
    445       "year": 2024,
    446       "relevance": "Automated program repair combining few-shot prompting with chain-of-thought reasoning and execution feedback."
    447     },
    448     {
    449       "title": "Self-edit: Fault-aware code editor for code generation",
    450       "authors": ["Kechi Zhang", "Zhuo Li"],
    451       "year": 2023,
    452       "arxiv_id": "2305.04087",
    453       "relevance": "Uses execution traces and error messages to guide LLM code repair."
    454     },
    455     {
    456       "title": "Reflexion: Language agents with verbal reinforcement learning",
    457       "authors": ["Noah Shinn", "Federico Cassano"],
    458       "year": 2023,
    459       "arxiv_id": "2303.11366",
    460       "relevance": "Memory-augmented reasoning loops for iterative self-improvement in code generation tasks."
    461     },
    462     {
    463       "title": "Deepseek-r1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    464       "authors": ["Daya Guo", "Dejian Yang"],
    465       "year": 2025,
    466       "arxiv_id": "2501.12948",
    467       "relevance": "State-of-the-art reasoning LLM relevant to code generation and repair capabilities."
    468     },
    469     {
    470       "title": "A survey of learning-based automated program repair",
    471       "authors": ["Quanjun Zhang", "Chunrong Fang"],
    472       "year": 2023,
    473       "relevance": "Comprehensive survey of learning-based APR methods establishing the field's research landscape."
    474     },
    475     {
    476       "title": "CodeRL: Mastering code generation through pretrained models and deep reinforcement learning",
    477       "authors": ["Hung Le", "Yue Wang"],
    478       "year": 2022,
    479       "relevance": "RL-based approach to code generation with iterative refinement via execution feedback."
    480     },
    481     {
    482       "title": "Measuring coding challenge competence with APPS",
    483       "authors": ["Dan Hendrycks", "Steven Basart"],
    484       "year": 2021,
    485       "arxiv_id": "2105.09938",
    486       "relevance": "Defines the test pass rate and strict accuracy metrics adopted in this paper for code evaluation."
    487     },
    488     {
    489       "title": "AuPair: Golden Example Pairs for Code Repair",
    490       "authors": ["Aditi Mavalankar", "Hassan Mansoor"],
    491       "year": 2025,
    492       "arxiv_id": "2502.18487",
    493       "relevance": "Directly related ICL approach using model-generated repair examples as in-context references."
    494     },
    495     {
    496       "title": "Debug like a human: A large language model debugger via verifying runtime execution step-by-step",
    497       "authors": ["Li Zhong", "Zilong Wang"],
    498       "year": 2024,
    499       "arxiv_id": "2402.16906",
    500       "relevance": "LLM debugging approach using program decomposition and runtime variable verification."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 1,
    506       "justification": "RAG-based code repair is a practical concept, but no code or benchmark is released, making it unusable by practitioners."
    507     },
    508     "surprise_contrarian": {
    509       "score": 0,
    510       "justification": "The finding that better retrieval improves RAG-based code repair is expected and confirms conventional wisdom."
    511     },
    512     "fear_safety": {
    513       "score": 0,
    514       "justification": "No safety or security concerns are raised by this work."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversy or conflict with established results or companies."
    519     },
    520     "demo_ability": {
    521       "score": 0,
    522       "justification": "No code, demo, or tool is released that someone could try."
    523     },
    524     "brand_recognition": {
    525       "score": 0,
    526       "justification": "Authors are from Chinese academic institutions (Fudan, Renmin University), not major AI labs."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs