scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27816B)
      1 {
      2   "paper": {
      3     "title": "CODEMORPH: Mitigating Data Leakage in Large Language Model Assessment",
      4     "authors": [
      5       "Hongzhou Rao",
      6       "Yanjie Zhao",
      7       "Wenjie Zhu",
      8       "Ling Xiao",
      9       "Meizhen Wang",
     10       "Haoyu Wang"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2506.17627"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper states 'Our artifact is available at https://github.com/security-pride/CodeMorph' in the Artifact Availability section after the contributions list."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The benchmark was constructed from The Stack dataset, which is publicly available, but the specific 100 selected code samples and the 720 code completion tasks constructed by the authors are not described as being released. The GitHub link references artifacts but no explicit dataset release is mentioned."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions 'two NVIDIA A100 GPUs with 80GB of memory each, utilizing a Linux environment' (Section IV-B) but provides no requirements.txt, Dockerfile, or detailed dependency/library versions needed to reproduce the pipeline."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is provided but the paper does not describe how to run the experiments, what commands to execute, or how to replicate the benchmark construction process."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Results are reported as point estimates (e.g., '24.67% average reduction', '45% lower accuracy'). Figure 7 shows a distribution interval plot of similarity scores (box plots), but no confidence intervals or error bars are provided for the accuracy results in Figures 6, 8, or Tables III-IV."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper makes comparative claims (e.g., 'PESO results in an average accuracy reduction of 15%' vs random perturbation) but no statistical significance tests are reported. All comparisons are based on raw number differences."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports percentage improvements with baseline context throughout: 'accuracy decreased by an average of 24.67%, with Python showing the most significant reduction at 45%' (abstract), 'average reduction in similarity score of 7.01%' (Section V-B), and Table III shows accuracy for original, random, and PESO conditions."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses 20 code samples per language (100 total) and acknowledges 'Due to the high cost associated with LLM API usage in CODEMORPH, we limit the selection to 20 code samples per group' (Section IV-A). This is an explanation but not a justification that N=20 is sufficient for the claims made. No power analysis is provided."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No standard deviations, variance measures, or spread across runs are reported. The similarity score box plots in Figure 7 show distribution ranges, but accuracy results are single-run point estimates with no spread measures. Table IV shows per-sample scores but no aggregate variance statistics."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper includes baselines: original (unperturbed) code performance as the primary baseline, and random perturbation as a baseline for comparing PESO's optimization (Table III, Table IV). These are appropriate comparison points."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper does not compare CODEMORPH against any contemporary code perturbation tools or decontamination methods. Related work discusses EvoEval (Xia et al. 2024), LiveCodeBench (Jain et al. 2024), and Liu et al. (2024)'s HumanEval expansion, but none are used as experimental baselines. The only comparisons are against no-perturbation and random perturbation."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "RQ2 serves as an ablation study, comparing PESO-guided perturbation against random perturbation method selection, isolating the contribution of the PESO optimization component. Results are in Tables III and IV."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses multiple evaluation metrics: code completion accuracy, surface-level similarity (Levenshtein), semantic similarity (JPlag), and overall similarity score (weighted combination). These are reported in Tables III and IV and Figures 6-7."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Human evaluation is included: 'we opted for a human assessment to determine if the completed code performs the same functionality as the original code' (Section IV-A3) and 'we manually audited all perturbed code for semantic consistency with the original' (Section IV-B, Human Verification)."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The benchmark is constructed from The Stack dataset (StarCoder's training data), which is deliberately chosen to guarantee data leakage. The evaluation tests whether perturbation can mitigate this known contamination. The experimental design with confirmed contaminated data serves the paper's purpose, and the 720 completion tasks are a separate test set from the training data."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down per programming language (C/C++, Python, Java, Rust, Go) in Figures 6-7 and Tables III-IV. Per-task breakdowns (1-line, 3-line, 5-line completions) are also provided in Table III and Figure 6."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper discusses cases where perturbation did not reduce accuracy (Section V-A, 'code perturbation does not always reduce code completion accuracy'), with a specific example in Figure 8 showing cases where perturbed code completion succeeded but original failed. Section V-A also discusses correctness failures (4 errors in Java, 0-3 in other languages)."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports that single perturbation achieved 94.3% accuracy (almost no effect) in the preliminary study (Section II), that perturbation sometimes makes completion easier (Section V-A), and that random perturbation can sometimes outperform PESO (Section V-B: 'the random algorithm is not always less effective than PESO')."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims (24.67% average accuracy decrease, 45% maximum for Python, 7.01% lower similarity for PESO, 15% average accuracy drop for PESO vs random, 25% maximum) are all supported by the results in Section V, Tables III-IV, and Figures 6-7."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims about CODEMORPH 'reducing' accuracy through perturbation. The study design is adequate: it uses a controlled comparison of original vs. perturbed code on the same completion tasks, and the PESO ablation compares random vs. optimized method selection with the same code samples. The single-variable manipulation (perturbation applied or not) supports causal inference."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper bounds its claims to StarCoder and The Stack dataset. The Limitations section acknowledges 'we did not find other publicly available training datasets paired with corresponding LLMs for code, limiting this study to an evaluation of StarCoder' (Section VI). The title is appropriately scoped."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper discusses alternative explanations: perturbation may sometimes introduce hints that make completion easier (Section V-A), and the observation that accuracy reduction could be partly due to increased task difficulty rather than purely decontamination (Section V-A discusses both factors). The Limitations section addresses benchmark and model constraints."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper specifies 'StarCoder with 15.5B parameters' and mentions 'GPT-4o', 'GPT-3.5-turbo', 'DeepSeek-coder', and 'Claude 3.5-sonnet' but provides no specific API versions, snapshot dates, or model version identifiers. Section III-D says 'GPT-4o' while Section IV-B says 'GPT-4' (inconsistency). No version strings like 'gpt-4o-2024-05-13' are provided."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper describes the prompt structure in natural language (Section III-A2: role prompt, task prompt, code prompt, answer prompt) but does not provide the actual prompt text used. Only the structure and purpose of each component is described, not the specific wording."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section IV-B reports key hyperparameters: mu=0.5, nu=0.5 for similarity weighting, T=2 for Boltzmann selection temperature, iteration limit=15, ssthreshold=0.2 for early stopping, and max input token limit=5k for StarCoder."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The paper describes the full agentic pipeline in detail: the iterative perturbation-verification loop (Section III-A), the PESO optimization algorithm with Boltzmann selection (Section III-B), the multi-LLM voting verification system (Section III-A2), and the termination conditions (Section III-C). Algorithm 1 provides pseudocode."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section IV-A1 documents the data filtering pipeline: rule-based filtering (minimum 40 lines, 100 characters, compilability check) followed by manual selection for cross-file dependencies, multiple method definitions, or complex control flows, resulting in 100 source files across five languages."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section VI is titled 'LIMITATION' and contains two substantive subsections: 'Benchmark and Model' discussing the limited benchmark size and single-model evaluation, and 'Code Task' discussing the restriction to code completion tasks only."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The limitations are specific to this study: 'we did not find other publicly available training datasets paired with corresponding LLMs for code, limiting this study to an evaluation of StarCoder' and 'The Stack is a training dataset, lacking unit test cases, docstrings, and related information, making the construction of other code tasks challenging' (Section VI)."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper states specific scope boundaries: only StarCoder was evaluated, only code completion tasks were tested, the benchmark is relatively small due to API costs (Section VI). Future work directions indicate what was NOT tested (other datasets, other LLMs, other task types like code classification)."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The raw data (100 selected code samples, 720 completion task definitions, per-task results) is not made available for independent verification. While The Stack dataset is public, the specific samples selected and the benchmark construction details are not released as data."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section IV-A1 describes data collection in detail: starting from The Stack dataset, applying rule-based filtering (line count, character count, compilability), then manual selection for code with cross-file dependencies and complex structures, resulting in 100 samples across 5 languages."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants were recruited for a study. The human verification of perturbed code appears to be done by the authors themselves, not recruited participants. The data source is a standard public dataset (The Stack)."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The data pipeline is documented in Section IV-A: The Stack → rule-based filtering (line/character thresholds, compilability) → manual selection → 100 source files → code task construction (1/3/5 line completion) → 720 tasks total. Figure 5 shows the benchmark construction process."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding information or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly stated: all authors are from Huazhong University of Science and Technology, Wuhan, China, specifically the School of Cyber Science and Engineering."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding source is disclosed, so independence cannot be assessed. The paper does not state whether it was funded or unfunded."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The paper's entire premise is about contamination. StarCoder's training data (The Stack) is explicitly stated and publicly available. The paper deliberately uses known-contaminated data: 'The StarCoder LLM has publicly released its training dataset, The Stack dataset, which is well-documented and confirmed to be contaminated' (Section II)."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "The entire paper is about train/test overlap. The benchmark is intentionally constructed from StarCoder's training data (The Stack) to guarantee contamination, and the paper's method is designed to mitigate this overlap through code perturbation. Section I and II discuss this extensively."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Benchmark contamination is the central topic of the paper. The authors deliberately use contaminated data to test whether their perturbation approach can mitigate contamination effects. The entire methodology and evaluation is designed around this problem."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human subjects study was conducted. The human verification of code correctness was done as part of the methodology, not as a human subjects study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human subjects study was conducted. The paper involves automated code perturbation and evaluation."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in a research study. The human reviewers appear to be the authors themselves performing code verification."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in a research study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human subjects study was conducted."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human subjects study was conducted."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human subjects study was conducted."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Section III-D states 'the cost of a single perturbation is approximately $0.20 per instance.' This provides per-instance cost information for the perturbation pipeline."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "While the per-instance cost ($0.20) is stated and the hardware is mentioned (two NVIDIA A100 GPUs), the total computational budget (total API spend, total GPU hours, total wall-clock time for all experiments) is not reported."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "CODEMORPH reduces LLM accuracy on code completion tasks by an average of 24.67% across five programming languages, with Python showing the most significant reduction at 45%.",
    293       "evidence": "Figure 6 shows accuracy distributions across single-line, three-line, and five-line code completion tasks for five languages. The largest gap is in Python single-line completion (Section V-A).",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "PESO achieves an average reduction in similarity score of 7.01% compared to random perturbation, with a maximum reduction of 42.86%.",
    298       "evidence": "Table IV presents detailed similarity score comparisons (surface-level, semantic, and overall) for 20 Python and 20 Go code samples, comparing PESO vs. random perturbation (Section V-B).",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "PESO reduces code completion accuracy by an average of 15% more than random perturbation, with a maximum reduction of 25%.",
    303       "evidence": "Table III shows accuracy comparisons for Python and Go across three task types. PESO accuracy is consistently lower than random perturbation accuracy (Section V-B).",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "CODEMORPH generates perturbed code with high accuracy and low correction cost, with at most 4 incorrect samples per language.",
    308       "evidence": "Section V-A reports 'four errors in Java, and between 0 and 3 errors in the remaining languages' after manual review, with reapplication typically requiring no more than two attempts.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "Code similarity can effectively indicate decontamination, and reducing similarity is a practical strategy for achieving it.",
    313       "evidence": "Section V-B argues this based on the correlation between lower similarity scores (PESO) and lower accuracy in completion tasks (Table III and IV). However, this is shown on only 2 languages (Python, Go) with 20 samples each.",
    314       "supported": "weak"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval"
    319   ],
    320   "key_findings": "CODEMORPH applies 26 semantic-preserving code transformations using LLMs to perturb benchmark code, reducing StarCoder's code completion accuracy by an average of 24.67% across five programming languages. The PESO optimization algorithm, based on Boltzmann selection from genetic algorithms, selects more effective perturbation method combinations, achieving 7.01% lower similarity scores and 15% greater accuracy reduction compared to random perturbation selection. The approach supports C/C++, Python, Java, Rust, and Go while preserving cross-file dependencies and code compilability.",
    321   "red_flags": [
    322     {
    323       "flag": "Very small sample size",
    324       "detail": "Only 20 code samples per language (100 total) are used, justified by API cost constraints rather than statistical adequacy. The PESO ablation uses only 20 samples each in Python and Go. With N=20, observed differences could easily be due to sample selection effects."
    325     },
    326     {
    327       "flag": "Single model evaluation",
    328       "detail": "Only StarCoder (15.5B) is evaluated. The generalizability of CODEMORPH's decontamination effectiveness to other Code LLMs (e.g., GPT-4, CodeLlama, DeepSeek-Coder) is unknown. The authors acknowledge this limitation."
    329     },
    330     {
    331       "flag": "No statistical significance tests",
    332       "detail": "All comparative claims (CODEMORPH vs. original, PESO vs. random) are based on raw percentage differences without any statistical testing. Given N=20 per group, these differences may not be statistically significant."
    333     },
    334     {
    335       "flag": "No comparison with existing decontamination methods",
    336       "detail": "The paper discusses EvoEval, LiveCodeBench, and other decontamination approaches in related work but does not compare CODEMORPH against any of them experimentally. The only baseline is random perturbation."
    337     },
    338     {
    339       "flag": "Inconsistent model naming",
    340       "detail": "Section III-D refers to the perturbation LLM as 'GPT-4o' while Section IV-B calls it 'GPT-4'. These are different models with different capabilities and costs. It is unclear which was actually used."
    341     },
    342     {
    343       "flag": "Human evaluation details sparse",
    344       "detail": "The paper mentions human assessment for code completion correctness and manual auditing of perturbed code, but does not describe who performed the evaluation, whether inter-rater agreement was measured, or what guidelines were followed."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    350       "authors": ["M. Riddell", "A. Ni", "A. Cohan"],
    351       "year": 2024,
    352       "arxiv_id": "2403.04811",
    353       "relevance": "Directly relevant to benchmark contamination in code LLM evaluation, which CODEMORPH builds upon for its similarity scoring methodology."
    354     },
    355     {
    356       "title": "Concerned with data contamination? assessing countermeasures in code language model",
    357       "authors": ["J. Cao", "W. Zhang", "S.-C. Cheung"],
    358       "year": 2024,
    359       "arxiv_id": "2403.16898",
    360       "relevance": "Studies data perturbation as a countermeasure against data contamination in code LLMs, foundational to CODEMORPH's approach."
    361     },
    362     {
    363       "title": "Rethinking benchmark and contamination for language models with rephrased samples",
    364       "authors": ["S. Yang", "W.-L. Chiang", "L. Zheng", "J. E. Gonzalez", "I. Stoica"],
    365       "year": 2023,
    366       "arxiv_id": "2311.04850",
    367       "relevance": "Demonstrates data leakage in HumanEval and MBPP, motivating the need for decontamination methods like CODEMORPH."
    368     },
    369     {
    370       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    371       "authors": ["N. Jain", "K. Han", "A. Gu"],
    372       "year": 2024,
    373       "arxiv_id": "2403.07974",
    374       "relevance": "A continuously updated benchmark designed to avoid contamination, representing an alternative approach to the perturbation-based decontamination studied here."
    375     },
    376     {
    377       "title": "Top leaderboard ranking=top coding proficiency, always? evoeval: Evolving coding benchmarks via llm",
    378       "authors": ["C. S. Xia", "Y. Deng", "L. Zhang"],
    379       "year": 2024,
    380       "arxiv_id": "2403.19114",
    381       "relevance": "Uses LLMs to evolve coding benchmarks to mitigate contamination, a closely related approach to CODEMORPH's perturbation strategy."
    382     },
    383     {
    384       "title": "Evaluating large language models trained on code",
    385       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    386       "year": 2021,
    387       "arxiv_id": "2107.03374",
    388       "relevance": "Introduces HumanEval, one of the most widely used code generation benchmarks, which has known contamination issues discussed in this paper."
    389     },
    390     {
    391       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    392       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    393       "year": 2024,
    394       "relevance": "Expanded HumanEval test cases to increase evaluation rigor, a perturbation-based decontamination approach."
    395     },
    396     {
    397       "title": "Dynamic evaluation of large language models by meta probing agents",
    398       "authors": ["K. Zhu", "J. Wang", "Q. Zhao", "R. Xu", "X. Xie"],
    399       "year": 2024,
    400       "relevance": "Proposes dynamic evaluation using psychometrics-inspired reconstruction to assess LLMs, related to contamination mitigation."
    401     },
    402     {
    403       "title": "Benchmark data contamination of large language models: A survey",
    404       "authors": ["C. Xu", "S. Guan", "D. Greene", "M. Kechadi"],
    405       "year": 2024,
    406       "arxiv_id": "2406.04244",
    407       "relevance": "Survey of benchmark data contamination in LLMs, providing context for the contamination problem CODEMORPH addresses."
    408     },
    409     {
    410       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    411       "authors": ["T. Y. Zhuo", "M. C. Vu", "J. Chim"],
    412       "year": 2024,
    413       "arxiv_id": "2406.15877",
    414       "relevance": "Repository-level code generation benchmark that faces similar contamination concerns addressed by CODEMORPH."
    415     },
    416     {
    417       "title": "Recode: Robustness evaluation of code generation models",
    418       "authors": ["S. Wang", "Z. Li", "H. Qian"],
    419       "year": 2022,
    420       "relevance": "Develops semantics-preserving code perturbation methods for evaluating code generation model robustness, directly related to CODEMORPH's transformation approach."
    421     },
    422     {
    423       "title": "Large language models for software engineering: A systematic literature review",
    424       "authors": ["X. Hou", "Y. Zhao", "Y. Liu"],
    425       "year": 2023,
    426       "relevance": "Comprehensive survey of LLMs in software engineering, providing broader context for code LLM evaluation methodology."
    427     }
    428   ]
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs