scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27092B)
      1 {
      2   "paper": {
      3     "title": "Beyond Correctness: Benchmarking Multi-dimensional Code Generation for Large Language Models",
      4     "authors": [
      5       "Jiasheng Zheng",
      6       "Boxi Cao",
      7       "Zhengzhao Ma",
      8       "Ruotong Pan",
      9       "Hongyu Lin",
     10       "Yaojie Lu",
     11       "Xianpei Han",
     12       "Le Sun"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2407.11470"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states 'We release our benchmark and source code at https://github.com/jszheng21/RACE and leaderboard at https://huggingface.co/spaces/jszheng/RACE_leaderboard' in the abstract (footnote 1)."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The benchmark data is released via the GitHub repository. The paper also uses publicly available datasets (HumanEval+, MBPP+, ClassEval, LeetCode) and the benchmark itself is released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is mentioned in the paper. The paper does not specify library versions or dependencies needed to reproduce the experiments."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While source code is released, the paper itself does not contain step-by-step reproduction instructions or reference a README with commands to run. The experimental setup describes models and hyperparameters but not how to execute the evaluation pipeline."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 2, 3, 5, 6, and 7 report only point estimates (percentages and scalars). No confidence intervals, error bars, or ± notation are provided anywhere in the paper."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes numerous comparative claims (e.g., 'GPT-4o-mini outperforms it by at least 5 percentage points') based solely on comparing raw numbers. No statistical significance tests (t-tests, bootstrap tests, etc.) are used."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports absolute percentage differences and scores with baseline context throughout. For example, Table 3 reports deltas (e.g., '+18.9', '+75.0') showing the magnitude of contamination impact. Section 4.2 states specific scores like 'o1-mini achieves only a score of 60.3 in time complexity' with context across models."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper does not justify why the specific number of evaluation cases per factor was chosen (e.g., 923 for correctness, 492 for code length, 101 for efficiency). No power analysis or justification for these sizes is provided."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper uses greedy decoding (temperature 0, Section 4.1), producing deterministic single-run results. No variance, standard deviation, or multi-run spread is reported. While greedy decoding makes runs deterministic for a given model, there is no discussion of variance across problem instances or other sources of variability."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares 28 LLMs against each other across all dimensions. For the contamination experiment (Section 4.3), a clean vs. contaminated comparison using starcoderbase-7b as baseline is conducted."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The evaluated models include state-of-the-art models as of 2024: o1-mini-2024-09-12, Claude-3.5-Sonnet, GPT-4o, Qwen2.5-72B, DeepSeek-V2.5. These are contemporary and competitive."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper includes ablation-like experiments: Section 4.4 tests complex instruction following with varying numbers of requirements (2-5), and Section 4.5 examines preference bias by varying specific instructions (camel vs. snake case, different length constraints, for vs. while loops). The contamination experiment (Section 4.3) ablates data contamination levels."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses multiple evaluation metrics across dimensions: accuracy (Acc), instruction-following rate (IF rate), Maintainability Index (MI), Normalized Index for Time (NIT), Normalized Index for Space (NIS), and an overall RACE score."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper evaluates code quality dimensions like readability and maintainability entirely through automated metrics (AST analysis, MI formula, runtime monitoring). No human evaluation of the generated code quality or the benchmark itself is conducted, despite readability and maintainability being inherently subjective dimensions where human judgment would be relevant."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The evaluation uses established test sets (HumanEval+, MBPP+, ClassEval, LeetCode) that are not used for model development by the authors. The contamination experiment (Section 4.3) explicitly separates clean and contaminated training from evaluation."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive per-category breakdowns are provided: Tables 2, 5, 6, and 7 show results broken down by individual factors (naming convention, length, comments, MI, modularity, time/space efficiency) for all 28 models."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 4.4 discusses how models fail with complex instructions (performance drops to near 0 with 5 requirements). Section 4.5 discusses preference bias failures (e.g., 'CodeLlama and WizardCoder almost fail to comprehend and fulfill' camel-case requirements, with IF rates below 30%)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports several negative findings: models struggle significantly with complex multi-requirement instructions (Section 4.4), most models have strong preference biases that hinder instruction following (Section 4.5), and adding length requirements tends to decrease code accuracy (Section 4.2)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract's four claims are all supported: (1) RACE reveals multi-dimensional defects (Table 2, Section 4.2), (2) RACE resists data contamination (Table 3, Section 4.3), (3) LLMs struggle with complex instructions (Figure 5, Section 4.4), (4) LLMs exhibit coding style preferences (Figure 6, Section 4.5)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal claims such as 'we hypothesize that this improvement may be attributed to comments facilitating an implicit chain-of-thought' (Section 4.2) and 'data contamination significantly impairs the model's instruction-following ability' (Section 4.3). While the contamination experiment uses a controlled setup, the chain-of-thought hypothesis is speculative. The claim about preference bias ('primarily learn the inherent patterns of token prediction from examples, lacking a comprehensive understanding of code logic') is a causal explanation without adequate evidence."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper's title claims 'Multi-dimensional Code Generation for Large Language Models' broadly, but experiments are limited to Python only. The paper mentions in Section 4.5 that preference bias 'could be even more pronounced in programming languages like Perl, JavaScript, and PHP' but does not test them. The generalization from Python-only results to 'code generation' broadly is not adequately bounded."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its findings. For example, the claim that models have 'inherent preference bias' could alternatively be explained by dataset composition or fine-tuning distributions. The improvement from comments could have explanations beyond chain-of-thought. No threats-to-validity section or alternative explanation discussion is present."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Table 4 (Appendix B.1) lists specific model IDs: 'gpt-4o-2024-05-13', 'gpt-3.5-turbo-0125', 'o1-mini-2024-09-12', 'claude-3.5-sonnet', 'deepseek-coder-6.7b-instruct', 'CodeLlama-7b-Python-hf', etc. Most closed-source models include snapshot dates."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt templates are provided in Figures 9 and 10 (Appendix B.2) for all factors across all four dimensions. These include the complete text with placeholders like {problem} and {starter_code} where the fill values come from standard benchmark problems."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.1 states 'we use a greedy strategy and set the temperature to 0.' For the contamination experiment (Section 4.3), LoRA training details are provided: 'batch size of 32 and a learning rate of 1e-3', trained for 10 epochs."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The evaluation is direct prompt-to-completion generation with no tool use, retry logic, or multi-step workflows."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.1 describes data preprocessing: 'we exclude such information from the datasets' to prevent bias. Appendix B.2 states 'we modify the original prompt format by extracting the core task descriptions to serve as the final prompts' to prevent conflicts with readability requirements. Table 1 provides exact case counts per factor."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section in the paper. The conclusion (Section 5) mentions future work directions but does not substantively discuss limitations of the current study."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed anywhere in the paper. There is no discussion of potential issues such as the limitation to Python, the reliance on automated metrics for subjective quality dimensions, or the representativeness of the selected benchmark tasks."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to Python, to the specific models tested, or acknowledge that automated readability/maintainability metrics may not align with human judgment."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The benchmark data and source code are released at https://github.com/jszheng21/RACE. The leaderboard is available at https://huggingface.co/spaces/jszheng/RACE_leaderboard, suggesting raw evaluation data is accessible."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes the data collection in detail: correctness data from HumanEval+, MBPP+, ClassEval, and LeetCode; readability evaluated on HumanEval+; maintainability on ClassEval and LeetCode; efficiency on 101 self-constructed LeetCode-based cases. Table 1 provides exact counts per factor."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved. The data comes from standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented through Section 3 and Appendix B.2: selection of source datasets, extraction and modification of prompts, incorporation of customized requirements, and evaluation through AST analysis and runtime monitoring. Figure 2 illustrates the overall evaluation pipeline."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources or acknowledgments section is present in the paper. The authors are affiliated with the Chinese Academy of Sciences, which suggests institutional funding, but no explicit disclosure is made."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Chinese Information Processing Laboratory, State Key Laboratory of Computer Science, Institute of Software, Chinese Academy of Sciences, and University of Chinese Academy of Sciences."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. The authors do not appear to have a direct conflict with any of the evaluated models, but the absence of any funding disclosure means this criterion is not satisfied."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates 28 LLMs on benchmarks but does not state the training data cutoff dates for the models being evaluated. For the contamination experiment, it notes that starcoderbase-7b was 'carefully curated to exclude data from HumanEval and MBPP' and that 'ClassEval and LeetCode are not within the temporal coverage of its training data,' but no specific cutoff dates are given for the other 27 models."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 4.3 is entirely dedicated to discussing train/test overlap (data contamination). The paper explicitly investigates the impact of contamination on benchmark results and demonstrates that RACE is more robust to contamination than traditional benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 4.3 directly addresses benchmark contamination as a core contribution. The paper argues that traditional correctness-only benchmarks are susceptible to contamination while RACE provides more stable results under contamination. However, this is done through a controlled experiment with starcoderbase-7b rather than analyzing contamination risk for all 28 evaluated models."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper evaluates 28 LLMs across thousands of evaluation cases but does not report inference costs, API costs, tokens consumed, or wall-clock time for the evaluations. The o1-mini results used 30% sampling 'to optimize budget efficiency' (Table 2 footnote), implying significant cost, but actual costs are not stated."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, GPU hours, API spend, or hardware specifications are reported. The contamination experiment involved LoRA training for 10 epochs but does not mention the GPU type or training time."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Current correctness-centric benchmarks fail to capture the multifaceted requirements of code in real-world scenarios, while RACE provides comprehensive evaluation revealing defects across multiple dimensions.",
    295       "evidence": "Table 2 and Section 4.2 show that even the best model (o1-mini) scores only 60.3 on time efficiency, and most models score below 45% on modularity. Models like Qwen2.5-Coder-7B-Ins have competitive correctness but lag significantly on other dimensions compared to GPT-4o-mini.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "The RACE benchmark serves as an effective tool for resisting the risk of data contamination.",
    300       "evidence": "Table 3 (Section 4.3) shows that while traditional benchmarks reach 85%+ accuracy under severe contamination (8 epochs), the RACE overall score increases much more slowly (from ~20% to ~43%), and the instruction-following rate consistently remains below 10% compared to the clean model.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Even the most advanced code LLMs struggle with complex instructions involving multiple customization requirements, with performance deteriorating significantly as requirements increase.",
    305       "evidence": "Figure 5 (Section 4.4) shows that when requirements increase from 2 to 5, the Acc.IF rate approaches 0 for almost all models, including Claude-3.5-Sonnet and GPT-4o.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Most LLMs exhibit an inherent preference for specific coding styles, making it difficult to follow user instructions inconsistent with their preference.",
    310       "evidence": "Figure 6 (Section 4.5) shows that models like CodeLlama and WizardCoder have IF rates below 30% for camel-case naming despite high rates for snake-case. Most instruct-type LLMs see ~15% IF rate drop under stricter length constraints.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "Comments serve as an implicit chain-of-thought mechanism, enhancing the accuracy of generated code.",
    315       "evidence": "Figure 8 (Appendix A) shows that requiring function-level or line-level comments improves correctness for some models (e.g., GPT-4o, DS-V2.5-236B). However, this is presented as a hypothesis without controlled testing.",
    316       "supported": "weak"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "The RACE benchmark evaluates code generation across four dimensions (readability, maintainability, correctness, efficiency) rather than correctness alone, revealing that even top LLMs like o1-mini and Claude-3.5-Sonnet have significant deficiencies in non-correctness dimensions. The benchmark demonstrates robustness against data contamination: while traditional benchmarks see accuracy inflate to 85%+ under contamination, RACE's multi-dimensional evaluation remains relatively stable. A notable finding is that LLMs exhibit strong preference biases toward specific coding styles (e.g., snake-case in Python) and performance degrades sharply when asked to satisfy multiple simultaneous requirements, with Acc.IF rates approaching zero at five concurrent requirements.",
    323   "red_flags": [
    324     {
    325       "flag": "No limitations section",
    326       "detail": "The paper lacks any limitations or threats-to-validity discussion. For a benchmark paper proposing a new evaluation framework, discussing the limitations of automated metrics for subjective dimensions like readability and maintainability is essential."
    327     },
    328     {
    329       "flag": "Python-only evaluation with broad claims",
    330       "detail": "All experiments are conducted solely in Python, but the paper makes general claims about 'code generation for LLMs' without bounding the findings to Python. The title and abstract do not indicate this restriction."
    331     },
    332     {
    333       "flag": "No statistical rigor in comparisons",
    334       "detail": "Despite making numerous comparative claims across 28 models, no significance tests, confidence intervals, or variance measures are reported. All results are single-run point estimates using greedy decoding."
    335     },
    336     {
    337       "flag": "Automated metrics for inherently subjective dimensions",
    338       "detail": "Readability and maintainability are measured entirely through automated proxies (AST analysis, MI formula) without any human validation that these automated metrics correlate with actual human judgment of code quality."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "Evaluating large language models trained on code",
    344       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    345       "year": 2021,
    346       "arxiv_id": "2107.03374",
    347       "relevance": "Introduced HumanEval, one of the primary correctness benchmarks used in RACE and a foundational code generation evaluation."
    348     },
    349     {
    350       "title": "Program synthesis with large language models",
    351       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    352       "year": 2021,
    353       "arxiv_id": "2108.07732",
    354       "relevance": "Introduced MBPP benchmark, another key correctness dataset used in RACE evaluation."
    355     },
    356     {
    357       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    358       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    359       "year": 2024,
    360       "relevance": "Created HumanEval+ and MBPP+ with enhanced test suites used in RACE for more rigorous correctness evaluation."
    361     },
    362     {
    363       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    364       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    365       "year": 2024,
    366       "arxiv_id": "2403.07974",
    367       "relevance": "Addresses contamination in code benchmarks through temporal splits, directly relevant to RACE's contamination resistance claims."
    368     },
    369     {
    370       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    371       "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"],
    372       "year": 2024,
    373       "relevance": "Directly studies data contamination in code generation benchmarks, providing context for RACE's contamination mitigation contribution."
    374     },
    375     {
    376       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    377       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    378       "year": 2023,
    379       "arxiv_id": "2308.01861",
    380       "relevance": "Provides class-level code generation benchmark used for maintainability evaluation in RACE."
    381     },
    382     {
    383       "title": "DeepSeek-Coder: When the large language model meets programming",
    384       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    385       "year": 2024,
    386       "arxiv_id": "2401.14196",
    387       "relevance": "Major open-source code LLM evaluated in RACE, representing state-of-the-art in code generation models."
    388     },
    389     {
    390       "title": "EffiBench: Benchmarking the efficiency of automatically generated code",
    391       "authors": ["Dong Huang", "Jie M Zhang", "Yuhao Qing", "Heming Cui"],
    392       "year": 2024,
    393       "arxiv_id": "2402.02037",
    394       "relevance": "Evaluates efficiency of generated code, directly related to the efficiency dimension in RACE."
    395     },
    396     {
    397       "title": "Large language models for software engineering: Survey and open problems",
    398       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    399       "year": 2023,
    400       "arxiv_id": "2310.03533",
    401       "relevance": "Comprehensive survey on LLMs for software engineering, providing context for why multi-dimensional code evaluation matters."
    402     },
    403     {
    404       "title": "Reflexion: Language agents with verbal reinforcement learning",
    405       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"],
    406       "year": 2024,
    407       "relevance": "Represents agentic approaches to code generation that benefit from multi-dimensional evaluation."
    408     },
    409     {
    410       "title": "Code Llama: Open foundation models for code",
    411       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    412       "year": 2023,
    413       "arxiv_id": "2308.12950",
    414       "relevance": "Major code LLM family evaluated in RACE, demonstrating strong preference biases in coding style."
    415     },
    416     {
    417       "title": "WizardCoder: Empowering code large language models with Evol-Instruct",
    418       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    419       "year": 2024,
    420       "relevance": "Code LLM using instruction tuning, evaluated in RACE and showing significant coding style preference biases."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs