scan.json (29192B)
1 { 2 "paper": { 3 "title": "PanGu-Coder2: Boosting Large Language Models for Code with Ranking Feedback", 4 "authors": [ 5 "Bo Shen", 6 "Jiaxin Zhang", 7 "Taihong Chen", 8 "Daoguang Zan", 9 "Bing Geng", 10 "An Fu", 11 "Muhan Zeng", 12 "Ailun Yu", 13 "Jichuan Ji", 14 "Jingyang Zhao", 15 "Yuenan Guo", 16 "Qianxiang Wang" 17 ], 18 "year": 2023, 19 "venue": "arXiv preprint", 20 "arxiv_id": "2307.14936", 21 "doi": "10.48550/arXiv.2307.14936" 22 }, 23 "scan_version": 2, 24 "active_modules": ["experimental_rigor", "data_leakage"], 25 "methodology_tags": ["benchmark-eval"], 26 "key_findings": "PanGu-Coder2, a 15B parameter Code LLM fine-tuned with the RRTF framework on StarCoder, achieves 62.20% pass@1 on HumanEval, outperforming all prior open-source Code LLMs including WizardCoder (59.80%). The model shows consistent improvements across HumanEval, CoderEval, and LeetCode, with a 28% absolute pass@1 improvement over the StarCoder base model. CTranslate2 int8 quantization halves memory usage and doubles inference speed with no performance loss.", 27 "checklist": { 28 "artifacts": { 29 "code_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "No repository URL or code archive is provided in the paper. The paper references StarCoder and RRHF code repositories but does not release the PanGu-Coder2 code, training scripts, or RRTF implementation." 33 }, 34 "data_released": { 35 "applies": true, 36 "answer": false, 37 "justification": "The curated 68K training dataset is not released. Section 3.3 describes starting from the public CodeAlpaca-20k dataset and evolving it, but the resulting corpus is not made available. The evaluation benchmarks (HumanEval, CoderEval, LeetCode) are public." 38 }, 39 "environment_specified": { 40 "applies": true, 41 "answer": false, 42 "justification": "Table 1 lists model architecture hyperparameters (hidden size, attention heads, layers) but no environment specifications such as library versions, CUDA version, requirements.txt, or Dockerfile are provided." 43 }, 44 "reproduction_instructions": { 45 "applies": true, 46 "answer": false, 47 "justification": "No step-by-step reproduction instructions are included. The paper describes the RRTF framework at a conceptual level but does not provide runnable scripts or detailed reproduction guides." 48 } 49 }, 50 "statistical_methodology": { 51 "confidence_intervals_or_error_bars": { 52 "applies": true, 53 "answer": false, 54 "justification": "All results in Tables 2, 3, and 4 are reported as point estimates without confidence intervals or error bars. No uncertainty quantification is provided." 55 }, 56 "significance_tests": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper claims PanGu-Coder2 'outperforms' multiple baselines across all benchmarks but provides no statistical significance tests. All comparisons are based solely on comparing point estimates." 60 }, 61 "effect_sizes_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper reports effect sizes with baseline context: '28% absolute improvement in terms of pass@1 score (from 33.6% to 61.6%)' over StarCoder (Section 4.2.1), and '4.34%' improvement over WizardCoder. Both baseline and new values are given." 65 }, 66 "sample_size_justified": { 67 "applies": true, 68 "answer": false, 69 "justification": "No justification is given for why HumanEval (164 problems), CoderEval (230 functions), or LeetCode (300 problems) are adequate sample sizes for the claims being made. No power analysis is discussed." 70 }, 71 "variance_reported": { 72 "applies": true, 73 "answer": false, 74 "justification": "No variance or standard deviation is reported across training runs. Results appear to come from a single training run. The n=200 samples for pass@k estimation provide sampling diversity but no across-run variance is reported." 75 } 76 }, 77 "evaluation_design": { 78 "baselines_included": { 79 "applies": true, 80 "answer": true, 81 "justification": "Tables 2 and 3 compare against multiple baselines: CodeGen-mono 16B, CodeGeeX 13B, StarCoder 15B, CodeT5+ 16B, WizardCoder 15B, and closed-source models including GPT-3.5 and GPT-4." 82 }, 83 "baselines_contemporary": { 84 "applies": true, 85 "answer": true, 86 "justification": "WizardCoder (2023), StarCoder (2023), CodeT5+ (2023) are contemporary to the paper. The paper states WizardCoder was 'the state-of-the-art Code LLM prior to PanGu-Coder2.'" 87 }, 88 "ablation_study": { 89 "applies": true, 90 "answer": false, 91 "justification": "Figure 4 shows performance vs. dataset size (18k, 38k, 68k) and training steps, but there is no ablation of the RRTF framework components — e.g., removing ranking loss, removing teacher feedback, comparing against plain SFT with the same data. The contribution of RRTF vs. data quality is not isolated." 92 }, 93 "multiple_metrics": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are reported for pass@1, pass@10, and pass@100 (Table 2), as well as greedy decoding pass@1 on three different benchmarks (Table 3). Table 4 reports GPU memory, inference speed, and HumanEval performance for quantization." 97 }, 98 "human_evaluation": { 99 "applies": true, 100 "answer": false, 101 "justification": "No human evaluation of generated code is conducted. Figures 6-8 show case studies with manual inspection but these are qualitative examples, not systematic human evaluation." 102 }, 103 "held_out_test_set": { 104 "applies": true, 105 "answer": true, 106 "justification": "HumanEval, CoderEval, and LeetCode are separate test benchmarks not used during training. Section 3.3 explicitly states they checked for overlap between the 68K training data and HumanEval. LeetCode uses problems after July 2022, post-dating StarCoder's training data." 107 }, 108 "per_category_breakdown": { 109 "applies": true, 110 "answer": true, 111 "justification": "Table 3 breaks down LeetCode performance by difficulty (easy/medium/hard: 32/30/10). Figure 5 shows Venn diagrams of solved problems by model. Results across three distinct benchmarks provide task-level diversity." 112 }, 113 "failure_cases_discussed": { 114 "applies": true, 115 "answer": true, 116 "justification": "Figure 8 shows a case where all three models (PanGu-Coder2, WizardCoder, StarCoder) generate incorrect code. Figure 7 shows a case where WizardCoder outperforms PanGu-Coder2. Section 4.2.3 discusses strengths and weaknesses." 117 }, 118 "negative_results_reported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Figure 7 shows WizardCoder outperforming PanGu-Coder2 on certain problems. Table 4 shows GPTQ quantization significantly degrades HumanEval performance (62.20% → 51.22%). Figure 5 shows 18 problems none of the models can solve even with 200 samples." 122 } 123 }, 124 "claims_and_evidence": { 125 "abstract_claims_supported": { 126 "applies": true, 127 "answer": true, 128 "justification": "The abstract claims 62.20% pass@1 on HumanEval, confirmed in Table 3. The claim of outperforming 'all previous Code LLMs' is supported by Tables 2 and 3 across three benchmarks, with the caveat that GPT-4 (a general LLM, not a 'Code LLM') scores higher." 129 }, 130 "causal_claims_justified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper claims RRTF 'effectively and efficiently boost[s]' code generation. However, there is no ablation separating the effects of RRTF from the curated 68K dataset, the Evol-Instruct data augmentation, or teacher model knowledge distillation. The causal role of the ranking feedback mechanism is not isolated." 134 }, 135 "generalization_bounded": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper's title claims to 'Boost Large Language Models for Code' but evaluation is exclusively on Python. The abstract claims RRTF 'can effectively and efficiently boost pre-trained large language models for code generation' without bounding this to Python or the specific model tested." 139 }, 140 "alternative_explanations_discussed": { 141 "applies": true, 142 "answer": false, 143 "justification": "No alternative explanations are discussed. The improvement could stem from the curated training data quality, teacher model knowledge transfer, the Evol-Instruct augmentation, or RRTF specifically — but these are never disentangled or discussed." 144 }, 145 "proxy_outcome_distinction": { 146 "applies": true, 147 "answer": true, 148 "justification": "The paper measures pass@k on specific benchmarks and frames results in those terms. It does not overclaim that pass@k on HumanEval represents general 'programming ability' or 'code quality.' Claims are tied to the specific metrics measured." 149 } 150 }, 151 "setup_transparency": { 152 "model_versions_specified": { 153 "applies": true, 154 "answer": false, 155 "justification": "The base model is identified as 'StarCoder 15B' without a specific checkpoint or version. The teacher models used in RRTF training are never identified by name or version — Section 3.4 refers only to generic 'teacher models.' GPT-3.5/GPT-4 comparison scores are cited from other papers without version specification." 156 }, 157 "prompts_provided": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 4.1.5 provides the exact evaluation prompts used for PanGu-Coder2, StarCoder, and WizardCoder. Figure 2 provides the Evol-Instruct prompt template. Figure 3 shows the training data format." 161 }, 162 "hyperparameters_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Table 1 lists model architecture hyperparameters. Section 3.5 states 'global batch size of 512 for 6 epochs.' Section 4.1.4 specifies temperature=0.2 for pass@1, temperature=1.2 for pass@10/100, top_p=0.95, max_new_tokens=512." 166 }, 167 "scaffolding_described": { 168 "applies": false, 169 "answer": false, 170 "justification": "No agentic scaffolding is used. The models directly generate code from prompts without multi-step reasoning, tool use, or agent frameworks." 171 }, 172 "data_preprocessing_documented": { 173 "applies": true, 174 "answer": false, 175 "justification": "Section 3.3 mentions 'we conducted data preprocessing on our initial corpus using several manually-defined rules and reduced the size of the corpus to 68K' but never specifies what those rules were or how many examples were removed at each step." 176 } 177 }, 178 "limitations_and_scope": { 179 "limitations_section_present": { 180 "applies": true, 181 "answer": false, 182 "justification": "There is no limitations section or threats-to-validity discussion in the paper. The conclusion mentions future work but does not discuss limitations of the current study." 183 }, 184 "threats_to_validity_specific": { 185 "applies": true, 186 "answer": false, 187 "justification": "No specific threats to validity are discussed anywhere in the paper." 188 }, 189 "scope_boundaries_stated": { 190 "applies": true, 191 "answer": false, 192 "justification": "No explicit scope boundaries are stated. The paper does not acknowledge that results are limited to Python, to the specific benchmarks tested, or to the 15B model scale." 193 } 194 }, 195 "data_integrity": { 196 "raw_data_available": { 197 "applies": true, 198 "answer": false, 199 "justification": "Neither the 68K training dataset nor per-problem evaluation results are made available. Only aggregate metrics are reported." 200 }, 201 "data_collection_described": { 202 "applies": true, 203 "answer": false, 204 "justification": "Section 3.3 describes the high-level pipeline (Alpaca 20K → Evol-Instruct → 100K → filtering → 68K) but omits critical details: what teacher models were sampled, what the filtering rules were, and what constituted the removed 32K samples." 205 }, 206 "recruitment_methods_described": { 207 "applies": false, 208 "answer": false, 209 "justification": "No human participants. Data sources are the CodeAlpaca-20k public dataset and standard code generation benchmarks." 210 }, 211 "data_pipeline_documented": { 212 "applies": true, 213 "answer": false, 214 "justification": "The pipeline from 100K to 68K involves unspecified 'manually-defined rules.' The ranking and scoring procedure (Section 3.4) is described at a high level but lacks specifics on how many samples fell into each category (compiled error, runtime error, partial pass, all pass)." 215 } 216 }, 217 "conflicts_of_interest": { 218 "funding_disclosed": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding or acknowledgments section is present in the paper." 222 }, 223 "affiliations_disclosed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Author affiliations are clearly listed: Huawei Cloud Co., Ltd., Chinese Academy of Science, and Peking University. The affiliation header identifies which authors belong to each institution." 227 }, 228 "funder_independent_of_outcome": { 229 "applies": true, 230 "answer": false, 231 "justification": "The majority of authors are from Huawei Cloud, which has a commercial interest in demonstrating strong code LLM performance. Huawei is not independent of the outcome — a successful PanGu-Coder2 promotes their commercial interests." 232 }, 233 "financial_interests_declared": { 234 "applies": true, 235 "answer": false, 236 "justification": "No competing interests or financial interests statement is included in the paper." 237 } 238 }, 239 "contamination": { 240 "training_cutoff_stated": { 241 "applies": true, 242 "answer": true, 243 "justification": "Section 4.1.2 states StarCoder 'only consists of code before June 2022.' This establishes the base model's training data cutoff." 244 }, 245 "train_test_overlap_discussed": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section 3.3: 'we devoted considerable efforts to surveying the potential overlap between the collected 68K dataset and the HumanEval benchmark. After conducting a meticulous survey, we confirm that there is no data leakage in our experiments.'" 249 }, 250 "benchmark_contamination_addressed": { 251 "applies": true, 252 "answer": false, 253 "justification": "HumanEval was published in July 2021, and StarCoder's training data includes code before June 2022 — solutions could have been posted to GitHub in that window. The paper only checks overlap with the 68K fine-tuning data, not with StarCoder's pre-training corpus. For LeetCode they create a temporal split, but HumanEval contamination through the base model is not addressed." 254 } 255 }, 256 "human_studies": { 257 "pre_registered": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "irb_or_ethics_approval": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "demographics_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "inclusion_exclusion_criteria": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "randomization_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "blinding_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 }, 287 "attrition_reported": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants in this study." 291 } 292 }, 293 "cost_and_practicality": { 294 "inference_cost_reported": { 295 "applies": true, 296 "answer": true, 297 "justification": "Table 4 reports GPU memory consumption (32.36 GB for float16) and inference speed (75 ms/token) for PanGu-Coder2 and its quantized variants." 298 }, 299 "compute_budget_stated": { 300 "applies": true, 301 "answer": false, 302 "justification": "No total training compute budget is stated. The paper mentions batch size 512 for 6 epochs on 68K samples, but does not report GPU hours, hardware used for training, or total training time." 303 } 304 }, 305 "experimental_rigor": { 306 "seed_sensitivity_reported": { 307 "applies": true, 308 "answer": false, 309 "justification": "No mention of multiple random seeds. Results appear to come from a single training run with single-seed evaluation." 310 }, 311 "number_of_runs_stated": { 312 "applies": true, 313 "answer": false, 314 "justification": "The number of training runs is not stated. For inference, n=200 samples are generated per problem for pass@k, but it is unclear if this represents a single training checkpoint or multiple runs." 315 }, 316 "hyperparameter_search_budget": { 317 "applies": true, 318 "answer": false, 319 "justification": "No hyperparameter search budget is reported. The paper does not describe how training hyperparameters (batch size 512, 6 epochs) were selected." 320 }, 321 "best_config_selection_justified": { 322 "applies": true, 323 "answer": false, 324 "justification": "Table 2 states results are the 'optimal pass@1/10/100' across temperature settings, following Codex methodology. However, the training configuration selection (why 68K, why 6 epochs) is not justified through validation-set selection — Figure 4 shows training curves but doesn't indicate which checkpoint was selected or how." 325 }, 326 "multiple_comparison_correction": { 327 "applies": false, 328 "answer": false, 329 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 330 }, 331 "self_comparison_bias_addressed": { 332 "applies": true, 333 "answer": false, 334 "justification": "Huawei Cloud authors evaluate their own PanGu-Coder2 model without acknowledging self-evaluation bias. Baseline scores for some models are taken from other papers, while PanGu-Coder2 is evaluated by the authors." 335 }, 336 "compute_budget_vs_performance": { 337 "applies": true, 338 "answer": false, 339 "justification": "PanGu-Coder2 receives additional RRTF training on 68K examples beyond StarCoder's pre-training, but no compute-matched comparison is provided. The additional training cost vs. baseline models like WizardCoder is not discussed." 340 }, 341 "benchmark_construct_validity": { 342 "applies": true, 343 "answer": false, 344 "justification": "No discussion of whether HumanEval, CoderEval, or LeetCode adequately measure real-world code generation ability. The benchmarks are used without questioning their construct validity." 345 }, 346 "scaffold_confound_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "Section 4.1.5 acknowledges 'performance of a Code LLM could be largely affected by the prompt used' and uses different prompts per model (each model's recommended prompt). However, this prompt variation is a confound that is not controlled for — performance differences could partially reflect prompt format advantages rather than model quality." 350 } 351 }, 352 "data_leakage": { 353 "temporal_leakage_addressed": { 354 "applies": true, 355 "answer": true, 356 "justification": "For LeetCode, the paper explicitly creates a temporal split: problems created after July 1, 2022, post-dating StarCoder's training data cutoff (before June 2022). Section 4.1.2 states this 'ensures that any data in this benchmark does not overlap with the training data of StarCoder.'" 357 }, 358 "feature_leakage_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether evaluation prompts or test harness details could leak answer information. The different prompt formats per model (Section 4.1.5) could provide varying amounts of contextual information." 362 }, 363 "non_independence_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "No discussion of whether training examples and benchmark problems share structural similarities (e.g., common coding patterns, similar problem types from the same sources)." 367 }, 368 "leakage_detection_method": { 369 "applies": true, 370 "answer": true, 371 "justification": "Section 3.3 states: 'we devoted considerable efforts to surveying the potential overlap between the collected 68K dataset and the HumanEval benchmark.' This is a concrete overlap detection step, though limited to the fine-tuning data and one benchmark." 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "PanGu-Coder2 achieves 62.20% pass@1 on HumanEval with greedy decoding, and 61.64% with temperature sampling (n=200)", 378 "evidence": "Tables 2 and 3 report these results. Section 4.1.4 describes the evaluation methodology following Codex/AlphaCode conventions.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "PanGu-Coder2 outperforms all previously published Code LLMs on HumanEval, CoderEval, and LeetCode", 383 "evidence": "Tables 2 and 3 show PanGu-Coder2 achieving the best open-source results across all three benchmarks. It also outperforms GPT-3.5 on HumanEval but not GPT-4.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "RRTF provides a 28% absolute improvement over StarCoder base model on HumanEval pass@1", 388 "evidence": "Table 2: StarCoder 33.60% → PanGu-Coder2 61.64% (sampling). Table 3: StarCoder 32.93% → PanGu-Coder2 62.20% (greedy). The improvement is real but conflated with data quality effects.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "RRTF is simpler and more efficient than PPO-based RL approaches for code generation", 393 "evidence": "Section 1 argues this conceptually, citing PPO complexity. No direct empirical comparison of training cost, convergence speed, or implementation complexity against PPO-based methods.", 394 "supported": "weak" 395 }, 396 { 397 "claim": "Larger dataset size leads to better and more stable training", 398 "evidence": "Figure 4 shows training curves for 18k, 38k, and 68k datasets. Performance improves and stabilizes with more data. However, only three data points with no controlled comparison.", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "CTranslate2 int8 quantization slightly improves HumanEval performance (62.20% → 64.63%)", 403 "evidence": "Table 4 reports this result with a single run. No error bars or multiple runs to confirm the improvement is not noise. The authors themselves call it 'incredible' and plan further study.", 404 "supported": "weak" 405 } 406 ], 407 "red_flags": [ 408 { 409 "flag": "Company evaluating its own product", 410 "detail": "Huawei Cloud authors evaluate PanGu-Coder2 without independent validation or acknowledgment of self-evaluation bias. All evaluation was conducted by the team that built the model." 411 }, 412 { 413 "flag": "No statistical tests on any comparison", 414 "detail": "All claims of 'outperforms' are based on comparing point estimates from what appears to be a single training run with no error bars. The 4.34% improvement over WizardCoder could easily be within noise." 415 }, 416 { 417 "flag": "Confounded attribution", 418 "detail": "The improvement is attributed to RRTF but could stem from the curated 68K dataset, teacher model knowledge, Evol-Instruct augmentation, or their combination. No ablation isolates the RRTF contribution." 419 }, 420 { 421 "flag": "Unidentified teacher models", 422 "detail": "The teacher models used in RRTF training are never named. If teacher models include GPT-4 or similar powerful models, much of the improvement could be knowledge distillation rather than the RRTF framework." 423 }, 424 { 425 "flag": "Different prompts across models", 426 "detail": "Section 4.1.5 shows each model uses a different prompt format. PanGu-Coder2 uses docstring-first format while WizardCoder uses instruction format. Performance differences could partly reflect prompt format advantages." 427 }, 428 { 429 "flag": "No limitations section", 430 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries despite making broad claims about Code LLM boosting." 431 } 432 ], 433 "cited_papers": [ 434 { 435 "title": "Evaluating large language models trained on code", 436 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 437 "year": 2021, 438 "arxiv_id": "2107.03374", 439 "relevance": "Introduced Codex and HumanEval benchmark, the primary evaluation benchmark used in this paper and widely in code LLM research." 440 }, 441 { 442 "title": "StarCoder: may the source be with you!", 443 "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"], 444 "year": 2023, 445 "arxiv_id": "2305.06161", 446 "relevance": "Base model for PanGu-Coder2; 15B open-source code LLM that is the starting point for RRTF fine-tuning." 447 }, 448 { 449 "title": "WizardCoder: Empowering code large language models with evol-instruct", 450 "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"], 451 "year": 2023, 452 "arxiv_id": "2306.08568", 453 "relevance": "Prior state-of-the-art Code LLM on HumanEval; key baseline and source of Evol-Instruct technique used in PanGu-Coder2." 454 }, 455 { 456 "title": "CodeRL: Mastering code generation through pretrained models and deep reinforcement learning", 457 "authors": ["Hung Le", "Yue Wang", "Akhilesh Deepak Gotmare"], 458 "year": 2022, 459 "arxiv_id": "2207.01780", 460 "relevance": "Pioneered RL-based code generation with actor-critic framework and unit test signals; key prior work on RL for code LLMs." 461 }, 462 { 463 "title": "RLTF: Reinforcement learning from unit test feedback", 464 "authors": ["Jiate Liu", "Yiqin Zhu", "Kaiwen Xiao"], 465 "year": 2023, 466 "relevance": "Online RL framework with multi-granularity unit test feedback for code generation; direct competitor to RRTF approach." 467 }, 468 { 469 "title": "Execution-based code generation using deep reinforcement learning", 470 "authors": ["Parshin Shojaee", "Aneesh Jain", "Sindhu Tipirneni"], 471 "year": 2023, 472 "arxiv_id": "2301.13816", 473 "relevance": "PPOCoder: PPO-based RL for code generation that showed limited improvements on MBPP benchmark." 474 }, 475 { 476 "title": "RRHF: Rank responses to align language models with human feedback without tears", 477 "authors": ["Zheng Yuan", "Hongyi Yuan", "Chuanqi Tan"], 478 "year": 2023, 479 "arxiv_id": "2304.05302", 480 "relevance": "Core inspiration for RRTF framework; proposed ranking-based alignment as simpler alternative to RLHF." 481 }, 482 { 483 "title": "Training language models to follow instructions with human feedback", 484 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 485 "year": 2022, 486 "relevance": "InstructGPT/RLHF foundational work that inspired the ranking-based feedback approach adapted in RRTF." 487 }, 488 { 489 "title": "GPT-4 technical report", 490 "authors": ["OpenAI"], 491 "year": 2023, 492 "arxiv_id": "2303.08774", 493 "doi": "10.48550/arXiv.2303.08774", 494 "relevance": "Key commercial comparison point; GPT-4 outperforms PanGu-Coder2 on HumanEval." 495 }, 496 { 497 "title": "Textbooks are all you need", 498 "authors": ["Suriya Gunasekar", "Yi Zhang", "Jyoti Aneja"], 499 "year": 2023, 500 "arxiv_id": "2306.11644", 501 "relevance": "phi-1: data-efficient code LLM achieving strong results with only 1.3B parameters through data quality." 502 }, 503 { 504 "title": "CodeT5+: Open code large language models for code understanding and generation", 505 "authors": ["Yue Wang", "Hung Le", "Akhilesh Deepak Gotmare"], 506 "year": 2023, 507 "arxiv_id": "2305.07922", 508 "relevance": "Encoder-decoder Code LLM baseline with modular architecture for diverse code tasks." 509 }, 510 { 511 "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models", 512 "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"], 513 "year": 2023, 514 "arxiv_id": "2302.00288", 515 "relevance": "Context-aware code generation benchmark used to evaluate PanGu-Coder2 beyond HumanEval's simple function generation." 516 } 517 ] 518 }