scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27738B)
      1 {
      2   "paper": {
      3     "title": "Pragmatic Reasoning improves LLM Code Generation",
      4     "authors": [
      5       "Zhuchen Cao",
      6       "Sven Apel",
      7       "Adish Singla",
      8       "Vera Demberg"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2502.15835",
     13     "doi": "10.48550/arXiv.2502.15835"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "CodeRSA applies the Rational Speech Act pragmatic reasoning framework to code candidate reranking, achieving 59.53% accuracy on MBPP with Llama-3-8B-Instruct vs. Coder (~55%) and CoderReviewer (~57%) baselines. Semantic clustering of paraphrased instructions is critical — removing it causes a substantial accuracy drop. The approach is robust to the calibration parameter α across a wide band. Results are consistent across two models (Llama-3-8B, Qwen-2.5-7B) and two benchmarks (HumanEval, MBPP), with a sanity check on Llama-3-70B.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, code archive, or link to released source code is provided anywhere in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses publicly available benchmarks: HumanEval (Chen et al., 2021) and MBPP (Austin et al., 2021). No proprietary data was collected."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions 'NVIDIA Tesla A100 (PCIe 4.0, 80GB HBM2e, 300W)' in Section 8 but provides no requirements.txt, dependency specifications, or library versions."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described algorithmically but not in a reproducible recipe format."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Main results (Figures 3, 5, 6, 7) report single accuracy values across α sweeps with no confidence intervals or error bars. Appendix A.4 (Figure 8) shows standard deviation bands for the sensitivity analysis, but the primary reported results lack uncertainty quantification."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims CodeRSA 'outperforms' and 'surpasses' baselines based solely on comparing accuracy percentages. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported for any comparison."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results are presented as accuracy curves across α values (Figures 3, 5, 6, 7) and as raw accuracy numbers (e.g., 59.53% in Section 5.1, Table 2). No explicit effect sizes (e.g., percentage point improvements with baseline context) are calculated or reported."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses standard benchmark sizes (HumanEval: 164, MBPP: 974) without justifying whether these are sufficient for the comparisons being made. No power analysis is performed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Main experimental results (Figures 3, 5, 6, 7) appear to be single-run numbers without variance or standard deviation. Only Appendix A.4's sensitivity analysis (Figure 8) reports standard deviation across 10 repetitions with random candidate subsampling."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares against Coder Reranking (Chen et al., 2021) and CoderReviewer Reranking (Zhang et al., 2023a), described as the state-of-the-art content-driven reranking method."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "CoderReviewer (Zhang et al., 2023a) is described as the current state-of-the-art for content-driven code reranking. The paper also references but excludes execution-driven methods (CodeT, AgentCoder) for methodological reasons."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All four main result figures (3, 5, 6, 7) include 'CodeRSA w/o Clustering' as an ablation, showing the contribution of the clustering component. The α calibration parameter sweep also functions as a form of ablation."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper uses only accuracy (proportion of instances where the selected candidate passes all test cases) as the evaluation metric. No additional metrics (e.g., code quality, runtime, rank position) are reported."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Evaluation is entirely automated via test-case pass/fail. The qualitative analysis in Section 5.2 is performed by the authors to illustrate RSA behavior, not a systematic human evaluation of outputs."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The calibration parameter α is swept across the full benchmark test sets (Figures 3, 5, 6, 7) with best values selected on the same data used for evaluation. No separate validation set is used for hyperparameter selection."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by model (Llama-3-8B-Instruct, Qwen-2.5-7B-Instruct), dataset (HumanEval, MBPP), and number of candidates (Figure 8). Appendix A.3 includes a 70B model breakdown."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The qualitative analysis (Section 5.2) shows only a successful case where CodeRSA corrects Coder's mistake. No examples of where CodeRSA fails or makes worse selections are discussed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The ablation shows CodeRSA without clustering performs substantially worse, sometimes below baselines. The paper also notes 'some fluctuations in performance' on HumanEval (Appendix A.2) and acknowledges that Schuster et al. (2024) obtained negative results with RSA on spreadsheets."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims CodeRSA 'consistently outperforms common baselines, surpasses the state-of-the-art approach in most cases.' The results show consistent improvement over Coder and improvement over CoderReviewer in most (not all) settings. The hedging ('most cases') matches the evidence."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims ('improves', 'enhancing') supported by controlled comparisons: all methods operate on the same candidate set per problem, with only the reranking method varying. The ablation study (removing clustering) isolates component contributions through controlled single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Pragmatic Reasoning improves LLM Code Generation' and abstract claim about 'enhancing code generation quality in LLMs' are broader than the evidence: two mid-sized open-source models on two Python benchmarks. The Limitations section acknowledges narrow scope but the title and abstract overreach."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The Discussion (Section 6) addresses uniform cost assumption and applicability conditions, but does not consider alternative explanations for the observed improvements. For example, could the clustering step alone (without RSA) explain improvements? Could the results be specific to these benchmark difficulties? No confound analysis is provided."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures accuracy (test-case pass rate) and discusses this as selecting the correct candidate from sampled solutions. Claims stay at the level of 'reranking quality' and 'candidate selection,' which match the measurement granularity. No inflated framing beyond what test-case passing demonstrates."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model versions are stated: 'Llama-3-8B-Instruct' and 'Qwen-2.5-7B-Instruct' (Section 4.1), plus 'Llama-3-70B-Instruct' for the sanity check (Appendix A.3). These are specific open-source releases with defined parameter counts."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figure 2 shows the prompts for calculating Coder score and generating additional instructions. Appendix A.6 provides the full prompt templates including a one-shot example for instruction generation and the Reviewer score prompt format."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Key hyperparameters are reported: temperature=1.0 for candidate sampling, temperature=0.7 for instruction generation, n=10 candidates, m=1 additional instruction per candidate, α calibration parameter swept and analyzed (Section 4.2)."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. CodeRSA is a reranking method applied post-generation, not an agent with tools, memory, or iterative loops."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The pipeline is documented: sample n=10 candidates per problem at temperature 1.0, generate m=1 instruction per candidate, perform pairwise semantic equivalence clustering, compute literal listener scores, aggregate by cluster, and apply pragmatic listener reranking (Sections 3-4)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8 'Limitations' provides substantive discussion of computational complexity, candidate pool size constraints, and scope of models/benchmarks."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats are discussed: computational cost (6 hours on A100 for 500 instances), restriction to 10 candidates which 'inevitably narrows the variety of solutions,' deliberate choice of mid-sized models, and plan to expand to DS-1000, Mistral, and newer Qwen releases."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly states: 'we deliberately focus on mid-sized models and well-established benchmarks as a proof-of-concept design choice' and 'We do not include specialized coder models in this study.' Specific exclusions (model families, benchmark types) are enumerated."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw data (generated candidates, instructions, scores, clustering outputs) is made available. Only aggregated accuracy numbers are reported."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The data generation procedure is described: n=10 candidates sampled at temperature 1.0, m=1 instruction generated per candidate at temperature 0.7, 3-shot prompt for semantic equivalence clustering (Sections 3-4)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data source is standard public benchmarks (HumanEval, MBPP)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The full pipeline from candidate generation to instruction generation to pairwise clustering to literal listener scoring to pragmatic speaker/listener reranking is documented step-by-step in Sections 3 and 4."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source, acknowledgments section, or grant information is provided anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: Max Planck Institute for Informatics, Saarland University, and Max Planck Institute for Software Systems. No products being evaluated are produced by these institutions."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Authors are at public academic institutions (Max Planck Institutes, Saarland University). These funders have no financial stake in the outcome of CodeRSA's performance evaluation."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for Llama-3-8B-Instruct or Qwen-2.5-7B-Instruct. Both models were released in 2024, well after the benchmarks were published."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether HumanEval or MBPP problems appeared in the training data of either model. Both benchmarks were published in 2021 and are widely known."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "HumanEval (2021) and MBPP (2021) were published years before Llama-3 (2024) and Qwen-2.5 (2024) were trained. The contamination risk is high and completely unaddressed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study is a purely computational evaluation of reranking methods."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 8 states: 'on a single NVIDIA Tesla A100 (PCIe 4.0, 80GB HBM2e, 300W), performing complete CodeRSA inference on 500 instances takes nearly 6 hours.'"
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "The GPU hardware (NVIDIA Tesla A100, 80GB) and approximate runtime (6 hours for 500 instances) are stated in Section 8."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Main results (Figures 3, 5, 6, 7) are presented as single runs without seed variation. Appendix A.4 varies random candidate subsampling across 10 seeds, but this tests candidate selection sensitivity, not generation seed sensitivity."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The main experiments do not state how many runs were performed. Only Appendix A.4's sensitivity analysis explicitly states 'ten times with different random seeds.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The α parameter is swept across a range shown in figures, but no explicit search budget (number of configurations, compute spent) is reported. The sweep granularity and total configurations tried are not stated."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Best α=1.0 is selected based on test set performance (Figure 3, MBPP with Llama-3-8B). No separate validation set is used for hyperparameter selection — α is effectively tuned on the test data."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement all methods (Coder, CoderReviewer, CodeRSA) themselves. No discussion of author-evaluation bias or whether their baseline implementations match published results."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "CodeRSA requires generating additional instructions, clustering, and computing scores across all instruction-candidate pairs — substantially more compute than Coder or CoderReviewer. Performance is not compared at matched compute budgets."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Section 4.1 discusses why HumanEval and MBPP are suitable difficulty levels for evaluating reranking, but does not question whether test-case pass rate actually measures code generation quality or whether these benchmarks have known validity limitations."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. CodeRSA is a post-generation reranking method, not an agent scaffold."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. HumanEval and MBPP were published in 2021. Llama-3 and Qwen-2.5 were trained in 2024 and could have seen solutions in their training data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. No analysis of whether the evaluation setup leaks information (e.g., function signatures in HumanEval providing strong hints)."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Not discussed. No analysis of whether training data contains problems structurally similar to HumanEval or MBPP problems."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or temporal splits."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "CodeRSA consistently outperforms Coder Reranking and CoderReviewer Reranking baselines across models and benchmarks.",
    370       "evidence": "Figures 3, 5, 6, 7 show CodeRSA above baselines on MBPP and HumanEval with both Llama-3-8B-Instruct and Qwen-2.5-7B-Instruct. E.g., 59.53% vs baselines on MBPP/Llama-3-8B at α=1.0 (Section 5.1).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Semantic clustering is crucial for CodeRSA's performance — removing it causes a substantial accuracy drop.",
    375       "evidence": "All four main result figures (3, 5, 6, 7) include 'CodeRSA w/o Clustering' ablation showing performance drops below baselines in some cases.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "CodeRSA is robust to the calibration parameter α within a stable band of [0.90, 1.15].",
    380       "evidence": "Figure 3 shows stable performance within the shaded band. Appendix A.2 figures show similar stability for other model/dataset combinations.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "CodeRSA scales to larger models (Llama-3-70B-Instruct) without abnormal behavior.",
    385       "evidence": "Table 2 (Appendix A.3): CodeRSA achieves 51.2% accuracy vs CoderReviewer's 48.8% and Coder's 39.3% on 84 MBPP instances unsolved at pass@1 but solved at pass@10.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "CodeRSA corrects the length bias inherent in Coder Reranking by normalizing over alternative instruction clusters.",
    390       "evidence": "Qualitative example in Section 5.2 (Figure 4) shows Coder preferring a shorter, incomplete code_09 while CodeRSA correctly selects code_01 via pragmatic reasoning over clusters.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No statistical significance tests",
    397       "detail": "All comparative claims ('outperforms', 'surpasses') are based on comparing raw accuracy percentages without any statistical tests. With single-run results on modestly sized benchmarks, observed differences could be due to chance."
    398     },
    399     {
    400       "flag": "Hyperparameter tuned on test set",
    401       "detail": "The calibration parameter α is swept and selected based on test set performance (Figures 3, 5, 6, 7). No validation set is used. The 'best α=1.0' is effectively selected post-hoc on the evaluation data."
    402     },
    403     {
    404       "flag": "Benchmark contamination unaddressed",
    405       "detail": "HumanEval (2021) and MBPP (2021) are well-known benchmarks published years before Llama-3 (2024) and Qwen-2.5 (2024). The models likely encountered these problems during training, potentially inflating absolute performance numbers and complicating interpretation of reranking improvements."
    406     },
    407     {
    408       "flag": "No error bars on main results",
    409       "detail": "Main experimental results are presented as single-run accuracy values. Only the sensitivity analysis in Appendix A.4 reports standard deviation across runs. The stability of the main findings cannot be assessed."
    410     },
    411     {
    412       "flag": "Compute cost asymmetry not controlled",
    413       "detail": "CodeRSA requires substantially more compute than baselines (generating instructions, pairwise clustering, cross-evaluating all instruction-candidate pairs). Performance is not compared at matched compute budgets, making it unclear whether the improvement justifies the ~6x+ cost increase."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["Mark Chen"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduces HumanEval benchmark and Coder Reranking baseline used in this paper's evaluation."
    423     },
    424     {
    425       "title": "Program synthesis with large language models",
    426       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    427       "year": 2021,
    428       "arxiv_id": "2108.07732",
    429       "relevance": "Introduces MBPP benchmark used as one of two primary evaluation datasets."
    430     },
    431     {
    432       "title": "Coder reviewer reranking for code generation",
    433       "authors": ["Tianyi Zhang", "Tao Yu", "Tatsunori B Hashimoto"],
    434       "year": 2023,
    435       "relevance": "State-of-the-art content-driven code reranking baseline that CodeRSA aims to surpass."
    436     },
    437     {
    438       "title": "CodeT: Code generation with generated tests",
    439       "authors": ["Bei Chen", "Fengji Zhang", "Anh Nguyen"],
    440       "year": 2022,
    441       "arxiv_id": "2207.10397",
    442       "relevance": "Execution-driven code reranking method representing an alternative approach to candidate selection."
    443     },
    444     {
    445       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    446       "authors": ["Dong Huang", "Jie M. Zhang", "Michael Luck"],
    447       "year": 2024,
    448       "arxiv_id": "2312.13010",
    449       "relevance": "Multi-agent code generation with iterative testing, an execution-driven reranking alternative."
    450     },
    451     {
    452       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    453       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    454       "year": 2024,
    455       "arxiv_id": "2407.21787",
    456       "relevance": "Explores scaling inference compute via repeated sampling for code generation — directly relevant to the sample-then-rerank paradigm."
    457     },
    458     {
    459       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    460       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    461       "year": 2024,
    462       "relevance": "Rigorous evaluation methodology for LLM code generation, directly relevant to benchmark validity."
    463     },
    464     {
    465       "title": "The llama 3 herd of models",
    466       "authors": ["Aaron Grattafiori"],
    467       "year": 2024,
    468       "arxiv_id": "2407.21783",
    469       "relevance": "Technical report for Llama-3, one of the primary models evaluated in this work."
    470     },
    471     {
    472       "title": "A performance study of LLM-generated code on LeetCode",
    473       "authors": ["Tristan Coignion", "Clément Quinton", "Romain Rouvoy"],
    474       "year": 2024,
    475       "relevance": "Empirical study of LLM code generation performance on competitive programming problems."
    476     },
    477     {
    478       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    479       "authors": ["Terry Yue Zhuo"],
    480       "year": 2024,
    481       "arxiv_id": "2406.15877",
    482       "relevance": "More challenging code generation benchmark referenced as too difficult for meaningful reranking comparison."
    483     },
    484     {
    485       "title": "Evaluating the code quality of AI-assisted code generation tools: An empirical study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    486       "authors": ["Burak Yetiştiren", "Işık Özsoy", "Miray Ayerdem", "Eray Tüzün"],
    487       "year": 2023,
    488       "arxiv_id": "2304.10778",
    489       "relevance": "Evaluates code quality of commercial AI code generation tools, relevant to AI programming assessment."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs