scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27487B)
      1 {
      2   "paper": {
      3     "title": "CodeScore: Evaluating Code Generation by Learning Code Execution",
      4     "authors": [
      5       "Yihong Dong",
      6       "Jiazheng Ding",
      7       "Xue Jiang",
      8       "Ge Li",
      9       "Zhuo Li",
     10       "Zhi Jin"
     11     ],
     12     "year": 2023,
     13     "venue": "ACM (Conference acronym placeholder in paper)",
     14     "arxiv_id": "2301.09043"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper provides a GitHub link for the code (https://github.com/Dingjz/CodeScore) and a HuggingFace link for the model (https://huggingface.co/dz1/CodeScore), both listed in footnotes 2 and 3 on page 3."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub link for the constructed code evaluation datasets (https://github.com/YihongDong/CodeGenEvaluation) in footnote 4 on page 3. The datasets (APPS-Eval, MBPP-Eval, HE-Eval) are built from public benchmarks."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions training on 'a single GPU of Tesla A100-PCIe-40G' and using 'UniXcoder' as the base model with Adam optimizer, learning rate 0.001, epoch 5, but does not provide a requirements.txt, Dockerfile, or detailed environment/dependency specification listing library versions."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While code and data are released, the paper itself does not include step-by-step reproduction instructions. The implementation details in Section 4.1.4 describe hyperparameters and architecture but do not provide specific commands or a reproducibility guide."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The main results tables (Tables 3, 4, 5) report only point estimates for correlation coefficients (Kendall-Tau, Spearman, Pearson) and MAE without confidence intervals or error bars. The human evaluation in Table 6 reports ± values (e.g., 3.4 ± 0.3), but the main experimental results lack uncertainty quantification."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports p-values for the human evaluation: 'All p-values are substantially less than 0.005' (Section 4.2.4, page 15). However, significance tests are not reported for the main correlation comparison results."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports absolute improvements with baseline context, e.g., 'CodeScore achieved absolute improvements of 40.56%, 55.07%, and 58.87% on τ, rs, and rp, respectively' (Section 4.2.1). Correlation coefficients are reported for all methods allowing magnitude comparison."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No justification is provided for the dataset sizes or the choice of 100 samples for human evaluation. The paper notes 'Considering the workload of the evaluators, we choose a moderate sample size of 100' (footnote 10), which is a practical constraint, not a statistical justification."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper states 'we exhibit the average performance of UniCE running five times' (Section 4.1.4) and 'we also run other LLM-based metrics five times with their public source code and provide the average performance' (Section 5). The human evaluation reports standard deviations (e.g., '3.4 ± 0.3'). However, the main tables do not show per-run variance for the 5 runs, so this is borderline. The ± notation in Table 6 does count."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper compares against multiple baselines across three categories: match-based CEMs (BLEU, Accuracy, CodeBLEU, CrystalBLEU), LLM-based EMs (BERTScore, COMET, CodeBERTScore), and execution-based CEMs (AvgPassRatio). These are detailed in Section 4.1.2."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The baselines include CodeBERTScore (2023, concurrent work), CodeBLEU (2020), BERTScore (2020), and CrystalBLEU (2022). Given the paper's 2023 submission date, these are reasonably contemporary. The paper explicitly notes CodeBERTScore as 'a concurrent work.'"
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "RQ2 evaluates the Exec component independently. RQ3 is an explicit ablation comparing the unified loss L_Uni against individual losses (L_Ref, L_NL, L_Ref+NL) across all three input formats (Tables 3 and 4). Table 5 ablates different base models (UniXcoder, CodeBert, CodeGraphBert)."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses four evaluation metrics: Kendall-Tau (τ), Spearman R (rs), Pearson R (rp), and Mean Absolute Error (MAE), all defined in Section 4.1.3."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "RQ4 (Section 4.2.4) conducts a human evaluation with 10 computer science PhD students evaluating 100 randomly selected samples from HE-Eval, rating the reasonableness of different metrics' scores on a 0-5 scale."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The datasets have explicit train/dev/test splits (Table 1). Results are reported on test sets. Additionally, HE-Eval is used in a zero-shot setting (trained on APPS-Eval, tested on HE-Eval) providing cross-dataset evaluation."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down by dataset (APPS-Eval, MBPP-Eval, HE-Eval), by input format (Ref-only, NL-only, Ref&NL), by base model (Table 5), and the Exec results are broken down by dataset with Accuracy/F1/Precision (Figure 4)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "RQ5 (Section 4.2.5) provides case studies showing where previous metrics fail and how CodeScore handles those cases. The Discussion section (Section 6) identifies limitations including restriction to function-level Python code."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports that HE-Eval performance is 'the poorest' for Exec due to zero-shot setting (Section 4.2.2). Table 5 shows CodeGraphBert outperforms UniXcoder on some metrics. The paper also notes CodeScore still has 'certain limitations' in the Discussion."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims 'up to 58.87% correlation improvement' which corresponds to the Pearson R improvement on APPS-Eval in Table 3 (Ref&NL format). The 'state-of-the-art' claim is supported by Tables 3 and 4 showing CodeScore outperforming all non-execution baselines. The three input format claim is demonstrated throughout."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims through ablation studies (RQ3): removing L_Uni and using individual losses degrades performance, supporting the claim that the unified training contributes to improvement. The ablation design (controlled single-variable manipulation of loss functions and base models) is adequate for these claims."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper title and abstract claim to evaluate 'code generation' broadly, but the Discussion section acknowledges 'CodeScore is more suitable for evaluating function-level code in Python.' The title does not bound this — it says 'Evaluating Code Generation' without qualification. The experiments are exclusively on Python function-level code."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The Threats to Validity section discusses hyperparameter sensitivity and dataset-specific generalizability but does not consider alternative explanations for why CodeScore outperforms baselines. For example, it doesn't discuss whether the improvement comes from learning execution patterns vs. better code representation, or whether dataset construction choices might favor CodeScore."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper specifies UniXcoder as the base model and lists the specific code generation models used to produce generated code: 'CodeGen 350M&16B, InCoder 1B&6B, and CodeX 13B&175B' and 'StarCoder 15.5B, CodeLlama 34B, and GPT-4' (Section 4.1.1). Model sizes are given, though specific versions/snapshots of GPT-4 are not stated."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "This paper does not use prompting in its core methodology. CodeScore is a fine-tuned model that takes structured input (code + reference + NL). The generated codes used in evaluation were produced by various LLMs, but the paper is evaluating the metric, not doing prompting. Appendix A shows a ChatGPT prompt for test case generation, which is a supplementary experiment."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 4.1.4 reports: learning rate 0.001, training epochs 5, Adam optimizer, input token length limit 1024, feedforward network architecture (3 linear layers with dimensions 3072, 1024, 2, Tanh activation). The hardware (Tesla A100-PCIe-40G) is also specified."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. CodeScore is a standard fine-tuned LLM with a feedforward head for regression/classification."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.1.1 describes the full dataset construction pipeline: starting from public benchmarks, generating codes with multiple LLMs, building test cases (type inference → input enumeration → reference code execution), labeling with PassRatio. Input formatting is described in Section 4.1.4."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 5 'Threats to Validity' and Section 6 'Discussion' both address limitations. The Discussion section identifies three specific limitations: data collection requirements, restriction to function-level Python, and additional computation cost."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 5 discusses specific threats: (1) the quality and representativeness of the three specific datasets used, (2) hyperparameter sensitivity with their mitigation (grid search on validation set), and (3) instability of deep learning models mitigated by 5-run averaging. Section 6 adds Python/function-level restriction."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The Discussion section explicitly states 'CodeScore is more suitable for evaluating function-level code in Python' and identifies that the approach requires 'collecting a certain amount of data, including sufficient test cases, generated codes, reference codes, and NL descriptions.'"
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The datasets are released via GitHub (https://github.com/YihongDong/CodeGenEvaluation, footnote 4). The base benchmarks (APPS, MBPP, HumanEval) are public. This allows independent verification of the evaluation data."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.1.1 describes the full data collection procedure: which base datasets were used, how generated codes were obtained (from specific LLMs), how test cases were extended (3-step process: infer input types, enumerate inputs, execute reference code), and how PassRatio labels were computed."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "For the human evaluation (RQ4), the paper states 'We invite ten computer science PhD students, each with over three years of experience in Python development' but does not describe how they were recruited (e.g., from which institution, how they were selected, whether this introduces selection bias)."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The data pipeline is documented: public benchmarks → code generation by LLMs → test case extension (3-step process) → PassRatio labeling → train/dev/test split. Statistics are provided in Tables 1 and 2 showing the counts at each stage."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The Acknowledgments section states: 'This research is supported by the National Natural Science Foundation of China under Grant No.62192733, 61832009, 62192731, 62192730, 62072007, the Key Program of Hubei under Grant JD2023008.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All authors are affiliated with 'Key Laboratory of High Confidence Software Technologies (Peking University), Ministry of Education; School of Computer Science, Peking University, Beijing, China' as stated in the header."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The funding comes from the National Natural Science Foundation of China and Hubei provincial grants, which are government research funding agencies with no commercial interest in CodeScore's performance outcomes."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper fine-tunes UniXcoder on the APPS-Eval training set and evaluates on HE-Eval (based on HumanEval). However, the pre-training data cutoff for UniXcoder is not discussed. More critically, the LLMs used to generate code (CodeGen, InCoder, CodeX, GPT-4) may have seen the benchmark problems during pre-training, and no training cutoffs are stated for these models."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "The paper does not discuss whether the LLMs used to generate the evaluation code (CodeX, GPT-4, etc.) may have seen the benchmark problems (APPS, MBPP, HumanEval) during their pre-training. If a model memorized solutions rather than generating them, the PassRatio labels could be inflated."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "HumanEval was published in 2021, APPS in 2021, and MBPP in 2021. Several LLMs used (GPT-4 2023, CodeLlama 2023, StarCoder 2023) were trained after these benchmarks were public. The paper does not address whether the generated code quality (and thus the evaluation data) is affected by benchmark contamination of the generating models."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The human evaluation (RQ4) involving 10 PhD students is not pre-registered. No mention of any pre-registration."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No IRB or ethics board approval is mentioned for the human evaluation study involving 10 PhD students."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "The paper reports that evaluators are 'ten computer science PhD students, each with over three years of experience in Python development' (Section 4.2.4). This provides expertise level, though geographic distribution and gender are not reported."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No explicit inclusion/exclusion criteria are stated for the evaluators beyond being CS PhD students with 3+ years of Python experience. It is unclear how these 10 were selected from a larger pool."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "This is not a between-subjects experimental study. Each evaluator rates code-metric score pairs; there are no treatment/control conditions requiring randomization of participants."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "The paper states evaluators rated anonymously: 'We randomly list the generated code with reference code and NL and the corresponding EM score on the questionnaire. Each group is evaluated anonymously by one evaluator' (Section 4.2.4). The metrics are presented without identifying which metric produced each score."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No information about whether all 10 evaluators completed their assigned evaluations or if there was any dropout."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "The paper reports execution time comparisons: CodeScore is 'three orders of magnitude faster than execution-based CEMs' and provides relative execution times for all methods (e.g., '33.7×' vs BLEU baseline in Table 3). Absolute times are given for BLEU as reference (26.0s, 0.87s, 1.96s for the three datasets)."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "The paper mentions training on 'a single GPU of Tesla A100-PCIe-40G' with 5 epochs, but does not quantify total GPU hours, training time, or total computational budget for the experiments."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "CodeScore achieves absolute improvements of up to 58.87% in correlation with functional correctness compared to other CEMs on APPS-Eval.",
    293       "evidence": "Table 3 shows Pearson R of 0.7210 for CodeScore (Ref&NL, L_Uni) vs. the best prior method at 0.1323 (CodeBERTScore), an absolute improvement of 58.87%. Similar large improvements in Kendall-Tau (40.56%) and Spearman (55.07%).",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "CodeScore generalizes to unseen datasets (MBPP-Eval, HE-Eval) without training on them.",
    298       "evidence": "Table 4 shows CodeScore trained on APPS-Eval achieves the best correlation among non-execution methods on both MBPP-Eval (fine-tuned) and HE-Eval (zero-shot), with Spearman correlations of 0.6027 and 0.6597 respectively.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "CodeScore operates three orders of magnitude faster than execution-based CEMs.",
    303       "evidence": "Table 3 shows execution-based CEMs require 1.5k× to 20.7k× the time of BLEU, while CodeScore requires 33.7× to 44.2×. Section 4.2.1 summarizes the speed advantage.",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "The unified loss L_Uni improves performance across all input formats compared to individual losses.",
    308       "evidence": "Tables 3 and 4 show L_Uni consistently outperforms L_Ref, L_NL, and L_Ref+NL across all datasets and metrics (RQ3, Section 4.2.3).",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Human evaluation shows CodeScore improves at least 54.6% over previous representative EMs.",
    313       "evidence": "Table 6 shows CodeScore scores 3.4 vs. CodeBERTScore at 2.2 (the next best), with p-values substantially less than 0.005. However, only 100 samples were evaluated by 10 evaluators with each evaluator seeing only one group.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "The Exec component achieves over 90% accuracy, F1, and precision across all datasets.",
    318       "evidence": "Figure 4 shows Accuracy 0.94/0.953/0.944, F1 0.969/0.973/0.972, and Precision 0.938/0.948/0.943 for APPS-Eval/MBPP-Eval/HE-Eval respectively.",
    319       "supported": "strong"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "benchmark-eval"
    324   ],
    325   "key_findings": "CodeScore, an LLM-based code evaluation metric fine-tuned on code execution data, substantially outperforms match-based metrics (BLEU, CodeBLEU) and embedding-based metrics (BERTScore, CodeBERTScore) in correlation with functional correctness across three datasets (APPS-Eval, MBPP-Eval, HE-Eval). The unified training framework UniCE handles three input formats (reference-only, NL-only, reference+NL) and operates three orders of magnitude faster than execution-based evaluation while achieving comparable correlation. The approach generalizes to unseen datasets in a zero-shot setting, and human evaluation with 10 PhD students confirms the metric's superior alignment with human judgment.",
    326   "red_flags": [
    327     {
    328       "flag": "Benchmark contamination not addressed",
    329       "detail": "The LLMs used to generate evaluation code (GPT-4, CodeLlama, StarCoder) were trained after HumanEval, APPS, and MBPP were published. If these models memorized benchmark solutions, the generated code distribution and PassRatio labels in the evaluation datasets could be systematically biased. The paper does not discuss this."
    330     },
    331     {
    332       "flag": "Small human evaluation with unclear recruitment",
    333       "detail": "The human evaluation uses only 10 PhD student evaluators, each seeing only 50 code-metric pairs (100 samples / 10 evaluators × 5 metrics). Recruitment methods are not described, no IRB approval is mentioned, and the sample size justification is explicitly about workload rather than statistical adequacy."
    334     },
    335     {
    336       "flag": "Variance across runs not shown in main tables",
    337       "detail": "Although the paper states results are averaged over 5 runs, the main results tables (3, 4, 5) report only point estimates without standard deviations across runs, making it impossible to assess result stability for the main claims."
    338     },
    339     {
    340       "flag": "Generalization claim broader than evidence",
    341       "detail": "The title claims 'Evaluating Code Generation' broadly, but experiments are restricted to function-level Python code from three specific benchmarks. The Discussion section acknowledges this limitation, but the title and abstract do not qualify it."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Evaluating Large Language Models Trained on Code",
    347       "authors": ["Mark Chen", "Jerry Tworek"],
    348       "year": 2021,
    349       "arxiv_id": "2107.03374",
    350       "relevance": "Introduces HumanEval benchmark and Codex, foundational for code generation evaluation."
    351     },
    352     {
    353       "title": "Measuring Coding Challenge Competence With APPS",
    354       "authors": ["Dan Hendrycks", "Steven Basart"],
    355       "year": 2021,
    356       "relevance": "Introduces the APPS benchmark used as one of three core evaluation datasets in this paper."
    357     },
    358     {
    359       "title": "Program Synthesis with Large Language Models",
    360       "authors": ["Jacob Austin", "Augustus Odena"],
    361       "year": 2021,
    362       "arxiv_id": "2108.07732",
    363       "relevance": "Introduces the MBPP benchmark, one of the three core datasets used in this work."
    364     },
    365     {
    366       "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code",
    367       "authors": ["Shuyan Zhou", "Uri Alon", "Sumit Agarwal", "Graham Neubig"],
    368       "year": 2023,
    369       "arxiv_id": "2302.05527",
    370       "relevance": "Concurrent work proposing a BERT-based code evaluation metric, directly compared as a baseline."
    371     },
    372     {
    373       "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis",
    374       "authors": ["Shuo Ren", "Daya Guo"],
    375       "year": 2020,
    376       "arxiv_id": "2009.10297",
    377       "relevance": "Proposes CodeBLEU metric combining n-gram, AST, and data-flow matching for code evaluation."
    378     },
    379     {
    380       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    381       "authors": ["Erik Nijkamp", "Bo Pang"],
    382       "year": 2023,
    383       "relevance": "Open code generation model used to produce generated code for the evaluation datasets."
    384     },
    385     {
    386       "title": "StarCoder: may the source be with you!",
    387       "authors": ["Raymond Li", "Loubna Ben Allal"],
    388       "year": 2023,
    389       "arxiv_id": "2305.06161",
    390       "relevance": "Large open code model used as one of the code generators for the HE-Eval dataset."
    391     },
    392     {
    393       "title": "Code Llama: Open Foundation Models for Code",
    394       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    395       "year": 2023,
    396       "arxiv_id": "2308.12950",
    397       "relevance": "Open code model used for generating code in the HE-Eval evaluation dataset."
    398     },
    399     {
    400       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    401       "authors": ["Yihong Dong", "Xue Jiang"],
    402       "year": 2024,
    403       "arxiv_id": "2402.15938",
    404       "relevance": "Addresses data contamination in LLM evaluation, directly relevant to benchmark contamination concerns."
    405     },
    406     {
    407       "title": "UniXcoder: Unified Cross-Modal Pre-training for Code Representation",
    408       "authors": ["Daya Guo", "Shuai Lu"],
    409       "year": 2022,
    410       "relevance": "The base pre-trained model used in CodeScore's UniCE framework."
    411     },
    412     {
    413       "title": "Fault-Aware Neural Code Rankers",
    414       "authors": ["Jeevana Priya Inala", "Chenglong Wang"],
    415       "year": 2022,
    416       "relevance": "Addresses code ranking by functional correctness, related to the code evaluation problem CodeScore solves."
    417     },
    418     {
    419       "title": "BERTScore: Evaluating Text Generation with BERT",
    420       "authors": ["Tianyi Zhang", "Varsha Kishore"],
    421       "year": 2020,
    422       "relevance": "Foundational embedding-based evaluation metric used as a baseline and architectural inspiration."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs