ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27099B)


      1 {
      2   "paper": {
      3     "title": "CODE-DITING: A Reasoning-Based Metric for Functional Alignment in Code Evaluation",
      4     "authors": [
      5       "Guang Yang",
      6       "Yu Zhou",
      7       "Xiang Chen",
      8       "Wei Zheng",
      9       "Xing Hu",
     10       "Xin Zhou",
     11       "David Lo",
     12       "Taolue Chen"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2505.19502"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides a GitHub link: https://github.com/Code-DiTing. Section I states 'experimental data and model weights are released at https://github.com/Code-DiTing.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper states model weights and experimental data are released at the GitHub repository. The three evaluation benchmarks (HumanEval-Judge, MBPP-Judge, BigCodeBench-Judge) and the training dataset CODEJUDGE-17K are described as released. Additionally, the underlying benchmarks (HumanEval-plus, MBPP-plus, BigCodeBench) are publicly available."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using a single RTX 4090 GPU with VLLM for inference and states maximum context length of 8k tokens, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While code and data are released on GitHub, the paper itself does not include step-by-step reproduction instructions, specific commands to run, or a 'Reproducing Results' section. Reproduction would require consulting the repository."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No confidence intervals or error bars are reported for any results. All main results in Tables III and IV are point estimates (e.g., '0.883 Acc') with no uncertainty quantification."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims CODE-DITING 'outperforms' and 'surpasses' various models based solely on comparing numerical values in tables. No statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are performed to support these comparative claims."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "While raw performance numbers are reported (e.g., accuracy, F1, MCC), no formal effect sizes (Cohen's d, relative improvement with explicit baseline context) are provided. The paper states percentage differences in parameter counts ('1% of parameters') but does not contextualize performance differences with effect size measures."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The sizes of the evaluation datasets (640, 1512, 800 samples) and the preference leakage experiments (50 problems per dataset) are not justified. No power analysis or reasoning for why these specific sizes are sufficient is provided."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The majority vote strategy uses T=7 independent inferences, but no variance or standard deviation across these runs is reported. All results are single point estimates. For the main comparisons in Tables III and IV, no variance across runs is provided."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Extensive baselines are included: GPT-3.5-turbo, GPT-4o, DeepSeek-V3, DeepSeek-R1, Llama3, Qwen2.5, and DeepSeek-R1-distill models across multiple scales (1B/1.5B, 7B/8B, 671B). Four prompting methods (Vanilla, CoT, ICE SCORE, CodeJudge) are also compared."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baselines include contemporary models such as GPT-4o, DeepSeek-R1 (2025), DeepSeek-V3 (2024), and Qwen2.5-Coder (2024), which are recent and competitive at the time of submission."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "RQ2 (Section V-B) presents ablation studies on three components: data filtering (Figure 2), PiSSA initialization vs. standard LoRA (Figure 3), and inference strategy with varying k values (Figure 4). Each component's contribution is isolated."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three evaluation metrics are used: Accuracy (Acc), F1 Score, and Matthews Correlation Coefficient (MCC). All three are reported for all experiments in Tables III and IV."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper proposes CODE-DITING as a metric for evaluating code quality, which is inherently about replacing or supplementing human judgment. While three authors manually verified some labels in the data labeling phase (Section III-A), there is no human evaluation of CODE-DITING's outputs — e.g., no assessment of whether its reasoning explanations are actually helpful or correct."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The training data (CODEJUDGE-17K) is constructed from KodCode, OpenCoder, and CodeHarmony benchmarks, while evaluation is on separately curated HumanEval-Judge, MBPP-Judge, and BigCodeBench-Judge datasets derived from different source benchmarks. The evaluation sets are distinct from the training data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per dataset (HumanEval-Judge, MBPP-Judge, BigCodeBench-Judge) in Tables III and IV, and per model scale category (1.5B, 7B, 671B). Ablation studies also show per-dataset breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No failure cases or error analysis is presented. The paper does not show examples where CODE-DITING makes incorrect judgments, nor does it analyze patterns in its errors."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The ablation study reports that PiSSA initialization shows 'less pronounced' improvement on BigCodeBench-Judge compared to other datasets (Section V-B). Also, the empirical study (Section III-E) reports that reasoning models degrade with complex prompts, which is a negative result for prompt engineering."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims that CODE-DITING 1.5B outperforms models of the same parameter magnitude and achieves performance of 5x larger models, and that CODE-DITING 7B surpasses GPT-4o and DeepSeek-V3. These claims are supported by Table IV showing the relevant comparisons. The claim about robustness to preference leakage is supported by Tables V and VI."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The ablation studies (RQ2) make causal claims about each component's contribution. These are tested via controlled single-variable manipulation: removing data filtering, replacing PiSSA with standard LoRA initialization, and varying inference passes. This constitutes adequate causal design for component-level claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper evaluates only on Python code across three benchmarks but makes broad claims about 'code evaluation' generally. The title says 'Functional Alignment in Code Evaluation' without bounding to Python. The threats to validity section mentions 'future work could explore additional programming paradigms and domain-specific languages' but does not bound the current claims to Python."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for why CODE-DITING outperforms baselines. For instance, the improvement could stem from the training data being derived from the same distribution as the test data (both using similar benchmarks), or from the majority vote strategy inflating performance at 7x computational cost. These alternatives are not considered."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper uses 'GPT-3.5-turbo' and 'GPT-4o' without specifying API versions or snapshot dates (e.g., 'gpt-4o-2024-05-13'). DeepSeek models are specified by family and size (e.g., 'DeepSeek-R1-671B') but without version identifiers. Marketing names without snapshot dates do not count as specified versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The four prompting methods (Vanilla, CoT, ICE SCORE, CodeJudge) are described in natural language in Section III-B, but the actual prompt text used is not provided in the paper. The paper states 'The prompt can be found in our GitHub repository' (footnote 2) but does not include them in the paper or appendix."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section III-D reports temperature settings (0.6 for reasoning models, 0.0 for general models), maximum context length (8k tokens), and the inference strategy (T=7 majority votes). The training uses PiSSA with LoRA, and the base models are specified."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The paper does not use agentic scaffolding. CODE-DITING is a fine-tuned model that performs single-pass inference (with majority voting), not an agent with tools, retry logic, or feedback loops."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section III-A documents data sampling (which models generate code, deduplication, comment removal), labeling (automatic via test cases + manual verification by three authors), and Section IV-A documents the training data pipeline (source benchmark collection, code generation, reasoning distillation, multi-stage filtering, class balancing to 1:1 ratio)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VI 'Threats to Validity' provides a dedicated discussion of internal, external, and construct validity threats."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The threats to validity section discusses specific issues: potential implementation fidelity concerns for baseline methods, the limitation to three benchmarks that do not cover all programming paradigms, hardware constraints limiting model scale tested (single RTX 4090), and the specific metrics chosen. These are specific to this study."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "While the threats to validity section mentions that 'future work could explore additional programming paradigms and domain-specific languages,' it does not explicitly state what the current results do NOT show. There is no explicit bounding of claims to Python, to specific benchmark types, or to the specific code generation models tested."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states that experimental data and model weights are released at https://github.com/Code-DiTing. The evaluation datasets (HumanEval-Judge, MBPP-Judge, BigCodeBench-Judge) and training data (CODEJUDGE-17K) are described as publicly available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III-A describes how evaluation data was collected: selection of three benchmark datasets, sampling code from four different models (Qwen2.5-Coder 1.5B/7B, DeepSeekCoder 1.3B/6.7B), data cleaning, deduplication, comment removal, and labeling via test cases with manual verification. Section IV-A describes training data collection similarly."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants were recruited. The study uses publicly available benchmarks and LLM-generated code, not human subjects."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented in detail: Section IV-A describes source benchmark collection, code generation with specific models, labeling via test cases, static analysis filtering, comment removal, reasoning knowledge distillation using DeepSeek-R1-671B, accuracy filtering, logical coherence filtering using DeepSeek-V3, and class balancing to produce the final 17,000 samples."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Nanjing University of Aeronautics and Astronautics, Nantong University, Northwestern Polytechnical University, Zhejiang University, Singapore Management University, and Birkbeck University of London. No apparent conflict with evaluated products."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding disclosure means this criterion cannot be confirmed as satisfied."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates multiple pre-trained models (GPT-4o, DeepSeek-R1, etc.) on benchmarks derived from HumanEval, MBPP, and BigCodeBench but does not state the training data cutoff dates for any of these models."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "HumanEval and MBPP are widely known benchmarks that are likely in the training data of the evaluated models (GPT-4o, DeepSeek series, etc.). The paper does not discuss whether the evaluated models may have been trained on these benchmarks. The preference leakage experiment (RQ3) addresses a different kind of contamination (same-family bias) but not benchmark contamination."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "HumanEval was published in 2021 and MBPP earlier. All evaluated models were trained after these benchmarks were publicly available. The paper does not discuss this contamination risk. While they use 'plus' versions with additional test cases, the original problems and solutions may still be in training data."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the study. The manual verification by three authors for data labeling does not constitute a human subjects study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section V-B reports inference latency: 0.15s per pass for the 1.5B model and 0.30s per pass for the 7B model on a single RTX 4090, scaling to approximately 1s and 2s respectively at k=7. This provides practical cost information."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "While inference hardware is specified (single RTX 4090), the total training compute budget is not stated. No information is provided about training time, GPU hours for fine-tuning, or total API costs for the distillation process using DeepSeek-R1-671B on 17,000+ samples."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "CODE-DITING 7B surpasses GPT-4o and DeepSeek-V3 (671B) despite using only 1% of their parameter volume.",
    295       "evidence": "Table IV shows CODE-DITING 7B achieving avg Acc 0.806, F1 0.782, MCC 0.565 vs GPT-4o CoT avg Acc 0.795, F1 0.762, MCC 0.497 and DS-V3 CodeJudge avg Acc 0.781, F1 0.745, MCC 0.498.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "CODE-DITING 1.5B outperforms all models of the same parameter magnitude and achieves performance equivalent to models with 5x parameters.",
    300       "evidence": "Table IV shows CODE-DITING 1.5B avg Acc 0.767, F1 0.736, MCC 0.476 vs DS-R1-distill 1.5B avg Acc 0.652, F1 0.604, MCC 0.241, and comparable to DS-R1-distill 7B avg Acc 0.737, F1 0.710, MCC 0.443.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Reasoning models demonstrate superior and more stable performance across different datasets compared to general models at comparable parameter scales.",
    305       "evidence": "Table III shows DeepSeek-R1 series consistently outperforming general models at each scale (Finding 3, Section III-E). E.g., DS-R1-distill 7B Vanilla avg Acc 0.737 vs Qwen2.5 7B best avg Acc 0.708.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "For reasoning models, simple prompts (Vanilla) work better than complex prompting strategies.",
    310       "evidence": "Table III shows DS-R1-distill 7B achieves best performance with Vanilla (avg Acc 0.737) vs CoT (0.687), ICE SCORE (0.712), CodeJudge (0.703). Finding 2, Section III-E.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "CODE-DITING does not suffer from significant preference leakage.",
    315       "evidence": "Tables V and VI show agreement rates of 93-98% and Cohen's Kappa 0.86-0.96 across code from GPT-4o and Claude-3.5 generators, and across paraphrased problem descriptions. Section V-C.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "Each component of CODE-DITING (data filtering, PiSSA, majority vote) contributes significantly to overall performance.",
    320       "evidence": "Figures 2, 3, and 4 show ablation results. Data filtering improves F1 consistently; PiSSA improves on HumanEval-Judge and MBPP-Judge but less on BigCodeBench-Judge; majority voting improves with increasing k up to k=7. Section V-B.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "CODE-DITING distills reasoning capabilities from DeepSeek-R1-671B into compact 1.5B and 7B models for LLM-as-judge code evaluation, using a curated 17K training dataset with reasoning paths. The empirical study finds that reasoning models outperform general models for code evaluation and prefer simpler prompts. CODE-DITING 7B achieves average accuracy 0.806, F1 0.782, MCC 0.565 across three benchmarks, outperforming GPT-4o and DeepSeek-V3 with only 1% of their parameters. Preference leakage experiments show high consistency (93-98% agreement) across different code generators and paraphrased problem descriptions.",
    328   "red_flags": [
    329     {
    330       "flag": "No statistical significance testing",
    331       "detail": "All comparative claims ('outperforms', 'surpasses') are based on comparing point estimates without any significance tests, confidence intervals, or variance measures. The margins between CODE-DITING 7B and GPT-4o are relatively small (e.g., 0.806 vs 0.795 avg accuracy) and could be within noise."
    332     },
    333     {
    334       "flag": "Benchmark contamination risk unaddressed",
    335       "detail": "HumanEval and MBPP are widely-used benchmarks published in 2021 and earlier. All evaluated LLMs were trained after these benchmarks were public. The paper does not discuss whether evaluated models (including CODE-DITING's base models) may have seen these problems during pre-training, which would inflate reported performance."
    336     },
    337     {
    338       "flag": "Small preference leakage sample size",
    339       "detail": "The preference leakage experiments (RQ3) use only 50 problems per dataset, which is a small sample for drawing conclusions about bias. No power analysis or confidence intervals are provided for the agreement rates."
    340     },
    341     {
    342       "flag": "Training data distribution overlap with test data",
    343       "detail": "Both training data (CODEJUDGE-17K from KodCode, OpenCoder, CodeHarmony) and evaluation benchmarks (HumanEval-Judge, MBPP-Judge, BigCodeBench-Judge) involve code generated by similar models (Qwen2.5-Coder, DeepSeekCoder) for algorithmic programming tasks. The paper does not discuss whether this distributional similarity could inflate results."
    344     },
    345     {
    346       "flag": "Unfair comparison via majority voting",
    347       "detail": "CODE-DITING uses 7 inference passes with majority voting while baselines use single inference. This 7x computational increase is not accounted for in the main comparison (Table IV). A fairer comparison would show baseline performance with the same majority voting strategy."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "Evaluating large language models trained on code",
    353       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    354       "year": 2021,
    355       "arxiv_id": "2107.03374",
    356       "relevance": "Introduced the HumanEval benchmark and Pass@k metric, foundational for code generation evaluation."
    357     },
    358     {
    359       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    360       "authors": ["D. Guo", "D. Yang", "H. Zhang"],
    361       "year": 2025,
    362       "arxiv_id": "2501.12948",
    363       "relevance": "Source of the reasoning model used for knowledge distillation in CODE-DITING and the strongest baseline."
    364     },
    365     {
    366       "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation",
    367       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    368       "year": 2023,
    369       "relevance": "Created HumanEval-plus and MBPP-plus with expanded test suites, used as basis for the evaluation benchmarks."
    370     },
    371     {
    372       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    373       "authors": ["T. Y. Zhuo", "M. C. Vu", "J. Chim"],
    374       "year": 2024,
    375       "arxiv_id": "2406.15877",
    376       "relevance": "Provides the BigCodeBench benchmark used for evaluation, testing real-world software development scenarios."
    377     },
    378     {
    379       "title": "ICE-Score: Instructing large language models to evaluate code",
    380       "authors": ["T. Y. Zhuo"],
    381       "year": 2024,
    382       "relevance": "Pioneered LLM-as-judge for code evaluation using GPT-3.5, serving as both baseline method and prompting strategy."
    383     },
    384     {
    385       "title": "CodeJudge: Evaluating code generation with large language models",
    386       "authors": ["W. Tong", "T. Zhang"],
    387       "year": 2024,
    388       "relevance": "Two-phase LLM-as-judge method for code evaluation, used as a baseline prompting strategy."
    389     },
    390     {
    391       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    392       "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng"],
    393       "year": 2023,
    394       "relevance": "Foundational work on LLM-as-judge methodology, establishing evaluation frameworks used in this space."
    395     },
    396     {
    397       "title": "Preference leakage: A contamination problem in LLM-as-a-Judge",
    398       "authors": ["D. Li", "R. Sun", "Y. Huang"],
    399       "year": 2025,
    400       "arxiv_id": "2502.01534",
    401       "relevance": "Identifies the preference leakage problem in LLM-as-judge frameworks that CODE-DITING specifically tests against."
    402     },
    403     {
    404       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    405       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    406       "year": 2022,
    407       "relevance": "Foundational prompting technique used as one of the baseline methods in the empirical study."
    408     },
    409     {
    410       "title": "Large language models for software engineering: A systematic literature review",
    411       "authors": ["X. Hou", "Y. Zhao", "Y. Liu"],
    412       "year": 2024,
    413       "relevance": "Comprehensive survey of LLMs in software engineering, providing context for the code generation evaluation landscape."
    414     },
    415     {
    416       "title": "From code to courtroom: LLMs as the new software judges",
    417       "authors": ["J. He", "J. Shi", "T. Y. Zhuo"],
    418       "year": 2025,
    419       "arxiv_id": "2503.02246",
    420       "relevance": "Recent work on LLM-based code evaluation methods, directly relevant to the LLM-as-judge evaluation paradigm."
    421     },
    422     {
    423       "title": "PiSSA: Principal singular values and singular vectors adaptation of large language models",
    424       "authors": ["F. Meng", "Z. Wang", "M. Zhang"],
    425       "year": 2024,
    426       "relevance": "Parameter-efficient fine-tuning technique used as a core component of CODE-DITING's training pipeline."
    427     }
    428   ]
    429 }

Impressum · Datenschutz