scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28061B)
      1 {
      2   "paper": {
      3     "title": "Rubric Is All You Need: Improving LLM-based Code Evaluation With Question-Specific Rubrics",
      4     "authors": [
      5       "Aditya Pathak",
      6       "Rachit Gandhi",
      7       "Vaibhav Uttam",
      8       "Arnav Ramamoorthy",
      9       "Pratyush Ghosh",
     10       "Aaryan Raj Jindal",
     11       "Shreyash Verma",
     12       "Aditya Mittal",
     13       "Aashna Ased",
     14       "Chirag Khatri",
     15       "Yashwanth Nakka",
     16       "Devansh",
     17       "Jagat Sesh Challa",
     18       "Dhruv Kumar"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv",
     22     "arxiv_id": "2503.23989",
     23     "doi": "10.48550/arXiv.2503.23989"
     24   },
     25   "scan_version": 2,
     26   "active_modules": ["experimental_rigor", "data_leakage"],
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "GitHub repository (https://github.com/BITS-Pilani-GRC/Rubric-Grader) is provided, and authors state 'We have made our dataset and code publicly available' (§1)."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Dataset released on HuggingFace (https://huggingface.co/datasets/BITS-Pilani-GRC/RubricEval) as stated in §1 and §3."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No mention of requirements.txt, Dockerfile, conda environment, or library versions anywhere in the paper."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions provided. Prompts are given in Appendix A but there is no guide on how to run the experiments end-to-end."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Tables 2 and 3 report only point estimates for all metrics (r, rs, τb, ln, ICC, κB). No confidence intervals or error bars are provided."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper claims 'question-specific rubrics significantly enhance logical assessment' and that EME-QS provides 'significantly better correlation scores versus the FPM technique' (§6.2.2), but no statistical significance tests (p-values, t-tests, bootstrap tests) are reported."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper reports metric differences with baseline context, e.g., 'lifts ICC3 from 0.560 → 0.819 and boosts Pearson r by +0.26 points (0.562 → 0.825)' (§6.2.2), providing enough context to assess the magnitude of improvement."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No justification for why 80 OOP and 150 DSA submissions were chosen. No power analysis. The OOP dataset samples 20 from each of 4 score ranges from 350 total (§3.1), but no rationale for this number."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No variance, standard deviation, or spread measures reported across experimental runs. Results appear to be single-run point estimates."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple baselines included: CodeBERTScore (no LLM), CodeJudge (no rubric), FPM and EME-QA (question-agnostic rubric). Presented in Tables 2 and 3."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Baselines include CodeBERTScore (2023), CodeJudge (2024), CodEv-inspired EME (2024), and FPM from Phung et al. (2023). These are recent and representative."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper systematically varies the key components: rubric type (QA vs QS), evaluation granularity (CRE whole-rubric vs PRE point-by-point), ensemble size, and model parameter size (§6.2.4). These serve as ablations of the proposed approach."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Seven complementary metrics reported: Pearson r, Spearman rs, Kendall τb, Leniency, ICC1, ICC2, ICC3, and Cohen's Kappa (§5.3)."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "All LLM grading is compared against human expert grading as ground truth. Two human graders evaluated submissions collaboratively (§3.1, §3.2). The entire evaluation framework tests LLM output against human judgment."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No explicit separation of development and test sets. Prompts and techniques may have been iterated on the same data used for final evaluation. No discussion of held-out splits."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Results broken down by dataset (OOP Table 2, DSA Table 3), by technique category (no LLM, no rubric, QA rubric, QS rubric), and per-method discussion for OOP. §6.2.3 discusses method-level evaluation."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "PRE's excessive strictness is discussed (leniency = -0.329, 'slashes average scores by 11.5/35 marks', §6.2.3). FPM's poor performance is analyzed (§6.2.2). The paper explains why PRE is harsher than CRE."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "PRE performs poorly on leniency (-0.329) and ICC scores (ICC1=0.201). QA and QS perform comparably on OOP, undermining the general claim. FPM performs worse than EME-QA. These negative findings are reported and discussed."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The abstract claims 'question-specific rubrics significantly enhance logical assessment of code in educational settings.' Table 3 (DSA) supports this, but Table 2 (OOP) shows QS and QA are comparable (EME-QS r=0.900 vs EME-QA r=0.904). The abstract does not acknowledge this split result."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The claim that QS rubrics improve grading is supported by a controlled comparison: the same LLMs, datasets, and evaluation framework are used with only the rubric type varied (QA vs QS in EME, §6.2). This is an adequate single-variable manipulation."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title 'Rubric Is All You Need' implies universal applicability. Results are limited to Java, two specific courses (OOP and DSA), one university (BITS Pilani), and primarily GPT-4o-mini. The limitations section (§8) acknowledges Java-only and limited courses, but the title and abstract frame results broadly."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No discussion of alternative explanations for why QS outperforms QA. Possible confounds include: rubric quality differences, the amount of information in QS vs QA prompts (QS rubrics may simply provide more context/tokens), or whether the DSA dataset's algorithmic diversity rather than rubric specificity drives the gap."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper measures correlation with human grading (Spearman, ICC, Cohen's Kappa) and frames this as 'evaluation quality.' The proxy and outcome are closely aligned — human expert grades are the natural gold standard for grading accuracy. The Leniency metric explicitly measures bias relative to human assessment."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Models specified as 'GPT-4o-mini' and 'Claude 3.7 Sonnet' (§5.1) and 'GPT-4o' (§4.4). These are marketing names without snapshot dates or API version identifiers. No model version strings like 'gpt-4o-mini-2024-07-18' are provided."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Full prompts for CRE (Appendix A.1), PRE (Appendix A.2), and EME (Appendix A.3) are provided, including complete prompt text with format specifications."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "Only 'Extended Thinking deactivated' is mentioned for Claude (§5.1). No temperature, top-p, max_tokens, or other sampling parameters are reported for any model."
    166       },
    167       "scaffolding_described": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The multi-agent architecture is described in detail: CRE's compiler-equipped agent for syntax checking (§4.2), PRE's per-criterion evaluation loop (§4.3), EME's approach identification + ensemble voting pipeline (§4.4). Workflow diagrams in Figures 9 and 10."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Data preprocessing documented: OOP selection process (350→80, stratified by score range, §3.1), DSA problem selection (9 topics, 3 difficulty levels, §3.2), scaling formulas (§5.2.1), binning procedure for Cohen's Kappa (§5.2.2)."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 8 'Limitations and Future Work' discusses specific constraints of the study."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Specific threats mentioned: limited to Java (§8), only OOP and DSA courses tested, mainly GPT-4o used, didn't compare open vs closed source LLMs, limited to single-file assignments, rubric granularity not studied."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 8 explicitly states what was NOT tested: other programming languages, other LLMs, open-source models, multi-file assignments, and varying rubric granularities."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Full dataset released on HuggingFace including student submissions, rubrics, model solutions, grades, and feedback."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "OOP data from Fall 2024 exam at BITS Pilani (§3.1). DSA data from Geeks for Geeks practice website (§3.2). Both describe collection source, time period, and structure."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "OOP: student submissions from an undergraduate OOP course exam, 80 selected from 350 via stratified sampling across 4 score ranges with 20 per range (§3.1). DSA: 6 submissions per problem selected by outcome category (3 correct, 1 wrong, 1 TLE, 1 compilation error, §3.2)."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Pipeline documented: OOP starts with 350 submissions → stratified into 4 score ranges → 20 per range → 80 total (§3.1). DSA: 25 problems × 6 submissions = 150 (§3.2). Grading by two human graders, scaling and binning procedures detailed (§5.2)."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgments (§10) state 'support from the New Faculty Seed Grant, Birla Institute of Technology and Science (BITS), Pilani (Grant Ref. N4/24/1004).'"
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All authors listed with BITS Pilani affiliation. No external product being evaluated — they test general-purpose LLMs (GPT-4o, Claude) rather than a product they developed."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "BITS Pilani is a university providing a faculty seed grant. The funder has no financial interest in whether QS or QA rubrics perform better."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests statement or financial interests declaration found in the paper."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No training cutoff dates stated for GPT-4o-mini, GPT-4o, or Claude 3.7 Sonnet. The LLMs grade student code, but knowledge of common algorithms and solutions from training data could bias grading."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "DSA problems come from Geeks for Geeks, one of the most popular competitive programming websites. These problems and editorial solutions are almost certainly in LLM training data. This overlap is not discussed."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "GFG problems and model solutions have been publicly available for years and are widely scraped for training data. The LLM grader may already 'know' the correct solutions independently of the rubric, confounding the evaluation. Not addressed."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants studied. Student code submissions are used as evaluation data, but students are not research subjects."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants studied. The paper uses existing student exam submissions as data, not conducting a study on humans."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants studied."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants studied."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants studied."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants studied."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants studied."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "PRE is described as 'resource-intensive due to multiple API calls per student solution' (§4.3) but no actual costs, token counts, or latency measurements are reported for any technique."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No total compute budget, GPU hours, or API spend reported. The paper proposes techniques with varying cost profiles but does not quantify them."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No mention of multiple random seeds or runs. LLM outputs are stochastic, and results appear to be from single runs without seed sensitivity analysis."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The exact number of experimental runs is never stated. It is unclear whether results come from single runs or are averaged over multiple runs."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No hyperparameter search budget reported. Prompt designs, penalty values (e.g., 0.5 syntax penalty), ensemble sizes, and binning parameters appear chosen without documented search."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Configuration choices like the 0.5 syntax penalty, 5-bin discretization, and ensemble composition are not justified through systematic comparison. §6.2.4 discusses ensemble size trends but provides no supporting data table."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Multiple techniques are compared across 7 metrics on 2 datasets with no correction for multiple comparisons."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors propose CRE, PRE, and EME and implement all baselines themselves. No acknowledgment of the bias of evaluating their own systems against their own implementations of competing methods."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "PRE requires one API call per rubric point while CRE uses one call for the whole rubric, yet no performance-per-compute comparison is provided. EME uses a three-model ensemble with higher cost but no cost-performance tradeoff analysis."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "Human grades from two consensus graders serve as ground truth, but inter-annotator agreement before consensus is never reported. The validity of the ground truth itself is not examined."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "EME uses a three-model ensemble (GPT-4o, Claude 3.7, GPT-4o-mini) while CRE uses single models. Comparing EME(QS) against CRE confounds the scaffold (ensembling) with the technique. Not discussed."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "DSA problems from Geeks for Geeks have been publicly available for years. Student solutions from Fall 2024 postdate most model training cutoffs. No discussion of temporal relationships between data and model training."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "The grading prompts include model solutions and rubrics by design, but the paper does not discuss whether the LLM's pre-existing knowledge of GFG problems (from training data) biases grading independently of the provided rubric."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "OOP: all 80 submissions answer the same single question (7 methods). DSA: 6 submissions per problem. Non-independence of submissions within the same problem is not discussed. Correlation metrics may be inflated by within-problem dependence."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No leakage detection or prevention method applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Question-specific rubrics significantly enhance logical assessment of code in educational settings compared to question-agnostic rubrics.",
    378       "evidence": "On DSA dataset (Table 3), EME-QS achieves ICC3=0.819 and r=0.825 vs EME-QA ICC3=0.560 and r=0.562. However, on OOP dataset (Table 2), EME-QS (r=0.900) and EME-QA (r=0.904) are comparable.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "CRE achieves near-human leniency while PRE is excessively strict.",
    383       "evidence": "Table 2 shows CRE(OpenAI) leniency=0.082 (near zero target) while PRE leniency=-0.329, meaning PRE penalizes 11.5/35 marks on average versus human graders (§6.2.3).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "LLM-based evaluation techniques vastly outperform CodeBERTScore.",
    388       "evidence": "Tables 2 and 3: CodeBERTScore achieves r=0.354/0.126 and κB=0.241/0.010, while even the worst LLM technique (CodeJudge) reaches r=0.717/0.423 and κB=0.433/0.406.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Providing a rubric (QA or QS) improves grading accuracy over no-rubric approaches.",
    393       "evidence": "Tables 2 and 3: CodeJudge (no rubric) achieves lower correlation and ICC scores than all rubric-based techniques on both datasets.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "EME effectiveness improves with model parameter size and plateaus at 3-4 ensemble models.",
    398       "evidence": "Stated in §6.2.4 as observations, but no supporting data table, figures, or specific numerical results are provided for either the model size or ensemble size experiments.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "methodology_tags": ["benchmark-eval"],
    403   "key_findings": "Question-specific rubrics substantially improve LLM-based code grading accuracy over question-agnostic rubrics on algorithmically diverse DSA problems (ICC3 0.819 vs 0.560), but show comparable performance on simpler, homogeneous OOP problems. Complete Rubric Evaluation (CRE) achieves near-human leniency (0.082) while Pointwise Rubric Evaluation (PRE) is excessively strict (-0.329), suggesting that whole-rubric prompts allow LLMs to award partial credit more like human graders. All LLM-based techniques significantly outperform non-LLM baselines like CodeBERTScore, and rubric-based approaches outperform rubric-free approaches like CodeJudge.",
    404   "red_flags": [
    405     {
    406       "flag": "No statistical significance testing",
    407       "detail": "All claims of one technique outperforming another are based on comparing raw metric values. The word 'significantly' is used in the abstract without any statistical significance test. With small datasets (N=80, N=150) and stochastic LLM outputs, observed differences could be noise."
    408     },
    409     {
    410       "flag": "No inter-annotator agreement for ground truth",
    411       "detail": "Human ground truth was created by two graders through a 'consensus-driven approach' (§3.1), but inter-annotator agreement (Cohen's Kappa or percentage agreement) between the two graders before consensus is never reported. The reliability of the gold standard is unverified."
    412     },
    413     {
    414       "flag": "GFG contamination risk unaddressed",
    415       "detail": "DSA problems come from Geeks for Geeks, one of the most widely-scraped programming education websites. LLMs trained on web data almost certainly have seen these problems and their editorial solutions. The grading LLM may succeed not because of the rubric but because it already knows the correct solution from training data."
    416     },
    417     {
    418       "flag": "Single-run results with stochastic models",
    419       "detail": "No variance, multiple runs, or seed sensitivity reported despite using stochastic LLM APIs. LLM output can vary substantially between calls, making single-run results unreliable."
    420     },
    421     {
    422       "flag": "Unsupported claim about ensemble and model size",
    423       "detail": "Section 6.2.4 claims EME improves with model parameter size and plateaus at 3-4 ensemble models, but provides no data table, figure, or specific numbers to support these claims."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human Tutors",
    429       "authors": ["Tung Phung", "Victor-Alexandru Pădurean", "José Cambronero", "Sumit Gulwani", "Tobias Kohn", "Rupak Majumdar", "Adish Singla", "Gustavo Soares"],
    430       "year": 2023,
    431       "arxiv_id": "2306.17156",
    432       "relevance": "Establishes FPM baseline for LLM code evaluation using question-agnostic rubrics; key comparator in this paper."
    433     },
    434     {
    435       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    436       "authors": ["Weixi Tong", "Tianyi Zhang"],
    437       "year": 2024,
    438       "arxiv_id": "2410.02184",
    439       "relevance": "LLM-based code evaluation technique using slow-thinking decomposition; baseline comparator for rubric-free approaches."
    440     },
    441     {
    442       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    443       "authors": ["Terry Yue Zhuo"],
    444       "year": 2024,
    445       "arxiv_id": "2304.14317",
    446       "relevance": "LLM-based code evaluation without test cases, demonstrating limited correlation with human judgment."
    447     },
    448     {
    449       "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code",
    450       "authors": ["Shuyan Zhou", "Uri Alon", "Sumit Agarwal", "Graham Neubig"],
    451       "year": 2023,
    452       "arxiv_id": "2302.05527",
    453       "relevance": "Semantic code similarity metric using pretrained code models; non-LLM baseline for code evaluation."
    454     },
    455     {
    456       "title": "CodEv: An Automated Grading Framework Leveraging Large Language Models for Consistent and Constructive Feedback",
    457       "authors": ["En-Qi Tseng", "Pei-Cing Huang", "Chan Hsu", "Peng-Yi Wu", "Chan-Tung Ku", "Yihuang Kang"],
    458       "year": 2024,
    459       "doi": "10.1109/bigdata62323.2024.10825949",
    460       "relevance": "LLM ensemble framework for code grading with question-agnostic rubrics; inspires EME technique."
    461     },
    462     {
    463       "title": "GPT-4 Technical Report",
    464       "authors": ["OpenAI"],
    465       "year": 2023,
    466       "arxiv_id": "2303.08774",
    467       "relevance": "Technical report for GPT-4, the model family used in the evaluation experiments."
    468     },
    469     {
    470       "title": "Automating Autograding: Large Language Models as Test Suite Generators for Introductory Programming",
    471       "authors": ["Umar Alkafaween", "Ibrahim Albluwi", "Paul Denny"],
    472       "year": 2024,
    473       "doi": "10.1111/jcal.13100",
    474       "relevance": "LLMs for automated test generation in programming education; related application of LLMs to code assessment."
    475     },
    476     {
    477       "title": "Evaluating Language Models for Generating and Judging Programming Feedback",
    478       "authors": ["Charles Koutcheme", "Nicola Dainese", "Sami Sarsa", "Arto Hellas", "Juho Leinonen", "Syed Ashraf", "Paul Denny"],
    479       "year": 2025,
    480       "doi": "10.1145/3641554.3701791",
    481       "relevance": "Evaluates LLMs as judges of programming feedback quality, directly relevant to LLM-as-evaluator research."
    482     },
    483     {
    484       "title": "Grade Like a Human: Rethinking Automated Assessment with Large Language Models",
    485       "authors": ["Wenjing Xie", "Juxin Niu", "Chun Jason Xue", "Nan Guan"],
    486       "year": 2024,
    487       "arxiv_id": "2405.19694",
    488       "relevance": "Multi-agent system for rubric-based grading of short answers; related approach to context-aware rubric creation."
    489     },
    490     {
    491       "title": "Large Language Models (GPT) for automating feedback on programming assignments",
    492       "authors": ["Maciej Pankiewicz", "Ryan S. Baker"],
    493       "year": 2023,
    494       "arxiv_id": "2307.00150",
    495       "relevance": "Early work on LLM-generated feedback for programming assignments."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs