scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29796B)
      1 {
      2   "paper": {
      3     "title": "Rubric Is All You Need: Improving LLM-based Code Evaluation With Question-Specific Rubrics",
      4     "authors": [
      5       "Aditya Pathak",
      6       "Rachit Gandhi",
      7       "Vaibhav Uttam",
      8       "Arnav Ramamoorthy",
      9       "Pratyush Ghosh",
     10       "Aaryan Raj Jindal",
     11       "Shreyash Verma",
     12       "Aditya Mittal",
     13       "Aashna Ased",
     14       "Chirag Khatri",
     15       "Yashwanth Nakka",
     16       "Devansh",
     17       "Jagat Sesh Challa",
     18       "Dhruv Kumar"
     19     ],
     20     "year": 2025,
     21     "venue": "International Computing Education Research Workshop",
     22     "arxiv_id": "2503.23989",
     23     "doi": "10.1145/3702652.3744220"
     24   },
     25   "scan_version": 2,
     26   "active_modules": ["experimental_rigor", "data_leakage"],
     27   "methodology_tags": ["benchmark-eval"],
     28   "key_findings": "Question-specific rubrics substantially outperform question-agnostic rubrics for LLM-based code grading, especially on algorithmically diverse DSA problems (ICC3 jumps from 0.560 to 0.819). The Complete Rubric Evaluation (CRE) technique achieves near-human leniency (0.082), while Pointwise Rubric Evaluation (PRE) is significantly harsher (leniency -0.329) due to single-criterion prompts forcing zero-or-full marking. LLM-based techniques vastly outperform similarity-based approaches like CodeBERTScore. The study introduces two datasets (80 OOP, 150 DSA submissions) and a new Leniency metric.",
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "GitHub repository URL provided: https://github.com/BITS-Pilani-GRC/Rubric-Grader (Section 1, footnote 2)."
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Dataset released on HuggingFace: https://huggingface.co/datasets/BITS-Pilani-GRC/RubricEval (Section 3, footnote 1/3)."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No requirements.txt, Dockerfile, or environment specification mentioned. The paper only names the models used (GPT-4o-mini, Claude 3.7 Sonnet) without dependency or version details for the evaluation code."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself contains no 'Reproducing Results' section or specific commands to run."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Tables 2 and 3 report only point estimates for all metrics (Pearson r, Spearman rs, ICC, Cohen's Kappa, Leniency). No confidence intervals or error bars are provided."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper makes comparative claims (e.g., 'question-specific rubrics significantly enhance logical assessment') but no statistical significance tests (t-tests, bootstrap, etc.) are applied to any comparison. All conclusions are drawn from visual inspection of metric values."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper reports correlation values and absolute metric changes with context, e.g., 'lifts ICC3 from 0.560 → 0.819 and boosts Pearson r by +0.26 points (0.562 → 0.825)' in Section 6.2.2. All metrics in Tables 2 and 3 provide magnitude information."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No justification for why 80 OOP submissions and 150 DSA submissions were chosen. No power analysis. The selection of 20 per score range for OOP is described as a sampling procedure but the overall N is not justified."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No variance or standard deviation reported across experimental runs. LLM outputs are non-deterministic but all results appear to be from single runs. No mention of repeated evaluations or spread measures."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple baselines compared: CodeBERTScore (no LLM), CodeJudge (no rubric), Five Point Marking (question-agnostic), EME-QA (question-agnostic). See Tables 2 and 3."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Baselines are recent: CodeBERTScore (2023), CodeJudge (2024), CodEV (2024), Phung et al. FPM (2023). All from the last 2 years relative to the paper."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper effectively ablates components: QA vs QS rubrics within EME, CRE vs PRE (whole-rubric vs pointwise evaluation), varying ensemble sizes (Section 6.2.4), and model comparisons (CRE OpenAI vs CRE Claude in Table 2)."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Seven complementary metrics used across three dimensions: correlations (Pearson, Spearman, Kendall-Tau), agreement (ICC1/2/3, Cohen's Kappa), and bias (Leniency). Described in Section 5.3."
     99       },
    100       "human_evaluation": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Ground truth consists of human grades by TAs and final-year students using consensus-driven approach. The entire study evaluates automated grading against human expert assessment."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No separation of development and test data. All 80 OOP and 150 DSA submissions are used for evaluation. Prompts (Appendix A) were presumably developed and iterated on the same data used for final evaluation, but no dev/test split is documented."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Results broken down by dataset (OOP Table 2, DSA Table 3), by rubric type (QA vs QS), by technique (CRE, PRE, EME, FPM, CodeJudge, CodeBERTScore), and by model (OpenAI vs Claude for CRE)."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 6.2.3 discusses PRE's excessive strictness (leniency -0.329) and explains why single-criterion prompts force harsh grading. Section 6.2.2 discusses where QA rubrics fail on diverse DSA problems. Section 7.1 discusses when each technique may be inappropriate."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "PRE's poor leniency (-0.329) is a negative result. CRE with Claude underperforms CRE with OpenAI on OOP (Pearson 0.840 vs 0.912). FPM's poor Kappa (0.346 OOP, 0.072 DSA) is reported without downplaying."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Abstract claims that question-specific rubrics 'significantly enhance logical assessment' are supported by Tables 2 and 3, particularly the large DSA improvements (EME-QS ICC3 0.819 vs EME-QA 0.560). The claim about the new Leniency metric is supported by its formal definition and application."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper claims question-specific rubrics improve evaluation quality. The study design compares the same technique (EME) with QA vs QS rubrics on the same data, isolating the rubric type as the manipulated variable. This controlled comparison is adequate for the causal claim within the tested setting."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The title 'Rubric Is All You Need' implies universal applicability. The abstract claims 'question-specific rubrics significantly enhance logical assessment of code in educational settings' broadly. But all experiments use Java only (acknowledged in Section 8), two courses, single-file assignments, and primarily GPT-4o-mini. The Limitations section lists these boundaries but the title and abstract are unbounded."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "No discussion of alternative explanations for the results. For example: the QS rubric improvement could be partly due to the rubric being more detailed (more information provided to the LLM) rather than question-specificity per se. The paper does not consider confounds or alternative interpretations."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper directly measures what it claims: agreement between LLM and human grading. The metrics (correlation, ICC, Kappa, Leniency) are appropriate for measuring grading quality. The gap between measurement and framing is small — they claim better LLM-based code evaluation and measure LLM-human grading agreement."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper states 'OpenAI's GPT-4o-mini' and 'Anthropic AI's Claude 3.7 Sonnet' (Section 5.1) without snapshot dates or API version identifiers. These are marketing names that do not pin a specific model version."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Full prompt text for CRE, PRE, and EME techniques provided in Appendix A, including exact system messages and output format specifications."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "No temperature, top-p, max tokens, or other sampling parameters reported for any of the LLM calls. Section 5.1 mentions only model names. These settings significantly affect LLM output and grading consistency."
    168       },
    169       "scaffolding_described": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "CRE's architecture (LLM grader + compiler agent + recursive mark calculator) is described in Section 4.2 and Figure 9. PRE's per-criterion approach in Section 4.3. EME's ensemble workflow with approach identification and voting in Section 4.4 and Figure 10."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 5.2 documents scaling (linear rescaling from QA 4-point to QS 35-point scale) and binning (5 bins at 20th/40th/60th/80th percentiles for Cohen's Kappa). Section 3 describes data selection criteria for both datasets."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 8 'Limitations and Future Work' presents multiple specific limitations."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 8 lists specific threats: single LLM (GPT-4o), Java-only evaluation, limited to two intermediary courses with single-file assignments, unexplored rubric granularity effects, no open-source vs closed-source comparison."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 8 explicitly states what was NOT tested: other programming languages, other LLMs, multi-file assignments, other rubric granularities, open-source LLMs. These are specific exclusions, not generic disclaimers."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Full dataset available on HuggingFace (https://huggingface.co/datasets/BITS-Pilani-GRC/RubricEval) including student code, rubrics, grades, and feedback."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 3.1: OOP data from Fall 2024 exam at BITS Pilani, 350 total submissions, 80 selected across 4 score ranges. Section 3.2: DSA data from GFG website, 25 problems across 9 topics and 3 difficulty levels, 6 submissions per problem."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "OOP submissions: from undergraduate sophomores in a programming exam, 20 randomly selected from each of 4 score ranges. DSA submissions: 6 per problem (3 correct, 1 wrong, 1 TLE, 1 compilation error) from GFG. Human graders described as 'final-year students with extensive experience in programming courses.'"
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Pipeline documented: OOP (350 submissions → 80 selected by score stratification → TA grading with consensus → LLM evaluation). DSA (25 GFG problems → 6 submissions each → rubric design → collaborative human grading → LLM evaluation). Scaling and binning steps in Section 5.2."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Section 10: 'This research was carried out in part with support from the New Faculty Seed Grant, Birla Institute of Technology and Science (BITS), Pilani (Grant Ref. N4/24/1004).'"
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "All authors list BITS Pilani as affiliation in the paper header. No evaluated product is affiliated with the authors' institution."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The BITS Pilani seed grant is an institutional research fund with no financial stake in whether question-specific rubrics outperform question-agnostic ones."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests or financial interests statement present in the paper."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No training data cutoff dates stated for GPT-4o-mini or Claude 3.7 Sonnet. This is relevant because DSA problems from GFG are widely available online and could be in the models' training data."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No discussion of whether the LLMs might have encountered these problems, solutions, or rubrics during training. GFG is a popular website whose content is likely in LLM training corpora."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "DSA problems are sourced from Geeks for Geeks, a widely-scraped website. The LLMs almost certainly encountered these problems and editorial solutions during training, which could bias their grading. This is not discussed."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the research design. Student code is used as evaluation data, and human graders produce ground truth, but neither are experimental subjects."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the experimental design. The study evaluates automated grading approaches against human-produced ground truth."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants. Student submissions and human graders are data sources, not study subjects."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study. Submission selection criteria are documented under data_integrity."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in an experimental study."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in an experimental study."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants in the study."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "PRE is described as 'resource-intensive due to multiple API calls per student solution' (Section 4.3) but no actual costs, token counts, or per-example latencies are reported. Section 7.1 mentions cost trade-offs qualitatively without quantification."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No total API spend, token consumption, or computational budget reported despite using commercial LLM APIs (GPT-4o-mini, Claude 3.7 Sonnet) across hundreds of evaluations."
    305       }
    306     },
    307     "experimental_rigor": {
    308       "seed_sensitivity_reported": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No mention of random seeds or sensitivity analysis across runs. LLM outputs are non-deterministic but results appear to be from single runs."
    312       },
    313       "number_of_runs_stated": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The number of evaluation runs per technique is not stated. It is unclear whether results are from a single LLM call per submission or averaged across multiple calls."
    317       },
    318       "hyperparameter_search_budget": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No hyperparameter search budget reported. The prompts (Appendix A) were presumably developed iteratively, but no search process or number of iterations is documented."
    322       },
    323       "best_config_selection_justified": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No documentation of how the final prompts or technique configurations were selected. The prompts in Appendix A appear to be the result of development/tuning but the selection process is not described."
    327       },
    328       "multiple_comparison_correction": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Many pairwise comparisons across techniques and metrics without any multiple comparison correction. Seven techniques compared on seven metrics across two datasets."
    332       },
    333       "self_comparison_bias_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The authors propose CRE, PRE, and EME, implement their own versions of baselines (CodeJudge), and compare without acknowledging author-evaluation bias. No independent evaluation."
    337       },
    338       "compute_budget_vs_performance": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "PRE is noted as more expensive than CRE qualitatively, but no performance-vs-cost curves or quantified cost comparisons are provided."
    342       },
    343       "benchmark_construct_validity": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "Human grading by 'final-year students with extensive programming experience' is used as ground truth without questioning its validity. Inter-rater reliability between the two human graders is not reported. Whether undergrad graders constitute an appropriate gold standard is not discussed."
    347       },
    348       "scaffold_confound_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Cross-technique comparisons (CRE vs CodeJudge vs FPM) confound scaffold architecture with rubric type. While EME-QA vs EME-QS controls for scaffold, the overall comparison table mixes scaffold and rubric effects without acknowledging this confound."
    352       }
    353     },
    354     "data_leakage": {
    355       "temporal_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "DSA problems are from Geeks for Geeks, a website whose content predates the training of GPT-4o-mini and Claude 3.7 Sonnet. The LLMs likely encountered these problems and editorial solutions during training. This temporal overlap is not discussed."
    359       },
    360       "feature_leakage_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "The LLM grader receives the model solution alongside student code. Whether this constitutes feature leakage (the answer is in the input) vs. legitimate reference information is not discussed. The impact of providing vs. withholding the model solution is not tested."
    364       },
    365       "non_independence_addressed": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No discussion of whether the structural similarity between GFG problems and LLM training data affects results. The OOP dataset (from a specific exam) is less likely contaminated, but no independence analysis is performed for either dataset."
    369       },
    370       "leakage_detection_method": {
    371         "applies": true,
    372         "answer": false,
    373         "justification": "No leakage detection or prevention method applied. No canary strings, membership inference, or decontamination analysis."
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "Question-specific rubrics substantially outperform question-agnostic rubrics for LLM-based code evaluation, especially on algorithmically diverse problems.",
    380       "evidence": "Table 3 (DSA): EME-QS achieves Pearson 0.825, ICC3 0.819, Kappa 0.646 vs EME-QA at 0.562, 0.560, 0.156. Table 2 (OOP): comparable performance between EME-QS and EME-QA.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "CRE achieves near-human leniency while PRE is significantly harsher.",
    385       "evidence": "Table 2: CRE (OpenAI) leniency = 0.082 vs PRE leniency = -0.329. PRE's single-criterion prompts force zero-or-full marking, slashing average scores by 11.5/35 marks (Section 6.2.3).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLM-based techniques outperform similarity-based approaches (CodeBERTScore) significantly.",
    390       "evidence": "Tables 2 and 3: CodeBERTScore achieves Pearson 0.354/0.126, Kappa 0.241/0.010 on OOP/DSA, while all LLM-based techniques achieve substantially higher values.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Providing a rubric improves LLM grading performance compared to no-rubric approaches.",
    395       "evidence": "Tables 2 and 3: CodeJudge (no rubric) achieves Pearson 0.717/0.423 vs EME-QA 0.904/0.562 and EME-QS 0.900/0.825 on OOP/DSA.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "EME performance improves with larger model parameter size and ensemble size (up to 3-4 models).",
    400       "evidence": "Section 6.2.4 describes these trends qualitatively but provides no quantitative data or tables for ensemble size or model size experiments.",
    401       "supported": "weak"
    402     },
    403     {
    404       "claim": "For homogeneous implementation-oriented questions (OOP), question-agnostic rubrics achieve comparable results to question-specific rubrics.",
    405       "evidence": "Table 2: EME-QA (Pearson 0.904, ICC3 0.904, Kappa 0.512) vs EME-QS (0.900, 0.900, 0.545) on OOP dataset — very similar performance.",
    406       "supported": "strong"
    407     }
    408   ],
    409   "red_flags": [
    410     {
    411       "flag": "No statistical significance testing",
    412       "detail": "All comparative claims ('significantly enhance', 'outperform') are made by visually comparing metric values in tables. No significance tests are applied to any comparison, despite sample sizes of 80 and 150."
    413     },
    414     {
    415       "flag": "No variance or reproducibility assessment",
    416       "detail": "LLM outputs are non-deterministic, but all results appear to be from single runs. No standard deviations, confidence intervals, or multi-run analysis. Results could shift substantially on a second run."
    417     },
    418     {
    419       "flag": "Possible training data contamination",
    420       "detail": "DSA problems are from Geeks for Geeks, a widely-scraped programming website. GPT-4o-mini and Claude 3.7 Sonnet have almost certainly seen these problems and editorial solutions during training, potentially inflating grading accuracy. The OOP dataset (Fall 2024 BITS Pilani exam) is less likely contaminated."
    421     },
    422     {
    423       "flag": "Human ground truth validity unverified",
    424       "detail": "Ground truth grades produced by 'final-year students with extensive programming experience' — not expert instructors. Inter-rater reliability between the two human graders is not reported. The quality of the ground truth is assumed rather than validated."
    425     },
    426     {
    427       "flag": "No dev/test split for prompt development",
    428       "detail": "Prompts were presumably iterated and refined, but all evaluation is on the same data used for development. No held-out test set to guard against overfitting the prompts to the evaluation data."
    429     },
    430     {
    431       "flag": "Missing quantitative ensemble analysis",
    432       "detail": "Section 6.2.4 claims performance varies with model size and ensemble size but provides no quantitative data, tables, or figures to support these claims — only qualitative descriptions."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human Tutors",
    438       "authors": ["Tung Phung", "Victor-Alexandru Pădurean", "José Cambronero", "Sumit Gulwani", "Tobias Kohn", "Rupak Majumdar", "Adish Singla", "Gustavo Soares"],
    439       "year": 2023,
    440       "arxiv_id": "2306.17156",
    441       "relevance": "Benchmark comparison of LLMs vs human tutors for programming education, used question-agnostic FPM technique that this paper improves upon."
    442     },
    443     {
    444       "title": "CodEv: An Automated Grading Framework Leveraging Large Language Models for Consistent and Constructive Feedback",
    445       "authors": ["En-Qi Tseng", "Pei-Cing Huang", "Chan Hsu", "Peng-Yi Wu", "Chan-Tung Ku", "Yihuang Kang"],
    446       "year": 2024,
    447       "doi": "10.1109/bigdata62323.2024.10825949",
    448       "relevance": "LLM ensemble-based code evaluation framework using question-agnostic rubrics, directly compared against in this work."
    449     },
    450     {
    451       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    452       "authors": ["Weixi Tong", "Tianyi Zhang"],
    453       "year": 2024,
    454       "arxiv_id": "2410.02184",
    455       "relevance": "LLM-based code evaluation using slow-thinking decomposition, used as a no-rubric baseline."
    456     },
    457     {
    458       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    459       "authors": ["Terry Yue Zhuo"],
    460       "year": 2024,
    461       "arxiv_id": "2304.14317",
    462       "relevance": "Early work on using LLMs for code evaluation without test cases, showing limited correlation with human judgment."
    463     },
    464     {
    465       "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code",
    466       "authors": ["Shuyan Zhou", "Uri Alon", "Sumit Agarwal", "Graham Neubig"],
    467       "year": 2023,
    468       "arxiv_id": "2302.05527",
    469       "relevance": "Semantic code evaluation metric using BERT embeddings, used as a non-LLM baseline showing limitations of similarity-based approaches."
    470     },
    471     {
    472       "title": "GPT-4 Technical Report",
    473       "authors": ["OpenAI"],
    474       "year": 2023,
    475       "arxiv_id": "2303.08774",
    476       "relevance": "Technical report for GPT-4, foundational model family used in the evaluation (GPT-4o-mini)."
    477     },
    478     {
    479       "title": "Automating Autograding: Large Language Models as Test Suite Generators for Introductory Programming",
    480       "authors": ["Umar Alkafaween", "Ibrahim Albluwi", "Paul Denny"],
    481       "year": 2024,
    482       "doi": "10.1111/jcal.13100",
    483       "relevance": "LLM-based approach to automated grading through test suite generation for programming courses."
    484     },
    485     {
    486       "title": "Evaluating Language Models for Generating and Judging Programming Feedback",
    487       "authors": ["Charles Koutcheme", "Nicola Dainese", "Sami Sarsa", "Arto Hellas", "Juho Leinonen", "Syed Ashraf", "Paul Denny"],
    488       "year": 2025,
    489       "doi": "10.1145/3641554.3701791",
    490       "relevance": "Evaluation of LLMs for generating and judging programming feedback quality, directly relevant to LLM-based code assessment."
    491     },
    492     {
    493       "title": "Grade Like a Human: Rethinking Automated Assessment with Large Language Models",
    494       "authors": ["Wenjing Xie", "Juxin Niu", "Chun Jason Xue", "Nan Guan"],
    495       "year": 2024,
    496       "arxiv_id": "2405.19694",
    497       "relevance": "Multi-agent LLM grading system that creates context-aware rubrics for short-answer questions, closest prior work on rubric-based LLM assessment."
    498     },
    499     {
    500       "title": "Large Language Models (GPT) for automating feedback on programming assignments",
    501       "authors": ["Maciej Pankiewicz", "Ryan S. Baker"],
    502       "year": 2023,
    503       "arxiv_id": "2307.00150",
    504       "relevance": "Early exploration of GPT models for automated programming feedback generation."
    505     }
    506   ]
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs