scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20679B)
      1 {
      2   "paper": {
      3     "title": "Automated Knowledge Component Generation for Interpretable Knowledge Tracing in Coding Problems",
      4     "authors": ["Zhangqi Duan", "Nigel Fernandez", "Arun Balajiee Lekshmi Narayanan", "Mohammad Hassany", "Rafaella Sampaio de Alencar", "Peter Brusilovsky", "Bita Akram", "Andrew Lan"],
      5     "year": 2025,
      6     "venue": "Preprint (under review)",
      7     "arxiv_id": "2502.18632"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The paper states 'We will make our code publicly available' (Section 1.1 footnote), which is a promise of future release, not an actual release."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses two publicly available datasets: CodeWorkout (DataShop, pslcdatashop.web.cmu.edu) and FalconCode, both with citations and download links."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using HuggingFace PEFT library, Llama 3 8B, NVIDIA L40S 48GB GPU, and LoRA hyperparameters, but no requirements.txt, Dockerfile, or detailed environment specification is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described but not in a form a researcher could directly execute."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Standard deviations are reported (e.g., '0.816±0.012' in Table 2) but no confidence intervals are provided. The ± notation represents standard deviation across 5 splits, not confidence intervals."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 2 notes statistical significance with p < 0.05, indicated by bullet markers (•) for KCGen-KT results compared to baselines."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Results are reported with baseline context enabling effect size interpretation, e.g., AUC improvement from 0.788 (TIKTOC*) to 0.816 (KCGen-KT) on CodeWorkout, with absolute differences interpretable from Table 2."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The datasets contain 246 students (CodeWorkout) and 3,267 students (FalconCode) but no justification for why these sizes are adequate for the claims, nor any power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations across 5 repeated random subsampling splits are reported in Table 2 (e.g., '0.816±0.012')."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are included: Random, Majority, Code-DKT, TIKTOC*, and KCGen-KT with human-written KCs (Table 2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "TIKTOC (2025) and Code-DKT (2022) are recent and relevant baselines for knowledge tracing in programming."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Tables 4 and 5 present detailed ablation studies examining the impact of correct solutions, in-context examples, AST vs code input, number of solutions, and KC abstraction levels."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four metrics are used: AUC, F1 Score, Accuracy for correctness prediction, and CodeBLEU for code prediction (Table 2)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 6 presents a human evaluation with two annotators assessing KC interpretability (98.6% labeled interpretable), precision (93.2%), and recall (96% equal-or-greater coverage) with inter-annotator agreement reported."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.3 states 'repeated random subsampling with five splits, partitioning the data into training, validation, and test sets each time.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per dataset (CodeWorkout and FalconCode) in Table 2, and per KC abstraction level in Table 3."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.2.1 shows a case study where predicted code has a runtime error due to low mastery on specific KCs. Section 5.2.2 discusses learning curves that poorly fit due to noisy data."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The ablation study reports that incorporating incorrect solutions hurts performance, and using AST representations decreases performance (Section 5.1.3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims KCGen-KT outperforms existing KT methods and human-written KCs, which is supported by Table 2 with statistical significance. The claim about better learning curves is supported by R² comparison (0.21 vs 0.18)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims like 'leveraging the semantic information in KC descriptions improves KT performance' are supported by controlled ablation studies (Tables 4-5) that isolate individual components."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper tests on two specific datasets (Java and Python) and discusses limitations in Section 7, including plans to extend to other domains. The title specifies 'coding problems' rather than making broader claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not substantively discuss alternative explanations for why LLM-generated KCs outperform human KCs. For example, it does not consider whether the improvement could be due to differences in KC granularity rather than quality."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT-4o' for KC generation and 'Llama 3' with 8B parameters for KT, but no specific version identifiers or snapshot dates are provided (e.g., no gpt-4o-2024-xx-xx or Llama-3-8B-Instruct version)."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompts are provided in Appendix B (Tables 7-10) for KC generation, cluster summarization, KC correctness labeling, and in-context example conversion."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.3 reports LoRA parameters (α=256, rank=128, dropout=0.05), learning rates for each component, batch size (32), optimizer choices (AdamW, RMSprop), and 8-bit quantization."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The three-step KC generation pipeline is described in detail (Section 3.1): initial KC generation with few-shot prompting, clustering on semantic similarity, and cluster labeling with GPT-4o."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 describes filtering to first submissions only (10,834 from CodeWorkout, 28,617 from FalconCode), and Section 3.1.1 describes the clustering of student submissions using CodeBERT embeddings for diverse sampling."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. Some limitations are briefly mentioned in Section 7 (Conclusions and Future Work) but without substantive discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The future work section mentions extending to other domains but does not address specific validity concerns."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do not show. It does not discuss limitations of testing on only introductory programming courses or the two specific datasets."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The underlying datasets (CodeWorkout and FalconCode) are publicly available through their respective repositories, allowing independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes both datasets: CodeWorkout from an introductory Java course (246 students, 50 problems), FalconCode from an introductory Python course (3,267 students, 157 problems), with citation to original data sources."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "The paper uses existing public datasets of student submissions, not recruiting participants. The human evaluation annotators are described as 'volunteers contacted through a research partner' (Appendix A)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from raw student submissions through CodeBERT embedding, clustering, KC generation, KC clustering, and final tagging is documented step by step in Section 3.1, with specific counts at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: University of Massachusetts Amherst, University of Pittsburgh, and North Carolina State University. No authors are affiliated with OpenAI or Meta (whose products are used)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed; appears to be unfunded academic work."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "GPT-4o is used for KC generation and Llama 3 for KT, but neither model's training data cutoff is stated. The student code datasets could potentially be in GPT-4o's training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether GPT-4o may have seen the CodeWorkout or FalconCode datasets during training, which could affect KC generation quality."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "CodeWorkout (2019/2021) and FalconCode (2023) are publicly available datasets that could be in GPT-4o's training data. This contamination risk is not addressed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The human evaluation in Section 6 involves two human annotators but no pre-registration is mentioned."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics approval is mentioned for the human evaluation study involving annotators."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The annotators are described only as having 'experience in teaching college-level programming in Java.' No further demographics are reported."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion/exclusion criteria for annotator selection are stated beyond Java teaching experience."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is not an experimental study with treatment/control groups for the human evaluation; both annotators evaluated the same materials."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "For the recall task, annotators compare two KC sets but it is not stated whether they were blinded to which set was LLM-generated vs. human-written."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "With only two annotators, attrition is less relevant, but no mention is made of whether both completed all annotation tasks."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs for GPT-4o calls or inference costs for the KC generation pipeline are reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 4.3 states training takes 80 minutes per epoch on CodeWorkout and 300 minutes per epoch on FalconCode on an NVIDIA L40S 48GB GPU, converging within 12 epochs."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "KCGen-KT with LLM-generated KCs outperforms existing KT methods and human-written KCs on future student response prediction",
    286       "evidence": "Table 2 shows statistically significant improvements (p<0.05) on both datasets across all metrics. On CodeWorkout: AUC 0.816 vs 0.788 (TIKTOC*) and 0.797 (human KCs). On FalconCode: AUC 0.771 vs 0.728 (TIKTOC*) and 0.752 (human KCs).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "LLM-generated KCs result in better fit than human-written KCs under cognitive models (power law of practice)",
    291       "evidence": "Section 5.2.2 reports weighted R² of 0.21 for LLM-generated KCs vs 0.18 for human-written KCs under PFA models on CodeWorkout.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "LLM-generated KCs are comparable to or more interpretable than human-written KCs",
    296       "evidence": "Section 6 reports 98.6% of LLM-generated KCs labeled interpretable vs 94.6% for baseline, precision 93.2% vs 92.5%, and recall showing equal-or-greater coverage in 96% of cases. Cohen's Kappa of 0.594 for inter-annotator agreement.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Including correct student submissions is crucial for KC generation quality",
    301       "evidence": "Table 4 ablation: removing correct solutions drops AUC from 0.812 to 0.789 and CodeBLEU from 0.569 to 0.529.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Overly abstract KCs hurt KT performance",
    306       "evidence": "Table 3 shows consistent performance decrease at highest abstraction levels (e.g., 10 clusters: AUC 0.794 vs 50 clusters: AUC 0.816 on CodeWorkout).",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper presents KCGen-KT, an LLM-based pipeline for automatically generating knowledge components (KCs) for programming problems and using them for knowledge tracing. LLM-generated KCs outperform human-written KCs on student response prediction across two datasets (CodeWorkout/Java and FalconCode/Python) with statistical significance. Ablation studies show that including diverse correct student solutions and maintaining medium-level KC granularity are important for performance. A human evaluation with two annotators confirms that LLM-generated KCs are interpretable and achieve comparable precision/recall to human-authored KCs.",
    312   "red_flags": [
    313     {
    314       "flag": "Tiny human evaluation sample",
    315       "detail": "The human evaluation uses only 2 annotators, which is insufficient for robust inter-rater reliability assessment. Cohen's Kappa of 0.594 (moderate agreement) on KC interpretability suggests the task is subjective, yet no additional annotators were recruited."
    316     },
    317     {
    318       "flag": "No limitations section",
    319       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Key threats such as dataset contamination risk for GPT-4o and generalizability beyond introductory programming courses are not discussed."
    320     },
    321     {
    322       "flag": "Contamination risk unaddressed",
    323       "detail": "GPT-4o is used for KC generation on publicly available datasets (CodeWorkout 2019, FalconCode 2023). If GPT-4o was trained on these datasets, the quality of generated KCs may be inflated. This is not discussed."
    324     },
    325     {
    326       "flag": "Code not released",
    327       "detail": "Despite promising code release, no repository link is provided, limiting reproducibility."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    333       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    334       "year": 2020,
    335       "relevance": "Pre-trained code model used for embedding student submissions in the KC generation pipeline."
    336     },
    337     {
    338       "title": "Code-DKT: A Code-based Knowledge Tracing Model for Programming Tasks",
    339       "authors": ["Yang Shi", "Min Chi", "Tiffany Barnes", "Thomas Price"],
    340       "year": 2022,
    341       "relevance": "Key baseline for knowledge tracing in programming that leverages student code content."
    342     },
    343     {
    344       "title": "Open-ended Knowledge Tracing for Computer Science Education",
    345       "authors": ["Naiming Liu", "Zichao Wang", "Richard Baraniuk", "Andrew Lan"],
    346       "year": 2022,
    347       "relevance": "Prior LLM-based knowledge tracing method for programming that predicts student code submissions."
    348     },
    349     {
    350       "title": "Test Case-Informed Knowledge Tracing for Open-ended Coding Tasks",
    351       "authors": ["Zhangqi Duan", "Nigel Fernandez", "Alexander Hicks", "Andrew Lan"],
    352       "year": 2025,
    353       "relevance": "Main baseline (TIKTOC) using LLM backbone for multi-task knowledge tracing in programming."
    354     },
    355     {
    356       "title": "The Llama 3 Herd of Models",
    357       "authors": ["AI @ Meta Llama Team"],
    358       "year": 2024,
    359       "arxiv_id": "2407.21783",
    360       "relevance": "Open-source LLM used as the backbone for the KCGen-KT knowledge tracing framework."
    361     },
    362     {
    363       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    364       "authors": ["Edward J Hu"],
    365       "year": 2022,
    366       "relevance": "Parameter-efficient fine-tuning method used for adapting Llama 3 in the KT framework."
    367     },
    368     {
    369       "title": "Automated generation and tagging of knowledge components from multiple-choice questions",
    370       "authors": ["Steven Moore", "Robin Schmucker", "Tom Mitchell", "John Stamper"],
    371       "year": 2024,
    372       "relevance": "Prior work on LLM-based KC generation in science domain, related approach for different problem types."
    373     },
    374     {
    375       "title": "Harnessing code domain insights: Enhancing programming Knowledge Tracing with Large Language Models",
    376       "authors": ["Xinjie Sun", "Qi Liu", "Kai Zhang"],
    377       "year": 2025,
    378       "relevance": "Concurrent work using LLMs to construct Q-matrices for knowledge tracing in programming."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs