ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25773B)


      1 {
      2   "paper": {
      3     "title": "Pedagogical Alignment of Large Language Models (LLM) for Personalized Learning: A Survey, Trends and Challenges",
      4     "authors": [
      5       "Mahefa Abel Razafinirina",
      6       "William Germain Dimbisoa",
      7       "Thomas Mahatody"
      8     ],
      9     "year": 2024,
     10     "venue": "Journal of Intelligent Learning Systems and Applications",
     11     "doi": "10.4236/jilsa.2024.164023"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["survey_methodology"],
     15   "methodology_tags": ["meta-analysis"],
     16   "key_findings": "This narrative survey covers LLMs for education across knowledge editing, content generation, personalized learning, and prompt optimization. It identifies two architectures (unified LLM and Mixture-of-Experts) for LLM-based education and discusses the Pedagogical Chain-of-Thought (PedCoT) framework. The paper catalogs challenges including factual accuracy, bias, computational cost, and lack of critical thinking support, but provides no systematic methodology, no quality assessment of reviewed papers, and no structured evidence synthesis.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No source code, analysis scripts, or repository URLs are provided. The paper is a pure narrative survey with no released artifacts."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No dataset of reviewed papers, search results, or extracted data is released. The survey could have released its paper corpus or structured extraction but did not."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No environment or tooling specifications are provided. A survey could specify tools used for literature search and analysis, but none are mentioned."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No instructions for reproducing the literature search or analysis are provided. There is no description of search queries, databases used, or inclusion criteria that would allow replication."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "Survey paper with no experiments or statistical analyses. No quantitative results are generated."
     45       },
     46       "significance_tests": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "Survey paper with no comparative experiments requiring significance testing."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "Survey paper with no experiments producing effect sizes."
     55       },
     56       "sample_size_justified": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "Survey paper with no experimental sample. The number of reviewed papers is not explicitly justified either, but this is a structural inapplicability of the statistical criterion."
     60       },
     61       "variance_reported": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "Survey paper with no experimental runs to report variance across."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The survey does not compare itself against prior surveys on LLMs in education. No systematic comparison with existing reviews (e.g., [1] Kasneci et al. 2023 or [72] Wang et al. 2024) is provided to show what this survey adds."
     72       },
     73       "baselines_contemporary": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No experimental baselines are applicable to a survey paper."
     77       },
     78       "ablation_study": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No system components to ablate in a survey paper."
     82       },
     83       "multiple_metrics": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No experimental metrics are applicable to a narrative survey."
     87       },
     88       "human_evaluation": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No system outputs to evaluate; this is a narrative survey."
     92       },
     93       "held_out_test_set": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No test sets are applicable to a survey paper."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper organizes findings by categories: LLM models (Section 2), knowledge editing (Section 3), content generation (Section 4), datasets (Section 5), pedagogical alignment including math/writing/programming/reasoning (Section 6), personalized learning (Sections 7-8), and prompt optimization (Section 9). Table 1 also breaks down challenges by category."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper discusses multiple failure modes: LLMs struggling with complex mathematical reasoning (Section 6.1.1), hallucinations in KBQA (Section 6.1.5), overcorrection in writing feedback (Section 6.1.2), limitations in complex algorithm generation (Section 6.1.3), and accuracy issues in error detection for feedback (Section 4.4)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports negative findings from the literature: current LLMs including GPT-4 cannot adequately perform knowledge-intensive writing tasks (Section 5.1), integrated scoring has minimal impact on feedback quality (Section 4.4), and ChatGPT shows 'significant limitations in the accuracy of error detection' for concurrent programming (Section 4.4)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract's descriptive claims ('we explore KME,' 'we discuss MoE and unified approaches,' 'we discuss challenges') are supported by corresponding sections in the paper. The abstract uses hedged language ('could transform') rather than strong empirical claims."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes numerous causal claims such as 'LLMs is set to revolutionize the educational landscape' (Section 1), 'KME ensures LLMs in education access the latest information' (Section 3), and 'PedCoT significantly improves the detection and correction of mathematical reasoning mistakes' (Section 6.3). These causal claims are presented as established facts rather than being supported by rigorous evidence synthesis from the reviewed literature."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper makes sweeping generalizations about LLMs in education without bounding to specific models, educational contexts, student populations, or cultural settings. Claims like 'LLMs can revolutionize online education by understanding a wide range of student questions, similar to human teachers' (Section 6.2) are unbounded. The title itself claims coverage of 'personalized learning' broadly."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not discuss alternative explanations for the observed trends in the literature. For example, the reported benefits of LLMs in education could be due to publication bias, novelty effects, or carefully selected tasks. No such alternatives are considered."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper frequently equates proxy measures with actual educational outcomes. For example, quiz generation quality is discussed as a proxy for learning effectiveness, but this gap is never acknowledged. Similarly, 'personalized learning' is treated as inherently beneficial without distinguishing between personalization as measured and actual learning outcomes."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "Survey paper that does not use any models directly."
    146       },
    147       "prompts_provided": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "Survey paper that does not use prompting."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "Survey paper with no experiments requiring hyperparameter reporting."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "Survey paper with no agentic scaffolding."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No paper selection pipeline is documented. There is no description of which databases were searched, what search queries were used, how many papers were initially found, or what criteria were used to filter them. The survey reads as an ad-hoc collection of papers with no reproducible methodology."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "Section 10 ('Challenges, Trends & Future Directions') discusses challenges of LLMs in education, not limitations of the survey itself. There is no dedicated section discussing the survey's own methodological limitations, such as search completeness, selection bias, or scope constraints."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No specific threats to the validity of this survey are discussed. The paper does not acknowledge that its ad-hoc selection methodology could bias its conclusions, nor does it discuss potential gaps in coverage."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not explicitly state what is excluded from scope. It does not clarify which educational levels, languages, model families, or geographic contexts are in or out of scope. The broad title and abstract suggest comprehensive coverage without acknowledging boundaries."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No raw data (list of all papers considered, search results, inclusion/exclusion decisions) is available for independent verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The paper provides no description of how the reviewed papers were collected. No databases, search terms, date ranges, or search strategies are mentioned. It is impossible to determine how the 160 references were identified."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The 'sample' for this survey is the set of reviewed papers, and no description of how they were found or selected is provided. There is no information about the search process that led to the inclusion of these specific papers."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No data pipeline from initial search to final analysis is documented. The paper jumps directly to discussing categorized papers without explaining any intermediate steps in the review process."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding information is disclosed anywhere in the paper. The authors declare no conflicts of interest but do not mention whether the research was funded or unfunded."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly stated: all three authors are from the School of Computer Science, University of Fianarantsoa, Madagascar."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding source is disclosed, making it impossible to assess funder independence. The absence of funding disclosure prevents evaluation of this criterion."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "The paper includes a conflicts of interest statement: 'The authors declare no conflicts of interest regarding the publication of this paper.' This is an explicit declaration."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this survey paper."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this survey paper."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this survey paper."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this survey paper."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this survey paper."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this survey paper."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this survey paper."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper with no method of its own to cost."
    288       },
    289       "compute_budget_stated": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Survey paper with no computational experiments."
    293       }
    294     },
    295     "survey_methodology": {
    296       "prisma_or_structured_protocol": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No PRISMA flow diagram, no structured review protocol, no reproducible search strategy. The paper does not reference any systematic review methodology. Papers appear to have been collected ad-hoc with no documented protocol."
    300       },
    301       "quality_assessment_of_sources": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The survey does not assess the quality of any of its source papers. All reviewed papers are treated equally regardless of methodological rigor, sample size, or evidence strength. Claims from weak studies are presented alongside claims from strong studies without distinction."
    305       },
    306       "publication_bias_discussed": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Publication bias is never discussed. The survey does not consider that published papers on LLMs in education skew heavily positive, potentially giving an overly optimistic picture of the field."
    310       }
    311     }
    312   },
    313   "engagement_factors": {
    314     "practical_relevance": {
    315       "score": 1,
    316       "justification": "Catalogs techniques (MoE, PedCoT, RAG) and architectures for LLM-based education but provides no directly usable tool or implementation guidance."
    317     },
    318     "surprise_contrarian": {
    319       "score": 0,
    320       "justification": "Confirms conventional wisdom that LLMs show educational promise but face challenges with accuracy, bias, and computational cost."
    321     },
    322     "fear_safety": {
    323       "score": 0,
    324       "justification": "No novel AI risk or security concerns raised; mentions generic bias and hallucination issues already widely known."
    325     },
    326     "drama_conflict": {
    327       "score": 0,
    328       "justification": "No controversy, no challenges to other researchers, and no provocative claims."
    329     },
    330     "demo_ability": {
    331       "score": 0,
    332       "justification": "No code, demo, or tool released."
    333     },
    334     "brand_recognition": {
    335       "score": 0,
    336       "justification": "Authors from University of Fianarantsoa; published in a low-profile journal (JILSA via SCIRP)."
    337     }
    338   },
    339   "claims": [
    340     {
    341       "claim": "LLMs can transform educational experiences through personalized learning by adapting to unique learning styles and needs of individual students.",
    342       "evidence": "Supported by references to multiple works in Section 1 and Section 7, citing [10]-[14] for adaptive learning and [15][16] for feedback capabilities. However, no meta-analytic synthesis or quantitative evidence aggregation is provided.",
    343       "supported": "weak"
    344     },
    345     {
    346       "claim": "Knowledge Editing Techniques (KME) ensure LLMs maintain current and accurate knowledge for educational applications.",
    347       "evidence": "Section 3 discusses KME with references to [22] and [48], describing the concept and challenges. However, the paper provides no evidence of KME being successfully deployed in an educational setting—only theoretical promise.",
    348       "supported": "weak"
    349     },
    350     {
    351       "claim": "The Mixture-of-Experts (MoE) framework addresses limitations of single-purpose LLMs by using specialized models for different subjects.",
    352       "evidence": "Section 6.2.2 describes the MoE concept with references to [109]-[112]. The described benefits are conceptual; no empirical evidence of MoE outperforming unified approaches in educational settings is presented.",
    353       "supported": "weak"
    354     },
    355     {
    356       "claim": "The Pedagogical Chain-of-Thought (PedCoT) framework significantly improves detection and correction of mathematical reasoning mistakes by LLMs.",
    357       "evidence": "Section 6.3 cites [113] for PedCoT combined with Bloom's Cognitive Model. The claim relies on a single referenced study without independent replication or meta-analytic evidence.",
    358       "supported": "moderate"
    359     },
    360     {
    361       "claim": "Current LLMs, including GPT-4, are unable to adequately perform knowledge-intensive writing tasks, particularly integrating new information and following precise edits.",
    362       "evidence": "Section 5.1 cites the KIWI dataset study [71] showing GPT-4 limitations in revising long-form answers with expert instructions.",
    363       "supported": "moderate"
    364     },
    365     {
    366       "claim": "Quantization techniques can achieve significant compression ratios while maintaining acceptable accuracy in LLM inference tasks.",
    367       "evidence": "Section 2.2.2 cites [11] and [12] for quantization results but provides no specific compression ratios or accuracy numbers from these studies.",
    368       "supported": "weak"
    369     }
    370   ],
    371   "red_flags": [
    372     {
    373       "flag": "No systematic review methodology",
    374       "detail": "The survey provides no description of search databases, queries, date ranges, inclusion/exclusion criteria, or paper selection process. The 160 references appear to be collected ad-hoc, making the review non-reproducible and potentially subject to selection bias."
    375     },
    376     {
    377       "flag": "No quality assessment of source papers",
    378       "detail": "All reviewed papers are treated with equal weight regardless of methodological quality. Claims from workshop papers, preprints, and rigorously peer-reviewed studies are mixed without distinction. This launders the signal-to-noise ratio of the sources."
    379     },
    380     {
    381       "flag": "Overclaiming relative to evidence",
    382       "detail": "The paper uses strong language ('revolutionize,' 'transform,' 'significantly enhance') that far exceeds the evidence synthesized. Most claims are supported only by citing individual papers without systematic evidence aggregation or critical analysis of the quality of those papers."
    383     },
    384     {
    385       "flag": "Many references tangentially related",
    386       "detail": "Several references (e.g., [3] on hate speech detection, [4] on hate speech in low-resource languages, [6] on spatial NLI, [10] on biomedical NLI) have only tangential connection to the paper's stated topic of pedagogical alignment of LLMs for personalized learning."
    387     },
    388     {
    389       "flag": "No publication bias consideration",
    390       "detail": "The survey does not acknowledge that the literature it reviews is heavily biased toward positive results. Papers reporting that LLMs failed in educational settings are underrepresented, potentially giving an overly optimistic picture."
    391     },
    392     {
    393       "flag": "Published in a predatory-adjacent venue",
    394       "detail": "SCIRP (Scientific Research Publishing) journals have been flagged by multiple sources for questionable publishing practices, including being listed on Beall's List. This raises concerns about the rigor of peer review."
    395     }
    396   ],
    397   "cited_papers": [
    398     {
    399       "title": "ChatGPT for Good? On Opportunities and Challenges of Large Language Models for Education",
    400       "authors": ["E. Kasneci", "K. Sessler", "S. Küchemann", "M. Bannert", "D. Dementieva", "F. Fischer"],
    401       "year": 2023,
    402       "doi": "10.1016/j.lindif.2023.102274",
    403       "relevance": "Major survey on LLM opportunities and challenges in education, including bias, factual inaccuracies, and interpretability concerns."
    404     },
    405     {
    406       "title": "A Comprehensive Overview of Large Language Models",
    407       "authors": ["H. Naveed", "A.U. Khan", "S. Qiu", "M. Saqib", "S. Anwar"],
    408       "year": 2024,
    409       "arxiv_id": "2307.06435",
    410       "relevance": "Broad overview of LLM architectures and capabilities, relevant to understanding foundational models used in agentic AI systems."
    411     },
    412     {
    413       "title": "Knowledge Editing for Large Language Models: A Survey",
    414       "authors": ["S. Wang", "Y.C. Zhu", "H.C. Liu", "Z.Y. Zheng", "C. Chen", "J.D. Li"],
    415       "year": 2023,
    416       "arxiv_id": "2310.16218",
    417       "relevance": "Survey of knowledge editing techniques for LLMs, relevant to maintaining model accuracy and safety in deployed systems."
    418     },
    419     {
    420       "title": "Large Language Models for Education: A Survey and Outlook",
    421       "authors": ["S. Wang", "T.L. Xu", "H. Li", "C.L. Zhang", "J. Liang", "J.L. Tang", "P.S. Yu", "Q.S. Wen"],
    422       "year": 2024,
    423       "arxiv_id": "2403.18105",
    424       "relevance": "Comprehensive survey on LLMs in education covering capabilities, limitations, and evaluation methodologies."
    425     },
    426     {
    427       "title": "Adapting Large Language Models for Education: Foundational Capabilities, Potentials, and Challenges",
    428       "authors": ["Q.Y. Li", "L.Y. Fu"],
    429       "year": 2024,
    430       "arxiv_id": "2401.08664",
    431       "relevance": "Key reference on LLM foundational capabilities for education including math, writing, programming, and reasoning."
    432     },
    433     {
    434       "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions",
    435       "authors": ["L. Huang"],
    436       "year": 2023,
    437       "arxiv_id": "2311.05232",
    438       "relevance": "Survey on LLM hallucination — directly relevant to AI safety and reliability concerns in deployed systems."
    439     },
    440     {
    441       "title": "Attention Is All You Need",
    442       "authors": ["A. Vaswani", "N. Shazeer", "N. Parmar", "J. Uszkoreit", "L. Jones", "A.N. Gomez", "Ł. Kaiser", "I. Polosukhin"],
    443       "year": 2017,
    444       "arxiv_id": "1706.03762",
    445       "relevance": "Foundational transformer architecture paper underlying all modern LLMs used in agentic AI systems."
    446     },
    447     {
    448       "title": "LLMs Can Find Mathematical Reasoning Mistakes by Pedagogical Chain-of-thought",
    449       "authors": ["Z. Jiang", "H. Peng", "S. Feng", "F. Li", "D. Li"],
    450       "year": 2024,
    451       "doi": "10.24963/ijcai.2024/381",
    452       "relevance": "Introduces PedCoT framework for LLM reasoning in educational contexts, relevant to LLM capability evaluation."
    453     },
    454     {
    455       "title": "ERAGent: Enhancing Retrieval-Augmented Language Models with Improved Accuracy, Efficiency, and Personalization",
    456       "authors": ["Y.X. Shi", "X. Zi", "Z.J. Shi", "H.M. Zhang", "Q. Wu", "M. Xu"],
    457       "year": 2024,
    458       "arxiv_id": "2405.06683",
    459       "relevance": "RAG-based agent framework with personalization, relevant to agentic AI workflow design and evaluation."
    460     },
    461     {
    462       "title": "Navigate through Enigmatic Labyrinth: A Survey of Chain of Thought Reasoning",
    463       "authors": ["Z. Chu", "J.C. Chen"],
    464       "year": 2024,
    465       "arxiv_id": "2309.15402",
    466       "relevance": "Comprehensive survey of chain-of-thought reasoning techniques applicable to LLM capability assessment."
    467     }
    468   ]
    469 }

Impressum · Datenschutz