scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28680B)
      1 {
      2   "paper": {
      3     "title": "AI-Tutoring in Software Engineering Education: Experiences with Large Language Models in Programming Assessments",
      4     "authors": [
      5       "Eduard Frankford",
      6       "Clemens Sauerwein",
      7       "Patrick Bassner",
      8       "Stephan Krusche",
      9       "Ruth Breu"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2404.02548",
     14     "doi": ""
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper provides Artemis integration files on Figshare (https://figshare.com/s/636a9c5ff8f2c8315f26) as described in Section 8 Data Availability. However, a comprehensive repository was not released 'due to challenges associated with its anonymization.' The Artemis platform itself is open source on GitHub (https://github.com/ls1intum/Artemis). The Figshare archive with integration files counts as a code release."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Section 8 describes the dataset available on Figshare including: anonymized data analysis spreadsheet with extracted database data, survey responses, code analysis, and student submissions to the version control system. All personal identifiers were removed."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No environment or dependency specifications are provided. There is no requirements.txt, Dockerfile, or detailed environment setup section. The paper mentions the Artemis platform and GPT-3.5-Turbo API but does not specify software versions, library dependencies, or setup instructions beyond pointing to the open-source Artemis project."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions are provided. Section 8 describes the released data and Artemis integration files but does not include a README with commands, a 'Reproducing Results' section, or scripts to replicate the study. A researcher would need to reverse-engineer the integration from the released files."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper reports Likert scale averages (e.g., 'Somewhat Agree (1.29)', 'Neutral (-0.29)') without confidence intervals or error bars. The stacked bar chart in Figure 5 shows response distributions but no uncertainty measures."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper makes comparative observations (e.g., different user personas, feedback quality categories) but does not use any statistical significance tests. The authors acknowledge the sample is too small for quantitative analysis (Section 6.3) but still report numerical averages without testing."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No effect sizes are reported. The paper presents raw percentages (e.g., '66.6% were useful') and Likert averages without formal effect size measures like Cohen's d or relative comparisons with baseline context."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 3.2 acknowledges the modest sample size (23 students, 12 active) and justifies the qualitative approach: 'While this may seem like a modest sample size, it's important to note that the qualitative nature of this analysis allowed for a more in-depth understanding of individual experiences.' Section 6.3 further notes it is 'too small to conduct a statistically significant quantitative analysis.'"
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No variance or standard deviation is reported. Likert responses are summarized as averages without spread measures. The stacked bar chart shows distributions visually but no numeric variance is provided."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No baseline comparison is included. The study does not compare the AI-Tutor against a control group (no AI-Tutor), against human tutors, or against any alternative system. Students could optionally use the AI-Tutor, but there is no systematic comparison of outcomes with vs. without it."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No baselines are included at all, so the question of whether they are contemporary does not arise. The paper only reports on the AI-Tutor's integration without comparison to any alternative."
     76       },
     77       "ablation_study": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "The system has essentially one component (GPT-3.5-Turbo with a single prompt template). There are no modular components to ablate."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses multiple evaluation approaches: (1) interaction pattern analysis with temporal coding, (2) TAM survey with 6 Likert items, (3) open-ended qualitative survey questions, and (4) manual feedback quality categorization (useful vs. not useful, with subcategories). These constitute multiple metrics for different research questions."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The study includes a survey of students evaluating their experience with the AI-Tutor's outputs (TAM questions and open-ended feedback). Additionally, the researchers manually evaluated 75 feedback responses for quality (Section 4.3: '55(66.6%) were useful and 20(26.6%) were categorized as not useful')."
     91       },
     92       "held_out_test_set": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "This is a case study with qualitative analysis, not a machine learning evaluation. There is no concept of a held-out test set."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper provides breakdowns: interaction patterns are categorized into two user personas (Iterative Ivy, Hybrid Harry). Among 20 not-useful responses, breakdown is given (3 revealed solutions, 4 hallucinations, 13 too general). TAM responses are reported per question. Figure 4 shows per-student interaction timelines."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 4.3 discusses failure cases in detail: 20 out of 75 feedback responses were not useful, categorized as solution-revealing (3), hallucinations (4), and too general (13). Specific examples of hallucination types are described (e.g., LLM stating a function looks well implemented when no implementation existed)."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports several negative findings: neutral-to-negative TAM responses on most questions, 26.6% feedback being not useful, students' concerns about learning inhibition, generic feedback quality, hallucination issues, and API dependency problems. Section 4.2 honestly reports the mixed/neutral survey results."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims are modest: 'identified different user types based on their interaction patterns' (supported in Section 4.1), 'advantages, such as timely feedback and scalability' (supported in Section 4.3), 'challenges like generic responses and students' concerns about a learning progress inhibition' (supported in Sections 4.2-4.3). No overclaiming is present."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The paper makes implicit causal claims, e.g., 'This immediate feedback helped students to quickly correct their errors' (Section 4.3), 'the AI-Tutor, in its capacity, guides through specific instructions' implying tutoring causes improvement. There is no control group or causal identification strategy; the observational design cannot establish that the AI-Tutor caused any learning improvements."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper overgeneralizes in several places. Section 6.3 claims findings from GPT-3.5-Turbo 'can largely be extrapolated to other similar models' without evidence. Section 4.3 suggests applicability to MOOCs based on a single exercise with 23 students. The title 'AI-Tutoring in Software Engineering Education' is broader than the actual scope (one C programming exercise at one university)."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The threats to validity section (Section 6) discusses generic methodological concerns but does not consider specific alternative explanations for findings. For example, students preferring the AI-Tutor might simply value novelty; the 'Iterative Ivy' pattern could reflect students who are generally more cautious, not AI-Tutor effectiveness. No such alternatives are discussed."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper states 'GPT-3.5-Turbo' throughout but never specifies a snapshot date or API version (e.g., 'gpt-3.5-turbo-0613'). Model behavior changes across versions, and the specific version used is not identified."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The full prompt template is provided in Listing 1 (Section 3.1). The placeholders are clearly explained: 'language' is English or German, 'description' is the Pascal's triangle task description, 'current' is the student's code, and 'solution' is the instructor's solution. The actual prompt structure with fill values is sufficiently described for reconstruction."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Temperature is reported as 0.7 (Section 3.1), but other hyperparameters such as top-p, max tokens, and frequency/presence penalties are not mentioned. The paper only states temperature, which is insufficient for full reproducibility."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The system architecture is described in detail in Section 3.1 with a sequence diagram (Figure 1). The workflow is: student presses button -> server retrieves student code, exercise description, and sample solution -> API call to OpenAI -> response displayed in popup. No retry logic, memory, or multi-turn interaction is used (by design). The scaffolding is simple and fully described."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3.3 documents data analysis: combination of two datasets (AI Feedback and student submissions), exclusion of students who did not solve the exercise or engage with the AI-Tutor (reducing from 23 to 12 active participants), and the three-phase qualitative analysis approach (temporal coding, thematic coding, theme development)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 6 'Limitations' is a dedicated section with four subsections: Construct Validity (6.1), Reliability (6.2), External Validity (6.3), and Internal Validity (6.4). This is a substantive multi-page discussion."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Several specific threats are discussed: the non-deterministic nature of GPT-3.5-Turbo affecting reliability (Section 6.2), integration into only one APAS (Artemis) limiting generalizability (Section 6.3), sample size of 23 being too small for quantitative analysis (Section 6.3), and use of a single LLM (Section 6.3). These are specific to this study rather than generic boilerplate."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "While the limitations section discusses threats to validity, it does not explicitly state what the results do NOT show. The paper does not clearly bound the scope to say, for example, 'these results do not show that AI-Tutors improve learning outcomes' or 'we do not claim effectiveness beyond introductory C programming.' Instead, claims like findings being 'extrapolated to other similar models' (Section 6.3) expand rather than bound the scope."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 8 describes the data availability on Figshare including anonymized raw data: database extracts with code/feedback/user/timestamps, survey responses, student code submissions, and code analysis. This allows independent verification of the qualitative findings."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3 describes data collection in detail: the APAS integration stores code, feedback, user, file, and timestamp on each AI feedback request (Section 3.1). Student submissions with test results and timestamps are stored in the version control system. The survey was administered in a 15-minute session in the following tutorial (Section 3.2)."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 3.2 describes recruitment: participants were students from the 'Introduction to Programming' tutorial at the University of Innsbruck, part of the Bachelor in Computer Science curriculum. 23 students actively participated. They were introduced to the AI-Tutor functionality before the exercise."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.3 documents the pipeline: two datasets were combined (AI Feedback data and student submissions), students who didn't engage were excluded (from 23 to 12 active), and the three-stage qualitative analysis was applied (temporal coding, thematic coding, theme development). The filtering rationale and stages are explained."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 9 Acknowledgments states: 'The CodeAbility Austria project has been funded by the Austrian Federal Ministry of Education, Science and Research (BMBWF).'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly stated: Eduard Frankford, Clemens Sauerwein, and Ruth Breu at University of Innsbruck; Patrick Bassner and Stephan Krusche at Technical University of Munich. The Artemis platform being evaluated was developed at TU Munich (co-authors Bassner and Krusche are affiliated there), which represents a potential conflict that is implicitly visible through affiliations."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The funder is the Austrian Federal Ministry of Education, Science and Research (BMBWF), a government agency with no apparent financial stake in whether GPT-3.5-Turbo works as an AI-Tutor or whether Artemis is effective."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "There is no competing interests or financial interests statement in the paper. Stephan Krusche is the creator/lead of the Artemis platform being evaluated, which represents a potential interest, but no declaration is made."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It uses GPT-3.5-Turbo as a tutoring tool and evaluates the student experience, not the model's knowledge against a test set."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Not applicable. The study evaluates the AI-Tutor's tutoring effectiveness with students, not model performance on a benchmark. There is no train/test overlap concern."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not applicable. The study is not a benchmark evaluation of model capabilities. Pascal's Triangle is the exercise students solve, not a benchmark for measuring LLM performance."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No mention of pre-registration found in the paper. No link to OSF, AsPredicted, or any other pre-registration repository is provided."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No mention of IRB or ethics board approval found in the paper. The study collected data from student participants (survey responses, interaction data) without stating ethics review."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Section 3.2 states the survey asked about demographics including 'their highest degree, their current semester and their programming experience.' The students are characterized as first-year Bachelor of Computer Science students at the University of Innsbruck in the 'Introduction to Programming' course using C."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 3.2 specifies the population: students from the 'Introduction to Programming' tutorial at the University of Innsbruck. Section 3.3 describes exclusion: 'students who did not solve the exercise and did not engage with the AI-Tutor were excluded from the qualitative analysis' (reducing to 12 participants)."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "This is an observational/exploratory case study, not an experimental study with treatment and control conditions. Students chose voluntarily whether to use the AI-Tutor. No randomization was applicable."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "Not applicable. This is an observational study, not an experiment with conditions. Students knew they were using the AI-Tutor; blinding is not feasible in this design."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Attrition is reported: 23 students participated, but only 12 'worked intensively with both systems' and were included in the qualitative analysis (Section 3.3, 4.1). The paper details various categories of non-participation: 4 made no submissions and no AI requests, 1 submitted without AI, 1 used AI without submitting, 2 submitted without AI, etc."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No API costs, cost per feedback request, or total expenditure is reported despite using a commercial API (OpenAI GPT-3.5-Turbo). The paper discusses resource efficiency abstractly as a design consideration (Section 3.1) but provides no actual cost data."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget, API spend, or token consumption is reported. The paper mentions 75 feedback requests were made but does not quantify the tokens consumed or costs incurred."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Two distinct user personas emerged: 'Iterative Ivy' (continuous AI feedback before submission) and 'Hybrid Harry' (alternating between AI feedback and APAS submissions).",
    293       "evidence": "Section 4.1 with Figure 4 showing temporal interaction patterns for 12 active students. Detailed descriptions of each persona's behavior in Sections 4.1.1 and 4.1.2.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "Student satisfaction with the AI-Tutor was overall neutral, with mixed positive and negative responses that 'neutralize one another.'",
    298       "evidence": "Section 4.2 with Figure 5 showing stacked bar distributions of Likert responses. Average sentiment scores ranging from -0.43 to 1.29, with only 'easy to use' receiving a non-neutral average.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "66.6% (55 out of 75) of AI-Tutor feedback responses were useful, while 26.6% (20) were not useful.",
    303       "evidence": "Section 4.3 states this directly with subcategorization: 3 revealed solutions, 4 were hallucinations, 13 were too general. Manual review of each feedback instance.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "GPT-3.5-Turbo can be successfully used as a language model behind an AI-Tutor.",
    308       "evidence": "Section 5 Discussion. Based on the system functioning and receiving mixed-neutral survey responses. The claim is partially undermined by the 26.6% not-useful rate and neutral TAM results.",
    309       "supported": "weak"
    310     },
    311     {
    312       "claim": "The AI-Tutor was able to give feedback on logical and semantic issues, such as incorrect loop boundary conditions.",
    313       "evidence": "Section 4.3 describes this observation qualitatively: 'if students had defined wrong boundary conditions to terminate a loop, then the AI-Tutor recognized this and proposed to the student to change this condition.'",
    314       "supported": "weak"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "case-study",
    319     "qualitative"
    320   ],
    321   "key_findings": "An exploratory case study integrating GPT-3.5-Turbo as an AI-Tutor in the Artemis APAS with 23 students (12 active) found two interaction personas: students who used AI feedback iteratively before submitting, and those who alternated between AI feedback and system submissions. Student satisfaction was overall neutral on TAM measures, with 66.6% of AI feedback rated useful by manual review, while 26.6% was too generic, hallucinated, or revealed solutions. The study highlights both the potential for scalable real-time feedback and challenges including generic responses, hallucinations, and student concerns about over-reliance inhibiting learning progress.",
    322   "red_flags": [
    323     {
    324       "flag": "No control group",
    325       "detail": "The study has no baseline comparison. Students who used the AI-Tutor are not compared against students who did not, making it impossible to determine whether the AI-Tutor had any effect on learning outcomes or task completion."
    326     },
    327     {
    328       "flag": "Tiny sample with quantitative reporting",
    329       "detail": "With only 23 students (12 active), the paper reports numerical averages of Likert responses (e.g., 1.29, -0.29) without uncertainty measures. These numbers are meaningless at this sample size without confidence intervals, yet they are presented as findings."
    330     },
    331     {
    332       "flag": "Overgeneralization from single exercise",
    333       "detail": "The study uses a single programming exercise (Pascal's Triangle in C) at one university, yet makes claims about AI-Tutoring in 'Software Engineering Education' broadly and suggests applicability to MOOCs and other LLMs."
    334     },
    335     {
    336       "flag": "Conflict of interest not acknowledged",
    337       "detail": "Co-authors Bassner and Krusche are from TU Munich and are involved in developing the Artemis platform being evaluated. Krusche is the original creator of Artemis. No competing interests statement is present."
    338     },
    339     {
    340       "flag": "No ethics approval reported",
    341       "detail": "The study collected interaction data and survey responses from students (a vulnerable population in an instructor-student power dynamic) without mentioning IRB or ethics board approval."
    342     },
    343     {
    344       "flag": "Selection bias in qualitative analysis",
    345       "detail": "The qualitative analysis excludes 11 of 23 students, focusing only on the 12 who actively used both the AI-Tutor and APAS. This selection bias means the analysis only captures experiences of students who chose to engage, not the broader population."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "How ChatGPT Will Change Software Engineering Education",
    351       "authors": ["Marian Daun", "Jennifer Brings"],
    352       "year": 2023,
    353       "relevance": "Directly addresses the impact of ChatGPT on software engineering education, relevant to understanding how LLMs are changing programming pedagogy."
    354     },
    355     {
    356       "title": "ChatGPT for good? On opportunities and challenges of large language models for education",
    357       "authors": ["Enkelejda Kasneci", "Kathrin Sessler", "Stefan Küchemann"],
    358       "year": 2023,
    359       "doi": "10.1016/j.lindif.2023.102274",
    360       "relevance": "Discusses opportunities and challenges of LLMs in education, relevant to understanding the broader context of AI-assisted learning."
    361     },
    362     {
    363       "title": "An analysis of the automatic bug fixing performance of chatgpt",
    364       "authors": ["Dominik Sobania", "Martin Briesch", "Carol Hanna", "Justyna Petke"],
    365       "year": 2023,
    366       "arxiv_id": "2301.08653",
    367       "relevance": "Evaluates ChatGPT's bug-fixing capability on QuixBugs benchmark, relevant to assessing LLM capability in code repair."
    368     },
    369     {
    370       "title": "Is ChatGPT the Ultimate Programming Assistant–How far is it?",
    371       "authors": ["Haoye Tian", "Weiqi Lu", "Tsz On Li"],
    372       "year": 2023,
    373       "arxiv_id": "2304.11938",
    374       "relevance": "Evaluates ChatGPT as a programming assistant for code generation, repair, and summarization, directly relevant to LLM programming capability assessment."
    375     },
    376     {
    377       "title": "ChatGPT, Can You Generate Solutions for my Coding Exercises? An Evaluation on its Effectiveness in an undergraduate Java Programming Course",
    378       "authors": ["Eng Lieh Ouh", "Benjamin Kok Siew Gan", "Kyong Jin Shim", "Swavek Wlodkowski"],
    379       "year": 2023,
    380       "arxiv_id": "2305.13680",
    381       "relevance": "Empirical analysis of ChatGPT's potential as a programming assistant in an educational context."
    382     },
    383     {
    384       "title": "Jailbreaking chatgpt via prompt engineering: An empirical study",
    385       "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu"],
    386       "year": 2023,
    387       "arxiv_id": "2305.13860",
    388       "relevance": "Studies prompt engineering attacks on ChatGPT, relevant to AI safety and understanding vulnerabilities in LLM-based educational tools."
    389     },
    390     {
    391       "title": "Learning gain differences between ChatGPT and human tutor generated algebra hints",
    392       "authors": ["Zachary A Pardos", "Shreya Bhandari"],
    393       "year": 2023,
    394       "arxiv_id": "2302.06871",
    395       "relevance": "Compares ChatGPT-generated vs human tutor hints for learning gain, directly relevant to evaluating LLM tutoring effectiveness."
    396     },
    397     {
    398       "title": "Chatgpt and software testing education: Promises & perils",
    399       "authors": ["Sajed Jalil", "Suzzana Rafi", "Thomas D LaToza", "Kevin Moran", "Wing Lam"],
    400       "year": 2023,
    401       "relevance": "Evaluates ChatGPT's performance in software testing education, relevant to understanding LLM capability in SE domains."
    402     },
    403     {
    404       "title": "What is the impact of ChatGPT on education? A rapid review of the literature",
    405       "authors": ["Chung Kwan Lo"],
    406       "year": 2023,
    407       "relevance": "Literature review of ChatGPT's educational impact, relevant as a survey of LLM applications in education."
    408     }
    409   ]
    410 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs