scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24136B)
      1 {
      2   "paper": {
      3     "title": "Using Assignment Incentives to Reduce Student Procrastination and Encourage Code Review Interactions",
      4     "authors": ["Kevin Wang", "Ramon Lawrence"],
      5     "year": 2023,
      6     "venue": "2023 International Conference on Computational Science and Computational Intelligence (CSCI)",
      7     "arxiv_id": "2311.15125",
      8     "doi": "10.1109/CSCI62032.2023.00270"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The authors provide a GitHub link to their assignments and instructions: 'github.com/rlawrenc/cosc_404/tree/404_2023_Jan/labs' (footnote 1, Section II.D). This is the course material, not analysis code, but it is the primary artifact of the intervention."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No dataset is released. The queue system data, survey responses, and grade data are not made publicly available."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications are provided. The paper does not describe any software dependencies, analysis tools, or computational environment used for the data analysis."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided for replicating the analysis. The paper describes the intervention design but not how to reproduce the quantitative analysis."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No confidence intervals or error bars are reported for any of the main results. The paper reports point estimates only (e.g., '45% of assignments completed early', '78% overall exam average')."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper states 'The performance on the final exam for students that had at least one early marked assignment was statistically significant' (Section V.D) but does not report which test was used, the test statistic, or a p-value. This is insufficient — claiming statistical significance without reporting the test or p-value does not satisfy this criterion."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports exam averages with baseline context: 'the final exam average was 78% for all students. Students that had no bonus marked assignment had an average of 71% versus 81% for students with at least 1 bonus marked assignment' (Section V.D). This provides the magnitude of the effect (10 percentage point difference) with baseline context."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No justification is provided for the sample size (107 students, 67 consenting, 44 survey respondents). No power analysis is discussed. The 41% survey response rate is noted but not discussed as a limitation."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No standard deviations, variance, or spread measures are reported for any of the quantitative results (exam scores, help session times, etc.). Only means and percentages are given."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper mentions 'significant student acceptance and behavior change compared to previous course offerings' (Section V.A) but provides no quantitative baseline data from previous offerings. No formal comparison against a control group or historical baseline is presented."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No quantitative baselines are provided, so contemporariness cannot be assessed. The paper references prior course offerings qualitatively but does not present data from them."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The intervention combines bonus marks with synchronous code review, but no ablation separates their individual effects. It is unclear whether the behavior change is due to bonus marks, code review, or their combination."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper reports multiple metrics: early submission rates (45%), survey satisfaction (92% agree/strongly agree), help session question distribution (Figure 1), questions by day (Figure 2), general questions vs marked assignments (Figure 3), and final exam averages."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The study includes a student survey (N=44) evaluating opinions on the incentive system, and interviews with instructional staff were conducted. These constitute human evaluation of the intervention."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is not a machine learning study with train/test splits. The concept of a held-out test set does not apply to this pedagogical intervention study."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper provides breakdowns by day of early submission (70% Monday, 7% Tuesday, 14% Wednesday, etc.), by question type (Figure 1), by number of marked assignments vs general questions (Figure 3), and by exam performance grouped by bonus assignment count (Figure 4)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section VI discusses practical challenges: 'Assignment incentivization takes help session time that may be required for general assignment help questions' and 'If the burden of grading is too high, wait times become high for all students and the approach would not be practical.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper honestly reports that the correlation between early submission and exam scores may be due to confounding: 'the lower average for students with fewer bonus marked assignments could be attributed to the original ability level and intrinsic motivation of the student' (Section V.D). The paper also notes 'there is no time savings' for grading (Section V.C)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims '45% of assignments completed early and 30% up to 4 days before the deadline' — the results section reports 234/523 (45%) early submissions and 70% of those on Monday (4 days before deadline). The abstract claim of 'no increase in marking time' is supported in Section V.C. All abstract claims are substantiated."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes causal claims such as 'assignment incentivization practice of awarding bonus marks for early submission effectively curbs procrastination' (Section VI), but the study design is observational (single course, no control group, no randomization). The paper acknowledges this partially for exam performance but not for the procrastination claim itself. Confounders like course content differences, student cohort differences, and instructor effects are not controlled for."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper explicitly bounds generalizations: 'The research was conducted in an upper-level course with well-defined assignments that were verified for correctness using unit tests. The results may be generalizable to other courses as long as assignment correctness can be verified quickly' (Section VI). It specifies conditions under which the technique may or may not transfer."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper explicitly discusses the selection bias alternative explanation: 'the lower average for students with fewer bonus marked assignments could be attributed to the original ability level and intrinsic motivation of the student, as more capable and motivated students with higher test scores are also more likely to complete assignments early' (Section V.D)."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "This paper does not use any AI/ML models. It is a pedagogical intervention study and does not involve LLMs or other pre-trained models."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "No prompting is used in this study. It is a pedagogical intervention study, not an AI/LLM study."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No models or algorithms with hyperparameters are used. This is a pedagogical intervention study."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. This is a pedagogical intervention study, not an AI agent study."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper describes data preprocessing: 'Times less than 30 seconds are eliminated as these often occur where no help was given' (Section IV). It also describes which students were included: 'Help session data collected was analyzed for students that used the HelpMe queue system (N=83)' and '67 students (62%) consented to analysis of their grades.'"
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. Some limitations are discussed within the Discussion section (Section VI), but there is no substantive dedicated subsection."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The paper discusses specific threats: the confound between student ability and early submission ('could be attributed to the original ability level and intrinsic motivation'), scalability concerns ('If the burden of grading is too high, wait times become high'), and practical deployment constraints. These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The paper states explicit scope boundaries: 'The research was conducted in an upper-level course with well-defined assignments that were verified for correctness using unit tests' and notes 'A more detailed study would be required to determine if the student learning was positively impacted' (Section V.D)."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "Raw data (queue system logs, survey responses, exam grades) is not made available for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section IV describes data collection: 'Data on student interactions and assignment completion was collected from an online queue system, which provides details regarding instructor-student interactions including wait and help session times, types of questions, time of day when questions were asked, and user information. The second set of quantitative data came from the exam and overall grades of consenting participants.'"
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Participants were students enrolled in the course — this is clearly stated: 'The course had 99 undergraduate students and 8 graduate students' (Section II.D). The survey was provided in the last two weeks of class. The participation pathway is clear (course enrollment), though selection bias in survey response (41% rate) is not discussed."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The paper documents the pipeline: 107 registered students → 67 consented to grade analysis (62%) → 83 used the queue system → 44 survey responses (41% response rate). Filtering criteria for help session times (>30 seconds) are specified."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding acknowledgment or disclosure is present in the paper."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: both authors are from the Department of Computer Science, University of British Columbia. One author (Ramon Lawrence) appears to be the course instructor, which is relevant as the evaluator of their own intervention."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding information is disclosed, so independence of funding cannot be assessed. Absence of disclosure is treated as NO."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This paper does not evaluate any pre-trained model on a benchmark. It is a pedagogical intervention study."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This paper does not evaluate any pre-trained model on a benchmark. It is a pedagogical intervention study."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "This paper does not evaluate any pre-trained model on a benchmark. It is a pedagogical intervention study."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No mention of pre-registration. No link to OSF, AsPredicted, or any registry."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "The paper states: 'This research uses both survey and quantitative data collected and processed in accordance with a university approved ethics study' (Section IV)."
    244       },
    245       "demographics_reported": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Some demographics are reported: '99 undergraduate students and 8 graduate students', 'All students are third year or above', 'the majority of students in the course were computer science majors' (Section II.D). Gender, geographic distribution, and programming experience are not reported."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "Inclusion criteria are implicit but clear: all students enrolled in the upper-level database systems course. For grade analysis, '67 students (62%) consented'. For help sessions, 'students that used the HelpMe queue system (N=83)'. Filtering criteria are specified."
    254       },
    255       "randomization_described": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "There is no randomization. All students in the course received the same incentive treatment. There is no control group or random assignment to conditions. This is an observational study of a whole-class intervention."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Blinding is not feasible in this study design. Students necessarily know they are receiving bonus marks for early submission. This is a classroom intervention, not an experiment where blinding would be applicable."
    264       },
    265       "attrition_reported": {
    266         "applies": true,
    267         "answer": true,
    268         "justification": "Attrition information is provided: 107 registered students, 67 consented to grade analysis, 83 used the queue system, 44 responded to the survey (41% response rate). The different N values across analyses are reported."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "This is a pedagogical intervention study, not an AI/ML system. Inference cost does not apply."
    276       },
    277       "compute_budget_stated": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a pedagogical intervention study, not a computational experiment. Compute budget does not apply."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "45% of assignments (234 of 523) were submitted early through help sessions",
    287       "evidence": "Section V.A: '234 of the total 523 assignment submissions (45%) were done in help sessions.' Breakdown by day: 70% Monday, 7% Tuesday, 14% Wednesday, 7% Thursday, 2% Friday.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "92% of surveyed students agree or strongly agree they are motivated to complete assignments earlier",
    292       "evidence": "Table I: 82% strongly agree + 10% agree = 92% for the question about bonus marks motivating earlier completion. Survey N=44 (41% response rate).",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Students with at least one bonus marked assignment had higher final exam scores (81% vs 71%)",
    297       "evidence": "Section V.D: 'Overall, the final exam average was 78% for all students. Students that had no bonus marked assignment had an average of 71% versus 81% for students with at least 1 bonus marked assignment.'",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "The incentive system shifts help session demand from right before deadlines to Monday bonus deadlines",
    302       "evidence": "Section V.B and Figure 2: 'the number of help sessions is small on the Friday (F) due date, and instead significant peaks are visible on the Monday (M), when the 10% bonus mark is provided.'",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Synchronous grading during office hours results in more detailed feedback with no increase in marking time",
    307       "evidence": "Section V.C: Traditional grading takes about 5 minutes; virtual office hours grading takes 3-5 minutes but can extend to 10+ minutes. 'Overall, there is no time savings, but two TAs were able to support a class of over a hundred students while providing personalized, real-time feedback.'",
    308       "supported": "moderate"
    309     }
    310   ],
    311   "methodology_tags": ["observational", "qualitative"],
    312   "key_findings": "An assignment incentive system offering 5-10% bonus marks for early submission combined with synchronous code review shifted student submission behavior, with 45% of assignments completed early and 70% of those on the Monday bonus deadline (4 days before the due date). Survey results (N=44, 41% response rate) showed 92% of respondents agreed the incentives motivated earlier completion. Students with early submissions had higher exam averages (81% vs 71%), though the authors acknowledge this correlation may be driven by pre-existing student ability differences rather than the intervention itself.",
    313   "red_flags": [
    314     {
    315       "flag": "No control group",
    316       "detail": "The study applies the intervention to all students in one course offering with no control group. There is no quantitative comparison to a prior semester or untreated group, making it impossible to attribute behavior changes to the intervention versus other factors (different cohort, different semester, instructor effects)."
    317     },
    318     {
    319       "flag": "Low survey response rate with likely self-selection bias",
    320       "detail": "Only 44 of 107 students (41%) responded to the survey. Students who benefited from or engaged with the incentive system are likely overrepresented among respondents. The 92% satisfaction figure may not represent the full class."
    321     },
    322     {
    323       "flag": "Statistical significance claimed without reporting the test",
    324       "detail": "Section V.D claims 'The performance on the final exam for students that had at least one early marked assignment was statistically significant' without naming the test, reporting the test statistic, or providing a p-value."
    325     },
    326     {
    327       "flag": "Confounding between ability and treatment",
    328       "detail": "The paper acknowledges that higher-performing students may self-select into early submission. Without randomization or controls for prior ability, the exam performance difference (81% vs 71%) cannot be attributed to the incentive system."
    329     },
    330     {
    331       "flag": "Instructor evaluating own intervention",
    332       "detail": "The authors designed and implemented the incentive system in their own course and then evaluated it. While not inherently disqualifying, this creates potential bias in how results are interpreted and reported."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Does the Early Bird Catch the Worm? Earliness of Students' Work and its Relationship with Course Outcomes",
    338       "authors": ["J. Leinonen", "F. E. V. Castro", "A. Hellas"],
    339       "year": 2021,
    340       "doi": "10.1145/3430665.3456383",
    341       "relevance": "Empirical study of relationship between early assignment start times and course performance in CS education, relevant to understanding AI-assisted completion incentives."
    342     },
    343     {
    344       "title": "Context-Aware and Data-Driven Feedback Generation for Programming Assignments",
    345       "authors": ["D. Song", "W. Lee", "H. Oh"],
    346       "year": 2021,
    347       "doi": "10.1145/3468264.3468598",
    348       "relevance": "AI-driven automated feedback for programming assignments, relevant to understanding automated code review and AI-assisted programming education."
    349     },
    350     {
    351       "title": "Applying Gamification to Motivate Students to Write High-Quality Code in Programming Assignments",
    352       "authors": ["R. Kasahara", "K. Sakamoto", "H. Washizaki", "Y. Fukazawa"],
    353       "year": 2019,
    354       "doi": "10.1145/3304221.3319792",
    355       "relevance": "Gamification approach to improving code quality in assignments, relevant to understanding incentive structures in programming education."
    356     },
    357     {
    358       "title": "Promoting Early Engagement with Programming Assignments Using Scheduled Automated Feedback",
    359       "authors": ["P. Denny", "J. Whalley", "J. Leinonen"],
    360       "year": 2021,
    361       "doi": "10.1145/3441636.3442309",
    362       "relevance": "Automated feedback system for encouraging early engagement with programming, relevant to understanding AI-assisted code review approaches."
    363     },
    364     {
    365       "title": "HelpMe: Student Help Seeking using Office Hours and Email",
    366       "authors": ["K. Wang", "R. Lawrence"],
    367       "year": 2024,
    368       "doi": "10.1145/3626252.3630867",
    369       "relevance": "The online queue system used in this study for tracking student-instructor interactions, relevant to infrastructure for studying AI-assisted programming education."
    370     }
    371   ]
    372 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs