scan.json (18683B)
1 { 2 "paper": { 3 "title": "A Comparative Study of AI and Human Evaluation for Student Website Projects", 4 "authors": ["Lidia Feklistova", "Artur Kašnikov"], 5 "year": 2025, 6 "venue": "5th International Conference on AI Research (ICAIR 2025)" 7 }, 8 "checklist": { 9 "artifacts": { 10 "code_released": { 11 "applies": true, 12 "answer": false, 13 "justification": "No source code or repository link is provided in the paper." 14 }, 15 "data_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "The evaluation data (scores from AI tools and HIs) is not released or made available." 19 }, 20 "environment_specified": { 21 "applies": true, 22 "answer": false, 23 "justification": "No environment or dependency specifications are provided. The paper mentions IBM SPSS Statistics 30 but no reproducible environment setup." 24 }, 25 "reproduction_instructions": { 26 "applies": true, 27 "answer": false, 28 "justification": "No step-by-step reproduction instructions are provided." 29 } 30 }, 31 "statistical_methodology": { 32 "confidence_intervals_or_error_bars": { 33 "applies": true, 34 "answer": false, 35 "justification": "No confidence intervals or error bars are reported. Tables show means, SDs, min-max, and medians but no CIs." 36 }, 37 "significance_tests": { 38 "applies": true, 39 "answer": true, 40 "justification": "Wilcoxon signed-rank tests are used with z-values and p-values reported in Table 2. Spearman correlations with significance levels in Table 3." 41 }, 42 "effect_sizes_reported": { 43 "applies": true, 44 "answer": false, 45 "justification": "No standardized effect sizes (e.g., Cohen's d, r) are reported. Only raw mean differences and correlation coefficients are shown." 46 }, 47 "sample_size_justified": { 48 "applies": true, 49 "answer": false, 50 "justification": "The sample size of 9 projects is not justified. No power analysis is discussed. The limitations section acknowledges 'a small sample size' but provides no justification for why 9 was used." 51 }, 52 "variance_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Standard deviations are reported in Table 2 for all evaluations." 56 } 57 }, 58 "evaluation_design": { 59 "baselines_included": { 60 "applies": true, 61 "answer": true, 62 "justification": "Human instructor evaluations serve as the baseline against which AI tool evaluations are compared." 63 }, 64 "baselines_contemporary": { 65 "applies": true, 66 "answer": true, 67 "justification": "The AI tools tested are contemporary models: ChatGPT-4o, ChatGPT-o1, ChatGPT-4.5, Gemini 2.0 Flash/Pro, Claude 3.7 Sonnet, Mistral." 68 }, 69 "ablation_study": { 70 "applies": false, 71 "answer": false, 72 "justification": "The study compares AI tools vs. human evaluators; there is no multi-component system to ablate." 73 }, 74 "multiple_metrics": { 75 "applies": true, 76 "answer": true, 77 "justification": "Multiple evaluation criteria are used: colour, contrast, typography, grouping, usability, readability, maintainability. Both Wilcoxon and Spearman tests are applied." 78 }, 79 "human_evaluation": { 80 "applies": true, 81 "answer": true, 82 "justification": "Human instructor evaluation is central to the study. Two HIs independently evaluated all 9 projects using the same rubric (Section 2.2)." 83 }, 84 "held_out_test_set": { 85 "applies": false, 86 "answer": false, 87 "justification": "This is not a machine learning study with train/test splits." 88 }, 89 "per_category_breakdown": { 90 "applies": true, 91 "answer": true, 92 "justification": "Results are broken down per criterion (colour, contrast, typography, grouping, usability, readability, maintainability) in Tables 2 and 3." 93 }, 94 "failure_cases_discussed": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper discusses where AI tools diverge from human evaluators, e.g., ChatGPT-4.5 consistently overrating, Claude 3.7 Sonnet underrating typography, and weak correlations for many criteria." 98 }, 99 "negative_results_reported": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper reports that many correlations were weak and not statistically significant (Table 3), and that DeepSeek and Copilot could not evaluate visual content." 103 } 104 }, 105 "claims_and_evidence": { 106 "abstract_claims_supported": { 107 "applies": true, 108 "answer": true, 109 "justification": "Abstract claims about no significant differences in many criteria (supported by Table 2) and low correlation in many cases (supported by Table 3) are consistent with the results." 110 }, 111 "causal_claims_justified": { 112 "applies": true, 113 "answer": false, 114 "justification": "The paper makes causal-like claims such as 'ChatGPT-4.5 might have missed some usability nuances... because, according to its self-description, it evaluates usability indirectly' (Section 4). These causal attributions are speculative and not supported by the study design." 115 }, 116 "generalization_bounded": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper limits claims to the specific context: 9 student website projects, specific AI tools as of April 2025, and acknowledges generalization limitations in Section 5." 120 }, 121 "alternative_explanations_discussed": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper discusses the probabilistic nature of LLMs as an alternative explanation for inconsistencies, training data biases for aesthetic differences, and references prior work on aesthetic judgment divergence (Section 4)." 125 } 126 }, 127 "setup_transparency": { 128 "model_versions_specified": { 129 "applies": true, 130 "answer": false, 131 "justification": "Models are listed by marketing names (ChatGPT-4o, ChatGPT-o1, ChatGPT-4.5, Gemini 2.0 Flash, Gemini 2.0 Pro, Claude 3.7 Sonnet, Mistral) without snapshot dates or API versions." 132 }, 133 "prompts_provided": { 134 "applies": true, 135 "answer": true, 136 "justification": "Full evaluation prompts are provided in Section 2.2, both for UI/UX design evaluation and code quality evaluation." 137 }, 138 "hyperparameters_reported": { 139 "applies": true, 140 "answer": false, 141 "justification": "No hyperparameters (temperature, top-p, etc.) are reported for any of the AI tools used." 142 }, 143 "scaffolding_described": { 144 "applies": false, 145 "answer": false, 146 "justification": "No agentic scaffolding is used. AI tools are queried directly with prompts." 147 }, 148 "data_preprocessing_documented": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 2.2 describes that screenshots were submitted for UI/UX and full source code for code quality. Section 2.3 documents the statistical analysis pipeline. The consent process filtered 21 to 18 students (9 projects)." 152 } 153 }, 154 "limitations_and_scope": { 155 "limitations_section_present": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 5 (Conclusions) contains a paragraph discussing limitations: 'The current study is limited by a small sample size and the subjectivity of HIs' evaluations.'" 159 }, 160 "threats_to_validity_specific": { 161 "applies": true, 162 "answer": true, 163 "justification": "Specific threats are mentioned: small sample size, subjectivity of HI evaluations, AI tools' inability to access live website prototypes or fully assess complex visuals." 164 }, 165 "scope_boundaries_stated": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper does not explicitly state what the results do NOT show. Limitations are mentioned but specific untested conditions or populations are not enumerated." 169 } 170 }, 171 "data_integrity": { 172 "raw_data_available": { 173 "applies": true, 174 "answer": false, 175 "justification": "Raw evaluation scores are not made available for independent verification." 176 }, 177 "data_collection_described": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 2.2 describes data collection: AI tools evaluated on the same day (April 7, 2025), using the same prompts, with screenshots for UI/UX and source code for code quality." 181 }, 182 "recruitment_methods_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 2.2 describes participants: 21 students in the course, 18 consented, demographics reported (gender, degree level, department, prior experience)." 186 }, 187 "data_pipeline_documented": { 188 "applies": true, 189 "answer": true, 190 "justification": "The pipeline from student consent (21→18 students, 9 projects) through AI tool selection, evaluation, and statistical analysis is documented across Sections 2.1-2.3." 191 } 192 }, 193 "conflicts_of_interest": { 194 "funding_disclosed": { 195 "applies": true, 196 "answer": false, 197 "justification": "No funding information or acknowledgments section mentioning grants or sponsors is present." 198 }, 199 "affiliations_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "Authors are affiliated with the Institute of Computer Science, University of Tartu, Estonia. They are not affiliated with any of the AI tool providers." 203 }, 204 "funder_independent_of_outcome": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is treated as non-disclosure." 208 }, 209 "financial_interests_declared": { 210 "applies": true, 211 "answer": false, 212 "justification": "No competing interests or financial interests statement is included in the paper." 213 } 214 }, 215 "contamination": { 216 "training_cutoff_stated": { 217 "applies": false, 218 "answer": false, 219 "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It tests AI tools' ability to evaluate student projects, not model knowledge." 220 }, 221 "train_test_overlap_discussed": { 222 "applies": false, 223 "answer": false, 224 "justification": "Not applicable. The study evaluates AI tools' rating behavior, not their performance on a knowledge benchmark." 225 }, 226 "benchmark_contamination_addressed": { 227 "applies": false, 228 "answer": false, 229 "justification": "Not applicable. Student website projects are novel assignments, not public benchmarks." 230 } 231 }, 232 "human_studies": { 233 "pre_registered": { 234 "applies": true, 235 "answer": false, 236 "justification": "No mention of pre-registration." 237 }, 238 "irb_or_ethics_approval": { 239 "applies": true, 240 "answer": false, 241 "justification": "The ethics statement mentions informed consent was obtained but does not mention IRB or ethics board approval." 242 }, 243 "demographics_reported": { 244 "applies": true, 245 "answer": true, 246 "justification": "Section 2.2 reports gender (50/50), degree level (28% master's, rest bachelor's), department (83% CS), and prior web development experience." 247 }, 248 "inclusion_exclusion_criteria": { 249 "applies": true, 250 "answer": true, 251 "justification": "Inclusion criteria: students enrolled in the course who provided consent (21 enrolled, 18 consented). Section 2.1-2.2." 252 }, 253 "randomization_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This is not an experimental study with treatment/control conditions requiring randomization. All projects were evaluated by all tools." 257 }, 258 "blinding_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "This is not an experimental study where blinding would be applicable. AI tools and HIs independently evaluated the same projects." 262 }, 263 "attrition_reported": { 264 "applies": true, 265 "answer": true, 266 "justification": "Attrition is reported: 21 students participated, only 18 consented (Section 2.2). DeepSeek and Copilot were excluded from further analysis (Section 3.1)." 267 } 268 }, 269 "cost_and_practicality": { 270 "inference_cost_reported": { 271 "applies": true, 272 "answer": false, 273 "justification": "No API costs, token counts, or inference time are reported for any of the 7 AI tools used." 274 }, 275 "compute_budget_stated": { 276 "applies": true, 277 "answer": false, 278 "justification": "No computational budget is stated." 279 } 280 } 281 }, 282 "claims": [ 283 { 284 "claim": "Wilcoxon signed-rank test revealed no statistically significant differences in many evaluation criteria between AI tools and HIs.", 285 "evidence": "Table 2 shows most p-values > 0.05 across criteria and tools, with significant differences only in specific criteria for specific tools (Section 3.2).", 286 "supported": "strong" 287 }, 288 { 289 "claim": "Spearman correlation analysis revealed low consistency in how AI tools and HI evaluated specific aspects of the projects.", 290 "evidence": "Table 3 shows many weak, non-significant correlations. Only ChatGPT-4o showed strong significant correlations for colour (.902), grouping (.833), and usability (.819) (Section 3.3).", 291 "supported": "strong" 292 }, 293 { 294 "claim": "ChatGPT-4.5 and ChatGPT-4o delivered particularly promising outcomes.", 295 "evidence": "ChatGPT-4o showed strong correlations with HIs for 3 criteria; ChatGPT-4.5 showed strong correlation for typography (.709) and usability (.804). However, ChatGPT-4.5 also showed the most significant differences from HIs (Tables 2 and 3).", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "AI tools should be treated as supportive assistants rather than autonomous evaluators.", 300 "evidence": "Based on weak correlations in many criteria (Table 3) and some significant scoring differences (Table 2). This is a reasonable interpretation but is a normative claim rather than strictly empirical.", 301 "supported": "moderate" 302 } 303 ], 304 "methodology_tags": ["observational", "case-study"], 305 "key_findings": "The study compared 7 AI tools against human instructors on evaluating 9 student website projects across UI/UX design and code quality criteria. While Wilcoxon signed-rank tests showed few statistically significant differences in overall scoring, Spearman correlations revealed that agreement on individual project rankings was often weak and inconsistent. ChatGPT-4o showed the strongest alignment with human evaluators. The authors conclude AI tools are better suited as assistive tools rather than standalone evaluators for subjective assessment tasks.", 306 "red_flags": [ 307 { 308 "flag": "Very small sample size", 309 "detail": "Only 9 student website projects were evaluated. With N=9, statistical power is extremely limited and results are unreliable for generalizable conclusions. The non-significant Wilcoxon results may simply reflect insufficient power rather than true agreement." 310 }, 311 { 312 "flag": "Single human baseline without inter-rater reliability", 313 "detail": "Two HIs evaluated by consensus, producing a single set of scores. No inter-rater reliability (e.g., Cohen's kappa) between the two HIs is reported, so the 'ground truth' baseline is not validated." 314 }, 315 { 316 "flag": "AI tool selection by personal interest", 317 "detail": "Section 2.2 states 'the authors selected nine different AI tools based on their personal interests.' No systematic selection criteria are provided." 318 }, 319 { 320 "flag": "No hyperparameters reported", 321 "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 7 AI tools, making results non-reproducible since these settings significantly affect output." 322 }, 323 { 324 "flag": "AI self-description used as evidence of capability", 325 "detail": "RQ1 is answered by asking AI tools to describe their own capabilities (Table 1). Self-description is unreliable evidence of actual capability, as LLMs routinely overstate or misrepresent their abilities." 326 } 327 ], 328 "cited_papers": [ 329 { 330 "title": "AICodeReview: Advancing code quality with AI-enhanced reviews", 331 "authors": ["Y. Almeida", "D. Albuquerque", "E.D. Filho"], 332 "year": 2024, 333 "doi": "10.1016/j.softx.2024.101677", 334 "relevance": "Directly relevant to AI-assisted code quality assessment." 335 }, 336 { 337 "title": "JorGPT: Instructor-Aided Grading of Programming Assignments with Large Language Models (LLMs)", 338 "authors": ["J. Cisneros-González", "N. Gordo-Herrera", "I. Barcia-Santos", "J. Sánchez-Soriano"], 339 "year": 2025, 340 "doi": "10.3390/fi17060265", 341 "relevance": "Studies correlation between ChatGPT-4o and human grading of programming tasks." 342 }, 343 { 344 "title": "Large Language Model-Powered Automated Assessment: A Systematic Review", 345 "authors": ["E. Emirtekin"], 346 "year": 2025, 347 "doi": "10.3390/app15105683", 348 "relevance": "Systematic review of LLM-powered automated assessment, directly relevant to survey scope." 349 }, 350 { 351 "title": "Applying Large Language Model to User Experience Testing", 352 "authors": ["N-L. Hsueh", "H-J. Lin", "L-C. Lai"], 353 "year": 2024, 354 "doi": "10.3390/electronics13234633", 355 "relevance": "Studies LLM application in UX evaluation, related to AI-assisted software quality assessment." 356 }, 357 { 358 "title": "AI-based Online Code Quality Assessment System", 359 "authors": ["S. Yi", "Y. Yu", "J. Wu"], 360 "year": 2024, 361 "doi": "10.1109/CBASE64041.2024.10824380", 362 "relevance": "AI-based code quality assessment system, relevant to automated code evaluation." 363 }, 364 { 365 "title": "Artificial Intelligence in System and Software Engineering for Auto Code Generation", 366 "authors": ["A.S. Ghai", "V. Rawat", "V.K. Gupta", "K.P. Ghai"], 367 "year": 2024, 368 "doi": "10.1109/ICEECT61758.2024.10738945", 369 "relevance": "AI in software engineering for code generation." 370 } 371 ] 372 }