scan.json (30168B)
1 { 2 "paper": { 3 "title": "A GPT-based Code Review System for Programming Language Learning", 4 "authors": ["Lee Dong-Kyu"], 5 "year": 2024, 6 "venue": "arXiv.org", 7 "arxiv_id": "2407.04722", 8 "doi": "10.48550/arXiv.2407.04722" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "case-study", "qualitative"], 13 "key_findings": "The paper presents a GPT-4-based code review system for programming education that catches error types (hard coding at 21.3%, unnecessary code at 17.59%) missed by traditional online judge systems. The improved system reduces response time by 12-58% and API costs by up to 8.53% compared to the initial version through prompt optimization and a code validation module. A survey of 6 software education experts rated code review quality positively (mostly 4-5 on 5-point Likert), but the system was never tested with its target audience of primary and secondary school students.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper provides a deployed URL (https://www.codetutor119judge.com/) but no source code repository. A live web app is not source code release." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The 93-item test dataset collected from Company C's Online Judge System is described but not publicly released. No download link or repository is provided." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions Next.js, Monaco Editor, and Azure Static Web App but provides no dependency versions, requirements.txt, or environment specification sufficient to recreate the setup." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions are provided. The paper describes the system architecture but does not provide steps to replicate the experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results (failure rates in Table 3, response times in Table 4, costs in Table 5, survey in Table 6) are reported as point estimates with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims the improved system is better across all metrics (correctness, response time, cost) but performs no statistical significance tests. All comparisons are raw number differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Response time reductions are reported with baseline context (e.g., 12-58% reduction with both initial and improved values in Table 4). Cost reduction reported as 8.53% with per-call costs for both systems. Error type failure rates show absolute percentages." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is given for the sample sizes: 108 test codes for correctness evaluation, 92 error codes for response time, 6 evaluators for quality survey. No power analysis or acknowledgment that these may be too small." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Response times appear to be single measurements per exercise, not averaged across runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The improved system is compared against two baselines: the existing online judge system (for error detection, RQ1) and the initial version of the system (for response time and cost, RQ2-3)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The baseline is a traditional online judge system using test-case matching. No comparison to other LLM-based code review or feedback systems (e.g., CodeAid, which is cited in the paper) is included." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "Multiple changes were made between the initial and improved system (code validation module added, maxTokens optimized, temperature/topP adjusted, prompt restructured). No ablation isolates the contribution of individual changes." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Four distinct evaluation dimensions are used: error type detection rates (RQ1), response time (RQ2), API cost (RQ3), and expert-rated quality on 5 criteria (RQ4)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Six software education experts evaluated code review quality using a 5-point Likert scale across 5 criteria (precision, usefulness, specificity, supportive tone, learning effect). Results in Table 6." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The prompts were developed and iteratively improved using data from the same online judge system. It is unclear whether the 108 evaluation test codes overlap with the 93 development test data. No explicit dev/test separation is described." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Error detection is broken down by 4 error types (unnecessary code, requirement not met, hard coding, computation error) in Table 3. Survey results are broken down by 5 quality criteria in Table 6." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No failure cases or error analysis is discussed. The paper reports aggregate success rates but does not examine specific cases where the system failed or produced poor reviews." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "Every experiment shows improvement across all metrics. No negative results, failed approaches, or configurations that hurt performance are reported." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims about identifying error types, shortening response times, lowering API costs, and maintaining quality are each addressed by RQ1-4 in the results sections with corresponding tables and figures." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper implicitly claims that prompt optimization and the new code validation module caused the improvements. However, multiple changes were made simultaneously (prompt restructuring, maxTokens optimization, temperature/topP adjustment, validation module) without controlled single-variable manipulation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims applicability to 'Programming Language Learning' broadly. The abstract says 'suitability for teaching programming to primary and secondary school students.' However, the system was only tested with adult experts, only on Python, and only on exercises from one online judge system." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are considered for the results. For instance, response time improvements could be due to API-side changes, and error detection improvements could reflect GPT-4's existing knowledge of the exercises rather than the prompt design." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures expert Likert ratings of code review quality and claims the system is suitable for student programming education. The gap between expert ratings (proxy) and actual student learning outcomes (claimed outcome) is never acknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper says 'GPT-4' throughout without specifying a version (e.g., gpt-4-0613) or snapshot date. Model behavior changes across versions." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Figures 3 and 5 show the module structure with role-setting, RNP, and RCG prompt components, but these are templates with placeholders ('Exercise', 'Submitted Code', 'Solution'). The actual prompt text appears to be in Korean in figures, and the paper describes prompt content in natural language rather than providing verbatim reproducible prompts." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper states 'the maxTokens value was optimized' and 'the values of the temperature and topP parameters were adjusted' but never reports the actual values used for any of these parameters." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "The system is a sequential pipeline (code validation → correctness check → code review) using single GPT-4 API calls, not an agentic scaffold with tool use, loops, or autonomous decision-making." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section III.A documents the data collection pipeline: raw data from Company C's Online Judge System starting 2021, static-based filtering to remove repeated submissions and comments, resulting in 93 test data for 27 questions." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. Section V ('Conclusion and Further Study') contains one sentence about future work ('further improve usability, introduce a membership management system, and verify the effectiveness with primary and secondary school students') but no substantive discussion of limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper. Issues such as the tiny evaluator sample, lack of student participants, prompt-data overlap, and GPT-4 version sensitivity are not addressed." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do not show. It mentions future work (testing with students) but does not bound its claims by stating what was not tested or what populations/settings are excluded." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Neither the test dataset nor the survey responses are made publicly available. No supplementary data files or download links are provided." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section III.A describes data collection: exercises, student-submitted codes, and instructor answer codes from Company C's Online Judge System starting in 2021, with static-based filtering to remove repeated submissions and delete comments." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "The paper states participants were 'software education experts' with 'at least two years of programming education experience' but does not describe how they were recruited (convenience sampling, institutional affiliation, solicitation method)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The initial data pipeline (raw collection → static filtering → 93 test data) is described in Section III.A. However, the evaluation uses 108 test codes and 92 error codes with no explanation of how these differ from the 93 development test data or how they were derived." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The author's affiliation is listed: Department of Electrical Engineering and Computer Science, University of Hanyang, Seoul, Republic of Korea." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The system uses Company C's Online Judge System data, but the relationship between the author and Company C is not clarified." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses GPT-4 to evaluate code correctness and generate reviews but never states GPT-4's training data cutoff date. The online judge exercises could be in GPT-4's training data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the programming exercises from Company C's Online Judge System could appear in GPT-4's training data. This is a significant concern since GPT-4 is used as the code correctness checker." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The online judge exercises have been available since 2021 and could have been in GPT-4's training data. The paper does not address this contamination risk despite using GPT-4 to check code correctness against these same exercises." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": true, 246 "answer": false, 247 "justification": "No pre-registration is mentioned. The study involves human participants (3 usability testers, 6 survey evaluators) but no pre-registration link is provided." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": true, 251 "answer": false, 252 "justification": "No IRB or ethics board approval is mentioned despite involving human participants in usability testing and survey evaluation." 253 }, 254 "demographics_reported": { 255 "applies": true, 256 "answer": false, 257 "justification": "Participants are described as 'software education experts' with 'at least two years of programming education experience.' No other demographics (gender, age, geographic distribution, specific programming expertise) are reported." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": true, 261 "answer": true, 262 "justification": "An inclusion criterion is stated: participants were 'current instructors with more than 2 years of programming education experience.' The survey also specifies participant roles (3 prior participants, 2 additional instructors, 1 coding content developer)." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "This is not an experimental study with treatment/control conditions. It is a usability evaluation and survey, so randomization is not applicable." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "This is a usability evaluation and survey of a single system, not a comparative experimental study where blinding would be applicable." 273 }, 274 "attrition_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "No attrition information is reported. All 6 survey participants appear to have completed the evaluation, but this is not explicitly stated." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "RQ3 specifically measures API call costs. Table 5 reports input/output tokens and per-call costs in USD for both the initial and improved systems, showing up to 8.53% cost reduction." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Per-call costs are reported but no total computational budget (total API spend across all experiments, total number of API calls, Azure hosting costs) is stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No multiple runs or seed variation is reported. GPT-4 outputs can vary across calls (especially with non-zero temperature), but results appear to be from single runs." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state how many times each experiment was run. Response times and code correctness results appear to be single measurements." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The paper states temperature, topP, and maxTokens were 'optimized' and 'adjusted' but reports neither the final values nor the search budget (how many configurations were tried)." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper reports five iterations of prompt improvement but does not describe the selection criterion for the final configuration or whether selection was done on held-out data." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own system without acknowledging self-evaluation bias. The expert evaluators may have known the system's goals, and no independent evaluation was conducted." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "The system is a single GPT-4 API-based tool. Compute budget is the subject of RQ3 (cost reduction), not a confound between compared methods." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the 108 test codes from one online judge system are representative of the errors primary/secondary students would actually make, or whether the 5-point Likert expert survey measures what makes a code review effective for young learners." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The system itself is the product being evaluated, not a model comparison confounded by scaffolding differences." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The online judge exercises have been available since 2021. GPT-4's training data likely includes similar programming exercises. This temporal overlap is not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The code correctness module provides GPT-4 with the instructor's solution code alongside the student submission. This gives the model the answer key, but the paper does not discuss how this affects the evaluation of GPT-4's code review ability." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The 108 test codes come from the same 27 exercises, meaning multiple test cases per exercise share the same problem structure. Independence of test samples is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or contamination analysis is performed." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "The improved system identifies error types (hard coding, unnecessary code, requirement not met, computation error) more effectively than the existing online judge system.", 365 "evidence": "Table 3 and Figure 11 show failure rates by error type: hard coding 21.3%, unnecessary code 17.59%, requirement not met 11.11%, computation error 6.48%. The existing judge system passed these codes; the improved system caught them.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "The improved system reduces response time for code review comments by 12-58% compared to the initial system.", 370 "evidence": "Table 4 and Figure 12 show response time comparisons across 27 exercises between the initial and improved systems, with consistent reductions.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "The improved system reduces API call costs by up to 8.53% compared to the initial system.", 375 "evidence": "Table 5 and Figure 13 show average token counts and per-call costs in USD for both systems, with the improved system using fewer tokens through prompt optimization.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The system maintains high-quality code reviews despite reduced response times and API costs.", 380 "evidence": "Table 6 shows Likert-scale survey results from 6 experts across 5 criteria (precision, usefulness, specificity, supportive tone, learning effect), with mostly positive ratings.", 381 "supported": "weak" 382 }, 383 { 384 "claim": "The system is suitable for teaching programming to primary and secondary school students.", 385 "evidence": "Expert survey feedback states suitability, but no testing with actual students was conducted. Only adult software education experts evaluated the system.", 386 "supported": "weak" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No testing with target audience", 392 "detail": "The system is designed for primary and secondary school students but was evaluated exclusively by adult software education experts. The paper's central claim of suitability for young learners has no direct evidence from the target population." 393 }, 394 { 395 "flag": "Tiny evaluator sample", 396 "detail": "Only 3 experts participated in the usability test and 6 in the quality survey. This is too small to draw reliable conclusions about system quality, and no statistical significance can be established." 397 }, 398 { 399 "flag": "GPT-4 used as code correctness checker without validation", 400 "detail": "Using GPT-4 to determine if code is correct (instead of actually compiling and running it) is unreliable. The paper uses GPT-4 to 'function as the compiler' (Figure 5) without discussing the error rate of this approach or comparing it to actual compilation." 401 }, 402 { 403 "flag": "Potential dev/test overlap", 404 "detail": "The same online judge system exercises were used for both prompt development (93 test data) and evaluation (108 test codes). No clear separation between development and test data is documented." 405 }, 406 { 407 "flag": "No statistical tests on any claims", 408 "detail": "All claims of improvement (error detection, response time, cost) are based on comparing raw numbers without significance tests, despite variability in GPT-4 outputs being well-known." 409 }, 410 { 411 "flag": "Hyperparameter values withheld", 412 "detail": "Temperature, topP, and maxTokens were 'optimized' but the actual values are never reported, making the experiments unreproducible." 413 }, 414 { 415 "flag": "Contamination risk unaddressed", 416 "detail": "GPT-4 may have seen similar or identical programming exercises in its training data. The paper uses GPT-4 as the code correctness checker, so contamination directly affects the core evaluation. This risk is never discussed despite being acknowledged in the related work (Section II.D on data leakage)." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "CodeAid: Evaluating a Classroom Deployment of an LLM-based Programming Assistant that Balances Student and Educator Needs", 422 "authors": ["Majeed Kazemitabaar", "Runlong Yang", "Xiaoning Wang", "Austin Zachary Henley", "Paul Denny", "Michelle Craig", "Tovi Grossman"], 423 "year": 2024, 424 "doi": "10.1145/3613904.3642773", 425 "relevance": "Directly relevant as an LLM-based programming assistant deployment study that balances learning with AI assistance, addressing the same anti-cheating concern." 426 }, 427 { 428 "title": "Natural Language Generation and Understanding of Big Code for AI-Assisted Programming: A Review", 429 "authors": ["Man-Fai Wong", "Shangxin Guo", "Ching-Nam Hang", "Siu-Wai Ho", "Chee-Wei Tan"], 430 "year": 2023, 431 "doi": "10.3390/e25060888", 432 "relevance": "Survey of LLM capabilities for AI-assisted programming, covering code generation and understanding relevant to automated code review." 433 }, 434 { 435 "title": "Can Large Language Models Provide Feedback to Students? A Case Study on ChatGPT", 436 "authors": ["Wei Dai", "Flora Jin", "Tongguang Li", "Yi-Shan Tsai"], 437 "year": 2023, 438 "relevance": "Evaluates ChatGPT's ability to generate educational feedback on student assignments, directly relevant to LLM-based feedback quality." 439 }, 440 { 441 "title": "Automated feedback generation for introductory programming assignments", 442 "authors": ["Rishabh Singh", "Sumit Gulwani", "Armando Solar-Lezama"], 443 "year": 2013, 444 "doi": "10.1145/2491956.2462195", 445 "relevance": "Foundational work on automated programming feedback that proposed minimal-correction feedback, an alternative to LLM-based approaches." 446 }, 447 { 448 "title": "LLaMA-Reviewer: Advancing Code Review Automation with Large Language Models through Parameter-Efficient Fine-Tuning", 449 "authors": ["Junyi Lu", "Lei Yu", "Xiaojia Li", "Li Yang", "Chun Zuo"], 450 "year": 2023, 451 "relevance": "Proposes an LLM-based code review automation pipeline with systematic prompt templates, directly relevant to automated code review methodology." 452 }, 453 { 454 "title": "A Critical Review of Large Language Model on Software Engineering: An Example from ChatGPT and Automated Program Repair", 455 "authors": ["Quanjun Zhang"], 456 "year": 2023, 457 "relevance": "Evaluates ChatGPT's automated program repair capability on a contamination-controlled dataset, relevant to LLM code fixing and data leakage concerns." 458 }, 459 { 460 "title": "From 'Ban It Till We Understand It' to 'Resistance is Futile': How University Programming Instructors Plan to Adapt as More Students Use AI Code Generation and Explanation Tools such as ChatGPT and GitHub Copilot", 461 "authors": ["Sam Lau", "Philip J. Guo"], 462 "year": 2023, 463 "relevance": "Studies instructor attitudes toward LLM tools in programming education, including concerns about AI-assist cheating." 464 }, 465 { 466 "title": "Designing LLM-based Code Reviewing Learning Environment for Programming Education", 467 "authors": ["Seong-yune Choi", "Donghee Lee", "Jungho Kim", "Youngkwang Jang", "Hyungshin Kim"], 468 "year": 2023, 469 "doi": "10.32431/kace.2023.26.5.001", 470 "relevance": "Proposes LLM-based code review for programming education with verification criteria used as reference for the quality survey in this paper." 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "A deployed web tool for programming education with a working URL, but limited to one online judge's exercises and Korean-language primary/secondary students." 477 }, 478 "surprise_contrarian": { 479 "score": 0, 480 "justification": "Confirms expected results that GPT-4 can generate code reviews and reduce instructor workload; no surprising or contrarian findings." 481 }, 482 "fear_safety": { 483 "score": 0, 484 "justification": "No AI safety or security concerns raised; the paper addresses AI-assist cheating as a design constraint, not as a demonstrated risk." 485 }, 486 "drama_conflict": { 487 "score": 0, 488 "justification": "No controversy, no challenge to existing work, and no debate-worthy findings." 489 }, 490 "demo_ability": { 491 "score": 2, 492 "justification": "A live web application at codetutor119judge.com is mentioned, though it targets Korean-language exercises and no source code is available." 493 }, 494 "brand_recognition": { 495 "score": 1, 496 "justification": "Uses GPT-4 (recognized brand) but the author and institution are not widely known in the AI/SE research community." 497 } 498 } 499 }