scan.json (29906B)
1 { 2 "paper": { 3 "title": "Repairing Bugs in Python Assignments Using Large Language Models", 4 "authors": [ 5 "Jialu Zhang", 6 "José Cambronero", 7 "Sumit Gulwani", 8 "Vu Le", 9 "Ruzica Piskac", 10 "Gustavo Soares", 11 "Gust Verbruggen" 12 ], 13 "year": 2022, 14 "venue": "arXiv.org", 15 "arxiv_id": "2209.14876", 16 "doi": "10.48550/arXiv.2209.14876" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No repository URL, code archive, or link to MMAPR's source code is provided anywhere in the paper. The paper mentions '600 lines of Python code' but does not release it." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The benchmark is derived from a third-party dataset of student programs from an Indian university (referenced as [28]), but no download link is provided for either the original dataset or their derived benchmark of 286 program pairs." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions 'a mix of Python and open-source software libraries' and 'a Windows VM with an Intel i7 CPU and 32 GB of RAM' but provides no requirements.txt, library versions, or dependency specifications." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A researcher would need to re-implement the system from the paper's description." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Tables report standard deviations of token edit distances across programs within each assignment, but no confidence intervals or error bars are reported for the repair rates themselves. No uncertainty quantification over experimental runs." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims MMAPR outperforms the baseline (86.71% vs 67.13%) but provides no statistical significance tests (no p-values, no t-tests, no bootstrap tests) to support the comparison." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports repair rates with baseline context (86.71% vs 67.13%, 96.50% with few-shots) and token edit distances (31.40 vs 42.50), providing enough context to assess the magnitude of improvements." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "Section VI notes 'The size of the dataset is on par with the state-of-the-art automated program repair techniques' but provides no formal justification or power analysis for 286 programs being sufficient." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Standard deviations in Tables I-IV are across programs within a single run, not across multiple independent experimental runs. With temperature=0.8, results are stochastic, but no multi-run variance is reported." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper compares MMAPR against a composed baseline of BIFI (syntax repair) + Refactory (semantic repair), described in Section V as combining state-of-the-art tools for each repair type." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "BIFI (2021) and Refactory (2019) were recent state-of-the-art tools at the time of this 2022 paper. The paper explicitly describes them as 'state-of-the-art' for their respective repair types." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section V-B presents a thorough ablation study covering program chunking (Table III), iterative querying (Table IV), few-shot learning (Table I), and multimodal prompts (Figure 7), each tested independently." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Two metrics are used throughout: repair rate (percentage of programs fixed) and mean token edit distance (TED) measuring patch size. Both are reported in Tables I and II." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "All evaluation is automated via syntax oracles (Python parser) and semantic oracles (test suites). No human evaluation of repair quality, educational value, or student comprehension is performed." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "For the few-shot learning setting, the paper uses other students' programs as shots but does not explicitly describe a held-out test set or separation protocol to prevent leakage between few-shot examples and evaluation programs." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Tables I, II, III, and IV all provide per-assignment breakdowns (15 problem IDs) showing repair rates and TED for each assignment, not just aggregate numbers." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "The paper discusses where BIFI fails (Section V-A, V-B4) but does not analyze specific cases where MMAPR itself fails. The 13.29% of programs MMAPR could not repair (without few-shots) are not examined." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that Codex Edit model 'did not perform as well' (Section V), that removing program chunking increases TED by 41.79% (Table III), and that iterative querying slightly increases TED while improving repair rate (Table IV)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims MMAPR 'can fix more programs and produce smaller patches on average.' Table I confirms 86.71% vs 67.13% repair rate and 31.40 vs 42.50 mean TED, directly supporting both claims." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims like 'by performing iterative querying the repair rate rises from 82.87% to 86.71%' are supported by controlled ablation studies (Section V-B) that isolate each component's contribution through single-variable manipulation." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The title scopes to 'Python Assignments.' Section VI explicitly states: 'We only evaluated MMAPR on Python programs' and 'We carried out our evaluation on one particular set of 286 student programs,' acknowledging the bounded scope." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "Section VI discusses generic threats (test-based validation, single dataset, Python only) but does not consider alternative explanations for the observed improvements, such as whether Codex's training data overlap with the benchmark could explain high repair rates." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper measures repair rate (test suite pass) and token edit distance, and explicitly acknowledges in Section VI that 'Validating program correctness through tests is not as strong as formal verification,' distinguishing the proxy from the ideal outcome." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper says 'OpenAI's Codex' and 'the completion model' (Section V) but does not specify which Codex model version (e.g., code-davinci-002 or code-cushman-001). No API version or snapshot date is given." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Figures 4, 5, and 6 provide concrete prompt examples showing the actual structure and content used for both syntax and semantic phases, including code, error messages, problem descriptions, and test cases. The prompt construction is fully specified in Section IV." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section V states temperature = 0.8, K = 10 candidates per prompt, top 10 candidates selected by average token log probabilities, and maximum 2 syntax iterations." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section IV describes the full MMAPR pipeline in detail: program chunking (Algorithm 1), syntax phase with iterative querying, semantic phase with multimodal prompts and few-shot selection, and candidate selection by edit distance. Figure 3 provides an architecture diagram." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section V describes the benchmark derivation: started with 18 assignments, removed 3 for specific reasons, selected students with eventually correct programs, collected the latest version with syntactic mistakes, resulting in 286 program pairs." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section VI 'Threats to Validity' is a dedicated section discussing three specific limitations of the study." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section VI discusses threats specific to this study: (1) test-based validation is weaker than formal verification, (2) evaluation on one particular set of 286 programs from one university, (3) only evaluated on Python programs." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section VI explicitly states: 'We only evaluated MMAPR on Python programs' and 'We carried out our evaluation on one particular set of 286 student programs,' clearly bounding what was not tested." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "Neither the original student programs nor the derived benchmark of 286 program pairs are made available. Only aggregated results are shown in tables." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section V describes the data source: 'introductory Python assignments collected by third-party authors in a large Indian university,' referencing the Python version of dataset [28], with 18 assignments containing problem descriptions, test suites, and student authoring histories." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants are recruited in this study. The student programs come from a pre-existing educational dataset collected by third-party authors." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "Section V documents the filtering pipeline: 18 assignments → removed 3 (file/PDF requirements) → selected students with eventually correct programs → collected latest version with syntax mistakes → 286 program pairs. Each stage's criteria are described." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding section or grant acknowledgments are included. The paper notes the first author was a Microsoft intern but does not disclose formal funding sources." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: Jialu Zhang and Ruzica Piskac at Yale University; José Cambronero, Sumit Gulwani, Vu Le, Gustavo Soares, and Gust Verbruggen at Microsoft." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "Five of seven authors are Microsoft employees. Microsoft has a significant investment in OpenAI (whose Codex is the core component). Microsoft has a financial interest in demonstrating LLM utility for developer tools." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interest statement is included in the paper, despite the clear Microsoft–OpenAI relationship." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper does not state Codex's training data cutoff date. Codex was trained on GitHub code, and the student assignments could overlap with training data. No cutoff is mentioned." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether Codex's training data includes similar or identical programming assignments. The student programs are from an Indian university and may have been posted online." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "The benchmark is derived from university assignments that may have appeared online before Codex's training cutoff. No contamination analysis is performed, despite Codex being trained on millions of GitHub repositories." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. The paper evaluates an automated repair tool on pre-existing student program data." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. Student programs are from a pre-existing dataset collected by third parties." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study. The paper analyzes student programs, not the students themselves." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No API costs, tokens consumed, or wall-clock time per repair are reported. The paper limits syntax iterations to 2 and candidate count to 10 but does not quantify actual cost." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "The paper mentions 'a Windows VM with an Intel i7 CPU and 32 GB of RAM' for running experiments but does not state total API spend, total compute time, or number of Codex API calls made." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "With temperature=0.8, Codex outputs are stochastic, but no multi-seed or multi-run sensitivity analysis is reported. Results appear to be from a single execution." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The paper does not state how many independent runs of the full pipeline produced the reported results. No mention of averaging across runs." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper states 'We set the temperature to 0.8 based on preliminary experiments' (Section V) but does not report how many configurations were tried or the search budget." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "Temperature 0.8 and K=10 were selected based on 'preliminary experiments' but no details on how many alternatives were tested or what validation data was used for selection." 316 }, 317 "multiple_comparison_correction": { 318 "applies": false, 319 "answer": false, 320 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors implemented the BIFI+Refactory baseline pipeline themselves and compared it against their own MMAPR system. No acknowledgment that their implementation of the baseline may underperform the original authors' setup." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "BIFI generates 50 candidates while MMAPR uses multiple prompts each generating 10 candidates. The compute budgets differ but are not compared or controlled for." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": true, 335 "justification": "Section VI acknowledges that 'Validating program correctness through tests is not as strong as formal verification,' explicitly questioning whether the benchmark (test-suite pass) measures true correctness." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": false, 340 "justification": "MMAPR's multi-stage scaffold (chunking, iterative querying, multimodal prompts) differs fundamentally from BIFI+Refactory's pipeline. Improvements could come from the scaffold rather than Codex, but this confound is not addressed." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "The student programs are from an existing dataset that may predate Codex's training data. Codex could have seen identical or very similar programming exercises. No temporal analysis is provided." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the rich prompt information (problem descriptions, test cases) provides an unfair advantage compared to real-world deployment where such information might not be available." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether Codex's training data includes the same or similar introductory programming exercises from online educational platforms." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection or prevention method is applied. No canary strings, n-gram overlap analysis, or decontamination steps." 363 } 364 } 365 }, 366 "scan_version": 3, 367 "active_modules": [ 368 "experimental_rigor", 369 "data_leakage" 370 ], 371 "claims": [ 372 { 373 "claim": "MMAPR without few-shot learning repairs 86.71% of programs, compared to 67.13% for the BIFI+Refactory baseline.", 374 "evidence": "Table I shows per-assignment and overall repair rates. MMAPR repairs 248/286 programs vs 192/286 for baseline (Section V-A).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "MMAPR with few-shot learning achieves a 96.50% repair rate.", 379 "evidence": "Table I shows adding test-case-based few-shot selection raises the repair rate from 86.71% to 96.50% across all 15 assignments.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "MMAPR produces smaller patches (mean TED 31.40) compared to the baseline (mean TED 42.50).", 384 "evidence": "Table I reports mean token edit distances. MMAPR's patches are 26.38% smaller on average (Section V-A).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "MMAPR achieves 100% syntax repair rate compared to 80.07% for BIFI.", 389 "evidence": "Table II shows MMAPR resolves syntax errors in all 286 programs with a mean TED of 5.46, versus BIFI's 80.07% rate and 25.07 mean TED.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Program chunking reduces average token edit distance by 41.79% in the syntax phase.", 394 "evidence": "Table III compares MMAPR with and without program chunking: overall mean TED drops from 9.38 to 5.46 (Section V-B1).", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Iterative querying (splitting syntax and semantic phases) raises repair rate from 82.87% to 86.71%.", 399 "evidence": "Table IV compares single-round vs iterative approaches across all assignments (Section V-B2).", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Ensembling multimodal prompts outperforms any single prompt structure.", 404 "evidence": "Figure 7 shows individual prompt structures achieve varying fix rates, while ensembling achieves the highest rate (Section V-B3).", 405 "supported": "moderate" 406 } 407 ], 408 "methodology_tags": [ 409 "benchmark-eval" 410 ], 411 "key_findings": "MMAPR, a unified system using OpenAI Codex for both syntax and semantic repair of introductory Python assignments, repairs 86.71% of 286 real student programs (96.50% with few-shot learning), outperforming a BIFI+Refactory baseline at 67.13%. The system produces patches with smaller edit distances (31.40 vs 42.50 mean TED), preserving students' original program structure. Key design decisions — program chunking, iterative querying, multimodal prompt ensembling, and test-case-based few-shot selection — each contribute measurably to performance, as shown in ablation studies.", 412 "red_flags": [ 413 { 414 "flag": "Undisclosed conflict of interest", 415 "detail": "Five of seven authors are Microsoft employees, and the paper evaluates OpenAI's Codex (Microsoft invested $10B+ in OpenAI). No competing interests statement is included, and the financial relationship is not acknowledged." 416 }, 417 { 418 "flag": "No variance across stochastic runs", 419 "detail": "Codex is queried with temperature=0.8, making outputs stochastic. All results appear to come from a single execution. Without multi-run variance, it is impossible to know if the observed improvements are stable or a lucky draw." 420 }, 421 { 422 "flag": "No significance tests for comparative claims", 423 "detail": "The paper claims MMAPR outperforms the baseline on repair rate (86.71% vs 67.13%) and edit distance (31.40 vs 42.50) but provides no statistical tests to establish whether these differences are significant." 424 }, 425 { 426 "flag": "No contamination analysis", 427 "detail": "Codex was trained on millions of GitHub repositories which likely include introductory programming exercises very similar to the benchmark. High repair rates could partially reflect memorization rather than generalized repair capability." 428 }, 429 { 430 "flag": "Self-evaluation bias", 431 "detail": "The authors built both MMAPR and the BIFI+Refactory baseline pipeline themselves. Their baseline implementation may not match the performance that the original BIFI and Refactory authors would achieve." 432 } 433 ], 434 "cited_papers": [ 435 { 436 "title": "Evaluating large language models trained on code", 437 "authors": ["Mark Chen"], 438 "year": 2021, 439 "relevance": "The Codex paper — the LLM used as MMAPR's core component, foundational to LLM-based code generation research." 440 }, 441 { 442 "title": "Break-it-fix-it: Unsupervised learning for program repair", 443 "authors": ["Michihiro Yasunaga", "Percy Liang"], 444 "year": 2021, 445 "relevance": "BIFI is the state-of-the-art syntax repair baseline used for comparison; demonstrates neural program repair approach." 446 }, 447 { 448 "title": "GenProg: A generic method for automatic software repair", 449 "authors": ["Claire Le Goues", "ThanhVu Nguyen", "Stephanie Forrest", "Westley Weimer"], 450 "year": 2012, 451 "relevance": "Foundational automated program repair work using genetic programming, one of the first generate-and-validate APR systems." 452 }, 453 { 454 "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning", 455 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 456 "year": 2022, 457 "arxiv_id": "2207.08281", 458 "relevance": "Concurrent work framing program repair as zero-shot LLM code generation, directly relevant to LLM-based APR." 459 }, 460 { 461 "title": "Repair is nearly generation: Multilingual program repair with LLMs", 462 "authors": ["Harshit Joshi", "José Cambronero", "Sumit Gulwani", "Vu Le", "Ivan Radicek", "Gust Verbruggen"], 463 "year": 2022, 464 "arxiv_id": "2208.11640", 465 "relevance": "Closely related work from overlapping authors exploring LLM-based multilingual program repair." 466 }, 467 { 468 "title": "Refactoring based program repair applied to programming assignments", 469 "authors": ["Yang Hu", "Umair Z. Ahmed", "Sergey Mechtaev", "Ben Leong", "Abhik Roychoudhury"], 470 "year": 2019, 471 "relevance": "Refactory is the semantic repair baseline; state-of-the-art symbolic repair for student assignments." 472 }, 473 { 474 "title": "sk_p: A neural program corrector for MOOCs", 475 "authors": ["Yewen Pu", "Karthik Narasimhan", "Armando Solar-Lezama", "Regina Barzilay"], 476 "year": 2016, 477 "relevance": "Early neural approach to student program repair for MOOCs, relevant to understanding the evolution of neural APR in education." 478 }, 479 { 480 "title": "Language models are few-shot learners", 481 "authors": ["Tom B. Brown"], 482 "year": 2020, 483 "relevance": "GPT-3 paper establishing few-shot learning with LLMs, foundational to the prompt-based approach used in MMAPR." 484 }, 485 { 486 "title": "A conversational paradigm for program synthesis", 487 "authors": ["Erik Nijkamp", "Bo Pang", "Hiroshi Hayashi"], 488 "year": 2022, 489 "arxiv_id": "2203.13474", 490 "relevance": "CodeGen model for program synthesis, mentioned as an alternative LLMC that could be used in place of Codex." 491 }, 492 { 493 "title": "Automated clustering and program repair for introductory programming assignments", 494 "authors": ["Sumit Gulwani", "Ivan Radicek", "Florian Zuleger"], 495 "year": 2018, 496 "relevance": "Symbolic APR for education using clustering of student submissions, a key prior approach MMAPR aims to improve upon." 497 }, 498 { 499 "title": "Competition-level code generation with AlphaCode", 500 "authors": ["Yujia Li"], 501 "year": 2022, 502 "relevance": "Demonstrates LLM code generation at competition level, relevant to understanding LLM capability in programming tasks." 503 }, 504 { 505 "title": "The robots are coming: Exploring the implications of OpenAI Codex on introductory programming", 506 "authors": ["James Finnie-Ansley", "Paul Denny", "Brett A. Becker", "Andrew Luxton-Reilly", "James Prather"], 507 "year": 2022, 508 "relevance": "Explores impact of Codex on programming education, directly relevant to AI in education domain." 509 }, 510 { 511 "title": "Synchromesh: Reliable code generation from pre-trained language models", 512 "authors": ["Gabriel Poesia", "Oleksandr Polozov", "Vu Le"], 513 "year": 2022, 514 "arxiv_id": "2201.11227", 515 "relevance": "Constrained code generation from LLMs addressing spurious edit problems similar to those MMAPR's chunking addresses." 516 } 517 ], 518 "engagement_factors": { 519 "practical_relevance": { 520 "score": 2, 521 "justification": "Directly applicable to educational settings for automated student feedback, though Codex API is now deprecated." 522 }, 523 "surprise_contrarian": { 524 "score": 1, 525 "justification": "LLMs fixing code is expected; the unified syntax+semantic approach and 96.5% repair rate are notable but not contrarian." 526 }, 527 "fear_safety": { 528 "score": 0, 529 "justification": "No AI safety or security concerns raised; the application is educational support." 530 }, 531 "drama_conflict": { 532 "score": 0, 533 "justification": "No controversy or provocative claims; straightforward system evaluation paper." 534 }, 535 "demo_ability": { 536 "score": 0, 537 "justification": "No code, demo, or tool released; the system cannot be tried by others." 538 }, 539 "brand_recognition": { 540 "score": 2, 541 "justification": "Microsoft authors using OpenAI's Codex; both are well-known brands in the AI/programming tools space." 542 } 543 } 544 }