scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22143B)
      1 {
      2   "paper": {
      3     "title": "Automated Program Repair of Uncompilable Student Code",
      4     "authors": ["Griffin Pitts", "Aum Pandya", "Darsh Rank", "Tirth Bhatt", "Muntasir Hoq", "Bita Akram"],
      5     "year": 2026,
      6     "venue": "SIGCSE TS 2026",
      7     "arxiv_id": "2510.06187",
      8     "doi": "10.1145/3770761.3777323"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. No mention of code release."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper uses a publicly available dataset from the CodeWorkout platform, citing Edwards and Murali (2017). The dataset is described as 'publicly available' in Section 2.1."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, dependency lists, or library versions are provided. The paper does not describe the software environment used for experiments."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No reproduction instructions, scripts, or step-by-step guides are provided. The methodology section describes the approach at a high level but does not include enough detail to replicate the experiment."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Only point estimates are reported (e.g., '98.5%', '11.4' average edit distance). No confidence intervals or error bars are provided for any result."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Chi-square tests of independence are used for categorical outcomes (compilation, SP, LP) and ANOVA for continuous measures (edit distance). Specific test statistics and p-values are reported throughout Section 3, e.g., chi-squared(2, N=600) = 3.21, p = .201."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Raw differences with baseline context are provided throughout: e.g., GPT-5 edit distance 11.4 vs Claude 3.5 at 24.4, SP rates of 96.4% vs 88.5%, LP rates of 86.5% vs 67.8%. The absolute counts and percentages allow the reader to assess magnitude of differences."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The sample of 100 uncompilable submissions from only 2 randomly selected problems is not justified. No power analysis or rationale for why 100 submissions from 2 problems (out of 50) is sufficient."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No standard deviations, variance, or spread measures are reported for edit distance or any other continuous metric. Only point estimates (means) are given."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Three models are compared against each other (GPT-5, Claude 3.5 Haiku, Gemini 2.5 Flash) under two prompting conditions, forming natural baselines. However, no non-LLM baseline (e.g., rule-based repair) is included."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The three models tested (GPT-5, Claude 3.5 Haiku, Gemini 2.5 Flash) are contemporary frontier models at the time of publication."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No ablation study is conducted. The high-context vs low-context prompting comparison serves as a partial ablation of the prompt design, but no systematic component removal is performed."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Four metrics are used: compilation success rate, normalized Levenshtein edit distance, Structural Preservation (SP), and Logical Preservation (LP)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Four experts independently annotated all 600 repaired outputs for Structural Preservation and Logical Preservation, with iterative calibration and inter-rater agreement measured via Cohen's Kappa (kappa > 0.80 achieved)."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "There is no train/test split since this is not a learning-based approach — the LLMs are used via prompting. However, the evaluation set of 100 submissions is the only test set used, with no separate validation set for prompt tuning. The paper does not discuss whether the few-shot examples in the high-context condition overlap with the evaluation set."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by model (GPT-5, Claude 3.5 Haiku, Gemini 2.5 Flash) and by prompting condition (low-context vs high-context) across all four metrics in Section 3."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4 discusses that 'models occasionally strayed from the intended pedagogical scope by making stylistic or structural edits that went beyond minimal correction,' identifying a key failure mode."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that prompting condition had no significant effect on any outcome (compilation, edit distance, SP, LP), which contradicts their hypothesis. They state this was 'contrary to our hypothesis that providing additional context might encourage models to over-correct.'"
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims that 'all models produced compilable repairs' (supported by 95.5-98.5% rates) and that 'they differed in how well they preserve students' control flow and code structure' (supported by significant chi-square tests for SP and LP). Claims are appropriately hedged."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper's main causal claims are about model differences and prompting condition effects. These are tested with controlled experimental conditions (3 models x 2 prompting conditions applied to the same 100 submissions), which is an adequate design for these within-subject comparisons."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The abstract says this work 'enables richer and more comprehensive analyses of learners' coding processes' generally, but the study uses only 100 submissions from 2 problems in Java from a single CS1 course. The title 'Automated Program Repair of Uncompilable Student Code' is broader than what the 2-problem, single-language evaluation supports."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations are discussed for the observed differences. For instance, the paper does not consider whether model differences in edit distance could be due to different tokenization or whether the 2 selected problems might favor certain models. The Discussion section focuses on future work rather than explaining the results."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to 'GPT-5', 'Claude 3.5 Haiku', and 'Gemini 2.5 Flash' without specifying API versions, snapshot dates, or exact model identifiers. Marketing names without version/snapshot information do not qualify."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Prompts are described in natural language only: 'instructions to perform syntax-only repair with minimal edits that preserved control flow, identifiers, and formatting' for low-context, and additional elements listed for high-context. The actual prompt text is not provided."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the three LLM API calls."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The LLMs are called directly with prompts, with no tool use, retry logic, or multi-step workflows described."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 2.1 describes the dataset (57,670 submissions, 368 students, CS1 Spring 2019), the filtering to 9,906 uncompilable submissions, random selection of 2 problems from 50, and random sampling of 100 uncompilable submissions from those problems."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section (Section 4) focuses on future work rather than analyzing limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the small number of problems (2 out of 50), the single programming language, or the single course context."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No explicit scope boundaries are stated. The paper does not identify what the results do NOT show, such as limitations to Java, CS1 contexts, or the specific problem types tested."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "Neither the 100 sampled submissions nor the 600 repaired outputs are released. The original CodeWorkout dataset is public, but the specific sample and repairs are not available for verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 2.1 describes the data source (CodeWorkout platform), time period (Spring 2019), course context (CS1 at a U.S. university), and composition (57,670 submissions from 368 students, with 9,906 uncompilable)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "The data source is a standard educational platform dataset. No human participants were recruited for the study; the four expert annotators are not experimental subjects. The student data was collected from a course, not through a recruitment process."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline is documented: full dataset (57,670) → incorrect submissions (38,883) → uncompilable (9,906) → random selection of 2 problems → random sample of 100 submissions → 6 experimental conditions → 600 repaired outputs → human evaluation."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section states: 'This research was supported by NSF under Grant #2418658.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All authors are listed with their affiliation at North Carolina State University. None of the authors appear to be affiliated with the companies whose models are evaluated (OpenAI, Anthropic, Google)."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funder is NSF, a government agency with no financial interest in the outcome of LLM evaluations. NSF is independent of OpenAI, Anthropic, and Google."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or financial interests declaration is included in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No training data cutoff dates are stated for any of the three models (GPT-5, Claude 3.5 Haiku, Gemini 2.5 Flash). The CodeWorkout dataset is from Spring 2019, and any of these models could have been trained on it."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether the CodeWorkout dataset or similar student code might appear in the training data of the models. The dataset is publicly available since 2017, making contamination plausible."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The CodeWorkout dataset was published in 2017, well before the training cutoffs of all three models. No contamination risk analysis is provided despite the dataset being publicly available for years."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The expert annotators are raters, not experimental subjects. The study does not involve human participants in the experimental sense. The student data is archival."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The study uses anonymized archival data from a public dataset. No human participants were recruited or experimented upon."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved as experimental subjects. The student data is archival and anonymized."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are recruited. The study uses an existing dataset and expert annotators who are not subjects."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are assigned to experimental conditions. The randomization in the study pertains to problem and submission selection, not participant assignment."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants serve as experimental subjects. The expert annotators are raters, not subjects in an experiment."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved as experimental subjects. The study uses archival data."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No API costs, token counts, or latency measurements are reported for the 600 LLM calls across three models and two conditions."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No total computational budget, API spend, or hardware details are reported."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "All three LLMs achieved high compilation success rates (95.5-98.5%) with no statistically significant differences among models.",
    287       "evidence": "Section 3: GPT-5 98.5% (591/600), Claude 3.5 96% (576/600), Gemini 2.5 95.5% (573/600). Chi-squared(2, N=600) = 3.21, p = .201.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Edit distance differed significantly by model, with GPT-5 producing the smallest edits.",
    292       "evidence": "Section 3: F(2,594) = 16.22, p < 0.001. GPT-5 mean 11.4, Gemini 2.5 mean 13.8, Claude 3.5 mean 24.4.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Prompting condition (high vs low context) had no significant effect on any evaluation metric.",
    297       "evidence": "Section 3: Compilability chi-squared(1, N=600) = 0.21, p = 0.649; edit distance F(1,594) = 0.004, p = 0.95; SP chi-squared(1, N=579) = 0.02, p = 0.88; LP chi-squared(1, N=549) = 0.00, p = 0.97.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "GPT-5 had the highest Logic Preservation rate (86.5%) while Claude 3.5 had the lowest (67.8%).",
    302       "evidence": "Section 3: GPT-5 166/192 (86.5%), Gemini 2.5 156/186 (83.9%), Claude 3.5 116/171 (67.8%). Chi-squared(2, N=549) = 22.36, p < 0.001.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "This work enables richer and more comprehensive analyses of learners' coding processes.",
    307       "evidence": "Abstract claim. No empirical demonstration of improved student modeling is provided — the paper only shows that repairs are compilable and structure-preserving.",
    308       "supported": "weak"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "LLMs (GPT-5, Claude 3.5 Haiku, Gemini 2.5 Flash) can reliably repair uncompilable student code to a compilable state (95.5-98.5% success), but they differ in how well they preserve the student's original code structure and logic. GPT-5 produced the smallest edits and highest logic preservation (86.5%), while Claude 3.5 Haiku had the largest edits and lowest logic preservation (67.8%). Providing additional context (compiler messages, problem statements, few-shot examples) had no significant effect on any metric, suggesting minimal prompting is sufficient for syntax-only repair of short code snippets.",
    313   "red_flags": [
    314     {
    315       "flag": "Very narrow evaluation scope",
    316       "detail": "Only 100 submissions from 2 randomly selected problems (out of 50) in a single Java CS1 course. The generalizability of findings to other problems, languages, or course levels is unknown, yet the paper frames contributions broadly."
    317     },
    318     {
    319       "flag": "No contamination analysis for publicly available dataset",
    320       "detail": "The CodeWorkout dataset has been public since 2017. All three models may have seen this data or similar student code during training, which could inflate compilation success and repair quality. This is not discussed."
    321     },
    322     {
    323       "flag": "Missing model versions and hyperparameters",
    324       "detail": "No API versions, snapshot dates, temperature, or sampling parameters are reported for any of the three models. Results are not reproducible without this information."
    325     },
    326     {
    327       "flag": "No variance or uncertainty measures",
    328       "detail": "Point estimates only are reported for all metrics. No confidence intervals, standard deviations, or bootstrap intervals are provided, despite the sample being drawn from only 2 problems."
    329     },
    330     {
    331       "flag": "Denominator discrepancies unexplained",
    332       "detail": "Total repaired outputs should be 600 (100 x 6 conditions), but SP is evaluated on N=579 and LP on N=549. The paper does not explain what happened to the missing 21 and 51 cases, possibly those that did not compile, but this is not stated."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Deepfix: Fixing common c language errors by deep learning",
    338       "authors": ["Rahul Gupta", "Soham Pal", "Aditya Kanade", "Shirish Shevade"],
    339       "year": 2017,
    340       "relevance": "Pioneering work on neural approaches to automated program repair, directly relevant to LLM-based code repair evaluation."
    341     },
    342     {
    343       "title": "Automated program repair using generative models for code infilling",
    344       "authors": ["Charles Koutcheme", "Sami Sarsa", "Juho Leinonen", "Arto Hellas", "Paul Denny"],
    345       "year": 2023,
    346       "relevance": "Explores LLMs for code infilling applied to student program repair and feedback, directly related to the survey scope of LLM programming capabilities."
    347     },
    348     {
    349       "title": "A Survey of LLM-Based Applications in Programming Education: Balancing Automation and Human Oversight",
    350       "authors": ["Griffin Pitts", "Anurata Prabha Hridi", "Arun-Balajiee Lekshmi-Narayanan"],
    351       "year": 2025,
    352       "arxiv_id": "2510.03719",
    353       "relevance": "Survey of LLM applications in programming education, directly relevant to the survey's scope on LLM-based software engineering tools."
    354     },
    355     {
    356       "title": "Finding misleading identifiers in novice code using LLMs",
    357       "authors": ["Anna Řechtáčková", "Alexandra Maximova", "Griffin Pitts"],
    358       "year": 2025,
    359       "relevance": "Uses LLMs to analyze student code quality, relevant to LLM capabilities in code understanding and education."
    360     },
    361     {
    362       "title": "Automated program repair for introductory programming assignments",
    363       "authors": ["Han Wan", "Hongzhen Luo", "Mengying Li", "Xiaoyan Luo"],
    364       "year": 2024,
    365       "relevance": "Contextual edit-based automated repair for student submissions, relevant to LLM and AI approaches to code repair."
    366     },
    367     {
    368       "title": "Grading uncompilable programs",
    369       "authors": ["Rohit Takhar", "Varun Aggarwal"],
    370       "year": 2019,
    371       "relevance": "Rule-based approach to handling uncompilable student code, provides baseline context for LLM-based repair approaches."
    372     }
    373   ]
    374 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs