scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20438B)
      1 {
      2   "paper": {
      3     "title": "Automated Program Repair Based on REST API Specifications Using Large Language Models",
      4     "authors": ["Katsuki Yamagishi", "Norihiro Yoshida", "Erina Makihara", "Katsuro Inoue"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.25148",
      8     "doi": "10.48550/arXiv.2510.25148"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "A Zenodo archive is provided (https://zenodo.org/records/16556822), referenced in the introduction as a footnote containing the dcFix implementation and misuse examples."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper states the misuse examples are 'published them on the website' alongside the dcFix implementation at the Zenodo archive link."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, dependency files, or library versions are mentioned anywhere in the paper."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step instructions for reproducing the experiments are provided. The paper describes the method conceptually but gives no commands or procedure to replicate results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results are reported as raw counts (e.g., 13/19, 6/8) with no confidence intervals or error bars despite the small sample sizes."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper claims dcFix achieves higher fix rates than the baseline but provides no statistical significance tests. Comparisons are based solely on raw counts."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Only raw fix counts are reported (e.g., 13/19 vs 1/19). No effect sizes, odds ratios, or percentage improvements with baseline context are provided."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The total dataset is 31 misuse cases (21 for Philips Hue, 10 for SwitchBot). No justification for this sample size is given, and no discussion of whether it is sufficient for the claims made."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The LLM is run 5 times per prompt and a case is considered successful 'if at least one attempt produced a correct result.' No variance across the 5 runs is reported — only the binary success/fail outcome."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "A baseline approach is included: prompts to the LLM that omit unsatisfied specifications or deviation points. Results are compared in Table III."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The only baseline is 'prompts without specification information.' No comparison against any existing automated repair tool, LLM-based or otherwise, is included. The related work discusses tools like MUDetect and InferFix but none are compared against."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "dcFix has two key components: (1) deviation point detection and (2) prompt augmentation with unsatisfied specifications. The baseline omits both, but there is no ablation testing each component individually (e.g., including only deviation points without unsatisfied specifications, or vice versa)."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "Only one metric is reported: fix rate (number of successfully repaired cases). No other metrics such as code quality, time to fix, or partial correctness are reported."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of the generated fixes is described. Correctness appears to be determined by the authors but no formal evaluation protocol is documented."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "There is no separation of development and test data. The entire dataset of 31 cases is used for evaluation, and presumably the prompts were developed using the same or similar examples."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Tables II and III provide per-category breakdowns by API (Philips Hue vs SwitchBot) and by misuse type (Endpoint, Request Headers, Request Body)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The Summary section mentions that 'there were cases where the LLM failed to generate correct fixes because it had not been trained on the latest REST API specifications.' However, this is a brief mention rather than detailed error analysis."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that dcFix failed to detect some deviations (2/21 for Philips Hue, 2/10 for SwitchBot) and failed to fix some detected deviations (6/19 for Philips Hue, 2/8 for SwitchBot). These are honestly reported in Tables II and III."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims dcFix 'accurately detects misuse and outperforms the baseline approach.' Tables II and III support these claims with 27/31 detection rate and higher fix rates than baseline (19/27 vs 2/27)."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper implicitly claims that including deviation points and unsatisfied specifications in prompts causes better repair outcomes. However, the comparison is only between full prompt vs no-specification prompt, with no control for confounds (e.g., prompt length, additional context in general)."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title says 'Automated Program Repair Based on REST API Specifications' broadly, but evaluation is limited to only two APIs (SwitchBot and Philips Hue) with 31 total cases. No discussion of whether results generalize to other REST APIs or programming languages beyond Python."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations are discussed. The improvement could be partly due to the additional context in the prompt rather than the specific deviation point / specification information. This is not considered."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper never specifies which LLM was used. It refers only to 'an LLM' or 'the LLM' throughout without naming the model, version, or provider."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Listing 2 shows the prompt template used for repair instructions. While it contains a placeholder '${target program}', the actual template text (the system instruction portion) is provided in full."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for the LLM. The paper states the LLM was run 5 times per prompt but provides no sampling configuration."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The two-step pipeline (S1: deviation point detection via static analysis, S2: LLM-based repair) is described in Section III with Figure 1 providing an overview. The static analysis steps are detailed in Section III-A."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section IV-A describes how misuse cases were collected: SwitchBot cases from closed issues on GitHub, Philips Hue cases from commits involving the /clip/v2 endpoint. Selection criteria are stated, though briefly."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The only limitation mentioned is a brief remark in the Summary about LLM not being trained on latest specifications."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity are discussed. The brief mention of LLM knowledge staleness is a single sentence in the summary, not a substantive limitations discussion."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings are excluded, or what claims the authors are NOT making."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The Zenodo archive (referenced in footnote 1) is stated to contain the misuse examples alongside the implementation, allowing independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section IV-A describes data collection: SwitchBot cases from closed GitHub issues where API specifications were modified in commits; Philips Hue cases from repositories with /clip/v2 endpoint changes."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were involved. The dataset consists of code examples from public GitHub repositories, which is a standard data source not involving recruitment."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper describes the sources but not the full pipeline. It states 'We manually extracted programs containing code fragments that were inconsistent with the specifications' but does not document how many candidates were considered, how many were filtered, or the specific criteria for inclusion/exclusion beyond 'violated API specifications.'"
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding information or acknowledgments section is present in the paper."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All four authors are listed with their affiliation at Ritsumeikan University with email addresses."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence of funding cannot be assessed. The absence of disclosure does not establish that the work is unfunded."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper uses an LLM to generate fixes but never states which LLM is used, let alone its training data cutoff. The Summary section acknowledges 'the LLM failed to generate correct fixes because it had not been trained on the latest REST API specifications,' implying awareness of the issue but providing no concrete cutoff date."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether the LLM may have seen the test code or API specifications during training. The misuse cases come from public GitHub repositories which are likely in LLM training data."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The misuse examples come from public GitHub issues and commits which are very likely in LLM training corpora. This contamination risk is not discussed at all."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants were involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants were involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants were involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants were involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference cost, API cost, or latency is reported despite the method calling an LLM 5 times per misuse case."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget, hardware specifications, or total API spend is reported."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "dcFix accurately detects deviation points from REST API specifications, identifying 19/21 for Philips Hue and 8/10 for SwitchBot.",
    287       "evidence": "Table II shows detection results: 12/14 endpoints, 7/7 request bodies for Philips Hue; 5/5 endpoints, 2/4 request headers, 1/1 request body for SwitchBot (Section IV-B).",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "dcFix achieves higher fix rates compared to the baseline approach for both APIs.",
    292       "evidence": "Table III shows dcFix fixed 13/19 vs baseline 1/19 for Philips Hue, and 6/8 vs 1/8 for SwitchBot. However, no statistical tests were applied and the sample sizes are very small.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Including deviation point and unsatisfied specification information in LLM prompts leads to better repair outcomes than omitting them.",
    297       "evidence": "Table III comparison between dcFix and baseline. The difference is substantial in raw numbers (19/27 vs 2/27 total) but lacks statistical testing, and no confounds are controlled.",
    298       "supported": "moderate"
    299     }
    300   ],
    301   "methodology_tags": ["benchmark-eval", "case-study"],
    302   "key_findings": "dcFix is a method for detecting and automatically repairing REST API misuses in client programs by combining static analysis of deviation points with LLM-based repair. Evaluated on 31 misuse cases from SwitchBot and Philips Hue APIs, dcFix detected 27 out of 31 deviations and repaired 19 of the 27 detected cases, compared to only 2/27 for the baseline approach that omits specification information from prompts. The study demonstrates that enriching LLM prompts with specific violation context improves repair effectiveness, though the evaluation is limited to two APIs and a small dataset.",
    303   "red_flags": [
    304     {
    305       "flag": "Unidentified LLM",
    306       "detail": "The paper never names which LLM was used. This is a critical omission — different models have vastly different capabilities, and the results cannot be replicated or interpreted without knowing the model."
    307     },
    308     {
    309       "flag": "Very small sample size",
    310       "detail": "Only 31 misuse cases total across two APIs. Some subcategories have as few as 1 case (SwitchBot Request Body). The results are too sparse to draw general conclusions."
    311     },
    312     {
    313       "flag": "No statistical analysis",
    314       "detail": "Despite claiming dcFix 'outperforms' the baseline, no statistical tests are applied. With such small samples, the observed differences could be due to chance."
    315     },
    316     {
    317       "flag": "Weak baseline",
    318       "detail": "The only baseline is 'no specification information in the prompt.' No comparison against existing APR tools mentioned in the related work (MUDetect, InferFix, etc.)."
    319     },
    320     {
    321       "flag": "Pass@5 metric without reporting per-run results",
    322       "detail": "The LLM is run 5 times and success is counted if at least one run succeeds. This inflates apparent success rates. Per-run success rates and variance are not reported."
    323     },
    324     {
    325       "flag": "No limitations section",
    326       "detail": "The paper has no limitations or threats-to-validity section despite significant methodological weaknesses (small sample, two APIs only, unknown LLM, no statistical tests)."
    327     },
    328     {
    329       "flag": "Data contamination risk",
    330       "detail": "Misuse cases are from public GitHub repositories likely present in LLM training data. The LLM may already know the correct code, making the baseline comparison unreliable."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Evaluating automatic program repair capabilities to repair API misuses",
    336       "authors": ["M. Kechagia", "S. Mechtaev", "F. Sarro", "M. Harman"],
    337       "year": 2022,
    338       "doi": "10.1109/TSE.2021.3095706",
    339       "relevance": "Evaluates APR tools on API misuse repair, directly relevant to LLM-based program repair quality assessment."
    340     },
    341     {
    342       "title": "Automated program repair in the era of large pre-trained language models",
    343       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    344       "year": 2023,
    345       "relevance": "Demonstrates LLM outperformance over traditional APR tools, foundational work for LLM-based repair evaluations."
    346     },
    347     {
    348       "title": "Inferfix: End-to-end program repair with LLMs",
    349       "authors": ["M. Jin", "S. Shahriar", "M. Tufano", "X. Shi", "S. Lu", "N. Sundaresan", "A. Svyatkovskiy"],
    350       "year": 2023,
    351       "relevance": "Combines static analysis with LLMs for program repair, directly comparable methodology to dcFix."
    352     },
    353     {
    354       "title": "RESTler: Stateful REST API fuzzing",
    355       "authors": ["V. Atlidakis", "P. Godefroid", "M. Polishchuk"],
    356       "year": 2019,
    357       "relevance": "REST API testing tool generating API call sequences from specifications, relevant to API quality and testing research."
    358     },
    359     {
    360       "title": "Boosting API misuse detection via integrating API constraints from multiple sources",
    361       "authors": ["C. Li", "J. Zhang", "Y. Tang", "Z. Li", "T. Sun"],
    362       "year": 2024,
    363       "relevance": "API misuse detection using multi-source constraints, relevant to automated code quality assessment."
    364     },
    365     {
    366       "title": "Generating REST API specifications through static analysis",
    367       "authors": ["R. Huang", "M. Motwani", "I. Martinez", "A. Orso"],
    368       "year": 2024,
    369       "relevance": "Static analysis for REST API specification generation, relevant to API conformance checking research."
    370     }
    371   ]
    372 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs