scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31713B)
      1 {
      2   "paper": {
      3     "title": "Search-based Automated Program Repair of CPS Controllers Modeled in Simulink-Stateflow",
      4     "authors": [
      5       "Aitor Arrieta",
      6       "Pablo Valle",
      7       "Shaukat Ali"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2404.04688",
     12     "doi": "10.48550/arXiv.2404.04688"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor"],
     16   "methodology_tags": ["benchmark-eval", "case-study"],
     17   "key_findings": "FLOWREPAIR, a search-based APR tool for Stateflow models in Simulink, found plausible patches for 8/9 and valid patches for 6/9 buggy CPS controller models across three case study systems. Its combined global-local search strategy outperformed a (1+1)EA baseline on 3 models where the baseline found zero patches, with statistical significance confirmed for one model (pacemaker_fault1). The tool introduces three CPS-specific repair objectives (failure active time, trigger time, severity) and 15 mutation operators for Stateflow models, with experiments totaling 90 hours across 9 faulty models and 5 repetitions each.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides both a GitHub repository (https://github.com/aitorarrietamarcos/StateflowRepairTool) and a Zenodo replication package (https://zenodo.org/records/10936238). Section 8 and the Replication Package section explicitly list both URLs."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The replication package on Zenodo includes the faulty Stateflow models used in evaluation. The paper states 'We provide both a replication package and a live repository' and the dataset extends Ayesh et al. [25] with 7 new faulty models."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 4.7 specifies MATLAB 2022b, Windows 10, 16GB RAM, AMD Ryzen 7 5800HS processor with 8 cores and 16 threads. A Zenodo replication package is also provided which would contain additional setup details."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "A dedicated replication package is provided on Zenodo (https://zenodo.org/records/10936238) alongside a live GitHub repository. The paper directs readers to 'See the replication package for details' for system-specific information."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Figure 4 shows standard deviation across the 5 runs for each buggy model over time. Table 1 reports mean, minimum, and maximum values across the 5 runs, providing spread measures for all results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Section 5.2 applies the Wilcoxon signed-rank test to compare FLOWREPAIR vs the baseline algorithm. They confirm statistical significance for pacemaker_fault1 and report no statistical significance for other models, following Arcuri and Briand's guide [35]."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 1 and Section 5.2 report absolute numbers with baseline context: e.g., FLOWREPAIR generated 4.2 plausible patches per run on average for pacemaker_fault1 vs 0.0 for the baseline. Results for all 9 models provide both FLOWREPAIR and baseline counts for comparison."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 4.6 justifies 5 repetitions citing 'long execution time of the algorithms' and 'significant manual effort for validating each of the plausible patches (686 patches).' Section 4.2 justifies 9 models as 'the largest evaluation related to APR in the field of CPSs' and notes the scarcity of real Stateflow fault datasets."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Table 1 reports mean, min, and max across 5 runs for both plausible and valid patches. Figure 4 shows standard deviation across runs. Section 5.1 notes specific examples of variance, e.g., 'fridge_3 between 2600 and 3300 seconds shows no standard deviation since in all runs the number of plausible patches was the same.'"
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 4.3 describes a (1+1)EA baseline algorithm, which is 'in line with the algorithms used by Abdessalem et al. [20] and another CPS misconfiguration repair approach [19].' The baseline uses the same mutation operators and component selection strategy as FLOWREPAIR but without local search."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baseline is inspired by Abdessalem et al. [20] (2020) and Valle et al. [19] (2023), which are recent and representative CPS APR algorithms. Section 4.3 notes 'The algorithm is also stronger than the commonly employed Random Search baseline in other search-based studies.'"
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "FLOWREPAIR has multiple novel components (global+local search, three repair objectives, 15 mutation operators) but no systematic ablation study is conducted. The baseline comparison partially ablates the local search component, but the novel repair objectives and individual mutation operator contributions are never evaluated in isolation."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 4.5 defines two evaluation metrics: (1) the number of plausible patches and (2) the number of valid patches. Both are reported for all models in Table 1."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 4.6 states 'we had to manually validate 686 plausible patches to check its semantic equivalence with the patch proposed by the developer.' All plausible patches were manually inspected to determine if they were semantically valid."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Section 4.4 mentions 'a set of preliminary experiments' were used to tune parameters (time budget=1 hour, local tries=30), but there is no separation of models into tuning and evaluation sets. It is unclear whether the same 9 models were used for both parameter tuning and final evaluation."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 1 and Figure 4 provide detailed per-model results for all 9 faulty models across 3 case study systems. Results are broken down individually rather than only reported as aggregates."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.1 discusses in detail why FLOWREPAIR failed on pacemaker_fault2: the buggy state contained 14 other variable assignments making the search space extremely large. They also analyze that 'the way the model is developed restricts efficient APR.'"
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports FLOWREPAIR's failure on pacemaker_fault2 (0 plausible patches). Section 5.2 reports that for 5 of the 9 models where both methods found patches, 'the differences in all these cases were not statistically significant.' For fridge_3, the baseline slightly outperformed FLOWREPAIR."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims FLOWREPAIR can fix bugs 'including models with multiple faults' (supported by fridge_2 results in Table 1) and 'surpasses or performs similarly to a baseline' (supported by Section 5.2 showing 8/9 vs 6/9 models with plausible patches). All abstract claims are hedged appropriately ('Our experiments suggest')."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims about FLOWREPAIR outperforming the baseline are supported by controlled experimentation: same time budget, same mutation operators, same component selection — only the local search and archive management differ. Section 5.2 applies statistical tests. Language is appropriately hedged ('This might mean that...')."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The title and abstract bound claims to 'CPS Controllers Modeled in Simulink-Stateflow.' Section 6 (External Validity) explicitly states 'We applied FLOWREPAIR to only nine faulty models in three case studies, which may not be large enough to generalize our findings.' The abstract says 'paving the way towards' (not 'solving') APR of CPSs."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 5.2 discusses that for simpler bugs, 'the fix could be provided by a pure random search version of FLOWREPAIR,' offering an alternative explanation for baseline-comparable performance. Section 5.1 analyzes how model design complexity (14 variable assignments) explains the pacemaker_fault2 failure. Section 6 discusses parameter configuration threats."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures plausible patches and valid patches, and claims repairability. These measures directly assess the claimed capability — no proxy gap exists. The distinction between plausible (passes test suite) and valid (semantically equivalent to developer fix) is clearly stated in Section 2.1 and 4.5."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4.7 specifies 'MATLAB 2022b version.' No pre-trained ML models are used — the tool is a search-based algorithm — so MATLAB version is the relevant software version. The OS (Windows 10) and hardware specs are also provided."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The paper does not use prompting. FLOWREPAIR is a search-based algorithm using mutation operators on Stateflow models, not an LLM-based approach."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.4 reports: time budget = 1 hour, number of local tries = 30, local mutation operator reuse probability = 50%, SBFL metric = Tarantula. Section 4.6 states 5 repetitions per experiment. The algorithm generated 150-1060 patches per run depending on the case study."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. FLOWREPAIR is a search-based algorithm with clearly defined mutation operators and search routines, not an LLM agent."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.2 describes the data: 2 models from Ayesh et al. [25], 7 from student projects, with details on each case study (pacemaker, fridge with single/multiple faults, automated door). Test case generation by an independent developer using regression oracles is described. Section 3.1 documents the SBFL instrumentation process."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 'Threats to Validity' is a dedicated section discussing external, internal, conclusion, and construct validity threats in structured subsections."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 discusses threats specific to this study: 'We applied FLOWREPAIR to only nine faulty models in three case studies' (external), 'The configuration of various parameters of our algorithms is a concern' with mitigation via preliminary experiments (internal), and 5 repetitions with Wilcoxon tests to address randomness (conclusion)."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 6 explicitly states 'may not be large enough to generalize our findings.' Section 7 distinguishes FLOWREPAIR from CPS misconfiguration repair and neural network repair approaches. Section 5.1 identifies a specific class of bugs FLOWREPAIR cannot fix (complex state assignments). Section 8 lists specific future work items addressing current scope limitations."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "A replication package is available on Zenodo (https://zenodo.org/records/10936238) and a live GitHub repository is provided. The faulty Simulink models and experimental artifacts are included for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.2 describes the data sources: 2 pacemaker models from Ayesh et al. [25], 7 models from student projects (3 fridge variants including multi-fault, 2 door models). Test case generation procedure is described: 'for each faulty model, we asked an independent developer to generate a failing and a passing test case' using regression oracles."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The paper states faulty models were 'extracted from students' projects where they had to develop a Stateflow controller' but does not describe which students, what course, how projects were selected, or criteria for including specific faults. The 'independent developer' who generated test cases is not characterized. Selection criteria for the 7 new faults are not specified."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline is documented: faulty models → instrumentation for SBFL (Section 3.1) → test suite execution → suspiciousness ranking → repair algorithm (Section 3.2) → plausible patch collection → manual validation (Section 4.6). The paper accounts for all 9 models and reports results for each."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The Acknowledgments section discloses: Software and Systems Engineering research group of Mondragon Unibertsitatea (IT1519-22), supported by the Department of Education, Universities and Research of the Basque Country. Shaukat Ali is supported by the Co-evolver project (#286898), funded by the Research Council of Norway."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: Aitor Arrieta and Pablo Valle at Mondragon University, and Shaukat Ali at Simula Research Laboratory / Oslo Metropolitan University. No evaluated product is affiliated with these institutions."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funding comes from the Basque Country Department of Education and the Research Council of Norway — public research funding agencies with no financial interest in FLOWREPAIR's outcomes."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper. The absence of a declaration is not the same as absence of conflict."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "FLOWREPAIR is a search-based algorithm, not a pre-trained ML model. There is no training data or training cutoff to consider."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No pre-trained model is evaluated on a benchmark. The tool uses mutation operators and search algorithms, not learned representations."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No pre-trained model is involved. Benchmark contamination is structurally inapplicable to a search-based approach that does not learn from training data."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The evaluation involves automated execution of a repair tool on faulty Simulink models."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study evaluates an automated repair tool on software artifacts."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in the study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 4.4 states a time budget of 1 hour per run. Section 4.6 reports total experiment time: 90 hours (5 repetitions × 9 models × 2 algorithms). Section 4.4 notes 150-1060 patches generated per run depending on the case study."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 4.7 specifies the hardware: Windows 10, 16GB RAM, AMD Ryzen 7 5800HS (8 cores, 16 threads). Total compute: 90 hours. Section 4.6 reports 686 plausible patches required manual validation."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 4.6 runs each algorithm 5 times to account for stochastic variation. Table 1 reports mean, min, and max across runs, and Figure 4 shows standard deviation, effectively demonstrating seed sensitivity."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 4.6 explicitly states: 'we run the algorithm 5 times, both, for FLOWREPAIR and the baseline algorithm.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 4.4 states 'Based on a set of preliminary experiments, we decided on the following values' for time budget and local tries, but does not describe how many configurations were tried, what ranges were explored, or total compute spent on tuning."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper reports results with a single configuration (time budget=1 hour, local tries=30) selected via undescribed 'preliminary experiments.' No validation set or selection criteria are documented."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Wilcoxon tests are applied across multiple model comparisons (up to 9 models) but no multiple comparison correction (Bonferroni, Holm, etc.) is applied or discussed."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The baseline is the authors' own implementation of a simplified version of their algorithm. They do not acknowledge the systematic bias documented by Lucic et al. (2018) where authors' baseline implementations tend to underperform."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Figure 4 plots the number of plausible patches over execution time (0-3600 seconds) for both FLOWREPAIR and baseline, showing performance as a function of compute budget. Both algorithms use the same 1-hour time budget for fair comparison."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether 9 faulty models from student projects and one prior dataset adequately represent real-world CPS Stateflow bugs. No analysis of benchmark representativeness or comparison with industrial fault distributions is provided."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. FLOWREPAIR is a standalone search-based tool, not an LLM-based approach with scaffolding."
    341       }
    342     }
    343   },
    344   "claims": [
    345     {
    346       "claim": "FLOWREPAIR can find plausible patches for 8 out of 9 buggy Stateflow models and valid patches for 6 out of 9.",
    347       "evidence": "Table 1 shows mean plausible patches > 0 for all models except pacemaker_fault2. Valid patches were found in 6 models: fridge_1 (8.8), fridge_2a (13.4), fridge_2b (7.4), fridge_2 (0.4), pacemaker_fault1 (0.4), door_2 (2.0). Section 5.1 provides detailed analysis.",
    348       "supported": "strong"
    349     },
    350     {
    351       "claim": "FLOWREPAIR surpasses or performs similarly to a (1+1)EA baseline algorithm for CPS repair.",
    352       "evidence": "Section 5.2 and Table 1 show FLOWREPAIR found plausible patches in 3 models (fridge_1, fridge_2, pacemaker_fault1) where the baseline found none. Wilcoxon signed-rank test confirmed statistical significance for pacemaker_fault1 (4.2 vs 0.0 plausible patches). For remaining models, differences were not statistically significant.",
    353       "supported": "moderate"
    354     },
    355     {
    356       "claim": "The combined global-local search strategy is more efficient than pure global search for complex bugs.",
    357       "evidence": "Section 5.2 observes that 'for many of the generated plausible patches, FLOWREPAIR made significant use of the local search routine' for the 3 models where the baseline failed. For simpler bugs, 'the fix could be provided by a pure random search version.' No controlled ablation isolating the local search contribution.",
    358       "supported": "weak"
    359     },
    360     {
    361       "claim": "The novel repair objectives (failure active time, trigger time, severity) are suitable for guiding CPS repair.",
    362       "evidence": "Section 3.3 provides conceptual motivation with examples (Figure 3). The objectives are used in all experiments, but their individual contributions are never evaluated. No ablation comparing repair performance with/without each objective.",
    363       "supported": "weak"
    364     },
    365     {
    366       "claim": "This is the first paper targeting APR of Simulink models (the de-facto CPS modeling tool).",
    367       "evidence": "Section 7 reviews related work and argues Singh and Saha [69] focus only on parameter repair (not general program repair), and AutoRepair [29] targets neural networks, not Stateflow models. The claim appears well-supported by the related work analysis.",
    368       "supported": "moderate"
    369     }
    370   ],
    371   "red_flags": [
    372     {
    373       "flag": "Very small evaluation dataset",
    374       "detail": "Only 9 faulty models across 3 case study systems, with only 5 repetitions each. While the authors acknowledge this and note it is the largest CPS APR evaluation, the sample is too small for robust statistical conclusions. Only 1 of the 9 model comparisons showed statistical significance."
    375     },
    376     {
    377       "flag": "Student-generated faults may not represent industrial bugs",
    378       "detail": "7 of 9 faulty models come from student projects. Section 4.2 acknowledges 'the practice of using students' projects is common in other APR studies' but does not analyze whether student bugs are representative of industrial Stateflow faults. Only 2 models come from prior published work [25]."
    379     },
    380     {
    381       "flag": "Self-implemented baseline",
    382       "detail": "The baseline is the authors' own simplified implementation of their algorithm (without local search). No independent implementation of prior CPS repair tools (e.g., Ariel, Swarmbug) is compared against. The baseline cannot be verified by comparing against a third-party implementation."
    383     },
    384     {
    385       "flag": "Novel components not individually evaluated",
    386       "detail": "The three novel repair objectives and 15 mutation operators are presented as contributions but never ablated. It is unclear whether the improvements come from the search algorithm, the repair objectives, the mutation operators, or their combination."
    387     },
    388     {
    389       "flag": "Manual validation criteria not specified",
    390       "detail": "686 plausible patches were manually validated for 'semantic equivalence with the patch proposed by the developer,' but the criteria for determining semantic equivalence are not described. No inter-rater reliability is reported for this manual assessment."
    391     }
    392   ],
    393   "cited_papers": [
    394     {
    395       "title": "GenProg: A generic method for automatic software repair",
    396       "authors": ["Claire Le Goues", "ThanhVu Nguyen", "Stephanie Forrest", "Westley Weimer"],
    397       "year": 2011,
    398       "relevance": "Seminal search-based APR approach using genetic programming; foundational to understanding automated program repair techniques."
    399     },
    400     {
    401       "title": "ARJA: Automated repair of Java programs via multi-objective genetic programming",
    402       "authors": ["Yuan Yuan", "Wolfgang Banzhaf"],
    403       "year": 2018,
    404       "relevance": "Multi-objective genetic programming approach for Java APR; directly relevant to search-based repair methodology."
    405     },
    406     {
    407       "title": "Automated program repair in the era of large pre-trained language models",
    408       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    409       "year": 2023,
    410       "relevance": "Evaluates LLMs for automated program repair at ICSE 2023; directly relevant to LLM-based APR capabilities."
    411     },
    412     {
    413       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    414       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    415       "year": 2022,
    416       "relevance": "Zero-shot LLM approach for APR; relevant to understanding LLM repair capabilities without fine-tuning."
    417     },
    418     {
    419       "title": "An analysis of the automatic bug fixing performance of ChatGPT",
    420       "authors": ["Dominik Sobania", "Martin Briesch", "Carol Hanna", "Justyna Petke"],
    421       "year": 2023,
    422       "relevance": "Evaluates ChatGPT for automated bug fixing; directly relevant to conversational AI for code repair."
    423     },
    424     {
    425       "title": "Getafix: Learning to fix bugs automatically",
    426       "authors": ["Johannes Bader", "Andrew Scott", "Michael Pradel", "Satish Chandra"],
    427       "year": 2019,
    428       "relevance": "Learning-based APR that mines human patches to learn repair rules; relevant to ML-based program repair."
    429     },
    430     {
    431       "title": "An empirical investigation into learning bug-fixing patches in the wild via neural machine translation",
    432       "authors": ["Michele Tufano", "Cody Watson", "Gabriele Bavota", "Massimiliano Di Penta", "Martin White", "Denys Poshyvanyk"],
    433       "year": 2018,
    434       "relevance": "Neural machine translation approach for learning bug-fixing patches; foundational for deep learning-based APR."
    435     },
    436     {
    437       "title": "Automated repair of feature interaction failures in automated driving systems",
    438       "authors": ["Raja Ben Abdessalem", "Annibale Panichella", "Shiva Nejati", "Lionel C Briand", "Thomas Stifter"],
    439       "year": 2020,
    440       "relevance": "APR for automated driving CPS using search-based approach (Ariel); directly relevant to CPS repair domain."
    441     },
    442     {
    443       "title": "AutoRepair: Automated repair for AI-enabled cyber-physical systems under safety-critical conditions",
    444       "authors": ["Deyun Lyu", "Jiayang Song", "Zhenya Zhang", "Zhijie Wang", "Tianyi Zhang", "Lei Ma", "Jianjun Zhao"],
    445       "year": 2023,
    446       "arxiv_id": "2304.05617",
    447       "relevance": "Repair of neural networks embedded in CPSs; complementary approach to Stateflow repair for AI-enabled CPS."
    448     },
    449     {
    450       "title": "Automated program repair",
    451       "authors": ["Claire Le Goues", "Michael Pradel", "Abhik Roychoudhury"],
    452       "year": 2019,
    453       "relevance": "Comprehensive overview of APR field in Communications of the ACM; provides context for repair objectives and patch overfitting."
    454     },
    455     {
    456       "title": "Automated repair of programs from large language models",
    457       "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"],
    458       "year": 2023,
    459       "relevance": "LLM-based automated program repair at ICSE 2023; relevant to understanding LLM repair capabilities."
    460     },
    461     {
    462       "title": "DEAR: A novel deep learning-based approach for automated program repair",
    463       "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
    464       "year": 2022,
    465       "relevance": "Deep learning approach for APR; represents the neural/deep learning direction in program repair research."
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 1,
    471       "justification": "Niche applicability limited to MATLAB/Simulink users developing CPS Stateflow controllers; requires commercial MATLAB license."
    472     },
    473     "surprise_contrarian": {
    474       "score": 0,
    475       "justification": "Extends search-based APR to a new domain (Stateflow) with expected results; does not challenge conventional wisdom."
    476     },
    477     "fear_safety": {
    478       "score": 0,
    479       "justification": "No AI safety or security concerns raised; the tool repairs CPS controllers, which could improve safety but is not framed as a risk."
    480     },
    481     "drama_conflict": {
    482       "score": 0,
    483       "justification": "No controversy or dramatic claims; straightforward tool evaluation paper."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "Open-source GitHub repository and Zenodo replication package provided, though requires MATLAB license to actually run."
    488     },
    489     "brand_recognition": {
    490       "score": 0,
    491       "justification": "From Mondragon University and Simula Research Laboratory; not widely recognized outside the CPS testing community."
    492     }
    493   }
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs