ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26429B)


      1 {
      2   "paper": {
      3     "title": "Leveraging Rust Types for Program Synthesis",
      4     "authors": [
      5       "Jonáš Fiala",
      6       "Shachar Itzhaky",
      7       "Peter Müller",
      8       "Nadia Polikarpova",
      9       "Ilya Sergey"
     10     ],
     11     "year": 2023,
     12     "venue": "Proceedings of the ACM on Programming Languages (PLDI)",
     13     "doi": "10.1145/3591278"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval", "theoretical"],
     18   "key_findings": "The paper presents Synthetic Ownership Logic (SOL) and RusSOL, the first tool for synthesizing correct-by-construction safe Rust programs from type signatures and functional specifications. RusSOL solves 115 of 117 annotated synthesis tasks (mean 1.8s) and 1328 of 1635 non-primitive type signatures from the top 100 crates. Annotations are on average 27% shorter than the synthesized code, and 172% shorter than equivalent SuSLik specifications, demonstrating that Rust's type system substantially reduces the specification burden and search space for synthesis.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states: 'The software artifact accompanying this paper is available online (Fiala et al. 2023)' with a Zenodo archive at https://doi.org/10.5281/zenodo.7811786 containing 'the source code and build scripts for RusSOL.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The artifact contains 'a corpus of case studies that can be used to reproduce the experimental results described in Sec. 4.' All four benchmark suites (Rust, SuSLik, Verifier, 100-Crates) are included."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The artifact includes 'build scripts for RusSOL' and 'a README file in markdown that provides detailed step-by-step instructions for running RusSOL and the experiments.'"
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The artifact provides 'detailed step-by-step instructions for running RusSOL and the experiments' in the README."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Table 1 reports mean time, mean SOL rules, mean LOC, and mean Code/Spec ratio as point estimates without confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The comparison between RusSOL and SuSLik (e.g., 3.1s vs 2.2s mean time, 172% annotation ratio improvement) is based on comparing raw numbers without any statistical significance test."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports effect sizes with baseline context: 'annotations are on average 27% more concise than the synthesized code,' 'Code/Spec ratio... 172% higher on average,' and direct comparisons of synthesis times (3.1s RusSOL vs 2.2s SuSLik)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for benchmark suite sizes (51 Rust tasks, 19 SuSLik tasks, 47 Verifier tasks). The 100-Crates suite is exhaustive (all supported signatures) but the annotated benchmarks have no size rationale."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Table 1 reports only means and maxima (e.g., 'mean time 1.8s', 'maximum of 28s'). No standard deviations, IQR, or other variance measures are provided across benchmarks."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RusSOL is compared against SuSLik on 19 shared benchmarks, with comparisons on synthesis time, success rate, and annotation overhead (Table 1, Sec. 4.2)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "SuSLik (2019/2021) is the only comparable deductive synthesis tool for heap-manipulating programs. The paper also notes SyRust (2021) as the only other Rust synthesizer but explains why it's not comparable (different goals). These represent the state of the art."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No ablation study is performed to measure the contribution of individual SOL rules or search phases (forward, backward, bridging). The evaluation does not isolate which aspects of the Rust type system provide the most search space reduction."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 1 reports five metrics: number of tasks solved, synthesis time, number of SOL rules in derivation, lines of Rust code, and Code/Spec annotation ratio."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper states: 'For all of these tasks, we manually checked that the synthesized programs indeed match the user intent or the original implementation. This analysis was done as a sanity check and to confirm that the partial specifications were sufficiently strong.'"
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "There is no discussion of whether the benchmarks used for evaluation overlap with those used to develop and tune the search heuristics. The cost function ('a heuristic function of the size of P and Q') could have been tuned on these benchmarks."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 1 provides per-group breakdowns across all four benchmark suites and their sub-categories (e.g., SLL Tutorial, StackOverflow, Custom, Integers, Singly linked list, etc.)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses both failed annotated benchmarks ('one is from StackOverflow and is unsolvable without unsafe code' and 'inability to synthesize a pure arithmetic expression') and analyzes 307 failed 100-Crates tasks with three categorized failure reasons (Sec. 4.3)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.3 explicitly identifies four limitations: no trait support, no arbitrary conditionals, limited arithmetic, and no unsafe code. The push example (Sec. 2.1) shows how a naive specification produces an unintended implementation."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims about simpler specifications (supported by 27% annotation reduction and 172% improvement over SuSLik), search space reduction (supported by no unfolding bounds needed), and effectiveness (115/117 solved, mean 1.8s) are all supported by results in Sec. 4."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The central causal claim — 'the Rust type system reduces the search space for synthesis, which improves performance' — is supported by both theoretical argument (Sec. 2-3 show how type rules restrict applicable operations) and empirical comparison (RusSOL needs no unfolding bounds unlike SuSLik, achieving similar time while exploring a more constrained search space)."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Claims are consistently bounded to 'safe Rust.' The abstract says 'first approach to automatically synthesizing correct-by-construction programs in safe Rust.' Sec. 4.3 explicitly lists four out-of-scope capabilities (traits, conditionals, arithmetic, unsafe code)."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The Discussion (Sec. 4.3) discusses limitations but does not consider alternative explanations for the results. For example, it does not consider whether the benchmarks are biased toward tasks that favor deductive synthesis, or whether the performance improvements over SuSLik are due to the Rust type constraints versus other implementation differences."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper's claims match the granularity of its measurements: it measures synthesis time, solved benchmarks, and annotation size, and claims effectiveness in terms of these same quantities. No broader proxy-outcome gap exists."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use any ML models or LLMs. RusSOL is a deterministic deductive synthesis engine based on formal logic rules."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper does not use prompting. Input is formal type signatures with pre/postconditions, not natural language prompts to an LLM."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The search algorithm uses a cost heuristic described only as 'a heuristic function of the size of P and Q' (Sec. 3.4). The exact cost function formula and any tunable parameters are not specified. The three-phase search structure is described qualitatively but not parameterized."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. RusSOL is a standalone deductive synthesis engine, not an agent-based system."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4.1 documents benchmark assembly: Rust benchmarks from StackOverflow and tutorials; SuSLik tasks with explicit exclusion criteria ('excluded tasks that are not meaningful in safe Rust'); Verifier tasks filtered to 'sufficiently complete functional specifications'; 100-Crates signatures with stated exclusions ('unsupported types, such as slices, raw pointers, characters, and floats')."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.3 (Discussion) contains substantive discussion of four specific limitations: traits, conditionals, arithmetic, and unsafe code. Each is explained with examples and assessment of whether the limitation is fundamental."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4.3 identifies specific, study-relevant threats: 'does not support traits and associated functions, which makes it incomplete when working with generics or datatypes with private fields. This limitation is the main reason for failures on the 100-Crates benchmark suite.' Each limitation is tied to specific observed failures."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly states scope boundaries: 'we target safe Rust' (not unsafe), 'does not implement branch abduction,' traits and unsafe code are out of scope. The last paragraph of Sec. 4.3 distinguishes fundamental from non-fundamental limitations."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The Zenodo artifact (doi:10.5281/zenodo.7811786) contains all benchmark specifications, synthesized programs, and reproduction scripts, enabling independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 4.1 describes how each benchmark suite was assembled: Rust tasks from StackOverflow/tutorials/custom; SuSLik tasks from prior work with exclusion criteria; Verifier tasks from Prusti/Creusot test suites; 100-Crates from top crates on crates.io."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Benchmarks are synthesis tasks extracted from code repositories and prior work, not from recruited subjects."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 4.1 documents the pipeline: source identification, exclusion criteria ('excluded tasks that are not meaningful in safe Rust, either because they are trivial or impossible due to aliasing'), translation to Rust types, and annotation addition. For 100-Crates: automatic extraction from top 100 crates with type filtering."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgments section states: 'This work has been supported by the National Science Foundation under Grant No. 1911149, and by a Singapore MoE Tier 3 grant \"Automated Program Repair\", MOE-MOET32021-0001.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All five authors' institutional affiliations are clearly listed: ETH Zurich (Fiala, Müller), Technion (Itzhaky), UCSD (Polikarpova), NUS (Sergey)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding comes from the National Science Foundation and Singapore Ministry of Education. Neither has a financial interest in the outcome of Rust program synthesis research."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is provided in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper does not evaluate any pre-trained model. RusSOL is a deterministic deductive synthesis engine based on formal logic rules with no training data."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model is evaluated. The synthesis tool uses formal inference rules, not learned parameters, so train/test overlap is not applicable."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No pre-trained model with training data is involved. The benchmarks are synthesis specifications, not test inputs for a learned model."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The evaluation is entirely automated benchmark testing of a synthesis tool."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants involved in the study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants involved in the study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants involved in the study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants or experimental conditions involving randomization."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants or conditions requiring blinding."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants involved in the study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Synthesis times are reported: mean 1.8s across all annotated benchmarks, up to 28s for the most complex case (rose tree copy). Table 1 provides mean time per benchmark group."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Hardware is specified: 'All experiments were conducted on a consumer-grade laptop with an Intel i7 CPU and 16GB RAM.'"
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": false,
    300         "answer": false,
    301         "justification": "RusSOL is a deterministic deductive synthesis engine. The best-first search with a fixed cost heuristic produces the same result on each run. Random seeds are not applicable."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "As a deterministic system, a single run is definitive. Multiple runs would produce identical results."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "The system has no ML-style hyperparameters to search. The cost heuristic and search phases are part of the algorithm design, not tunable parameters optimized via search."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The cost function is described only as 'a heuristic function of the size of P and Q.' No justification is given for this choice, no alternatives are discussed, and no sensitivity analysis to the cost function is provided."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "RusSOL is compared primarily against SuSLik, which was created by three of the same authors (Itzhaky, Polikarpova, Sergey). The paper does not acknowledge or address the potential bias of evaluating their own system against their own prior system."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "The synthesis tool runs to completion or times out. There is no meaningful compute budget dimension to vary — the search is deterministic and finds the first solution."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The paper assembles benchmarks from four distinct sources to test different aspects: Rust suite tests idiomatic patterns, SuSLik suite enables direct comparison, Verifier suite tests specification handling, and 100-Crates stress-tests real-world type handling. The purpose and limitations of each suite are discussed."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. RusSOL is a standalone synthesis tool, not an agent-based system."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No pre-trained model with training data is involved. RusSOL uses formal inference rules, so temporal leakage between training data and benchmarks is not applicable."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "No machine learning model with features/labels is involved. The synthesis tool operates on formal specifications, not learned feature representations."
    354       },
    355       "non_independence_addressed": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "No train/test split exists. The tool uses formal logic rules, not learned parameters, so independence between training and test data is not applicable."
    359       },
    360       "leakage_detection_method": {
    361         "applies": false,
    362         "answer": false,
    363         "justification": "No machine learning model is evaluated, so data leakage detection methods are not applicable."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "RusSOL successfully synthesizes 115 out of 117 annotated benchmark tasks with a mean time of 1.8 seconds.",
    370       "evidence": "Table 1 (Sec. 4.2) provides detailed per-group results. Two failures are explained: one requires unsafe code, one requires arithmetic synthesis. Mean derivation size is 26.5 SOL rules, mean output is 5.4 LOC.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "User-provided annotations are on average 27% shorter than the synthesized code (measured in AST nodes).",
    375       "evidence": "Table 1 reports Code/Spec ratios per benchmark group. Mean ratio across annotated benchmarks is 1.4 (code is 1.4x the annotation size). Methodology for counting AST nodes is described in Sec. 4.2.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "RusSOL requires 172% less annotation effort than SuSLik on shared benchmarks, because Rust types encode information that SuSLik requires in specifications.",
    380       "evidence": "Table 1 compares Code/Spec ratios on SuSLik benchmarks. RusSOL needs only 3 data type definitions vs. SuSLik's 8 inductive predicates for the same 19 tasks. Each group shows percentage improvement.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "All synthesized programs pass the Rust compiler type checker and Creusot verifier without errors.",
    385       "evidence": "Sec. 4.2 (RQ3): 'We ran all programs synthesized for the annotated benchmarks through the Rust compiler and a supported subset thereof through the Creusot verifier; neither tool uncovered any errors.'",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Rust's type system reduces the synthesis search space, enabling synthesis from type signatures alone in non-trivial cases.",
    390       "evidence": "Sec. 2.1 demonstrates singleton and 100-Crates examples synthesized without functional annotations. Sec. 4.2 explains that RusSOL needs no unfolding depth bounds unlike SuSLik, attributed to type-guided search.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "RusSOL solves 1328 of 1635 non-primitive type signatures from the top 100 Rust crates, with all solutions verified as well-typed by the Rust compiler.",
    395       "evidence": "Table 1 reports 100-Crates results. Three categorized failure reasons are given in Sec. 4.2: need for Unreachable without preconditions, need for unsafe code, and non-constructible return types.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Self-evaluation bias",
    402       "detail": "RusSOL is compared primarily against SuSLik, which was created by three of the five same authors (Itzhaky, Polikarpova, Sergey). The implementation of SuSLik used for comparison, the benchmark selection, and the evaluation methodology are all controlled by overlapping author groups. No independent evaluation is provided."
    403     },
    404     {
    405       "flag": "Author-assembled primary benchmark",
    406       "detail": "The main Rust benchmark suite (51 tasks) was assembled by the authors themselves, including tasks 'written by us.' While the 100-Crates and Verifier suites partially mitigate selection bias, the primary evaluation suite could favor tasks that RusSOL handles well."
    407     }
    408   ],
    409   "cited_papers": [
    410     {
    411       "title": "SyRust: Automatic Testing of Rust Libraries with Semantic-Aware Program Synthesis",
    412       "authors": ["Yoshiki Takashima", "Ruben Martins", "Limin Jia", "Corina S. Păsăreanu"],
    413       "year": 2021,
    414       "doi": "10.1145/3453483.3454084",
    415       "relevance": "Only other Rust program synthesizer; generates code snippets to test unsafe libraries, representing an alternative approach to automated Rust code generation."
    416     },
    417     {
    418       "title": "Structuring the Synthesis of Heap-Manipulating Programs",
    419       "authors": ["Nadia Polikarpova", "Ilya Sergey"],
    420       "year": 2019,
    421       "doi": "10.1145/3290385",
    422       "relevance": "Foundation for deductive program synthesis from separation logic specifications; the SuSLik framework that RusSOL extends for automatic code generation."
    423     },
    424     {
    425       "title": "Leveraging Rust Types for Modular Specification and Verification",
    426       "authors": ["Vytautas Astrauskas", "Peter Müller", "Federico Poli", "Alexander J. Summers"],
    427       "year": 2019,
    428       "doi": "10.1145/3360573",
    429       "relevance": "Prusti verification tool for Rust; establishes the connection between Rust types and separation logic that enables automated code safety guarantees."
    430     },
    431     {
    432       "title": "Verus: Verifying Rust Programs using Linear Ghost Types",
    433       "authors": ["Andrea Lattuada", "Travis Hance", "Chanhee Cho", "Matthias Brun", "Isitha Subasinghe", "Yi Zhou", "Jon Howell", "Bryan Parno", "Chris Hawblitzel"],
    434       "year": 2023,
    435       "relevance": "Recent Rust verification tool using linear ghost types; represents the expanding ecosystem of formal methods for safe systems code generation."
    436     },
    437     {
    438       "title": "Cyclic Program Synthesis",
    439       "authors": ["Shachar Itzhaky", "Hila Peleg", "Nadia Polikarpova", "Reuben N. S. Rowe", "Ilya Sergey"],
    440       "year": 2021,
    441       "doi": "10.1145/3453483.3454087",
    442       "relevance": "Technique for synthesizing recursive and mutually-recursive programs, enabling automatic generation of complex code structures without user-provided recursion schemes."
    443     }
    444   ]
    445 }

Impressum · Datenschutz