ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (24725B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Leveraging Rust Types for Program Synthesis",
      6     "authors": [
      7       "Jonáš Fiala",
      8       "Shachar Itzhaky",
      9       "Peter Müller",
     10       "Nadia Polikarpova",
     11       "Ilya Sergey"
     12     ],
     13     "year": 2023,
     14     "venue": "Proceedings of the ACM on Programming Languages (PLDI)",
     15     "arxiv_id": null,
     16     "doi": "10.1145/3591278"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims are backed by Section 4 results: 115/117 tasks solved supports 'synthesizing non-trivial programs', 27% shorter annotations supports 'simpler specifications', and synthesis times in seconds support 'improves performance'.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The causal claim that Rust types reduce search space is supported both theoretically (SOL restricts derivable programs to type-correct ones) and empirically via the 172% higher code/spec ratio vs SuSLik on shared benchmarks, with the mechanism well-argued.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 4.3 explicitly bounds scope to safe Rust without traits, conditionals requiring branch abduction, non-trivial arithmetic, or unsafe code; claims are restricted to these boundaries throughout.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper acknowledges SuSLik's 'unfair advantage' from user-provided unfolding bounds, but does not systematically consider whether RusSOL's results might reflect implementation effort, benchmark selection, or other confounds rather than the Rust-type-system hypothesis.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Claims are framed as annotation overhead (measured by AST node count ratio), synthesis success rate, and synthesis time — all directly measured; no conflation of proxy metrics with broader productivity or correctness claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 4.3 'Discussion' includes a dedicated limitations subsection listing four specific limitations: traits, conditionals/branch abduction, arithmetic synthesis, and unsafe code.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "There is no threats-to-validity section; the discussion addresses tool capability limits but not experimental validity concerns such as benchmark selection bias (benchmarks partly written by authors), timing measurement methodology, or generalizability of the benchmark suite.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 4.3 explicitly states RusSOL does not handle traits, branch abduction for conditionals, non-trivial arithmetic, or unsafe code; Section 4.1 documents the explicit exclusion criteria for benchmark tasks.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed in the Acknowledgments: NSF Grant No. 1911149 and Singapore MoE Tier 3 grant MOE-MOET32021-0001.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five author affiliations are disclosed on the title page: ETH Zurich, Technion, ETH Zurich, UC San Diego, and National University of Singapore.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSF and Singapore MoE are government funding agencies with no commercial stake in the RusSOL synthesis tool outcome.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests or financial interests declaration anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Program synthesis, deductive synthesis, Synthetic Ownership Logic, symbolic heaps, snapshots, prophecy variables, and Rust ownership/borrowing concepts are all defined when introduced.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The Contributions section (end of Section 1) explicitly lists three contributions: SOL (the logic), RusSOL (the tool), and the evaluation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5 (Related Work) systematically situates RusSOL against SuSLik/SSL, SyRust, Prusti, Creusot, Aeneas, Oxide, RustBelt, and RustHornBelt, explaining what each does/doesn't capture and how SOL differs.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The Data Availability section states the artifact with RusSOL source code and build scripts is available at https://doi.org/10.5281/zenodo.7811786.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The artifact contains the corpus of benchmark case studies; benchmark sources (StackOverflow, Prusti/Creusot test suites, crates.io) are publicly available, and the artifact includes reproduction materials.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper only states the tool is 'implemented in Scala' and experiments run on 'a consumer-grade laptop with Intel i7 CPU and 16GB RAM'; no Dockerfile, dependency manifest, or JVM/Scala version is specified in the paper text.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The Data Availability section explicitly states the artifact contains 'a README file in markdown that provides detailed step-by-step instructions for running RusSOL and the experiments'.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Table 1 reports means for synthesis time, rule counts, and LOC, but no confidence intervals, standard deviations, or error bars are reported for any metric.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The comparison of RusSOL vs SuSLik synthesis times (3.1s vs 2.2s on shared benchmarks) is presented as a raw average with no statistical significance test.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported: annotations are 27% more concise than synthesized code, and RusSOL's code/spec ratio is 172% higher than SuSLik's on shared benchmarks, both with baseline context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 117 annotated benchmarks are described by their sources but no power analysis or justification for why this count is sufficient to support the generalization claims is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Only mean synthesis times are reported in Table 1; no standard deviation, interquartile range, or other variance measure is provided.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "SuSLik is used as the primary baseline for 19 shared benchmarks, and Prusti/Creusot test suites serve as a correctness validation baseline.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "SuSLik (Itzhaky et al. 2021) is the current state-of-the-art deductive heap-manipulating synthesizer and an appropriate contemporary comparison for this work.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study is performed to isolate the contribution of individual SOL components (e.g., prophecy variables, blocking sets, search phasing) to annotation reduction or synthesis performance.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table 1 reports five metrics: number of solved tasks, mean synthesis time, mean SOL rules applied, mean synthesized LOC, and code/spec ratio.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "This is a formal program synthesis tool; human evaluation of outputs is not relevant since correctness is verified mechanically by the Rust type checker and Creusot verifier.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is not a prediction/learning system; the evaluation is over a fixed benchmark suite rather than a train/test split.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 1 breaks results down by source (Rust/SuSLik/Verifier/100-Crates) and within Rust by subcategory (SLL tutorial, StackOverflow, Custom), and within SuSLik by data structure type.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 4.2 discusses all two annotated failures (one requiring unsafe code, one needing arithmetic synthesis) and categorizes the 307/1635 100-Crates failures into three root causes.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Failures are reported and explained honestly, including the inability to synthesize list-of-lists length (arithmetic) and the 18.7% failure rate on 100-Crates non-primitive tasks.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": false,
    237           "answer": false,
    238           "justification": "RusSOL is a rule-based symbolic synthesis tool, not an LLM; no model versions apply.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "No language model prompts are used; the system takes formal Rust type signatures and logical annotations as input.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "The search algorithm uses 'a heuristic function of the size of P and Q' for node cost, but the exact heuristic formula, timeout thresholds, and search tree parameters are not specified.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "There is no agentic scaffolding; RusSOL is a deterministic deductive synthesis engine.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.1 documents the extraction of 2671 signatures from top-100 crates.io and the exclusion criteria (unsupported types: slices, raw pointers, characters, floats).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The artifact (zenodo.7811786) contains the corpus of case studies used to reproduce experimental results, as stated in the Data Availability section.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.1 describes all four benchmark sources in detail: Rust (StackOverflow + tutorial + custom), SuSLik suite, Verifier test suites, and automated extraction from top-100 crates.io.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; benchmarks are software artifacts drawn from public sources.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from Rust type signature and annotations → SOL synthesis goal → deductive derivation → pretty-printed Rust output is formally specified in Sections 3.2–3.4 and illustrated in Figure 2.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "RusSOL is a rule-based deductive synthesizer with no learned components; training cutoff is not applicable.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "No machine learning model is evaluated; contamination between training and test data is not relevant.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "No LLM or learned model is evaluated on benchmarks; contamination is not applicable.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Synthesis times are reported throughout: 1.8s mean for annotated benchmarks, up to 28s for rose tree copy, 0.5s mean for 100-Crates non-primitive tasks.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Hardware is specified as 'a consumer-grade laptop with an Intel i7 CPU and 16GB RAM'; minimal but present.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "RusSOL synthesizes 115 out of 117 annotated benchmark tasks, covering a broad range of heap-manipulating Rust programs.",
    375       "evidence": "Table 1 reports results across Rust, SuSLik, and Verifier benchmark categories with 115/117 solved.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Annotations for RusSOL are on average 27% more concise than the synthesized code, reducing specification burden compared to traditional SL synthesizers.",
    380       "evidence": "Table 1 code/spec ratio of 1.4 average; the SuSLik comparison shows 172% higher ratio for RusSOL on shared benchmarks.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "RusSOL synthesizes programs in a matter of seconds (mean 1.8s), with performance comparable to SuSLik despite targeting a harder language.",
    385       "evidence": "Table 1 timing data; SuSLik comparison of 3.1s vs 2.2s on shared benchmarks with noted confound that SuSLik requires user-specified bounds.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "All synthesized programs are well-typed by construction; no errors were found by the Rust compiler across all solved benchmarks including 2671 100-Crates signatures.",
    390       "evidence": "Section 4.2 RQ3 states all programs passed the Rust compiler; a subset passed Creusot verification.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "RusSOL handles 1328 out of 1635 unannotated real-world function signatures from the top-100 crates.io, demonstrating robustness on real-world types.",
    395       "evidence": "Table 1 bottom rows; failures attributed to three specific causes (Unreachable without precondition, unsafe code needs, private-field return types).",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "SOL is the first program logic that captures both Rust's borrowing rules and functional correctness, enabling synthesis of well-typed Rust programs by construction.",
    400       "evidence": "Section 5 surveys prior logics (Prusti, Creusot, Aeneas, Oxide) and shows each lacks one of the two properties; no counterexample to the novelty claim.",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "theoretical"
    407   ],
    408   "key_findings": "RusSOL is the first tool to automatically synthesize safe Rust programs from type signatures and functional specifications, solving 115/117 annotated benchmarks with a mean synthesis time of 1.8 seconds. The core contribution is Synthetic Ownership Logic (SOL), which simultaneously enforces Rust's borrowing rules and functional correctness, enabling correct-by-construction synthesis. Rust's type system demonstrably reduces specification burden: annotations are 27% shorter than synthesized code on average, and the code/spec ratio is 172% higher than the SuSLik baseline on shared benchmarks. All synthesized programs pass the Rust type checker, and a subset were independently verified by Creusot.",
    409   "red_flags": [
    410     {
    411       "flag": "Author-constructed benchmarks",
    412       "detail": "39 of 51 Rust benchmark tasks were written by the paper's own authors, creating potential selection bias toward problems where the approach works well."
    413     },
    414     {
    415       "flag": "No variance on timing",
    416       "detail": "Only mean synthesis times are reported across all benchmark categories; no standard deviation or confidence intervals, making performance comparisons with SuSLik statistically weak."
    417     },
    418     {
    419       "flag": "SuSLik comparison confounded",
    420       "detail": "Authors acknowledge SuSLik has an 'unfair advantage' requiring user-specified bounds; without those bounds SuSLik solves only 14/19 benchmarks. The comparison is not fully controlled."
    421     },
    422     {
    423       "flag": "Manual intent verification",
    424       "detail": "Section 4.2 notes that whether synthesized programs 'match user intent' was checked via manual inspection by the authors, which is a subjective validation without independent replication."
    425     },
    426     {
    427       "flag": "No soundness proof",
    428       "detail": "Section 4.2 explicitly states 'no such formal semantics [for safe Rust] exists', so SOL soundness is validated only empirically via compiler/verifier checks, not by formal proof."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Structuring the Synthesis of Heap-Manipulating Programs (SuSLik)",
    434       "relevance": "Direct predecessor and baseline; RusSOL integrates SOL into SuSLik's proof search framework."
    435     },
    436     {
    437       "title": "Leveraging Rust Types for Modular Specification and Verification (Prusti)",
    438       "relevance": "Key prior work on Rust verification logic that RusSOL builds upon and distinguishes from."
    439     },
    440     {
    441       "title": "The CREUSOT Environment for the Deductive Verification of Rust Programs",
    442       "relevance": "Inspiration for the prophetic encoding of references; used as validation baseline."
    443     },
    444     {
    445       "title": "Aeneas: Rust Verification by Functional Translation",
    446       "relevance": "Prior work formalizing Rust borrowing; SOL's blocking sets are similar to Aeneas loan values."
    447     },
    448     {
    449       "title": "Oxide: The Essence of Rust",
    450       "relevance": "Formalizes Rust type rules but lacks functional reasoning; motivates the need for SOL."
    451     },
    452     {
    453       "title": "RustBelt: Securing the Foundations of the Rust Programming Language",
    454       "relevance": "Foundational Rust semantics work that targets unsafe Rust, contrasted with RusSOL's safe Rust focus."
    455     },
    456     {
    457       "title": "SyRust: Automatic Testing of Rust Libraries with Semantic-Aware Program Synthesis",
    458       "relevance": "The only other Rust synthesizer; contrasted to show RusSOL's greater scope (recursion, functional specs)."
    459     },
    460     {
    461       "title": "Cyclic Program Synthesis",
    462       "relevance": "Provides the cyclic proof mechanism RusSOL reuses from SuSLik for synthesizing recursive functions."
    463     },
    464     {
    465       "title": "Stacked Borrows: An Aliasing Model for Rust",
    466       "relevance": "Formalizes Rust's dynamic aliasing semantics; related to SOL's blocking set approach."
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 2,
    472       "justification": "Could assist Rust developers struggling with the borrow checker, but requires writing formal pre/postconditions which limits immediate practical adoption."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "Counterintuitive central finding: Rust's restrictive type system, widely seen as a barrier for programmers, actually simplifies program synthesis by reducing the specification burden."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "No AI safety concerns; this is a formal methods paper about program synthesis."
    481     },
    482     "drama_conflict": {
    483       "score": 0,
    484       "justification": "No controversy or conflict angle; the paper is technically focused with no disputed claims."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Artifact is publicly available at Zenodo with step-by-step instructions; someone could run the tool on their own Rust signatures."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "ETH Zurich, UCSD, and NUS are respected institutions but not famous AI labs; PLDI is a top PL venue but not widely followed outside the programming languages community."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [],
    497     "top_points": 0,
    498     "total_points": 0,
    499     "total_comments": 0
    500   }
    501 }

Impressum · Datenschutz