scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29892B)
      1 {
      2   "paper": {
      3     "title": "Proving the Coding Interview: A Benchmark for Formally Verified Code Generation",
      4     "authors": [
      5       "Quinn Dougherty",
      6       "Ronak Mehta"
      7     ],
      8     "year": 2025,
      9     "venue": "2025 IEEE/ACM International Workshop on Large Language Models for Code (LLM4Code)",
     10     "arxiv_id": "2502.05714",
     11     "doi": "10.1109/LLM4Code66737.2025.00014"
     12   },
     13   "scan_version": 3,
     14   "active_modules": [
     15     "experimental_rigor",
     16     "data_leakage"
     17   ],
     18   "methodology_tags": [
     19     "benchmark-eval"
     20   ],
     21   "key_findings": "FVAPPS is the largest formal verification benchmark with 4,715 Lean 4 samples (1,083 curated), derived from the APPS programming benchmark. On 406 theorems from 101 randomly sampled problems, Claude 3.5 Sonnet proved 30% and Gemini 1.5 Pro proved 18%, with most successes being trivial or syntactic rather than deeply nontrivial proofs. A human baseliner spent 10 hours on a single problem and completed the definition but no proofs, suggesting formal verification remains extremely challenging for both humans and current LLMs.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper states 'All code and prompts used for the generation can be found at our open-source GitHub repository' (footnote 7: https://github.com/quinn-dougherty/fvapps)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The benchmark dataset is released on HuggingFace at https://huggingface.co/datasets/quinn-dougherty/fvapps, mentioned multiple times including in the abstract."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper specifies Lean 4.12.0 with commit hash and Mathlib commit (809c3fb3b5c8f5d7dace56e200b426187516535a), but does not provide comprehensive environment specifications for the Python side of the pipeline (pytest, Python version, other dependencies). No requirements.txt or Dockerfile is mentioned."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper describes the pipeline at a high level (Section II.B) and references the GitHub repo, but does not include step-by-step reproduction instructions in the paper itself. The pipeline stages are described narratively but without specific commands or scripts to run."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Main results are reported as point estimates only: '30%' and '18.5%' theorem proving rates with no confidence intervals, error bars, or uncertainty measures."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper compares Sonnet (30%) vs Gemini (18.5%) without any statistical significance tests. The claim that Sonnet outperforms Gemini rests solely on comparing two raw percentages."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Raw counts and percentages are given with full context: 'Sonnet accomplished 121 (30%) and Gemini accomplished 74 (18.5%)' out of 406 theorems, and Table II breaks down by quality level. The reader can compute effect sizes from these numbers."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper evaluates 101 randomly sampled problems from 4,715 but does not justify why 101 was chosen or discuss whether this sample size provides sufficient statistical power."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Results are from single experimental runs. No variance, standard deviation, or spread measures are reported across multiple runs or seeds."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Two model baselines (Claude 3.5 Sonnet, Gemini 1.5 Pro) and one human baseline are included. They also attempted open-source models (DeepSeek-Prover-V1.5-RL) which could not make progress."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Claude 3.5 Sonnet (October 2024) and Gemini 1.5 Pro (November 2024) were contemporary frontier models at the time of evaluation."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The scaffolding loop has multiple design decisions (one-theorem-at-a-time approach, max iterations, prompt engineering choices) but no ablation study systematically varies these components. The paper notes they moved to one-at-a-time solving but doesn't report controlled comparisons."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Multiple metrics are reported: definition completion rate, theorem proving rate, qualitative categorization of proof types (Figure 6), number of iterations to convergence (Figures 7-8), and breakdown by quality assurance level (Table II)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Qualitative human assessment of theorem proofs is provided in Figure 6 and Table III, categorizing solutions into buckets like 'Actually nontrivial', 'Bug', 'Too trivial', 'Easy branch of if', etc."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The 101 problems were randomly sampled from the full 4,715-sample benchmark. The benchmark was not used for any model tuning or development decisions."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by quality assurance level (Unguarded/Guarded/Guarded and Plausible) in Table II, and by qualitative proof category in Figure 6 and Table III."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section III.E discusses Sonnet's reliance on imperative programming with Id.run do that doesn't lend itself to proofs. Section III.B describes the human's stack overflow issues. Section III.A notes open-source models couldn't make progress."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Multiple negative results: DeepSeek-Prover failed entirely (Section III.A), the human baseline couldn't complete any proofs in 10 hours (Section III.B), Sonnet diverged after 100 loops on sample 23's second theorem (Section III.C)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of 4,715 samples, 1,083 curated, 30% Sonnet and 18% Gemini theorem proving rates are all supported by the results. Minor discrepancy: abstract says '100 randomly selected samples' while body says 101, but this is a rounding issue."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "Section III.A.1 states 'We found significantly higher success rates by following a number of key insights' regarding prompt engineering, but this claim is based on informal observation without controlled comparison or ablation."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Claims are largely bounded to Lean 4 and the specific models tested. The paper frames results as preliminary baselines on their specific benchmark rather than making broad generalizations about LLM verification capabilities."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not discuss alternative explanations for the low proving rates. Could the low success be due to the quality of the benchmark's theorem statements, the scaffolding design, the prompt quality, or Lean 4's evolving API? These alternatives are not considered."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures sorry-completion rates and reports them as such ('correctly proves 30%'). Claims match the granularity of measurements. The qualitative categorization in Figure 6 explicitly distinguishes trivial proofs from nontrivial ones, avoiding overclaiming."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Claude is specified precisely as 'claude-3-5-sonnet-20241022'. However, Gemini is specified only as 'Gemini 1.5 Pro' with footnote 'Retrieved in early November 2024' — no API version or snapshot date. Per schema rules, marketing names without a snapshot date do not count."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The paper states 'All code and prompts used for the generation can be found at our open-source GitHub repository' (footnote 7). The prompting strategy is described in Section III.A.1 and full prompts are available in the repository."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No mention of temperature, top-p, max tokens, or other sampling parameters for any of the LLM API calls. Only the maximum number of scaffolding loop iterations is reported."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The scaffolding loop is well-described in Section II.B with Figure 3, including error feedback mechanism, max iterations per stage (5 for stages 1-3, 10 for stage 4, 25/50 for baseline), and the one-theorem-at-a-time strategy."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The 5-stage pipeline is described in detail in Section II.B: preprocessing APPS, generating property-based tests with pytest, converting to Lean, adding unit test quality assurance, and Plausible filtering. Sample counts at each stage are given (4715, 2089, 1083)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section V (Discussion) opens with 'Our current presentation has a few limitations, and clear directions for future work' and discusses multiple specific limitations over two paragraphs."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section V discusses specific threats: theorem statements may not correspond to desirable properties, theorems may not cover the full range of ideal properties, particular theorems may be vacuous, and the gap between imperative solutions and proof-amenable structures."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show. While limitations are discussed, there are no explicit scope boundaries like 'our results do not generalize to X' or 'we do not claim Y'. The discussion of limitations is about potential quality issues, not about claims the authors are not making."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The full dataset of 4,715 Lean 4 files is publicly available on HuggingFace with quality assurance tagging. All baseline code is available on GitHub."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section II describes starting from the APPS dataset (scraped from CodeForces and LeetCode), then details the 5-stage pipeline for converting Python problems to Lean 4 theorem specifications."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper includes a human baseline ('a baseliner' who spent 10 hours) but provides no information about who this person was, how they were selected, or their qualifications beyond being able to write Lean."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline from APPS to final benchmark is documented across 5 stages with Table I showing iteration counts, and final sample counts at each quality level (4715 → 2089 → 1083)."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Acknowledgments section states: 'We are also grateful to the Anthropic External Researcher program for providing API credits for benchmark generation, and to FAR.ai for providing operations and administrative support.'"
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are listed: Quinn Dougherty at Beneficial AI Foundation, Ronak Mehta as Independent."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Anthropic provided API credits for benchmark generation, and Anthropic's Claude 3.5 Sonnet is one of the two models evaluated. Anthropic has a financial interest in their model performing well on benchmarks."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is provided in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper states model access dates (October/November 2024) but does not state the training data cutoff dates for either Claude 3.5 Sonnet or Gemini 1.5 Pro."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "APPS problems are derived from CodeForces and LeetCode, which are heavily represented in LLM training data. The paper does not discuss whether the evaluated models may have seen the original problems or their solutions during training."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "APPS was published in 2021 and its source problems are from public competitive programming platforms. Both evaluated models were trained well after 2021 and could have seen these problems. This contamination risk is not discussed, despite the Lean translation preserving the underlying algorithmic problems."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "This is primarily a benchmark paper. The single human baseliner is an informal data point, not a human subjects study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study was conducted. The single human baseline is an informal demonstration, not a research study requiring ethics review."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study. The paper mentions a single 'baseliner' without characterization, but this is not a human study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study was conducted."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study was conducted."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human subjects study was conducted."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human subjects study was conducted."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No API costs, tokens consumed, or wall-clock time are reported for either the benchmark generation pipeline or the baseline evaluation experiments, despite both involving substantial LLM API usage."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget is stated. The paper does not report GPU hours, total API spend, or the aggregate cost of running the benchmark generation or baseline evaluations."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "All results appear to be from single experimental runs. No analysis of sensitivity to random seeds or LLM sampling stochasticity is reported."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of scaffolding loop iterations is stated (25 for defs, 50 per theorem), but the number of independent experimental runs is not stated. Results appear to be from a single run."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Prompt engineering was guided by insights from Victor Taelin (Section III.A.1) but no systematic hyperparameter search was conducted or budgeted. The paper does not report how many prompt variants were tried."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper describes arriving at prompt design choices through informal iteration ('We found significantly higher success rates by following a number of key insights') but does not justify the selection through systematic comparison or validation."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Claude 3.5 Sonnet was used to GENERATE the benchmark (all 5 pipeline stages) and is then evaluated on it. This creates a potential bias where the benchmark may be systematically easier or harder for the generating model. This bias is not acknowledged or discussed."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Both models are given the same iteration budget (25/50 loops) but actual compute (tokens, cost) is not reported or compared. Sonnet and Gemini may have very different token costs per iteration, making the budget effectively unequal."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Section V briefly mentions that theorem statements 'may not correspond to desirable properties' but does not substantively analyze whether the benchmark measures formal verification capability as claimed. No comparison with alternative benchmarks or analysis of what skills the benchmark actually tests."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "Both Sonnet and Gemini use the same scaffolding loop with the same iteration limits ('Both Sonnet and Gemini ran for a maximum of 25 loops for the defs and a max of 50 loops for each theorem'), controlling for scaffold effects."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "APPS was published in 2021 from problems scraped from CodeForces and LeetCode. Models evaluated in late 2024 could have trained on these problems. The temporal leakage risk is not discussed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The scaffolding provides the English-language problem description from APPS alongside the Lean file. If models have seen APPS problems and solutions during training, the English description could leak algorithmic solutions. This is not discussed."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether APPS problems (from CodeForces/LeetCode) or similar problems appeared in the models' training data. The independence of test problems from training data is not verified."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination is performed."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "FVAPPS is the largest formal verification benchmark with 4,715 samples including 1,083 curated and quality-controlled samples.",
    373       "evidence": "Section II.C reports 4,715 successful samples (Unguarded), 2,089 Guarded, 1,083 Guarded and Plausible. Section IV.A compares to prior benchmarks: Clover and dafny-synthesis ~100 programs each, DafnyBench 782.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Claude 3.5 Sonnet correctly proves 30% and Gemini 1.5 Pro correctly proves 18.5% of theorems on 101 randomly selected samples.",
    378       "evidence": "Section III.D: 'Of the 406 theorems attempted, Sonnet accomplished 121 (30%) and Gemini accomplished 74 (18.5%).' Table II provides breakdown by quality level.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Higher-quality (guarded and plausible) samples are somewhat more likely to be solvable by current models.",
    383       "evidence": "Table II shows proportions: Unguarded has 41/69 Sonnet successes (59%), Guarded 12/18 (67%), Guarded and Plausible 7/14 (50%). The paper states 'we roughly find that our higher-quality samples...are somewhat more likely to be solvable.'",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "A full human baseline is not feasible for this benchmark due to the extreme difficulty of formal verification.",
    388       "evidence": "Section III.B: A human spent 10 hours on sample 0023, completed the function definition but no proofs, encountering stack overflow issues.",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "Sonnet's reliance on imperative programming (Id.run do) creates a tension between functional solutions and proof-amenable structures.",
    393       "evidence": "Section III.E observes that 'Sonnet solved many problems via imperative programming with the identity monad (Id.run do)' and notes this 'fails to capture inductive structure that lends itself to proofs.'",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Generator-evaluator conflation",
    400       "detail": "Claude 3.5 Sonnet was used to generate all 4,715 benchmark samples and is then evaluated on the same benchmark. This creates potential bias: the benchmark may be systematically shaped by Sonnet's capabilities and failure modes."
    401     },
    402     {
    403       "flag": "No contamination analysis",
    404       "detail": "APPS problems are derived from CodeForces and LeetCode, which are extensively represented in LLM training corpora. The evaluated models (trained after 2021) may have memorized the underlying problems and solutions, but this contamination risk is completely unaddressed."
    405     },
    406     {
    407       "flag": "Single-run results with no uncertainty",
    408       "detail": "All baseline results come from single experimental runs with no error bars, confidence intervals, or repeated trials. LLM outputs are stochastic, so results could vary substantially across runs."
    409     },
    410     {
    411       "flag": "Tiny human baseline (n=1)",
    412       "detail": "The human baseline consists of a single unnamed person attempting a single problem. This is presented alongside model results as if it provides a meaningful comparison, but n=1 cannot support any conclusions about human performance."
    413     },
    414     {
    415       "flag": "Non-independent funder",
    416       "detail": "Anthropic provided API credits for benchmark generation, and Anthropic's Claude 3.5 Sonnet is one of two evaluated models, scoring highest. While the conflict is disclosed in acknowledgments, it is not highlighted as a potential bias."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Measuring Coding Challenge Competence With APPS",
    422       "authors": ["Dan Hendrycks"],
    423       "year": 2021,
    424       "relevance": "Source dataset for FVAPPS; major benchmark for code generation evaluation."
    425     },
    426     {
    427       "title": "Evaluating large language models trained on code",
    428       "authors": ["Mark Chen"],
    429       "year": 2021,
    430       "arxiv_id": "2107.03374",
    431       "relevance": "Codex paper introducing HumanEval; foundational LLM code generation benchmark."
    432     },
    433     {
    434       "title": "Swe-bench: Can language models resolve real-world github issues?",
    435       "authors": ["Carlos E Jimenez"],
    436       "year": 2023,
    437       "arxiv_id": "2310.06770",
    438       "relevance": "Major benchmark for autonomous software engineering agents."
    439     },
    440     {
    441       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    442       "authors": ["Naman Jain"],
    443       "year": 2024,
    444       "relevance": "Contamination-free code evaluation benchmark addressing temporal leakage."
    445     },
    446     {
    447       "title": "DeepSeek-Prover-V1.5: Harnessing Proof Assistant Feedback for Reinforcement Learning and Monte-Carlo Tree Search",
    448       "authors": ["Huajian Xin"],
    449       "year": 2024,
    450       "arxiv_id": "2408.08152",
    451       "relevance": "State-of-the-art model for formal theorem proving, attempted as baseline in this paper."
    452     },
    453     {
    454       "title": "DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data",
    455       "authors": ["Huajian Xin"],
    456       "year": 2024,
    457       "arxiv_id": "2405.14333",
    458       "relevance": "Formal theorem proving model using synthetic data for training."
    459     },
    460     {
    461       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    462       "authors": ["Xingyao Wang"],
    463       "year": 2024,
    464       "arxiv_id": "2407.16741",
    465       "relevance": "Autonomous software engineering agent platform representative of the agentic coding trend."
    466     },
    467     {
    468       "title": "Code Llama: Open Foundation Models for Code",
    469       "authors": ["Baptiste Rozière"],
    470       "year": 2023,
    471       "relevance": "Open-source code generation foundation model relevant to LLM programming capabilities."
    472     },
    473     {
    474       "title": "Clover: Closed-loop verifiable code generation",
    475       "authors": ["Chuyue Sun"],
    476       "year": 2024,
    477       "relevance": "Directly comparable formal verification benchmark (~100 programs); FVAPPS is positioned as much larger."
    478     },
    479     {
    480       "title": "DafnyBench: A Benchmark for Formal Software Verification",
    481       "authors": ["Chloe Loughridge"],
    482       "year": 2024,
    483       "arxiv_id": "2406.08467",
    484       "relevance": "Formal software verification benchmark (782 programs in Dafny); direct comparison point for FVAPPS."
    485     },
    486     {
    487       "title": "AutoVerus: Automated Proof Generation for Rust Code",
    488       "authors": ["Chenyuan Yang"],
    489       "year": 2024,
    490       "arxiv_id": "2409.13082",
    491       "relevance": "Automated formal verification for Rust, bridging the gap between code generation and formal proofs."
    492     },
    493     {
    494       "title": "Program Synthesis with Large Language Models",
    495       "authors": ["Jacob Austin"],
    496       "year": 2021,
    497       "arxiv_id": "2108.07732",
    498       "relevance": "Early work on LLM-based program synthesis foundational to the code generation field."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 1,
    504       "justification": "The benchmark is useful for formal verification researchers but not immediately applicable to typical software engineering practice."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that LLMs struggle with formal proofs is expected; the paper confirms rather than challenges conventional wisdom."
    509     },
    510     "fear_safety": {
    511       "score": 1,
    512       "justification": "The introduction motivates the work with safety-critical applications (medical, military, autonomous vehicles) but the results themselves are about capability gaps rather than novel risks."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or conflict; straightforward benchmark introduction and baseline evaluation."
    517     },
    518     "demo_ability": {
    519       "score": 2,
    520       "justification": "Dataset is on HuggingFace and code on GitHub; researchers can download and try it immediately, though it requires Lean 4 expertise."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Evaluates Claude and Gemini (known models) but the paper is from a small independent foundation, not a major lab."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs