ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (17975B)


      1 {
      2   "paper": {
      3     "title": "Position: Vibe Coding Needs Vibe Reasoning: Improving Vibe Coding with Formal Verification",
      4     "authors": ["Jacqueline Mitchell", "Yasser Shaaban"],
      5     "year": 2025,
      6     "venue": "Proceedings of the 1st ACM SIGPLAN International Workshop on Language Models and Programming Languages (LMPL '25)",
      7     "arxiv_id": "2511.00202",
      8     "doi": "10.1145/3759425.3763390"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical", "case-study"],
     13   "key_findings": "This position paper argues that vibe coding suffers from constraint-reconciliation decay as LLMs cannot maintain global consistency across long iterative sessions. The authors propose 'Vibe Reasoning' (Type III systems): a side-car architecture that autoformalizes specifications, continuously verifies them, and integrates feedback to the LLM. A proof-of-concept TypeScript implementation demonstrates exhaustive switch handling and discriminated union enforcement via Claude Code hooks.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code archive is provided. The proof-of-concept is described but not released."
     20       },
     21       "data_released": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a position paper with a small proof-of-concept; there is no dataset to release."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The proof-of-concept is described as written in TypeScript and integrated with Claude Code, but no environment specifications, dependency versions, or setup details are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided for the proof-of-concept implementation."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Position paper with no quantitative experiments; no numerical results to attach uncertainty measures to."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative quantitative claims are made; this is a theoretical argument with a qualitative proof-of-concept."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative experiments are run; no effect sizes to report."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No empirical study with a sample is conducted."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No repeated experiments are run."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "The paper proposes a conceptual architecture; the proof-of-concept is a qualitative demonstration, not a comparative evaluation."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No baselines are applicable since there is no quantitative evaluation."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The proof-of-concept is a single integrated system; no ablation is applicable to a position paper."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No metrics are reported; the paper is a position/theoretical contribution."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs are evaluated; human evaluation is not relevant to a position paper's claims."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No benchmark evaluation is conducted."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No quantitative results to break down."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 2 extensively discusses failure modes of vibe coding: constraint inconsistency, partial-propagation bugs, state-machine divergence, and duplication debt, with concrete examples (Listing 1 bugs)."
    104       },
    105       "negative_results_reported": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "No experiments are run from which negative results could be reported."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims that vibe coding has limitations (supported by cited developer reports and benchmarks in Sections 1-2), that formal methods can mitigate them (argued in Section 3), and proposes a side-car system (described in Section 4). Claims are appropriately hedged as 'we argue' and 'we posit'."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims LLMs' 'inability to reconcile accumulating human-imposed constraints' causes vibe coding pitfalls, citing Kambhampati et al. and benchmarks as support. However, the causal link between constraint-reconciliation decay and the specific reported problems is argued by analogy, not demonstrated empirically."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes broad claims about vibe coding generally but the proof-of-concept covers only TypeScript type-level checks (4 patterns). The title and framing suggest general applicability of formal verification to vibe coding without bounding claims to the narrow domain demonstrated."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not consider alternative explanations for vibe coding failures (e.g., insufficient context windows, poor prompting practices, or that testing alone may suffice). It argues formal methods are needed without substantively considering why lighter-weight alternatives might address the same problems."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No measurements are taken; this is a theoretical position paper."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The proof-of-concept uses 'Claude Code' (Section 5) but no specific model version or snapshot date is provided."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The proof-of-concept uses hooks rather than prompting; no prompts are part of the experimental setup."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No LLM hyperparameters are relevant to the proof-of-concept, which uses syntactic checks and compilation."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 5 describes the side-car architecture: it integrates with Claude Code via hooks on code changes, uses LLM-based analysis for autoformalization of templates (1)-(4), and uses syntactic checks + compilation for verification. The overall architecture is diagrammed in Figure 3."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No data is collected or preprocessed."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The paper acknowledges challenges within the proposal (Sections 4.1-4.3) but does not discuss limitations of its own arguments or proof-of-concept."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed for the position or the proof-of-concept."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly bound what its proposal does NOT cover. The proof-of-concept is limited to TypeScript type-level patterns but this is not framed as a limitation — the paper implies broad applicability."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No data is collected; this is a position paper."
    187       },
    188       "data_collection_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No data collection is performed."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No participants or data samples are recruited."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No data pipeline exists."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Jacqueline Mitchell at University of Southern California, Yasser Shaaban at Workato."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Funding is not disclosed, so independence cannot be assessed. One author works at Workato (a software company), creating potential interest in tools that improve vibe coding, but this is not discussed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is conducted."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is conducted."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Position paper; cost reporting is not applicable."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Position paper; no significant computation is performed."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "LLMs suffer from constraint-reconciliation decay: as vibe coding sessions lengthen and dependencies accumulate, success rates fall.",
    296       "evidence": "Cited multi-turn coding benchmarks [44], repository-level evaluations [10, 22], SWE-bench static vs. dynamic comparison [24, 43], and memory benchmarks [17, 39] in Section 2.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Formal methods can mitigate vibe coding pitfalls by autoformalizing specifications, continuously verifying, and providing actionable feedback.",
    301       "evidence": "Argued theoretically in Sections 3-4, citing existing Type I and Type II systems and proposing Type III. No empirical evidence of improvement is provided.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "The proof-of-concept Type III side-car can detect and fix bugs like missing switch cases and state-machine divergence in TypeScript vibe coding.",
    306       "evidence": "Section 5 describes a working prototype integrated with Claude Code that handles the toy example in Listing 1, detecting both bugs (missing 'shipped' case, missing 'cancelled' action).",
    307       "supported": "weak"
    308     }
    309   ],
    310   "red_flags": [
    311     {
    312       "flag": "Claims significantly outrun evidence",
    313       "detail": "The paper proposes a comprehensive 4-component architecture (autoformalization, verification, integration, developer collaboration) but only demonstrates a narrow proof-of-concept covering 4 TypeScript syntactic patterns. The gap between the vision and the evidence is large."
    314     },
    315     {
    316       "flag": "No quantitative evaluation",
    317       "detail": "The proof-of-concept is demonstrated on a single toy example (Listing 1). No benchmarks, no user studies, no metrics of any kind are reported for the proposed system."
    318     },
    319     {
    320       "flag": "Potential industry affiliation bias",
    321       "detail": "One author works at Workato (a software automation company). The paper advocates for tooling that could benefit such companies, but no conflict of interest is disclosed."
    322     }
    323   ],
    324   "cited_papers": [
    325     {
    326       "title": "SecRepoBench: Benchmarking LLMs for Secure Code Generation in Real-World Repositories",
    327       "authors": ["Connor Dilgren", "Purva Chiniya", "Luke Griffith", "Yu Ding", "Yizheng Chen"],
    328       "year": 2025,
    329       "arxiv_id": "2504.21205",
    330       "relevance": "Repository-level evaluation showing strongest LLM generates secure and correct code at only 28% rate."
    331     },
    332     {
    333       "title": "Helping LLMs Improve Code Generation Using Feedback from Testing and Static Analysis",
    334       "authors": ["Greta Dolcetti", "Vincenzo Arceri"],
    335       "year": 2024,
    336       "arxiv_id": "2412.14841",
    337       "relevance": "Combines formal methods feedback with LLM code generation."
    338     },
    339     {
    340       "title": "Position: LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks",
    341       "authors": ["Subbarao Kambhampati"],
    342       "year": 2024,
    343       "relevance": "Foundational argument that LLMs optimize for local plausibility rather than combinatorial planning."
    344     },
    345     {
    346       "title": "CodeFlowBench: A Multi-turn, Iterative Benchmark for Complex Code Generation",
    347       "authors": ["Zhenyu Zhang"],
    348       "year": 2025,
    349       "arxiv_id": "2504.21751",
    350       "relevance": "Multi-turn coding benchmark showing steep performance decline vs single-turn tasks."
    351     },
    352     {
    353       "title": "RepoExec: Evaluate Code Generation with a Repository-Level Executable Benchmark",
    354       "authors": ["Vu Le Hai"],
    355       "year": 2024,
    356       "arxiv_id": "2406.11927",
    357       "relevance": "Repository-level evaluation revealing cross-file integration and dependency breaks."
    358     },
    359     {
    360       "title": "Dissecting the SWE-Bench Leaderboards: Profiling Submitters and Architectures of LLM- and Agent-Based Repair Systems",
    361       "authors": ["Matias Martinez", "Xavier Franch"],
    362       "year": 2025,
    363       "arxiv_id": "2506.17208",
    364       "relevance": "Analysis of SWE-bench showing static benchmark success doesn't transfer to dynamic tasks."
    365     },
    366     {
    367       "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair",
    368       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"],
    369       "year": 2023,
    370       "doi": "10.1145/3611643.3616271",
    371       "relevance": "Type II system combining LLMs with completion engines for program repair."
    372     },
    373     {
    374       "title": "Hidden Technical Debt in Machine Learning Systems",
    375       "authors": ["D. Sculley"],
    376       "year": 2015,
    377       "relevance": "Foundational work on ML technical debt patterns (entanglement, feedback loops) that vibe coding amplifies."
    378     },
    379     {
    380       "title": "Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI",
    381       "authors": ["Ranjan Sapkota"],
    382       "year": 2025,
    383       "arxiv_id": "2505.19443",
    384       "relevance": "Defines and compares vibe coding and agentic coding workflows."
    385     },
    386     {
    387       "title": "MemBench: Towards More Comprehensive Evaluation on the Memory of LLM-based Agents",
    388       "authors": ["Mengchen Li"],
    389       "year": 2025,
    390       "arxiv_id": "2506.21605",
    391       "relevance": "Benchmark showing LLM long-term recall is unreliable and degrades under complexity."
    392     }
    393   ]
    394 }

Impressum · Datenschutz