scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (37947B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Evaluation to Enhancement: Large Language Models for Zero-Knowledge Proof Code Generation",
      6     "authors": [
      7       "Zhantong Xue",
      8       "Pingchuan Ma",
      9       "Zhaoyu Wang",
     10       "Yuguang Zhou",
     11       "Xiaoqin Zhang",
     12       "Shuai Wang",
     13       "Juergen Rahmel"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2509.11708",
     18     "doi": "10.48550/arXiv.2509.11708"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract's key claims are supported: 'strong proficiency in language syntax' matches Finding ① (88.1% accuracy, §3.3), 'struggle when implementing and composing algebraic primitives' matches Findings ③-④ (§3.4), and '20.29% to 87.85% / 28.38% to 97.79%' matches Table 2 exactly.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper's main causal claims are about ZK-Coder's components causing improvement. The ablation study (§5.3) uses controlled single-variable manipulation — each variant removes exactly one component — which adequately supports claims like 'removing either component causes a significant performance drop (over 10%)' (§1).",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper explicitly bounds its scope to Circom and Noir (§2.1: 'In this work, we focus on Circom and Noir') and explains why. Claims are generally tied to specific languages and models. The title ('Zero-Knowledge Proof Code Generation') accurately reflects the scope.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "§7 discusses five specific threats to validity: LLM-generated question bias (①), test case limitations (②-③), prompting strategy influence (④), and security limitations (⑤). These address specific alternative explanations for the observed results.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper explicitly distinguishes between functional correctness (what it measures via test suites) and other ZK program qualities it does not claim: efficiency/circuit size (§7 'ZKP Program Efficiency'), security (§7 threat ⑤), and real-world deployment readiness. It also notes 'compilable alone is insufficient for assessing ZK program quality' (§3.5).",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "§7 'Discussion and Limitations' is a dedicated section covering ZKP program efficiency, LLM training/fine-tuning challenges, real-world representativeness, and five specific threats to validity (①-⑤).",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "§7 identifies five specific threats: ① LLM-aided question generation may introduce bias, ② hand-crafted test cases cannot prove correctness (mitigated by 86-87% mutation detection rates), ③ automated evaluation depends on test coverage, ④ prompting strategy may influence results, ⑤ generated code may have security vulnerabilities. These are specific to this study.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "§7 explicitly states what the results do NOT show: efficiency/circuit size optimization is not addressed ('ZKP Program Efficiency' paragraph), fine-tuning is not explored ('LLM Training and Fine-tuning'), security guarantees are not provided (threat ⑤). §2.1 explicitly bounds scope to Circom and Noir, excluding zkVMs.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section listing grants or sponsors is present.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed: Hong Kong University of Science and Technology, Zhejiang University of Technology, CipherInsight Limited, and HSBC. The dual affiliations with CipherInsight Limited (a ZK-related company) are transparent.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funder is disclosed. Two authors are affiliated with CipherInsight Limited (appears to be a ZK-related company) and one with HSBC. Without funding disclosure, independence cannot be assessed. The paper proposes ZK-Coder which could have commercial relevance to CipherInsight.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement appears in the paper. Two authors have dual affiliations with CipherInsight Limited (a company in the ZK space) but no declaration of financial interests is made.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "ZKPs are formally defined with completeness/soundness/zero-knowledge properties (§2.1), 'algebraic primitives' are defined and catalogued (§3.1.2), and ZKSL is formally specified via grammar (Fig. 8).",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three explicit contributions are enumerated at the end of §1: ZK-Eval (benchmark), ZK-Coder (agentic framework), and empirical evaluation showing substantial gains.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 6 situates the work against LLM-for-DSL-generation, code generation benchmarks, and ZKP software engineering tools, explaining how this work fills the gap of ZKP-specific benchmarking and augmentation.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The paper refers to a 'replication package' and 'Artifact' multiple times (§3.1.1, §13) but provides no repository URL or archive link in the paper text. 'Provided in the Artifact' is a reference to an intended release, not a working URL.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "ZK-Eval benchmark questions and filtered task IDs (Tables 9-10) are described, and the paper says the 'full set of questions is provided in the Artifact' (§3.1.1), but no URL to download the dataset is provided. They use public datasets (HumanEval, LiveCodeBench) but their contributed benchmark has no verifiable release link.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper states 'Linux (Ubuntu 22.04 LTS) server equipped with 256 GiB of RAM' (§5.1) but provides no requirements.txt, Dockerfile, library versions, or dependency specifications sufficient to recreate the environment.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are provided in the paper. §13 says prompts are 'available in the replication package' but no URL or README with commands is given.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All results (Tables 2-3, Figures 4-6, 9) report point estimates only. No confidence intervals, error bars, or ± notation appear despite using 10 samples per task to estimate Pass@1.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "The paper makes many comparative claims (e.g., 'ZK-Coder delivers a significant improvement', 'GPT-o3 leads') based solely on comparing numbers. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported anywhere.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The paper consistently reports improvements with baseline context, e.g., 'improving code generation success rates from 20.29% to 87.85% on Circom and from 28.38% to 97.79% on Noir' (§1), and Table 2 provides both ZK-Coder and baseline accuracy for each model/language combination.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No justification is provided for benchmark sizes: 68 tasks from HumanEval, 34 from LiveCodeBench, 35 algebraic primitives, 172/164 MCQ questions. No power analysis or discussion of whether these sizes are adequate for the claims made.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Despite using 10 samples per task (§3.2), no standard deviations, interquartile ranges, or spread measures are reported. Tables 2-3 and all figures show single point estimates only.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Table 2 compares ZK-Coder against a 'Simple Baseline' (direct generation with grammar summary and few-shot examples). Table 3 adds a 'Repairing Baseline' (baseline + interactive repair loop). §5.2 describes the baseline setup.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "The evaluation uses GPT-o4-mini, GPT-o3, DeepSeek-V3, and Qwen3 — all contemporary models. Baselines use the same models with direct prompting, ensuring a fair comparison of the framework rather than the model.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "§5.3 provides a thorough ablation study with 6 variants (V1-V6): No RAG, No Sketch, No Compile-Repair, No Execute-Repair, Similarity-based RAG, Only Repair. Figure 9 shows results for all variants across both languages.",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "The paper reports: syntactic validity (compilable programs), semantic correctness (Pass@1 on test cases), per-stage metrics (sketch correctness, repair pass rate, program correctness), and average token cost (Table 2). MCQ accuracy is reported separately for language knowledge.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "Human experts validated the benchmark questions (§3.1.1: 'Three human reviewers independently assess each item'), but no human evaluation of ZK-Coder's generated code outputs is performed. All system evaluation is automated via test suites.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "The paper explicitly uses hidden test cases: 'evaluation test cases remain hidden to prevent data-leakage' (§4.2.3). Table 2 measures 'Program correctness' as 'conditioned on all successfully repaired programs, and measures the proportion that pass all hidden test cases.'",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by: language (Circom/Noir), model (4 LLMs), pipeline stage (Table 2), primitive type and error category (Fig. 6), MCQ category (Fig. 4), ablation variant (Fig. 9), dataset type (Table 3), and failure mode (Table 4).",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "§5.5 (RQ7) is a dedicated failure analysis. Table 4 categorizes failures into 5 types: Repair Budget Exceed, Sketched Constraint Incorrect, False Acceptance, False Rejection, and Mixed. §3.4 also analyzes error categories in primitive implementations (Fig. 6).",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "The paper reports that LLMs perform poorly on algebraic primitives (§3.4, even best category only 52%), end-to-end generation drops to 5-20% (§3.5), open-weight models achieve under 10% baseline accuracy on Circom, and the ablation shows significant degradation when components are removed.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "The paper uses 'GPT-o4-mini', 'GPT-o3', 'DeepSeek-V3', and 'Qwen3' (§3.2) without snapshot dates, API versions, or specific model IDs. These are marketing names that do not specify exact model versions.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Fig. 7 shows high-level prompt sketches (e.g., '[Prompt] Given this problem, please formulate the verification constraints using ZKSL...') but not actual prompt text. §13 states 'All prompts used in our experiments are available in the replication package' but no URL is provided in the paper to access them.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Repair budgets are reported (N1=8, N2=3 in §4.2.3) and '10 samples per task' (§3.2), but LLM API parameters (temperature, top-p, max tokens) are not stated anywhere in the paper.",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "ZK-Coder's agentic scaffolding is described in detail: §4.2 covers three stages (constraint formulation, sketch-guided retrieval, interactive refinement), Fig. 7 provides a workflow diagram, retry thresholds are specified (N1=8, N2=3), and the ZKSL grammar is formally defined in Fig. 8.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "The benchmark curation pipeline is well-documented: §3.1.1 describes document collection (51 Circom docs, 40 Noir docs), LLM-aided question generation, and expert validation. §3.1.3 details task filtering criteria (excluding string/float/variable-length tasks), yielding 68 tasks. Tables 9-10 list exact task IDs.",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Per-task results, individual model outputs, and raw Pass@1 data across 10 samples are not provided. Only aggregate statistics appear in the paper. The 'replication package' is mentioned but no URL is given.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "§3.1 provides detailed benchmark construction: sources (official docs, GitHub repos, tutorials), LLM-aided question generation pipeline, algebraic primitive curation from circomlib/zk-kit/Z3Py (§3.1.2), HumanEval task adaptation with three specific transformations (§3.1.3), and quality validation through mutation testing.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants in the study. Three expert reviewers validated the benchmark (§3.1.1) but are likely the authors, not recruited participants. The study evaluates LLMs on benchmarks, not human subjects.",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from raw sources to final benchmark is documented: document collection → LLM question generation → expert validation → deduplication (§3.1.1); library survey → primitive extraction → completeness alignment (§3.1.2); HumanEval filtering → verification reformulation → oracle adaptation → test case generation (§3.1.3). Stage counts are provided (172/164 MCQ, 35 primitives, 68 tasks).",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "No training data cutoff dates are stated for any of the four evaluated models (GPT-o4-mini, GPT-o3, DeepSeek-V3, Qwen3).",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "The paper acknowledges HumanEval is 'potentially contaminated' (§5.4) and uses LiveCodeBench as a contamination-free alternative, but does not analyze train/test overlap for the adapted HumanEval benchmark or ZK-Eval MCQ questions, which are derived from public documentation that models likely trained on.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "The paper acknowledges HumanEval contamination risk and uses LiveCodeBench (§5.4) for RQ6, but the primary evaluation (Table 2, RQ4) uses adapted HumanEval without any contamination mitigation. The ZK-Eval MCQ benchmark is derived from public documentation, which could be in training data, with no contamination analysis.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study. The paper evaluates LLMs on benchmarks.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 2 reports average token costs per task for each model/language/method combination. §5.2 states 'each task requiring fewer than 5,000 tokens on average, corresponding to less than 0.1 USD per task.'",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "The paper mentions hardware ('Linux (Ubuntu 22.04 LTS) server equipped with 256 GiB of RAM' in §5.1) but does not state total API spend, total compute time, or aggregate cost for the full evaluation campaign.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "The paper uses '10 samples per task to estimate Pass@1 rates and reduce variance' (§3.2) but does not report variance across these samples or any seed sensitivity analysis.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": true,
    382           "justification": "§3.2 explicitly states '10 samples per task to estimate Pass@1 rates and reduce variance.' Repair budgets (N1=8, N2=3) are also stated in §4.2.3.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "No hyperparameter search budget is reported. The paper uses 'standard zero-shot and few-shot prompting techniques without extensive prompt engineering' (§13) but does not document how repair budgets (N1=8, N2=3) or other parameters were selected.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "The repair budget thresholds (N1=8 for compile-repair, N2=3 for execute-repair) are stated but their selection is not justified. No exploration of alternative configurations is documented.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "The paper makes numerous comparative claims across 4 models × 2 languages × multiple ablation variants but performs no statistical tests at all, and therefore no multiple comparison correction.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The authors evaluate their own ZK-Coder system against their own baseline implementations. No discussion of author-evaluation bias or independent evaluation. Per Lucic et al. (2018), authors' implementations of baselines systematically underperform.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "Token costs are reported alongside accuracy (Table 2) but performance is not shown as a function of compute budget. The repair budgets are fixed (N1=8, N2=3) with no analysis of how performance varies with different compute allocations.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": true,
    418           "justification": "§7 'Real-World Representativeness' analyzes construct validity: generated programs have average cyclomatic complexity of 22.81 vs 16.8/17.5 in production repos. §10 surveys 37 repositories (408 source files) showing 97.3% coverage for arithmetic and 94.6% for relational primitives. §3.1.3 justifies the HumanEval adaptation rationale.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": true,
    423           "answer": true,
    424           "justification": "When comparing models (Table 2), all models use the same ZK-Coder pipeline, isolating the model effect. The ablation study (§5.3) systematically varies the scaffold components. Baseline comparisons use the same models with and without scaffolding.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "The paper acknowledges HumanEval is 'potentially contaminated' and uses LiveCodeBench for RQ6 (§5.4), but does not discuss temporal leakage for the primary ZK-Eval benchmark or adapted HumanEval evaluation. No model training cutoff dates are provided to assess temporal overlap.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": true,
    438           "justification": "§4.2.3 states 'evaluation test cases remain hidden to prevent data-leakage' and §13 verifies 'no test cases or ground-truth solutions were included in the prompts, preventing information leakage that could artificially inflate success rates.' The ZK-Coder self-generated test cases are separate from evaluation test cases.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of whether training data for the evaluated models contains ZK-related code from circomlib, zk-kit, or the GitHub repositories used to construct the benchmark. The 37 production repositories surveyed (§10) are public and could be in training data.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "LiveCodeBench is used as a temporal decontamination strategy for RQ6 only (§5.4). No concrete detection method (canary strings, membership inference, n-gram overlap) is applied to the primary evaluation benchmarks (ZK-Eval, adapted HumanEval).",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "LLMs achieve near-expert-level accuracy on ZK language knowledge, with reasoning models reaching 87-88% vs. human expert 88.7%",
    459       "evidence": "Figure 4 shows GPT-o4-mini at 88.1%, GPT-o3 at 87.2%, human expert at 88.7% on the 172/164 MCQ benchmark",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "LLM accuracy on algebraic primitive implementation drops below 30% even for the best-performing category, representing a sharp gap from language knowledge",
    464       "evidence": "Figure 6 shows best category (logical ops) at 52% Circom / 49% Noir, with arithmetic and composites near 0-20%; this contrasts with 87-88% language knowledge",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "ZK-Coder improves end-to-end ZKP code generation from 20.29% to 87.85% on Circom and 28.38% to 97.79% on Noir using GPT-o3",
    469       "evidence": "Table 2 reports these exact numbers for ZK-Coder overall vs. baseline Pass@1 for GPT-o3 on both Circom and Noir",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Each ZK-Coder component (sketch, RAG, compile-repair, execute-repair) is indispensable; removing any causes significant accuracy drops",
    474       "evidence": "Figure 9 shows removing compile-repair drops accuracy from 65.66% to 26.04% on Circom; removing RAG drops to 53%; removing sketch drops to 46.37%",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "ZK-Coder generalizes to production ZKP coding patterns with ~91% success rate",
    479       "evidence": "Table 3 shows 90.83% on Circom and 92.09% on Noir for production case studies averaged across 4 models",
    480       "supported": "moderate"
    481     },
    482     {
    483       "claim": "Reasoning models (GPT-o3, GPT-o4-mini) outperform open-weight models (DeepSeek-V3, Qwen3) by 2-3x on end-to-end ZK generation",
    484       "evidence": "Figure 5 shows GPT-o3 at 20% vs. Qwen3 at 5% on Circom semantically correct programs (4x ratio); similar gaps on Noir",
    485       "supported": "strong"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval"
    490   ],
    491   "key_findings": "LLMs demonstrate near-expert competence in ZK DSL syntax and API knowledge (87-88% accuracy) but fail dramatically on algebraic primitive implementation (below 30% for most categories) and end-to-end ZKP program generation (5-32% without augmentation). ZK-Coder, combining constraint sketching via a custom Python-embedded DSL (ZKSL), sketch-guided exact-match retrieval, and interactive compile-test-repair cycles, achieves dramatic improvements: GPT-o3 reaches 87.85% on Circom and 97.79% on Noir. Ablation studies confirm all three pipeline stages are necessary, with interactive compile-repair being the single most critical component. The system generalizes to production ZKP coding patterns (~91%) and harder LiveCodeBench tasks (44-57%), substantially outperforming both direct prompting and repair-only baselines.",
    492   "red_flags": [
    493     {
    494       "flag": "No confidence intervals",
    495       "detail": "All Pass@1 results are point estimates; with 10 samples per task and small task counts (68, 34), uncertainty bounds are not reported, making differences between models statistically unverifiable."
    496     },
    497     {
    498       "flag": "Model versions unspecified",
    499       "detail": "GPT-o3 and GPT-o4-mini are used without API snapshot dates; OpenAI models update silently, making results potentially non-reproducible."
    500     },
    501     {
    502       "flag": "Replication package inaccessible",
    503       "detail": "The paper repeatedly references a 'replication package' and 'Artifact' for benchmark data, raw prompts, and code but provides no URL or access instructions."
    504     },
    505     {
    506       "flag": "HumanEval contamination unmitigated",
    507       "detail": "The paper acknowledges HumanEval as 'potentially contaminated' but uses 68 adapted tasks from it as the primary evaluation; no decontamination analysis is performed for the main results."
    508     },
    509     {
    510       "flag": "Single human expert reference",
    511       "detail": "One unnamed human expert with undisclosed background serves as the human baseline for language knowledge; no inter-rater reliability or expert selection criteria are reported."
    512     },
    513     {
    514       "flag": "Undisclosed potential conflicts",
    515       "detail": "Two authors (Pingchuan Ma, Shuai Wang) are affiliated with CipherInsight Limited, a ZKP company, with no competing interests statement or funding disclosure."
    516     }
    517   ],
    518   "cited_papers": [
    519     {
    520       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    521       "relevance": "Used as the base for ZK-Eval's end-to-end generation benchmark; directly comparable baseline for code generation capability measurement"
    522     },
    523     {
    524       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    525       "relevance": "Used as contamination-free benchmark for generalization evaluation in RQ6; harder and more recent tasks than HumanEval"
    526     },
    527     {
    528       "title": "A Survey on LLM-based Code Generation for Low-resource and Domain-specific Programming Languages",
    529       "relevance": "Directly frames the problem of LLM code generation for DSLs and ZKP languages; motivates ZK-Eval's design rationale"
    530     },
    531     {
    532       "title": "Grammar Prompting for Domain-Specific Language Generation with Large Language Models",
    533       "relevance": "Related technique for DSL generation that ZK-Coder's ablation implicitly competes with; cited as alternative approach"
    534     },
    535     {
    536       "title": "Automated Detection of Under-constrained Circuits in Zero-Knowledge Proofs (QED2)",
    537       "relevance": "Related ZKP security/verification tool; establishes the problem of under-constrained circuits that ZK-Coder must avoid"
    538     },
    539     {
    540       "title": "Practical Security Analysis of Zero-Knowledge Proof Circuits (ZKAP)",
    541       "relevance": "Static analysis tool for ZKP circuit vulnerabilities; situates ZK-Coder's generation challenges in the broader ZKP security ecosystem"
    542     },
    543     {
    544       "title": "SoK: What Don't We Know? Understanding Security Vulnerabilities in SNARKs",
    545       "relevance": "Systematizes ZKP vulnerability types (under/over-constrained circuits) that ZK-Coder's soundness/completeness testing addresses"
    546     },
    547     {
    548       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    549       "relevance": "Foundational for the RAG component of ZK-Coder; motivates sketch-guided retrieval over naive similarity-based approaches"
    550     },
    551     {
    552       "title": "Self-Planning Code Generation with Large Language Models",
    553       "relevance": "Motivates the ZKSL constraint sketching stage as a planning step before final code generation"
    554     },
    555     {
    556       "title": "A Comparative Study of DSL Code Generation: Fine-tuning vs. Optimized Retrieval Augmentation",
    557       "relevance": "Directly related work on DSL code generation with RAG; ZK-Coder extends this approach with domain-specific structural retrieval"
    558     }
    559   ],
    560   "engagement_factors": {
    561     "practical_relevance": {
    562       "score": 2,
    563       "justification": "ZK programming is a real bottleneck for practitioners in privacy/blockchain domains; ZK-Coder directly lowers the barrier for non-experts."
    564     },
    565     "surprise_contrarian": {
    566       "score": 1,
    567       "justification": "The syntax-vs-semantics gap (87% language knowledge but <30% primitive competence) is a noteworthy and somewhat surprising finding, though not broadly counterintuitive."
    568     },
    569     "fear_safety": {
    570       "score": 1,
    571       "justification": "Under-constrained ZK circuits can compromise cryptographic soundness; the paper notes security concerns but doesn't prominently foreground catastrophic risks."
    572     },
    573     "drama_conflict": {
    574       "score": 0,
    575       "justification": "No controversy, competitive tension, or provocative framing; straightforward engineering paper."
    576     },
    577     "demo_ability": {
    578       "score": 2,
    579       "justification": "ZK-Coder processes natural language descriptions into ZKP programs; this is highly demonstrable if the replication package were accessible."
    580     },
    581     "brand_recognition": {
    582       "score": 0,
    583       "justification": "HKUST and Zhejiang University are respected institutions but not famous AI labs; no famous industry product or prominent brand involved."
    584     }
    585   },
    586   "hn_data": {
    587     "threads": [
    588       {
    589         "hn_id": "43091339",
    590         "title": "DeepSeek Native Sparse Attention",
    591         "points": 16,
    592         "comments": 1,
    593         "url": "https://news.ycombinator.com/item?id=43091339",
    594         "created_at": "2025-02-18T16:17:40Z"
    595       },
    596       {
    597         "hn_id": "43086831",
    598         "title": "Native Sparse Attention: Hardware-Aligned, Natively Trainable Sparse Attention",
    599         "points": 15,
    600         "comments": 2,
    601         "url": "https://news.ycombinator.com/item?id=43086831",
    602         "created_at": "2025-02-18T07:04:47Z"
    603       },
    604       {
    605         "hn_id": "43098140",
    606         "title": "NSA: Hardware-Aligned and Natively Trainable Sparse Attention",
    607         "points": 4,
    608         "comments": 2,
    609         "url": "https://news.ycombinator.com/item?id=43098140",
    610         "created_at": "2025-02-19T03:12:01Z"
    611       },
    612       {
    613         "hn_id": "44304578",
    614         "title": "Serving Large Language Models on Huawei CloudMatrix384",
    615         "points": 3,
    616         "comments": 0,
    617         "url": "https://news.ycombinator.com/item?id=44304578",
    618         "created_at": "2025-06-17T22:18:43Z"
    619       },
    620       {
    621         "hn_id": "45259423",
    622         "title": "Human+AI loops stay stable even with quantization",
    623         "points": 2,
    624         "comments": 1,
    625         "url": "https://news.ycombinator.com/item?id=45259423",
    626         "created_at": "2025-09-16T08:08:10Z"
    627       },
    628       {
    629         "hn_id": "43318708",
    630         "title": "MAML: Towards a Faster Web in Developing Regions",
    631         "points": 2,
    632         "comments": 2,
    633         "url": "https://news.ycombinator.com/item?id=43318708",
    634         "created_at": "2025-03-10T10:03:48Z"
    635       },
    636       {
    637         "hn_id": "46445614",
    638         "title": "Mechanical non-reciprocity programmed by shear jamming in soft composite solids",
    639         "points": 2,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=46445614",
    642         "created_at": "2025-12-31T16:32:15Z"
    643       },
    644       {
    645         "hn_id": "44739937",
    646         "title": "Double Duty: FPGA Architecture to Enable Concurrent LUT and Adder Chain Usage",
    647         "points": 2,
    648         "comments": 0,
    649         "url": "https://news.ycombinator.com/item?id=44739937",
    650         "created_at": "2025-07-30T21:53:00Z"
    651       },
    652       {
    653         "hn_id": "44668806",
    654         "title": "LLMs are Bayesian, in Expectation, not in Realization",
    655         "points": 2,
    656         "comments": 0,
    657         "url": "https://news.ycombinator.com/item?id=44668806",
    658         "created_at": "2025-07-24T09:39:43Z"
    659       },
    660       {
    661         "hn_id": "43773523",
    662         "title": "Robotic Squirrel Pinto: A latched spring actuated robot for jumping and perching",
    663         "points": 2,
    664         "comments": 0,
    665         "url": "https://news.ycombinator.com/item?id=43773523",
    666         "created_at": "2025-04-23T15:51:24Z"
    667       }
    668     ],
    669     "top_points": 16,
    670     "total_points": 50,
    671     "total_comments": 8
    672   }
    673 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs