ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32155B)


      1 {
      2   "paper": {
      3     "title": "From Evaluation to Enhancement: Large Language Models for Zero-Knowledge Proof Code Generation",
      4     "authors": [
      5       "Zhantong Xue",
      6       "Pingchuan Ma",
      7       "Zhaoyu Wang",
      8       "Yuguang Zhou",
      9       "Xiaoqin Zhang",
     10       "Shuai Wang",
     11       "Juergen Rahmel"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2509.11708",
     16     "doi": "10.48550/arXiv.2509.11708"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper refers to a 'replication package' and 'Artifact' multiple times (§3.1.1, §13) but provides no repository URL or archive link in the paper text. 'Provided in the Artifact' is a reference to an intended release, not a working URL."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "ZK-Eval benchmark questions and filtered task IDs (Tables 9-10) are described, and the paper says the 'full set of questions is provided in the Artifact' (§3.1.1), but no URL to download the dataset is provided. They use public datasets (HumanEval, LiveCodeBench) but their contributed benchmark has no verifiable release link."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper states 'Linux (Ubuntu 22.04 LTS) server equipped with 256 GiB of RAM' (§5.1) but provides no requirements.txt, Dockerfile, library versions, or dependency specifications sufficient to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. §13 says prompts are 'available in the replication package' but no URL or README with commands is given."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results (Tables 2-3, Figures 4-6, 9) report point estimates only. No confidence intervals, error bars, or ± notation appear despite using 10 samples per task to estimate Pass@1."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes many comparative claims (e.g., 'ZK-Coder delivers a significant improvement', 'GPT-o3 leads') based solely on comparing numbers. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported anywhere."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper consistently reports improvements with baseline context, e.g., 'improving code generation success rates from 20.29% to 87.85% on Circom and from 28.38% to 97.79% on Noir' (§1), and Table 2 provides both ZK-Coder and baseline accuracy for each model/language combination."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is provided for benchmark sizes: 68 tasks from HumanEval, 34 from LiveCodeBench, 35 algebraic primitives, 172/164 MCQ questions. No power analysis or discussion of whether these sizes are adequate for the claims made."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Despite using 10 samples per task (§3.2), no standard deviations, interquartile ranges, or spread measures are reported. Tables 2-3 and all figures show single point estimates only."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Table 2 compares ZK-Coder against a 'Simple Baseline' (direct generation with grammar summary and few-shot examples). Table 3 adds a 'Repairing Baseline' (baseline + interactive repair loop). §5.2 describes the baseline setup."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The evaluation uses GPT-o4-mini, GPT-o3, DeepSeek-V3, and Qwen3 — all contemporary models. Baselines use the same models with direct prompting, ensuring a fair comparison of the framework rather than the model."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "§5.3 provides a thorough ablation study with 6 variants (V1-V6): No RAG, No Sketch, No Compile-Repair, No Execute-Repair, Similarity-based RAG, Only Repair. Figure 9 shows results for all variants across both languages."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper reports: syntactic validity (compilable programs), semantic correctness (Pass@1 on test cases), per-stage metrics (sketch correctness, repair pass rate, program correctness), and average token cost (Table 2). MCQ accuracy is reported separately for language knowledge."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Human experts validated the benchmark questions (§3.1.1: 'Three human reviewers independently assess each item'), but no human evaluation of ZK-Coder's generated code outputs is performed. All system evaluation is automated via test suites."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper explicitly uses hidden test cases: 'evaluation test cases remain hidden to prevent data-leakage' (§4.2.3). Table 2 measures 'Program correctness' as 'conditioned on all successfully repaired programs, and measures the proportion that pass all hidden test cases.'"
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by: language (Circom/Noir), model (4 LLMs), pipeline stage (Table 2), primitive type and error category (Fig. 6), MCQ category (Fig. 4), ablation variant (Fig. 9), dataset type (Table 3), and failure mode (Table 4)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "§5.5 (RQ7) is a dedicated failure analysis. Table 4 categorizes failures into 5 types: Repair Budget Exceed, Sketched Constraint Incorrect, False Acceptance, False Rejection, and Mixed. §3.4 also analyzes error categories in primitive implementations (Fig. 6)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that LLMs perform poorly on algebraic primitives (§3.4, even best category only 52%), end-to-end generation drops to 5-20% (§3.5), open-weight models achieve under 10% baseline accuracy on Circom, and the ablation shows significant degradation when components are removed."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract's key claims are supported: 'strong proficiency in language syntax' matches Finding ① (88.1% accuracy, §3.3), 'struggle when implementing and composing algebraic primitives' matches Findings ③-④ (§3.4), and '20.29% to 87.85% / 28.38% to 97.79%' matches Table 2 exactly."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper's main causal claims are about ZK-Coder's components causing improvement. The ablation study (§5.3) uses controlled single-variable manipulation — each variant removes exactly one component — which adequately supports claims like 'removing either component causes a significant performance drop (over 10%)' (§1)."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly bounds its scope to Circom and Noir (§2.1: 'In this work, we focus on Circom and Noir') and explains why. Claims are generally tied to specific languages and models. The title ('Zero-Knowledge Proof Code Generation') accurately reflects the scope."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "§7 discusses five specific threats to validity: LLM-generated question bias (①), test case limitations (②-③), prompting strategy influence (④), and security limitations (⑤). These address specific alternative explanations for the observed results."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly distinguishes between functional correctness (what it measures via test suites) and other ZK program qualities it does not claim: efficiency/circuit size (§7 'ZKP Program Efficiency'), security (§7 threat ⑤), and real-world deployment readiness. It also notes 'compilable alone is insufficient for assessing ZK program quality' (§3.5)."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper uses 'GPT-o4-mini', 'GPT-o3', 'DeepSeek-V3', and 'Qwen3' (§3.2) without snapshot dates, API versions, or specific model IDs. These are marketing names that do not specify exact model versions."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Fig. 7 shows high-level prompt sketches (e.g., '[Prompt] Given this problem, please formulate the verification constraints using ZKSL...') but not actual prompt text. §13 states 'All prompts used in our experiments are available in the replication package' but no URL is provided in the paper to access them."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Repair budgets are reported (N1=8, N2=3 in §4.2.3) and '10 samples per task' (§3.2), but LLM API parameters (temperature, top-p, max tokens) are not stated anywhere in the paper."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "ZK-Coder's agentic scaffolding is described in detail: §4.2 covers three stages (constraint formulation, sketch-guided retrieval, interactive refinement), Fig. 7 provides a workflow diagram, retry thresholds are specified (N1=8, N2=3), and the ZKSL grammar is formally defined in Fig. 8."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The benchmark curation pipeline is well-documented: §3.1.1 describes document collection (51 Circom docs, 40 Noir docs), LLM-aided question generation, and expert validation. §3.1.3 details task filtering criteria (excluding string/float/variable-length tasks), yielding 68 tasks. Tables 9-10 list exact task IDs."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "§7 'Discussion and Limitations' is a dedicated section covering ZKP program efficiency, LLM training/fine-tuning challenges, real-world representativeness, and five specific threats to validity (①-⑤)."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "§7 identifies five specific threats: ① LLM-aided question generation may introduce bias, ② hand-crafted test cases cannot prove correctness (mitigated by 86-87% mutation detection rates), ③ automated evaluation depends on test coverage, ④ prompting strategy may influence results, ⑤ generated code may have security vulnerabilities. These are specific to this study."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "§7 explicitly states what the results do NOT show: efficiency/circuit size optimization is not addressed ('ZKP Program Efficiency' paragraph), fine-tuning is not explored ('LLM Training and Fine-tuning'), security guarantees are not provided (threat ⑤). §2.1 explicitly bounds scope to Circom and Noir, excluding zkVMs."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Per-task results, individual model outputs, and raw Pass@1 data across 10 samples are not provided. Only aggregate statistics appear in the paper. The 'replication package' is mentioned but no URL is given."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "§3.1 provides detailed benchmark construction: sources (official docs, GitHub repos, tutorials), LLM-aided question generation pipeline, algebraic primitive curation from circomlib/zk-kit/Z3Py (§3.1.2), HumanEval task adaptation with three specific transformations (§3.1.3), and quality validation through mutation testing."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants in the study. Three expert reviewers validated the benchmark (§3.1.1) but are likely the authors, not recruited participants. The study evaluates LLMs on benchmarks, not human subjects."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline from raw sources to final benchmark is documented: document collection → LLM question generation → expert validation → deduplication (§3.1.1); library survey → primitive extraction → completeness alignment (§3.1.2); HumanEval filtering → verification reformulation → oracle adaptation → test case generation (§3.1.3). Stage counts are provided (172/164 MCQ, 35 primitives, 68 tasks)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section listing grants or sponsors is present."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Hong Kong University of Science and Technology, Zhejiang University of Technology, CipherInsight Limited, and HSBC. The dual affiliations with CipherInsight Limited (a ZK-related company) are transparent."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funder is disclosed. Two authors are affiliated with CipherInsight Limited (appears to be a ZK-related company) and one with HSBC. Without funding disclosure, independence cannot be assessed. The paper proposes ZK-Coder which could have commercial relevance to CipherInsight."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement appears in the paper. Two authors have dual affiliations with CipherInsight Limited (a company in the ZK space) but no declaration of financial interests is made."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the four evaluated models (GPT-o4-mini, GPT-o3, DeepSeek-V3, Qwen3)."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The paper acknowledges HumanEval is 'potentially contaminated' (§5.4) and uses LiveCodeBench as a contamination-free alternative, but does not analyze train/test overlap for the adapted HumanEval benchmark or ZK-Eval MCQ questions, which are derived from public documentation that models likely trained on."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper acknowledges HumanEval contamination risk and uses LiveCodeBench (§5.4) for RQ6, but the primary evaluation (Table 2, RQ4) uses adapted HumanEval without any contamination mitigation. The ZK-Eval MCQ benchmark is derived from public documentation, which could be in training data, with no contamination analysis."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. The paper evaluates LLMs on benchmarks."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Table 2 reports average token costs per task for each model/language/method combination. §5.2 states 'each task requiring fewer than 5,000 tokens on average, corresponding to less than 0.1 USD per task.'"
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper mentions hardware ('Linux (Ubuntu 22.04 LTS) server equipped with 256 GiB of RAM' in §5.1) but does not state total API spend, total compute time, or aggregate cost for the full evaluation campaign."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The paper uses '10 samples per task to estimate Pass@1 rates and reduce variance' (§3.2) but does not report variance across these samples or any seed sensitivity analysis."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "§3.2 explicitly states '10 samples per task to estimate Pass@1 rates and reduce variance.' Repair budgets (N1=8, N2=3) are also stated in §4.2.3."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search budget is reported. The paper uses 'standard zero-shot and few-shot prompting techniques without extensive prompt engineering' (§13) but does not document how repair budgets (N1=8, N2=3) or other parameters were selected."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The repair budget thresholds (N1=8 for compile-repair, N2=3 for execute-repair) are stated but their selection is not justified. No exploration of alternative configurations is documented."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes numerous comparative claims across 4 models × 2 languages × multiple ablation variants but performs no statistical tests at all, and therefore no multiple comparison correction."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate their own ZK-Coder system against their own baseline implementations. No discussion of author-evaluation bias or independent evaluation. Per Lucic et al. (2018), authors' implementations of baselines systematically underperform."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Token costs are reported alongside accuracy (Table 2) but performance is not shown as a function of compute budget. The repair budgets are fixed (N1=8, N2=3) with no analysis of how performance varies with different compute allocations."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "§7 'Real-World Representativeness' analyzes construct validity: generated programs have average cyclomatic complexity of 22.81 vs 16.8/17.5 in production repos. §10 surveys 37 repositories (408 source files) showing 97.3% coverage for arithmetic and 94.6% for relational primitives. §3.1.3 justifies the HumanEval adaptation rationale."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "When comparing models (Table 2), all models use the same ZK-Coder pipeline, isolating the model effect. The ablation study (§5.3) systematically varies the scaffold components. Baseline comparisons use the same models with and without scaffolding."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The paper acknowledges HumanEval is 'potentially contaminated' and uses LiveCodeBench for RQ6 (§5.4), but does not discuss temporal leakage for the primary ZK-Eval benchmark or adapted HumanEval evaluation. No model training cutoff dates are provided to assess temporal overlap."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "§4.2.3 states 'evaluation test cases remain hidden to prevent data-leakage' and §13 verifies 'no test cases or ground-truth solutions were included in the prompts, preventing information leakage that could artificially inflate success rates.' The ZK-Coder self-generated test cases are separate from evaluation test cases."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether training data for the evaluated models contains ZK-related code from circomlib, zk-kit, or the GitHub repositories used to construct the benchmark. The 37 production repositories surveyed (§10) are public and could be in training data."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "LiveCodeBench is used as a temporal decontamination strategy for RQ6 only (§5.4). No concrete detection method (canary strings, membership inference, n-gram overlap) is applied to the primary evaluation benchmarks (ZK-Eval, adapted HumanEval)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "LLMs demonstrate strong competence in ZK language knowledge, achieving accuracy within a few percentage points of human experts (88.1% for GPT-o4-mini vs 88.7% for human expert).",
    371       "evidence": "Fig. 4 shows MCQ benchmark accuracy across 4 categories for Circom and Noir. GPT-o4-mini reaches 88.1%, GPT-o3 87.2%, vs human expert 88.7% (§3.3).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "LLMs perform consistently weak on implementing and using algebraic primitives, with even the best category (logical operations) reaching only 52% accuracy in Circom and 49% in Noir.",
    376       "evidence": "Fig. 6 shows error distributions across primitive types and error categories. Comparison scored 37%/20%, arithmetic and composites extremely low (§3.4).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "ZK-Coder improves end-to-end code generation success rates from 20.29% to 87.85% on Circom and from 28.38% to 97.79% on Noir using GPT-o3.",
    381       "evidence": "Table 2 provides per-stage and overall accuracy with token costs. Per-stage breakdown: 97.30% sketch, 94.41% repair, 95.64% program correctness for GPT-o3 on Circom (§5.2).",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Each component of ZK-Coder (sketching, RAG, repair) is indispensable; removing any causes over 10% performance drop.",
    386       "evidence": "Fig. 9 shows ablation results: removing RAG drops Circom from 65.66% to 53.04%, removing sketch to 46.37%, removing compile-repair to 26.04% (§5.3).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "ZK-Coder generalizes to harder contamination-free tasks (LiveCodeBench) and production coding patterns.",
    391       "evidence": "Table 3: LiveCodeBench 44.10% Circom / 56.93% Noir vs Simple Baseline 1.72% / 9.89%. Production case studies: 90.83% / 92.09% (§5.4).",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Reasoning models (GPT-o4-mini, GPT-o3) substantially outperform open-weight models (DeepSeek-V3, Qwen3) on ZK code generation, by 2-3x on end-to-end tasks.",
    396       "evidence": "Fig. 5 and Table 2: GPT-o3 achieves 20%/28% on Circom/Noir baselines vs 5-7%/10-16% for open-weight models. With ZK-Coder, GPT-o3 at 87.85%/97.79% vs 42-50%/55-69% for open-weight (§3.5, §5.2).",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "methodology_tags": ["benchmark-eval"],
    401   "key_findings": "ZK-Eval reveals that while LLMs achieve near-expert accuracy on ZK language knowledge (88%), they fail dramatically when implementing algebraic primitives (<52%) or generating end-to-end ZK programs (<30% baseline). ZK-Coder, an agentic framework combining constraint sketching, guided retrieval, and interactive repair, boosts GPT-o3's success rate from 20% to 88% on Circom and 28% to 98% on Noir. Ablation shows all three components are necessary, with compile-repair being the most critical (removing it drops accuracy to 26%). The dominant failure modes are repair budget exhaustion and incorrect constraint sketching.",
    402   "red_flags": [
    403     {
    404       "flag": "No error bars or statistical tests",
    405       "detail": "Despite 10 samples per task, no variance, confidence intervals, or significance tests are reported for any comparison. All claims of improvement are based on comparing point estimates. The paper makes dozens of comparative claims (4 models × 2 languages × 7 ablation variants) without any statistical rigor."
    406     },
    407     {
    408       "flag": "Primary benchmark contamination acknowledged but not mitigated",
    409       "detail": "The paper acknowledges HumanEval is 'potentially contaminated' (§5.4) but uses it for the primary evaluation (Table 2, RQ4). LiveCodeBench is used only for the generalization study (RQ6). The ZK-Eval MCQ benchmark is derived from public documentation that models likely trained on, with no contamination analysis."
    410     },
    411     {
    412       "flag": "Undisclosed potential commercial interest",
    413       "detail": "Two authors (Pingchuan Ma, Shuai Wang) are affiliated with CipherInsight Limited, which appears to be a company in the zero-knowledge proof space. One author (Juergen Rahmel) is from HSBC. No funding disclosure, competing interests statement, or financial interests declaration appears in the paper, despite the proposed ZK-Coder having potential commercial relevance."
    414     },
    415     {
    416       "flag": "Self-evaluation without bias acknowledgment",
    417       "detail": "The authors evaluate their own ZK-Coder system against their own baseline implementations. Per Lucic et al. (2018), authors' implementations of baselines systematically underperform. No discussion of this bias or independent evaluation is provided."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Evaluating large language models trained on code",
    423       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    424       "year": 2021,
    425       "arxiv_id": "2107.03374",
    426       "relevance": "HumanEval benchmark paper, the basis for ZK-Eval's end-to-end generation tasks and a core code generation evaluation resource."
    427     },
    428     {
    429       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    430       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    431       "year": 2024,
    432       "arxiv_id": "2403.07974",
    433       "relevance": "Contamination-free code generation benchmark used for ZK-Coder's generalization study."
    434     },
    435     {
    436       "title": "A Survey on Large Language Models for Code Generation",
    437       "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen"],
    438       "year": 2025,
    439       "doi": "10.1145/3747588",
    440       "relevance": "Comprehensive survey of LLM-based code generation frameworks and benchmarks."
    441     },
    442     {
    443       "title": "A Survey on Code Generation with LLM-based Agents",
    444       "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"],
    445       "year": 2025,
    446       "arxiv_id": "2508.00083",
    447       "relevance": "Survey of agentic LLM frameworks for code generation, the paradigm ZK-Coder follows."
    448     },
    449     {
    450       "title": "A survey on llm-based code generation for low-resource and domain-specific programming languages",
    451       "authors": ["Sathvik Joel", "Jie JW Wu", "Fatemeh H Fard"],
    452       "year": 2024,
    453       "arxiv_id": "2410.03981",
    454       "relevance": "Directly relevant survey on LLM code generation for DSLs and low-resource languages, the core challenge this paper addresses."
    455     },
    456     {
    457       "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation",
    458       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    459       "year": 2022,
    460       "arxiv_id": "2208.08227",
    461       "relevance": "Multi-language code generation benchmark addressing evaluation across diverse programming languages."
    462     },
    463     {
    464       "title": "Knowledge transfer from high-resource to low-resource programming languages for code llms",
    465       "authors": ["Federico Cassano", "John Gouwar", "Francesca Lucchetti"],
    466       "year": 2024,
    467       "relevance": "Addresses knowledge transfer for low-resource language code generation, an alternative approach to ZK-Coder's RAG strategy."
    468     },
    469     {
    470       "title": "Enhancing code generation for low-resource languages: No silver bullet",
    471       "authors": ["Alessandro Giagnorio", "Alberto Martin-Lopez", "Gabriele Bavota"],
    472       "year": 2025,
    473       "arxiv_id": "2501.19085",
    474       "relevance": "Studies challenges of LLM code generation for low-resource languages, finding no single approach dominates."
    475     },
    476     {
    477       "title": "Retrieval-augmented generation for large language models: A survey",
    478       "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"],
    479       "year": 2023,
    480       "arxiv_id": "2312.10997",
    481       "relevance": "Survey of RAG techniques that underpin ZK-Coder's sketch-guided retrieval component."
    482     },
    483     {
    484       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    485       "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"],
    486       "year": 2023,
    487       "arxiv_id": "2307.16789",
    488       "relevance": "Demonstrates LLM capabilities in API usage and tool integration, related to ZK-Coder's API-aware code generation."
    489     },
    490     {
    491       "title": "Automated detection of under-constrained circuits in zero-knowledge proofs",
    492       "authors": ["Shankara Pailoor", "Yanju Chen", "Franklyn Wang"],
    493       "year": 2023,
    494       "relevance": "Static analysis tool (QED2) for ZK circuit verification, complementary to ZK-Coder's generation approach."
    495     },
    496     {
    497       "title": "Grammar prompting for domain-specific language generation with large language models",
    498       "authors": ["Bailin Wang", "Zi Wang", "Xuezhi Wang"],
    499       "year": 2023,
    500       "relevance": "Grammar prompting technique for DSL generation compared against ZK-Coder's sketch-based approach in ablation."
    501     }
    502   ]
    503 }

Impressum · Datenschutz