scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29780B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Evaluation to Enhancement: Large Language Models for Zero-Knowledge Proof Code Generation",
      6     "authors": [
      7       "Zhantong Xue",
      8       "Pingchuan Ma",
      9       "Zhaoyu Wang",
     10       "Yuguang Zhou",
     11       "Xiaoqin Zhang",
     12       "Shuai Wang",
     13       "Juergen Rahmel"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2509.11708",
     18     "doi": "10.48550/arXiv.2509.11708"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims are supported: LLM struggle with algebraic primitives is shown in Fig. 6 (below 30% accuracy), and ZK-Coder improvements (20.29%→87.85% Circom, 28.38%→97.79% Noir with GPT-o3) are documented in Table 2.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims about component contributions are supported by a 6-variant ablation study (Section 5.3/Fig. 9) that systematically removes or replaces each component and quantifies the resulting accuracy drop.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Results are explicitly bounded to Circom and Noir DSLs and four tested LLMs; zkVM-based frameworks are explicitly excluded from scope (Section 2.1), and claims are restricted to adapted HumanEval/LiveCodeBench tasks.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss whether gains could be attributed to extra context tokens rather than the specific ZKSL structure, nor whether GPT-o3/o4 may have encountered ZKP material during fine-tuning.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper explicitly separates syntactic validity from semantic correctness in Fig. 5 and Section 3.5, and uses both acceptance (completeness) and rejection (soundness) test cases to ensure correctness is not conflated with compilation success.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 7 'Discussion and Limitations' contains a dedicated 'Threats to Validity' subsection listing five numbered threats with specific mitigations, plus subsections on efficiency and real-world representativeness.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Five specific threats are identified: LLM-assisted MCQ generation bias, test case coverage limitations, automated evaluation reliability, prompting strategy sensitivity, and deployment security concerns — each with a concrete mitigation.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper explicitly excludes zkVM frameworks (SP1, RISC0), limits to adapted HumanEval algorithmic tasks, and notes circuit efficiency optimization as future work.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding acknowledgment section appears anywhere in the paper; funding is entirely undisclosed.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly stated: HKUST, Zhejiang University of Technology, CipherInsight Limited (commercial ZK company), and HSBC.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Funding is undisclosed; two authors are affiliated with CipherInsight Limited, a ZK-focused commercial entity that could directly benefit from the benchmarks and tools produced.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement is present; the CipherInsight Limited affiliation of two authors is listed but no financial interests declaration accompanies it.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms including ZKP soundness/completeness (Section 2.1), algebraic primitives, ZKSL grammar (Fig. 8 formal grammar), and constraint-level vs. application-level APIs are formally defined.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three explicit contributions are enumerated in Section 1: ZK-Eval benchmark, ZK-Coder agentic framework, and empirical evaluation of four SoTA LLMs.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 6 positions the work relative to DSL code generation (grammar prompting, RAG, iterative refinement), general code generation benchmarks, and ZKP software engineering tools, explaining how the contribution differs from each.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The paper repeatedly references a 'replication package' and 'Artifact' (Sections 3.1.1, 13) but provides no URL or DOI, making actual public availability unverifiable.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "ZK-Eval benchmark data is described as being in the 'Artifact' but without an accessible URL; only filtered task IDs (Tables 9, 10) appear in the paper, not the actual test cases or MCQ questions.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Only 'Linux (Ubuntu 22.04 LTS) server equipped with 256 GiB of RAM' is reported (Section 5.1); no requirements.txt, Dockerfile, Circom/Noir compiler versions, or Python dependencies are specified.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Section 13 notes prompts are 'available in the replication package' but provides no link; no step-by-step instructions for running the evaluation pipeline appear in the paper.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Tables 2 and 3 report only point estimates; despite collecting 10 samples per task, no confidence intervals or error bars accompany any reported accuracy figures.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are applied to any comparative claims between ZK-Coder and baselines or between ablation variants.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Absolute percentage improvements are reported throughout (e.g., 20.29%→87.85% for GPT-o3 on Circom in Table 2), contextualizing magnitude relative to baseline.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The 68 HumanEval and 34 LiveCodeBench tasks are described with selection rationale but no power analysis or statistical justification for sufficiency of this sample size is provided.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The paper collects 10 samples per task but reports only averages in Tables 2 and 3 without standard deviations, confidence intervals, or per-model variance.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Two baselines are provided: a 'Simple Baseline' (direct prompting with grammar summary and few-shot examples) and a 'Repairing Baseline' (adding interactive repair to direct generation) in Section 5.3.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines use the same state-of-the-art models (GPT-o3, GPT-o4-mini, DeepSeek-V3, Qwen3) as ZK-Coder, ensuring fair and contemporary comparisons.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Section 5.3 presents 6 ablation variants (No RAG, No Sketch, No Compile-Repair, No Execute-Repair, Similarity-based RAG, Only Repair) averaged across all four models on both languages.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "The paper reports syntactic validity, semantic correctness, per-stage pipeline rates (sketch correctness, repair pass rate, program correctness), and token cost as separate metrics.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation of system outputs is not applicable; ZKP code correctness is objectively verifiable by compiler checks and test case execution with provable soundness/completeness semantics.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Section 4.2.3 states 'evaluation test cases remain hidden to prevent data-leakage,' and LiveCodeBench tasks are selected specifically as 'contamination-free, and more recent tasks' (Section 5.4).",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Fig. 6 shows error distributions by primitive type and language; Fig. 4 shows MCQ accuracy by question category (Basic Syntax, Advanced Topics, API, Compiler Principles); Table 4 breaks down failure types.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 5.5 (RQ7) provides a dedicated failure analysis categorizing failures into 5 types with distribution statistics in Table 4 and discussion of underlying causes.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "The paper openly reports open-weight model failures (Qwen3: 5%/10% baseline), high failure rates on algebraic primitives (<30%), and specific ZK-Coder failure modes including 31.67% repair budget exhaustion on Circom.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Models are identified only by marketing names ('GPT-o4-mini', 'GPT-o3', 'DeepSeek-V3', 'Qwen3') with no API snapshot dates, model hashes, or parameter counts.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Section 13 explicitly states 'we omit raw prompts from this paper to maintain brevity'; only placeholder structure is described without the actual prompt text or filled values.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "No temperature, top-p, max tokens, or other API hyperparameters are reported; only samples per task (10) and repair budgets (N1=8, N2=3) are stated.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "ZK-Coder's three-stage pipeline is described in detail in Section 4.2 with the ZKSL formal grammar (Fig. 8), exact-match RAG mechanism, compiler oracle integration, and repair loop with specific budget parameters.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Benchmark construction is documented including web-crawling methodology, LLM-aided question generation pipeline, expert validation process, HumanEval task filtering criteria with explicit exclusion rules (Tables 9, 10), and test case generation with mutation testing scores.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Raw LLM outputs and full benchmark question sets are not made available in the paper; they are referenced in an unlinked 'replication package.'",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Benchmark construction is thoroughly described in Sections 3.1.1-3.1.3: document web-crawling, LLM-aided question generation, three-reviewer expert validation, HumanEval adaptation, and test case generation.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participant recruitment required; benchmark validators are co-authors rather than a recruited sample.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The full pipeline from raw documentation/repositories to benchmark questions to evaluation is documented, including the 37-repository survey, algebraic primitive extraction methodology, and mutation-testing-based test quality validation.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "No training data cutoffs are reported for any of the four evaluated models (GPT-o3, GPT-o4-mini, DeepSeek-V3, Qwen3).",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "Section 5.4 explicitly acknowledges HumanEval is 'potentially contaminated' and uses LiveCodeBench as a contamination-free alternative; Section 11.2.1 reformulates production task descriptions to mitigate memory recall.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": true,
    310           "justification": "Contamination is addressed by selecting LiveCodeBench's 'contamination-free, and more recent tasks' (Section 5.4) and reformulating production case descriptions to avoid models recalling memorized ZKP repository code.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participant study.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participant study.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participant study.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participant study.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participant study.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participant study.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participant study.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 2 reports average token costs per model/language, and Section 5.2 provides an absolute cost estimate of 'less than 0.1 USD per task' for ZK-Coder.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Total experiment compute budget is not reported; only per-task token averages and server RAM (256 GiB) are mentioned without total API spend or GPU-hours.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "LLMs achieve near-expert language knowledge for Circom/Noir (~88%) but fail dramatically at algebraic primitive specification (<30% accuracy)",
    377       "evidence": "Fig. 4 shows GPT-o4-mini/o3 at 88.1%/87.2% on MCQs (vs. human expert 88.7%); Fig. 6 shows best primitive category (logical ops) at only 52%/49% with arithmetic/composites near 0%",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "ZK-Coder improves Circom success rate from 20.29% to 87.85% using GPT-o3",
    382       "evidence": "Table 2 reports ZK-Coder Overall Pass@1 87.85% vs. Baseline Pass@1 20.29% for GPT-o3 on Circom",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "ZK-Coder improves Noir success rate from 28.38% to 97.79% using GPT-o3",
    387       "evidence": "Table 2 reports ZK-Coder Overall Pass@1 97.79% vs. Baseline Pass@1 28.38% for GPT-o3 on Noir",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "All three ZK-Coder components (ZKSL sketching, sketch-guided RAG, interactive repair) are individually indispensable",
    392       "evidence": "Fig. 9 ablation shows removing compile-repair causes the largest drop (to 26.04%/27.12%), removing sketch drops to 46.37%/64.63%, and removing RAG drops to 53.04%/72.00% on Circom/Noir",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "ZK-Coder achieves 90.83%/92.09% success rates on production-grade ZKP coding patterns",
    397       "evidence": "Table 3 reports these rates averaged across models on 10 coding patterns from 37 repositories, with Repairing Baseline also performing well at 83.33%/89.17%",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Reasoning models (GPT-o3/o4-mini) substantially outperform open-weight models on ZKP tasks, 2-3x better",
    402       "evidence": "Table 2 shows GPT-o3 at 87.85% vs. Qwen3 at 42.43% on Circom; baseline gap shows GPT-o3 at 20.29% vs. Qwen3 at 4.90%",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "The adapted HumanEval tasks adequately capture the computation-to-verification paradigm shift",
    407       "evidence": "Authors argue by design rationale and cite structural complexity metrics (avg cyclomatic complexity 22.81 exceeds production average), but provide no independent validation of this claim",
    408       "supported": "weak"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "case-study"
    414   ],
    415   "key_findings": "LLMs have strong ZK language knowledge (~88% MCQ accuracy, near human-expert level) but fail catastrophically at implementing algebraic primitives for constraint specification (<30% pass rate), revealing a fundamental gap between syntactic fluency and constraint-level reasoning. ZK-Coder, combining constraint sketching in a new intermediate DSL (ZKSL), sketch-guided exact-match RAG over verified primitive implementations, and an interactive compile-and-test repair loop, achieves 87.85%/97.79% success rates on Circom/Noir (vs. 20.29%/28.38% baseline) using GPT-o3 at under $0.10/task. Ablation confirms all three components are essential, with removing compile-repair causing the largest accuracy drop to ~26%. The system generalizes to harder LiveCodeBench tasks (44%/57%) and production ZKP coding patterns (91%/92%).",
    416   "red_flags": [
    417     {
    418       "flag": "No statistical tests or CIs",
    419       "detail": "All comparative claims use point estimates with no confidence intervals or significance tests, despite collecting 10 samples per task — the variance is measured but not reported."
    420     },
    421     {
    422       "flag": "No model version snapshots",
    423       "detail": "GPT-o3, GPT-o4-mini, DeepSeek-V3, and Qwen3 are cited without API snapshot dates; results are not reproducible as models are updated over time."
    424     },
    425     {
    426       "flag": "Artifact URL absent",
    427       "detail": "The 'replication package' and 'Artifact' are referenced throughout but no URL or DOI is provided; public availability cannot be verified."
    428     },
    429     {
    430       "flag": "Prompts withheld from paper",
    431       "detail": "Section 13 explicitly omits prompts from the paper, citing brevity; actual prompts are unavailable without locating the unlinked replication package."
    432     },
    433     {
    434       "flag": "Undisclosed commercial affiliation conflict",
    435       "detail": "Two authors (Ma, Wang) are co-affiliated with CipherInsight Limited, a ZK-focused commercial company, with no competing interests declaration."
    436     },
    437     {
    438       "flag": "LLM-contaminated MCQ generation",
    439       "detail": "MCQ questions were initially generated using LLMs (Section 3.1.1), which may introduce bias toward the same model families used in evaluation, inflating language knowledge scores."
    440     },
    441     {
    442       "flag": "Weak baseline comparison",
    443       "detail": "The baseline is simple direct prompting; no comparison to other published agentic approaches, grammar prompting, or fine-tuning methods is provided despite these existing in the DSL literature."
    444     }
    445   ],
    446   "cited_papers": [
    447     {
    448       "title": "Evaluating large language models trained on code (HumanEval)",
    449       "relevance": "Primary benchmark adapted for end-to-end ZKP evaluation tasks; widely used code generation baseline"
    450     },
    451     {
    452       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    453       "relevance": "Used as contamination-free generalization benchmark for harder algorithmic ZKP tasks"
    454     },
    455     {
    456       "title": "A Survey on Large Language Models for Code Generation",
    457       "relevance": "Contextualizes LLM code generation capabilities and motivates the ZKP evaluation gap"
    458     },
    459     {
    460       "title": "A survey on LLM-based code generation for low-resource and domain-specific programming languages",
    461       "relevance": "Directly motivates the paper by identifying lack of standardized DSL benchmarks and LLM challenges with low-resource languages"
    462     },
    463     {
    464       "title": "Grammar prompting for domain-specific language generation with large language models",
    465       "relevance": "Related DSL code generation approach mentioned as an ablation variant; ZK-Coder explicitly advances beyond this"
    466     },
    467     {
    468       "title": "A comparative study of DSL code generation: Fine-tuning vs. optimized retrieval augmentation",
    469       "relevance": "Prior work on RAG for DSL code generation that informs ZK-Coder's retrieval design"
    470     },
    471     {
    472       "title": "Practical Security Analysis of Zero-Knowledge Proof Circuits (USENIX Security 24)",
    473       "relevance": "Background on ZKP circuit vulnerabilities (under/over-constrained) that motivate correctness requirements"
    474     },
    475     {
    476       "title": "Automated detection of under-constrained circuits in zero-knowledge proofs (QED2)",
    477       "relevance": "Complementary static analysis tool for ZKP correctness; motivates the soundness/completeness test distinction"
    478     },
    479     {
    480       "title": "CYCLE: Learning to Self-Refine the Code Generation",
    481       "relevance": "Related iterative code refinement approach that informs ZK-Coder's interactive repair design"
    482     },
    483     {
    484       "title": "Retrieval-augmented generation for large language models: A survey",
    485       "relevance": "Foundation for the RAG component; ZK-Coder uses exact-match rather than similarity-based retrieval"
    486     }
    487   ],
    488   "engagement_factors": {
    489     "practical_relevance": {
    490       "score": 3,
    491       "justification": "ZK programming is a real and growing bottleneck for blockchain/privacy applications; ZK-Coder directly addresses practitioner pain with demonstrated cost under $0.10/task."
    492     },
    493     "surprise_contrarian": {
    494       "score": 2,
    495       "justification": "The finding that LLMs have near-expert language knowledge but fail catastrophically at constraint specification (<30%) directly challenges the assumption that language proficiency implies domain code generation ability."
    496     },
    497     "fear_safety": {
    498       "score": 1,
    499       "justification": "Under-constrained ZKP circuits can break cryptographic soundness in production systems, but the paper focuses on improving correctness rather than highlighting the security risks of LLM-generated ZKP code."
    500     },
    501     "drama_conflict": {
    502       "score": 0,
    503       "justification": "No controversy, competitive drama, or conflict angle in this paper."
    504     },
    505     "demo_ability": {
    506       "score": 2,
    507       "justification": "The Sudoku ZKP example (Fig. 10-11) and the 10 production case studies provide concrete demonstrations of ZK-Coder generating executable Circom/Noir code."
    508     },
    509     "brand_recognition": {
    510       "score": 1,
    511       "justification": "HKUST is a recognized institution and the paper evaluates OpenAI's GPT-o3/o4-mini models, but no major AI lab is directly behind this work."
    512     }
    513   },
    514   "hn_data": {
    515     "threads": [
    516       {
    517         "hn_id": "43091339",
    518         "title": "DeepSeek Native Sparse Attention",
    519         "points": 16,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=43091339",
    522         "created_at": "2025-02-18T16:17:40Z"
    523       },
    524       {
    525         "hn_id": "43086831",
    526         "title": "Native Sparse Attention: Hardware-Aligned, Natively Trainable Sparse Attention",
    527         "points": 15,
    528         "comments": 2,
    529         "url": "https://news.ycombinator.com/item?id=43086831",
    530         "created_at": "2025-02-18T07:04:47Z"
    531       },
    532       {
    533         "hn_id": "43098140",
    534         "title": "NSA: Hardware-Aligned and Natively Trainable Sparse Attention",
    535         "points": 4,
    536         "comments": 2,
    537         "url": "https://news.ycombinator.com/item?id=43098140",
    538         "created_at": "2025-02-19T03:12:01Z"
    539       },
    540       {
    541         "hn_id": "44304578",
    542         "title": "Serving Large Language Models on Huawei CloudMatrix384",
    543         "points": 3,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=44304578",
    546         "created_at": "2025-06-17T22:18:43Z"
    547       },
    548       {
    549         "hn_id": "45259423",
    550         "title": "Human+AI loops stay stable even with quantization",
    551         "points": 2,
    552         "comments": 1,
    553         "url": "https://news.ycombinator.com/item?id=45259423",
    554         "created_at": "2025-09-16T08:08:10Z"
    555       },
    556       {
    557         "hn_id": "43318708",
    558         "title": "MAML: Towards a Faster Web in Developing Regions",
    559         "points": 2,
    560         "comments": 2,
    561         "url": "https://news.ycombinator.com/item?id=43318708",
    562         "created_at": "2025-03-10T10:03:48Z"
    563       },
    564       {
    565         "hn_id": "46445614",
    566         "title": "Mechanical non-reciprocity programmed by shear jamming in soft composite solids",
    567         "points": 2,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=46445614",
    570         "created_at": "2025-12-31T16:32:15Z"
    571       },
    572       {
    573         "hn_id": "44739937",
    574         "title": "Double Duty: FPGA Architecture to Enable Concurrent LUT and Adder Chain Usage",
    575         "points": 2,
    576         "comments": 0,
    577         "url": "https://news.ycombinator.com/item?id=44739937",
    578         "created_at": "2025-07-30T21:53:00Z"
    579       },
    580       {
    581         "hn_id": "44668806",
    582         "title": "LLMs are Bayesian, in Expectation, not in Realization",
    583         "points": 2,
    584         "comments": 0,
    585         "url": "https://news.ycombinator.com/item?id=44668806",
    586         "created_at": "2025-07-24T09:39:43Z"
    587       },
    588       {
    589         "hn_id": "43773523",
    590         "title": "Robotic Squirrel Pinto: A latched spring actuated robot for jumping and perching",
    591         "points": 2,
    592         "comments": 0,
    593         "url": "https://news.ycombinator.com/item?id=43773523",
    594         "created_at": "2025-04-23T15:51:24Z"
    595       }
    596     ],
    597     "top_points": 16,
    598     "total_points": 50,
    599     "total_comments": 8
    600   }
    601 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs