scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24660B)
      1 {
      2   "paper": {
      3     "title": "Reducing Hallucinations in LLM-Generated Code via Semantic Triangulation",
      4     "authors": ["Yihan Dai", "Sijie Liang", "Haotian Xu", "Peichu Xie", "Sergey Mechtaev"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2511.12288",
      8     "doi": "10.48550/arXiv.2511.12288"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper states 'All code, data, and mechanized proofs are available at http://github.com/msv-lab/just-tri-it/' (Section 1)."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper states all code and data are available at the GitHub link. They also use public benchmarks (LiveCodeBench and CodeElo)."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No mention of environment specifications, requirements.txt, Dockerfile, or dependency versions in the paper text."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no instructions on how to run experiments."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Results in Figures 9, 10, 11, and 12 report point estimates (percentages) without confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims just-tri-it outperforms baselines but no statistical significance tests are reported. Comparisons are based on raw percentage differences."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports effect sizes in context, e.g., 'increases reliability of generated code by 21%' (abstract), '29% on average' (RQ1), percentage improvements with baseline values throughout Section 6."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The sample size of 30 programs is justified via entropy stabilization analysis (Appendix C, Figure 14), showing semantic entropy plateaus around N=30."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines compared: Plurality, Majority0.5, CodeT, Postcondition, Syntactic, OffByOne (Section 6)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include CodeT (2023), postcondition autoformalization, and majority voting approaches. These are contemporary and relevant methods for sample consensus."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "RQ4 (Section 6.4) provides an ablation study showing contributions of individual triangulation schemes (FWD-INV, FWD-SINV, ENUM-SINV) and the impact of removing bijective mapping constraints."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics used: Reliable Accuracy, Overall Accuracy, Abstention Rate, Abstention F1-score, and conditional probability of correctness under agreement (Figure 8)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation is included. All evaluation is automated via test suites and correctness judges."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "LiveCodeBench v6 uses problems published Feb-Apr 2025 to ensure contamination-free evaluation. The angelic value threshold was tuned on MBPP (a separate benchmark), not on the test benchmarks."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per-method (Figures 9, 10, 11), per triangulation scheme (Figure 12), and separately for exact (LCB) vs inexact (CEI) problems."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 7 (Threats to Validity) discusses where the approach fails: boolean predicates with intractable domains, optimization problems, and errors in LLM-generated transformations."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that Syntactic and OffByOne transformations 'showed no significant difference compared to plurality' (Section 6.2 footnote). The ablation shows 23% drop when bijective mapping is removed."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of 21% reliable accuracy improvement and selection at 0.14 probability are supported by results in Section 6 (Figures 9, 11, and the motivating example)."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims like 'triangulation increases confidence' are justified through theoretical proofs (Propositions 4.2, 4.3) and controlled ablation experiments (RQ4) that isolate components."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 7 explicitly bounds generalization: 'limited generalization beyond programming-contest problems', mentions that programs interacting with users/environment may need different methods, and notes results may vary across LLM architectures."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 7 discusses alternative explanations: transformation errors from LLM prompting, the possibility that reasoning models may shift program distribution structures, and that not all problems are naturally invertible."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper carefully defines its metrics (reliable accuracy, overall accuracy, abstention F1) in Figure 8 and measures exactly what it claims — correctness of code selection and abstention decisions, not broader 'code quality' claims."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper uses 'GPT-4o' and 'DeepSeek-V3' without specifying exact API versions or snapshot dates. Marketing names without version identifiers."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Figure 5 shows a prompt template fragment for SINV transformation. The paper states prompts are used for problem transformation and code generation, and the full prompts would be in the released code repository."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Temperature set to 1.0, sample size N=30, angelic threshold T=1/3 (tuned on MBPP pilot study). Reported in Section 6 and Appendix C."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The just-tri-it pipeline is described in detail: problem transformation via LLM prompting, sampling, RANSAC consensus, cascading triangulation, and the STREAM meta-method (Section 5)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "CodeElo-Inexact subset selection is documented: 'we systematically selected 31 inexact problems from CodeElo, choosing those where the acceptability of multiple solutions is explicitly stated' (Section 6). LCB v6 segment selection is also described."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 7 'Threats to Validity' provides a dedicated discussion of limitations across multiple paragraphs."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats discussed: limited to programming-contest problems, boolean predicates have intractable inversions, optimization problems are hard to invert, LLM-generated transformations may introduce errors (Section 7)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper states it focuses on 'generating individual functions... rather than whole projects interacting with users and environment' (Assumption 2), and bounds its scope to pure deterministic functions."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper states 'All code, data, and mechanized proofs are available at http://github.com/msv-lab/just-tri-it/' suggesting raw data is available."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Data collection described: LCB v6 (175 problems, Feb-Apr 2025), CEI (31 inexact problems from CodeElo with manually written correctness judges verified against CodeForces submissions). 100 trials for probability estimation (Figure 2)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants; data comes from standard benchmarks (LiveCodeBench, CodeElo)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: sample 30 programs per problem, transform problems via LLM, sample witnesses, check hyperproperties, apply RANSAC consensus, with the angelic/demonic value handling detailed in Section 5.3 and Figure 7."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information is disclosed. The acknowledgements (Section 10) thank individuals for feedback but do not mention funding sources."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Peking University, Beijing Forestry University, and Independent."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates stated for GPT-4o or DeepSeek-V3."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The paper explicitly addresses this by using LiveCodeBench v6 (problems from Feb-Apr 2025) 'to ensure contamination-free evaluation' (Section 6)."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Using the most recent LCB segment (published Feb-Apr 2025) specifically mitigates contamination. The paper also notes that 'traditional code generation benchmarks HumanEval and MBPP are reaching their saturation' (Section 6)."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The method samples 30 programs plus additional witnesses per problem via GPT-4o/DeepSeek-V3 API calls, but no cost, latency, or token consumption is reported."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget, API spend, or hardware details are stated despite significant API usage (30 samples + witnesses for 175+31 problems across 2 models)."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Results are reported as single-run point estimates. No seed sensitivity analysis or multi-seed results."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "100 trials for probability estimation (Figure 2) and 30 samples per problem are explicitly stated (Section 6, Appendix C)."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The angelic threshold T=1/3 was tuned on MBPP but no search budget (how many values tried) is reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Configuration selection is justified: sample size via entropy analysis (Figure 14), threshold via pilot study on MBPP (a separate benchmark, not the test set), temperature at default 1.0 with justification for challenging benchmarks."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple comparisons across methods, benchmarks, and models without any correction for multiple testing."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors implement all baselines themselves (CodeT, Postcondition, etc.) without acknowledging potential bias in their implementations of competitor methods."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "just-tri-it requires significantly more compute than baselines (multiple problem transformations, additional sampling for witnesses) but this cost difference is not discussed or controlled for."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper discusses construct validity by noting LCB excludes inexact problems and introducing CEI to fill this gap. It also discusses the limitation that programming-contest problems may not generalize (Section 7)."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "The paper evaluates its own tool (just-tri-it) as a post-generation selection method, not comparing models across different scaffolds. The scaffold IS the thing being tested."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "LCB v6 uses problems from Feb-Apr 2025 to ensure they postdate model training. The paper explicitly states this is 'to ensure contamination-free evaluation' (Section 6)."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks answer information through context or whether prompts contain implicit hints."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether LCB or CEI problems share structural similarities with training data problems."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Temporal splitting is used (recent LCB segment) but no concrete leakage detection method (canary strings, membership inference, etc.) is applied."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Semantic triangulation yields plausibility witnesses that increase the probability of correctness by 29% over the strongest baseline on average.",
    363       "evidence": "Figure 10 shows conditional probability of correctness under agreement: ENUM-SINV 91.4%, FWD-INV 88.7%, FWD-SINV 88.3% vs. Postcondition 63.6% and Another solution 59.2% on 175 LCB problems with GPT-4o.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "just-tri-it increases reliable accuracy by 21% compared to Majority0.5 (probability threshold ≥0.5).",
    368       "evidence": "Figure 9: GPT-4o LCB reliable accuracy 60.0% (just-tri-it) vs 52.9% (Majority0.5); DeepSeek-V3 76.0% vs 69.8%. Average improvement ~11% on LCB, higher on CEI.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "just-tri-it can select correct solutions with sampling probabilities as low as 0.14.",
    373       "evidence": "The motivating example (Figure 3, Section 2.3) demonstrates selection of a correct solution with probability 0.07 in the presence of a dominant error with probability 0.23 for CodeElo problem 1999D.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "On inexact problems, just-tri-it achieves 76% higher reliable accuracy than Majority0.5 on average.",
    378       "evidence": "Figure 11: GPT-4o CEI 44.4% vs 40.0%; DeepSeek-V3 30.0% vs 12.5%. The 76% average improvement is mainly driven by the DeepSeek-V3 gap.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Semantic triangulation provably outperforms plurality voting under stochastic parrot and correlated errors assumptions.",
    383       "evidence": "Propositions 4.2 and 4.3 with proofs, mechanized in Lean. The proof uses the rearrangement inequality to show bijective error mapping reduces matching bug probability.",
    384       "supported": "strong"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval", "theoretical"],
    388   "key_findings": "The paper introduces semantic triangulation, a framework that transforms coding problems via partial inversion and answer enumeration to detect LLM code hallucinations. On LiveCodeBench and CodeElo benchmarks with GPT-4o and DeepSeek-V3, the approach achieves ~90% conditional correctness probability under agreement vs ~60% for baselines. It enables reliable code selection even at low sampling probabilities (0.14) and on inexact problems with multiple valid solutions, where prior methods fail. Theoretical proofs (mechanized in Lean) establish that triangulation strictly improves upon plurality voting under correlated error assumptions.",
    389   "red_flags": [
    390     {
    391       "flag": "No cost/compute reporting",
    392       "detail": "The method requires sampling 30 programs plus additional witnesses (enumerators, inverses) per problem, likely costing many times more than baselines. No cost comparison is provided, making practical applicability hard to assess."
    393     },
    394     {
    395       "flag": "No error bars or significance tests",
    396       "detail": "All results are point estimates without confidence intervals or statistical significance tests. Given the relatively small benchmark sizes (175 LCB, 31 CEI), differences may not be statistically significant."
    397     },
    398     {
    399       "flag": "Very small inexact benchmark",
    400       "detail": "The CodeElo-Inexact benchmark has only 31 problems with manually written judges. Claims about inexact problem handling rest on this small sample, making percentage comparisons unreliable."
    401     },
    402     {
    403       "flag": "Self-implemented baselines",
    404       "detail": "All baselines are re-implemented by the authors. The paper does not use official CodeT implementations or acknowledge potential bias in baseline implementation."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Evaluating large language models trained on code",
    410       "authors": ["Mark Chen"],
    411       "year": 2021,
    412       "arxiv_id": "2107.03374",
    413       "relevance": "Introduced HumanEval benchmark and Pass@k metrics for LLM code generation evaluation."
    414     },
    415     {
    416       "title": "CodeT: Code Generation with Generated Tests",
    417       "authors": ["Bei Chen"],
    418       "year": 2023,
    419       "relevance": "Key baseline using dual execution agreement between sampled programs and tests for consensus."
    420     },
    421     {
    422       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    423       "authors": ["Naman Jain"],
    424       "year": 2025,
    425       "relevance": "Primary benchmark used for evaluation, designed for contamination-free LLM code assessment."
    426     },
    427     {
    428       "title": "CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings",
    429       "authors": ["Shanghaoran Quan"],
    430       "year": 2025,
    431       "arxiv_id": "2501.01257",
    432       "relevance": "Source of the inexact problems benchmark (CEI) used for multi-solution evaluation."
    433     },
    434     {
    435       "title": "Correlated Errors in Large Language Models",
    436       "authors": ["Elliot Myunghoon Kim"],
    437       "year": 2025,
    438       "relevance": "Provides empirical evidence for correlated LLM errors and abstention evaluation metrics used in this paper."
    439     },
    440     {
    441       "title": "Exploring and evaluating hallucinations in llm-powered code generation",
    442       "authors": ["Fang Liu"],
    443       "year": 2024,
    444       "relevance": "Studies hallucination patterns in LLM code generation, directly relevant to the problem addressed."
    445     },
    446     {
    447       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    448       "authors": ["Hammond Pearce"],
    449       "year": 2022,
    450       "relevance": "Documents security vulnerabilities in LLM-generated code, motivating the need for hallucination detection."
    451     },
    452     {
    453       "title": "Can large language models transform natural language intent into formal method postconditions?",
    454       "authors": ["Madeline Endres"],
    455       "year": 2024,
    456       "relevance": "Baseline approach using autoformalized specifications for code verification."
    457     },
    458     {
    459       "title": "Program synthesis with large language models",
    460       "authors": ["Jacob Austin"],
    461       "year": 2021,
    462       "arxiv_id": "2108.07732",
    463       "relevance": "Introduced MBPP benchmark used for hyperparameter tuning in this paper."
    464     },
    465     {
    466       "title": "Calibration and Correctness of Language Models for Code",
    467       "authors": ["Claudio Spiess"],
    468       "year": 2025,
    469       "relevance": "Studies calibration and confidence measures for LLM code generation, related to uncertainty quantification."
    470     },
    471     {
    472       "title": "Incoherence as Oracle-less Measure of Error in LLM-Based Code Generation",
    473       "authors": ["Thomas Valentin"],
    474       "year": 2025,
    475       "relevance": "Proposes uncertainty measure for code correctness that assumes unique correct solutions, a limitation addressed by triangulation."
    476     }
    477   ]
    478 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs