scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30412B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Holistic Evaluation of State-of-the-Art LLMs for Code Generation",
      6     "authors": [
      7       "Le Zhang",
      8       "Suresh Kothari"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2512.18131",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All claims in abstract (evaluation of 6 LLMs on 944 LeetCode problems across 5 languages, metric categories, DeepSeek-R1/GPT-4.1 outperformance) are demonstrated in results sections with detailed tables and figures.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Paper makes minimal causal claims. Section 4.6 tests prompt engineering effect quasi-experimentally on 202 problems, showing optimization hints reduce algorithmic suboptimality 2-5% across models.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Paper evaluates only LeetCode algorithmic problems but generalizes conclusions to 'real-world software development tasks' and 'software engineering' broadly. LeetCode is a narrow, artificial benchmark that doesn't represent production code modification, security-critical systems, or maintainability scenarios.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "When Llama-3.3 underperforms, paper doesn't explore whether this is due to: (a) inherent model weakness, (b) poor prompt fit, (c) suboptimal hyperparameters, or (d) task mismatch. Single explanations presented without alternatives.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Pass@1 directly measures 'percentage of problems solved correctly on first attempt.' Metrics (CE, RE, FF, AS) are direct measurements mapped to explicit definitions; claims about 'correctness' and 'efficiency' align with metric granularity.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No dedicated 'Limitations' or 'Threats to Validity' section. Contamination concern mentioned in Section 3.2 and future work in Section 6, but no systematic review of study limitations.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Contamination mitigation ('selected recent problems', 'avoided discussion content') is specific. But systematic threats—selection bias in 944 problems, prompt format bias, LeetCode's artificial time limits vs. real-world constraints—are not addressed.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Paper states what it evaluates ('944 LeetCode problems, 5 languages') but does NOT explicitly state scope boundaries—what results do NOT show (e.g., no claim about real-world code modification, security, maintainability, multi-turn interactions). Future work section partially addresses this.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding statement anywhere in the paper. Authorship shows Iowa State University affiliation, but no disclosure of grant support, sponsorship, or funding source.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Authors listed as 'Department of Computer Science, Iowa State University, Ames, Iowa, USA.' No apparent financial ties to evaluated model companies (OpenAI, Anthropic, DeepSeek, Meta, Alibaba).",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Iowa State is independent of LLM vendors. No evidence of company funding or incentive to bias results toward any model.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No 'Competing Interests' or 'Conflicts of Interest' statement. No disclosure of patents, equity stakes, consulting arrangements, or financial relationships.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms operationalized through metrics: 'correctness' (Pass@1), 'efficiency' (algorithmic suboptimality), 'robustness' (runtime errors). Metrics are formally defined in Section 3.6.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three explicit objectives stated in Introduction: (1) evaluate coding proficiency across languages, (2) benchmark strengths/weaknesses, (3) provide practical guidance. Contributions are unambiguous.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 reviews prior work (Chen et al. 2021, Pearce et al. 2025, Nijkamp et al. 2022). Section 3.2 explicitly contrasts: 'Previous studies [Döderlein, Coignion] are based on older models... Our study provides more comprehensive and up-to-date evaluation.'",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Section 3.7 states: 'All the artifacts of this study, including our datasets, code, and evaluation results, are available in a public repository anonymously: https://figshare.com/s/26448e92798aab34e407'",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "944 LeetCode problems and their results are publicly released via figshare. LeetCode problems are from a public platform. Raw submission reports available.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No requirements.txt, Dockerfile, Python version specs, or dependency list provided. Paper mentions 'custom LeetCode API' but does not document its environment or dependencies.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Experimental setup is described (prompt format, hyperparameters, model names) but no step-by-step instructions to run the pipeline. How to access figshare, invoke the API, generate solutions for all 6 models, and reproduce results is not documented.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "All results reported as single-point percentages (e.g., 'DeepSeek-R1 Python: 89.30%'). No confidence intervals, error bars, standard deviations, or uncertainty quantification.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Comparative claims ('DeepSeek-R1 consistently outperforms'; 'Llama-3.3 shows lowest Pass@1') are presented without statistical significance tests. Differences could be within noise.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Percentages and absolute differences (e.g., DeepSeek-R1 89.3% vs Llama-3.3 54.8% on Python) are reported. These are effect sizes, though not standardized or confidence-bounded.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "944 problems for large dataset, 202 for small dataset. No power analysis, justification for these counts, or discussion of why these sample sizes are adequate.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "All results are single-point estimates per model-language combination. No variance, standard deviation, run-to-run variation, or multiple trials. Pass@1 is deterministic per model run.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Six models are evaluated against each other, providing pairwise comparisons. Each model serves as a baseline for others.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "All six models are from 2024-2025 releases. Includes general-purpose (Llama-3.3, Claude-3.7) and code-specialized models (Qwen2.5-Coder, DeepSeek-Coder variants). Contemporaneous and representative.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Section 4.6 provides one ablation: optimization prompt ('Optimize the time complexity') tested on 202 problems across all models. Shows 2-5% improvement in algorithmic suboptimality for most models.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Five quality metrics (Pass@1, CE, RE, FF, AS) plus token usage and cost analysis. Multiple dimensions of code quality evaluated.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "No human subjects; LeetCode's automated test cases are the evaluation criterion. Not applicable.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "LeetCode provides extensive test cases per problem (7–9,558 cases, mean varies by problem). Models are evaluated against these unseen test cases on submission.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results broken down by model (6 models), language (5 languages), difficulty tier (Easy/Medium/Hard in Table 2), and metric. Per-language and per-model analysis in Figures 2–9.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": false,
    221           "justification": "Abstract mentions 'common failure scenarios such as syntax errors, logical flaws, and suboptimal algorithms,' but no specific case studies, examples, or detailed failure analysis provided in results.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "All models' results reported fully, including poor performance (e.g., Llama-3.3's 54.8% Pass@1 on Python, 41.6% algorithmic suboptimality on JavaScript). No selective reporting.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "Table 1 lists model names and sizes, but lacks exact snapshot dates for proprietary models. GPT-4.1 and Claude-3.7 are referenced to 2025 announcements but no API version or access date specified. Open-source models are better identified (e.g., Llama-3.3-70B-Instruct).",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Figure 1 shows a complete, concrete prompt example with all components: role specification, problem description, constraints, code snippet template, test cases, and instructions. Not a template with placeholders.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Section 3.3 specifies: top-p = 0.95, temperature = 0.1 for all models. Rationale provided (prior research showing these values optimize Pass@1).",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Prompt scaffolding (role, problem, constraints, code snippet, test cases, instructions) is detailed in Section 3.4 with subsections 3.4.1–3.4.5 and a full example.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": false,
    259           "justification": "Problems are described as '944 manually picked' from LeetCode and '202 selected... based on their sensitivity to algorithmic complexity,' but selection criteria and preprocessing steps are not documented.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Section 3.7 states data and results are available in the public figshare repository. Assuming the repository contains the promised datasets and submission results.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "Data source is clear (LeetCode), but 'manually picked' criteria are undefined. What makes a problem worthy of selection? How were 944 chosen from thousands? No documented procedure.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants; benchmark is standard. Not applicable.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": false,
    285           "justification": "High-level pipeline described (select problems, create prompts, submit to LeetCode API, collect results) but detailed steps missing: How are submission reports parsed? How are metrics computed from reports? No pseudocode or detailed algorithm.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "Section 3.2 discusses contamination risk but does not state training data cutoff dates for any model. DeepSeek-R1 (2025), GPT-4.1 (2025), Claude-3.7 (2025)—exact cutoffs unknown.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "Section 3.2 explicitly addresses: 'possibility that some problems or their corresponding solutions may have appeared in the pretraining corpora.' Mitigation strategy: 'intentionally selected relatively recent problems... avoided using any LeetCode discussion content.'",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "Contamination is acknowledged as a risk. Mitigations are described (recent problem selection, no discussion forum content, unseen task treatment). However, 'complete elimination of contamination cannot be guaranteed,' per the text.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human subjects. Not applicable.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human subjects. Not applicable.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human subjects. Not applicable.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human subjects. Not applicable.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human subjects. Not applicable.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human subjects. Not applicable.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human subjects. Not applicable.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": true,
    357           "justification": "Section 4.7.2 reports total costs in USD per model: DeepSeek-R1 (regular $81.75, off-peak $20.52), GPT-4.1 ($44.41), Claude-3.7 ($108.65), etc. Also Figure 8 shows token usage breakdown.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Section 4.7 reports token consumption (input and output tokens per model) and API costs. DeepSeek-R1 uses 36.52M output tokens; GPT-4.1 uses 4.74M. Cost data implicitly captures compute budget.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "DeepSeek-R1 and GPT-4.1 consistently outperform other models in correctness, efficiency, and robustness",
    372       "evidence": "Pass@1 scores: DeepSeek-R1 86.97–89.30% across languages; GPT-4.1 83.26–86.55%. Algorithmic suboptimality (AS): DeepSeek-R1 1.38–2.23%; GPT-4.1 2.54–4.77%. Runtime errors (RE): GPT-4.1 lowest across most languages.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Algorithmic suboptimality is common, especially in Llama-3.3, which defaults to brute-force solutions",
    377       "evidence": "Llama-3.3 AS: 7.63–10.49% vs. DeepSeek-R1 1.38–2.23%. Figure 6 shows Llama-3.3 consistently highest AS across all 5 languages. No direct evidence of 'brute-force' reasoning, only outcome-level observation.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Prompt engineering with optimization hints significantly improves algorithmic efficiency",
    382       "evidence": "Section 4.6: Adding 'Optimize the time complexity' hint reduces AS 2–5% for most models on 202-problem small dataset. DeepSeek-R1 shows consistent improvement across all languages (2–4% reduction).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Python3 and JavaScript yield fewer compile and runtime errors across all models, while stricter languages like C++ and Go reveal more limitations",
    387       "evidence": "Figure 3 (CE): Python and JavaScript show 0% CE for all models; Go shows 2.01–7.10% CE. Figure 4 (RE): Go shows strong performance; JavaScript shows higher RE (1.91–4.34%) than Go. Language difficulty gradient is clear.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Larger or more specialized models tend to perform better in programming evaluations",
    392       "evidence": "Larger models (DeepSeek-R1 671B, GPT-4.1 1800B) show higher Pass@1 (86–89%) than Llama-3.3 (52–55%). Qwen2.5-Coder (32B, code-specialized) outperforms Llama-3.3 (70B, general-purpose) in Pass@1 and FF metrics.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "DeepSeek-R1's reasoning capabilities provide advantages in efficiency-sensitive tasks compared to DeepSeek-V3",
    397       "evidence": "DeepSeek-R1 AS: 1.38–2.23%; DeepSeek-V3 AS: 2.65–3.28%. DeepSeek-R1 explicitly designed for 'complex reasoning.' Figure 6 shows consistent advantage across languages.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Prompt responsiveness to optimization cues varies significantly across models; Llama-3.3 shows minimal improvement despite starting from worst baseline",
    402       "evidence": "Figure 7 (optimization hint experiment): DeepSeek-R1 reduces AS 1.98–3.47pp; Llama-3.3 reduces only 1.49–2.49pp despite starting at 40–44% AS. Claude-3.7 shows nearly no improvement.",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Successful LLM deployment in software engineering requires human oversight, not autonomous code generation",
    407       "evidence": "Conclusions (5.1, 5.2): 'Even top models occasionally fail on complex problems or edge cases, underscoring the need for human oversight.' Recommendations: 'Treat LLM-generated code as a first draft. Always review for logical correctness, edge cases, and efficiency.'",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "observational"
    414   ],
    415   "key_findings": "This paper evaluates six state-of-the-art LLMs on 944 LeetCode problems across five programming languages, using five quality metrics. DeepSeek-R1 and GPT-4.1 achieve 86–89% Pass@1 (first-attempt correctness), while Llama-3.3 achieves only 52–55%. Models are most proficient in dynamically-typed languages (Python, JavaScript) and struggle with strict syntactic constraints in Go and C++. Algorithmic efficiency is a widespread weakness: Llama-3.3 shows 7.6–10.5% time-limit exceeded errors, indicating brute-force solutions. Prompt engineering with optimization hints improves performance modestly (2–5%), but responsiveness varies by model. DeepSeek-R1's reasoning-optimized design provides measurable efficiency advantages over its non-reasoning variant. Even top models fail on edge cases, underscoring the need for human code review.",
    416   "red_flags": [
    417     {
    418       "flag": "No statistical significance testing",
    419       "detail": "All results reported as single-point percentages without confidence intervals, p-values, or variance estimates. Claimed differences (e.g., 2–3pp between models) may be within measurement noise on a 944-problem sample."
    420     },
    421     {
    422       "flag": "Overgeneralization from benchmark to real-world",
    423       "detail": "LeetCode problems are algorithmic, isolated, and artificial. Conclusions extrapolate to 'real-world software development,' ignoring code maintenance, security, refactoring, and multi-turn interactions absent from the evaluation."
    424     },
    425     {
    426       "flag": "No dedicated limitations section",
    427       "detail": "Methodological threats (selection bias in 944 problems, LeetCode time-limit realism, contamination risk) are scattered or unaddressed. No explicit statement of what results do NOT show."
    428     },
    429     {
    430       "flag": "Contamination risk unresolved",
    431       "detail": "Paper acknowledges LeetCode problems may have appeared in training corpora but provides no empirical evidence that 'recent problem' selection eliminated this risk. Training cutoff dates not disclosed for any model."
    432     },
    433     {
    434       "flag": "Limited reproducibility documentation",
    435       "detail": "While code and data are released via figshare, no step-by-step reproduction instructions, environment specifications, or Dockerfile provided. Custom LeetCode API is undocumented."
    436     },
    437     {
    438       "flag": "Insufficient data collection transparency",
    439       "detail": "944 problems are 'manually picked' with undefined criteria. How were these selected from thousands? What selection bias might this introduce?"
    440     },
    441     {
    442       "flag": "Minimal ablation study",
    443       "detail": "Only one ablation (optimization prompt) tested on 202 problems. Other design choices (hyperparameters, prompt structure) not systematically tested."
    444     },
    445     {
    446       "flag": "No funding or conflict-of-interest disclosure",
    447       "detail": "Paper includes no statement of funding source or competing interests. No acknowledgment of financial relationships with model vendors."
    448     },
    449     {
    450       "flag": "Lack of failure case analysis",
    451       "detail": "Mentions 'syntax errors, logical flaws, suboptimal algorithms' as failure types but provides zero case studies, code excerpts, or detailed failure analysis to ground findings."
    452     },
    453     {
    454       "flag": "Model version ambiguity for proprietary models",
    455       "detail": "GPT-4.1 and Claude-3.7 lack exact API version, snapshot date, or access timing. Results may be sensitive to model updates within 2025."
    456     }
    457   ],
    458   "cited_papers": [
    459     {
    460       "title": "Evaluating Large Language Models Trained on Code",
    461       "relevance": "Foundational benchmark study (Chen et al. 2021, Codex evaluation); prior work on LLM code generation methodologies"
    462     },
    463     {
    464       "title": "A Performance Study of LLM-Generated Code on LeetCode",
    465       "relevance": "Direct precedent for LeetCode-based evaluation; prior hyperparameter study (Coignion et al. 2024)"
    466     },
    467     {
    468       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    469       "relevance": "Multi-turn code generation paradigm; contrasts with this paper's Pass@1 single-shot evaluation (Nijkamp et al. 2022)"
    470     },
    471     {
    472       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    473       "relevance": "Security vulnerabilities in LLM-generated code; related evaluation dimension (Pearce et al. 2025)"
    474     },
    475     {
    476       "title": "Proving the Coding Interview: A Benchmark for Formally Verified Code Generation",
    477       "relevance": "Formal verification of LLM-generated code; alternative evaluation criterion beyond runtime testing (Dougherty & Mehta 2025)"
    478     },
    479     {
    480       "title": "The Curious Case of Neural Text Degeneration",
    481       "relevance": "Foundational work on top-p and temperature hyperparameters; cited for decoding parameter justification (Holtzman et al. 2019)"
    482     },
    483     {
    484       "title": "LeetCodeDataset: A Temporal Dataset for Robust Evaluation and Efficient Training of Code LLMs",
    485       "relevance": "Contemporary temporal dataset for code LLM evaluation; time-aware benchmark construction (Xia et al. 2025)"
    486     },
    487     {
    488       "title": "A Survey on Large Language Models for Code Generation",
    489       "relevance": "Comprehensive survey of NL2Code paradigm and code generation capabilities (Jiang et al. 2024)"
    490     }
    491   ],
    492   "engagement_factors": {
    493     "practical_relevance": {
    494       "score": 3,
    495       "justification": "Practitioners evaluating LLM adoption for code generation get direct model comparisons, cost analysis, and per-language performance. Actionable for tool selection and prompt design."
    496     },
    497     "surprise_contrarian": {
    498       "score": 1,
    499       "justification": "Results confirm conventional expectations: larger/specialized models outperform smaller ones, strict languages are harder, prompt engineering helps. No counterintuitive findings."
    500     },
    501     "fear_safety": {
    502       "score": 0,
    503       "justification": "Paper does not address security vulnerabilities, alignment, or AI safety concerns. Focuses on code correctness only, not risk."
    504     },
    505     "drama_conflict": {
    506       "score": 0,
    507       "justification": "Straightforward benchmark study with no controversy or competing narratives. Clear winner (DeepSeek-R1) without debate."
    508     },
    509     "demo_ability": {
    510       "score": 3,
    511       "justification": "Results highly reproducible: select any LeetCode problem, run six models via free/paid APIs, compare outputs. Readers can verify findings immediately."
    512     },
    513     "brand_recognition": {
    514       "score": 2,
    515       "justification": "Iowa State University is reputable but not a household name. Study is arXiv preprint, not published in top-tier venue (ICLR, NeurIPS, ACL). Authors less known."
    516     }
    517   },
    518   "hn_data": {
    519     "threads": [
    520       {
    521         "hn_id": "42557775",
    522         "title": "Mulberry: Empowering MLLM with o1-like Reasoning",
    523         "points": 3,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=42557775",
    526         "created_at": "2024-12-31T10:22:36Z"
    527       },
    528       {
    529         "hn_id": "46425525",
    530         "title": "Optimal Software Pipelining and Warp Specialization for Tensor Core GPUs",
    531         "points": 2,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=46425525",
    534         "created_at": "2025-12-29T20:54:07Z"
    535       },
    536       {
    537         "hn_id": "46069881",
    538         "title": "Conformal Prediction for Compositional Data",
    539         "points": 2,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=46069881",
    542         "created_at": "2025-11-27T15:03:53Z"
    543       },
    544       {
    545         "hn_id": "46363503",
    546         "title": "Layout-Aware Text Editing for Efficient Conversion of Academic PDFs to Markdown",
    547         "points": 1,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=46363503",
    550         "created_at": "2025-12-23T08:26:53Z"
    551       },
    552       {
    553         "hn_id": "43150514",
    554         "title": "Intuitive physics understanding emerges from self-supervised pretraining",
    555         "points": 1,
    556         "comments": 1,
    557         "url": "https://news.ycombinator.com/item?id=43150514",
    558         "created_at": "2025-02-23T16:27:24Z"
    559       },
    560       {
    561         "hn_id": "46021507",
    562         "title": "World-in-World: World Models in a Closed-Loop World",
    563         "points": 1,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=46021507",
    566         "created_at": "2025-11-23T07:25:35Z"
    567       }
    568     ],
    569     "top_points": 3,
    570     "total_points": 10,
    571     "total_comments": 1
    572   }
    573 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs