scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35719B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On Evaluating the Efficiency of Source Code Generated by LLMs",
      6     "authors": [
      7       "Changan Niu",
      8       "Ting Zhang",
      9       "Chuanyi Li",
     10       "Bin Luo",
     11       "Vincent Ng"
     12     ],
     13     "year": 2024,
     14     "venue": "AI Foundation Models and Software Engineering (FORGE '24)",
     15     "arxiv_id": "2404.06041",
     16     "doi": "10.1145/3650105.3652295"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims that efficiency is evaluated, that simple prompts help basic problems, and that chain-of-thought helps complex problems. These are supported by Tables 2-4 in the paper.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims 'training strategy and data have an impact on the efficiency of the generated code' based on comparing DeepSeek Coder base vs instruct, but this is a single observational comparison with many confounding differences between the two versions. No controlled experiment isolates training strategy.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims to evaluate 'Source Code Generated by LLMs' broadly, but HumanEval/MBPP are Python-only and LeetCodeEval is C++-only. Claims about 'LLMs' are based on a limited set of models. These scope limitations are not explicitly bounded in the claims.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for key findings. For example, why GPT-3.5 generates more efficient code than GPT-4 could relate to RLHF training, code verbosity preferences, or other factors — none of which are discussed.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper equates 'efficiency' with runtime but does not acknowledge that efficiency could encompass memory usage, energy consumption, or code maintainability. The proxy gap between 'runtime' and the broader 'efficiency' framing is not discussed.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 3 'THREATS TO VALIDITY' discusses potential data leakage and unstable runtime as threats, with specific mitigation strategies described.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The threats section discusses study-specific issues: (1) data leakage because training data contents are unknown, mitigated by temporal filtering for LeetCodeEval, and (2) unstable runtime, mitigated by gem5 simulator and repeated runs.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of scope limitations such as generalizability to other programming languages, problem types, or non-tested models.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The Acknowledgments section lists 'Cooperation Fund of Huawei-NJU Creative Laboratory for the Next Programming, CCF-Huawei Populus Grove Fund, NSF award 2034508.'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed: Nanjing University, Singapore Management University, and University of Texas at Dallas. None of the authors are affiliated with the companies whose models are evaluated.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Huawei funds the research but none of Huawei's own LLM products are evaluated. The paper evaluates OpenAI, Meta, Microsoft, and DeepSeek models. NSF is a government funder with no stake in results.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is included in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "'Efficiency' is defined as runtime execution time, but 'normalized runtime' is introduced in Section 2.1.4 without motivation—why normalize across all models instead of reporting absolute values? 'More efficient code' conflates algorithmic and runtime efficiency without precision.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three contributions stated explicitly: (1) evaluate efficiency of LLM code, (2) propose LeetCode-based benchmark, (3) investigate prompting for efficiency. Intent is clear.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Related work section (Section 4) cites DeepDev-PERF, Madaan et al., Self-Refine, and code quality work. Situates this work as extending prior correctness/quality focus to efficiency. Could be deeper but adequate.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper states 'We also make code, data and other artifacts available online [1]' with reference [1] pointing to https://github.com/NougatCA/EfficiencyEval.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper claims to release code, data, and artifacts online. HumanEval and MBPP are publicly available standard benchmarks, and the newly constructed LeetCodeEval is included in the artifact release.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment specification (requirements.txt, Dockerfile, library versions) is provided in the paper. They mention using the gem5 CPU simulator but do not specify software environment details.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself lacks instructions for replicating the experiments.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 2, 3, and 4 report only point estimates (normalized runtime, Pass@10, speedup) with no confidence intervals or error bars despite repeated measurements.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper makes comparative claims ('code generated by the former is not as efficient as the latter') based solely on comparing numbers without any statistical significance tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The main RQ1 results (Table 2, Table 3) report normalized runtimes and Pass@10 without contextualizing the magnitude of differences. The speedup ratios in Table 4 (RQ2) provide relative magnitudes but the primary comparative claims lack formal effect sizes.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is given for the number of problems (164 HumanEval, 399 MBPP, 166 LeetCodeEval), the number of models tested, or k=10 generations. No power analysis is discussed.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The paper states 'we repeat the execution of each piece of code for 10 times and take the average runtime' and 'repeat the submission of each piece of code for 3 times and record the average results' but never reports standard deviation or any spread measure for these repeated runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple LLMs are compared against each other: GPT-4, GPT-3.5, Phi-2, Code Llama (7B/13B/34B), WizardCoder (7B/13B/34B), and DeepSeek Coder (base/instruct). Each serves as a baseline for the others.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The models evaluated (GPT-4, GPT-3.5, Code Llama, WizardCoder, DeepSeek Coder, Phi-2) were all state-of-the-art or near-SOTA at the time of writing (late 2023/early 2024).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": false,
    193           "answer": false,
    194           "justification": "This is a benchmark evaluation paper comparing existing models, not proposing a system with components to ablate. The RQ2 prompting experiments compare different strategies but are not ablations of a single system.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports Pass@10 (correctness), average normalized runtime (efficiency), and percentage beats (LeetCode-specific efficiency). Multiple complementary metrics are used.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not relevant to the paper's claims about code execution efficiency, which is measured automatically via runtime.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The paper uses established benchmarks (HumanEval, MBPP) and a newly constructed LeetCodeEval as evaluation sets. No tuning or selection is performed against these benchmarks — models are evaluated as-is.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by benchmark (HumanEval vs MBPP vs LeetCodeEval) and by difficulty level within LeetCodeEval (easy, medium, hard). Table 2 and Table 3 show separate results per dataset.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Failed problems are simply excluded from runtime analysis. The paper notes overlap counts (70 problems all pass on HumanEval, 0 on hard LeetCodeEval) but does not discuss what kinds of problems models fail on or why.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that prompting has minimal effect on simple benchmarks (HumanEval/MBPP speedups near 1.0), that DeepSeek Coder shows almost no improvement from prompting (1.00-1.01x on some benchmarks), and that no model could solve enough hard LeetCode problems for comparison.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "The paper specifies exact API model IDs: 'gpt-3.5-turbo-1106 and gpt-4-1106-preview'. Open-source models are identified with specific sizes (Phi-2 2.7B, Code Llama 7B/13B/34B, WizardCoder 7B/13B/34B, DeepSeek Coder 33B base/instruct).",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 2 provides the LeetCodeEval prompt template, and Figure 3 provides the three prompting strategies for RQ2. For HumanEval/MBPP, they reference Liu et al.'s published source code for prompt generation. The fill values (problem descriptions, constraints) come from publicly available benchmarks.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported for any model. Only k (number of generations: 10 for HumanEval/MBPP, 3 for LeetCodeEval) is stated.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The paper evaluates direct LLM code generation from prompts.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "For LeetCodeEval, the paper documents the filtering pipeline: select problems from May 2023+, filter out problems with images, filter out problems with more downvotes than upvotes, divide by difficulty. Table 1 provides dataset statistics.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The paper states 'We also make code, data and other artifacts available online [1]' at a GitHub repository (https://github.com/NougatCA/EfficiencyEval), suggesting raw data is available for verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "For LeetCodeEval, the paper describes collection of URLs, titles, descriptions, examples, constraints, and code templates from LeetCode, with specific filtering criteria (post-May 2023, no images, positive vote ratio). HumanEval and MBPP are standard benchmarks with known provenance.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard benchmarks (HumanEval, MBPP) and a publicly available programming platform (LeetCode).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline is documented in Section 2.1.2: generate k responses → test correctness → select first passing code → measure runtime via gem5 (10 repetitions) for HumanEval/MBPP, and generate 3 codes → submit to LeetCode → record correctness and runtime (3 repetitions) for LeetCodeEval.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The paper mentions GPT-4's knowledge cutoff as May 2023 for LeetCodeEval construction but acknowledges 'we are unable to get the data cut-offs for the other model.' Training cutoffs are not systematically stated for all evaluated models.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section 3 states: 'Potential data leakage is a threat to construct validity because we can not know if the data used for evaluation is present in the training data of models.' The paper acknowledges this risk explicitly.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The paper takes a concrete mitigation step: constructing LeetCodeEval with problems from May 2023+ (after GPT-4's training cutoff) specifically to reduce contamination risk. However, contamination for HumanEval and MBPP (published 2021) is only acknowledged, not mitigated.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No human participants.",
    316         "source": "haiku",
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in the study.",
    321           "source": "opus"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in the study.",
    327           "source": "opus"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in the study.",
    333           "source": "opus"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in the study.",
    339           "source": "opus"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in the study.",
    345           "source": "opus"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in the study.",
    351           "source": "opus"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants in the study.",
    357           "source": "opus"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No API costs, token counts, or inference latency are reported despite using commercial APIs (GPT-3.5, GPT-4) and generating thousands of code samples.",
    365           "source": "opus"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No total computational budget is stated. The paper uses the gem5 CPU simulator and OpenAI APIs but does not quantify the total compute resources used.",
    371           "source": "opus"
    372         }
    373       },
    374       "experimental_rigor": {
    375         "seed_sensitivity_reported": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "No random seed sensitivity analysis is reported. The paper generates k responses per problem but does not discuss seed variation or its effect on results.",
    379           "source": "opus"
    380         },
    381         "number_of_runs_stated": {
    382           "applies": true,
    383           "answer": true,
    384           "justification": "The paper states 'we repeat the execution of each piece of code for 10 times' on gem5 and 'repeat the submission of each piece of code for 3 times' on LeetCode.",
    385           "source": "opus"
    386         },
    387         "hyperparameter_search_budget": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "No hyperparameter search budget is reported. Generation parameters (temperature, top-p) are not even stated, let alone any search over them.",
    391           "source": "opus"
    392         },
    393         "best_config_selection_justified": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "The choice of k=10 for HumanEval/MBPP and k=3 for LeetCodeEval is not justified. The selection of 'first passing code' for efficiency evaluation is described but the rationale for this specific selection strategy is not discussed.",
    397           "source": "opus"
    398         },
    399         "multiple_comparison_correction": {
    400           "applies": false,
    401           "answer": false,
    402           "justification": "No statistical significance tests are performed at all, so the question of correction for multiple comparisons does not arise.",
    403           "source": "opus"
    404         },
    405         "self_comparison_bias_addressed": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "The paper does not discuss potential evaluation bias. While they evaluate third-party models rather than their own system, they do not acknowledge potential biases in their evaluation methodology (e.g., prompt template design favoring certain models).",
    409           "source": "opus"
    410         },
    411         "compute_budget_vs_performance": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper compares models of vastly different compute requirements (2.7B Phi-2 vs GPT-4) without discussing or normalizing for computational cost. No performance-vs-compute analysis is provided.",
    415           "source": "opus"
    416         },
    417         "benchmark_construct_validity": {
    418           "applies": true,
    419           "answer": true,
    420           "justification": "The paper argues that HumanEval/MBPP have limited test cases that don't adequately reveal efficiency differences, motivating LeetCodeEval: 'Comprehensive test cases on LeetCode can make the runtime benefits of code with real less complexity more significant, and thus more accurately reflect the efficiency.' This is an explicit construct validity argument.",
    421           "source": "opus"
    422         },
    423         "scaffold_confound_addressed": {
    424           "applies": false,
    425           "answer": false,
    426           "justification": "No scaffolding or agentic tools are involved. Models are evaluated via direct prompting.",
    427           "source": "opus"
    428         }
    429       },
    430       "data_leakage": {
    431         "temporal_leakage_addressed": {
    432           "applies": true,
    433           "answer": true,
    434           "justification": "The paper explicitly addresses temporal leakage by constructing LeetCodeEval with problems from 'May 2023 and later (this is the latest GPT-4 knowledge cut-off)' to avoid training data contamination.",
    435           "source": "opus"
    436         },
    437         "feature_leakage_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether the evaluation setup leaks answer information through prompt context, code templates, or test case design.",
    441           "source": "opus"
    442         },
    443         "non_independence_addressed": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No discussion of whether HumanEval, MBPP, or LeetCode problems share structural similarities with training data beyond temporal overlap.",
    447           "source": "opus"
    448         },
    449         "leakage_detection_method": {
    450           "applies": true,
    451           "answer": true,
    452           "justification": "Temporal splitting is used as a concrete leakage prevention method for LeetCodeEval: only problems posted after the model's training cutoff are included.",
    453           "source": "opus"
    454         }
    455       }
    456     }
    457   },
    458   "claims": [
    459     {
    460       "claim": "The ability to generate correct code is not positively correlated with the ability to generate efficient code.",
    461       "evidence": "GPT-4 achieves highest Pass@10 (98.2%) on HumanEval but runtime 8.61, while GPT-3.5 has lower Pass@10 (87.2%) but better runtime 8.35. Same pattern on MBPP (94.2% vs 88.7% Pass@10, 9.14 vs 8.86 runtime).",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Larger number of parameters does not promise higher performance in code efficiency.",
    466       "evidence": "Code Llama 7B, 13B, 34B show nearly identical runtimes (9.95, 9.87, 9.93 on HumanEval). WizardCoder series shows similar invariance to size.",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "Training strategy and data have significant impact on code efficiency.",
    471       "evidence": "DeepSeek Coder 33B Instruct (7.54 runtime) substantially outperforms 33B Base (9.40) on HumanEval, despite identical parameter count. Authors attribute to instruct-tuning on instruction data.",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "Step-by-step prompting improves code efficiency, especially on complex problems.",
    476       "evidence": "Table 4 shows Prompts 2 and 3 achieve 1.16-1.18 speedup on medium LeetCode vs 1.03-1.04 on HumanEval. Larger gaps on complex problems confirm hypothesis.",
    477       "supported": "moderate"
    478     },
    479     {
    480       "claim": "Prompting effectiveness differs across benchmarks and problem difficulty.",
    481       "evidence": "Prompting yields 1.06-1.18 speedup on LeetCodeEval but only 1.01-1.06 on HumanEval/MBPP. Authors explain via constrained optimization space in simpler problems.",
    482       "supported": "moderate"
    483     },
    484     {
    485       "claim": "Comprehensive test cases (as in LeetCode) reveal efficiency benefits more clearly than limited test cases.",
    486       "evidence": "GPT-4 achieves best %Beats on LeetCodeEval (73.09% medium) compared to its relative performance on simpler benchmarks. Authors attribute to diverse test cases magnifying algorithmic improvements.",
    487       "supported": "weak"
    488     },
    489     {
    490       "claim": "Data distribution of model training correlates with efficiency across different benchmarks.",
    491       "evidence": "DeepSeek best on HumanEval, WizardCoder best on MBPP despite similar model sizes. Paper speculates correlation with training data distribution but provides no direct evidence.",
    492       "supported": "weak"
    493     }
    494   ],
    495   "methodology_tags": [
    496     "benchmark-eval",
    497     "empirical"
    498   ],
    499   "key_findings": "The paper shows that code generation performance (Pass@10) does not correlate with code efficiency, contradicting expectations. Training strategy significantly impacts efficiency (DeepSeek Instruct vastly outperforms Base), while model size within a family does not. Step-by-step prompting yields modest speedups (1-18%) but works better on complex problems; results vary significantly across benchmarks, suggesting optimization space and test case diversity matter more than model architecture.",
    500   "red_flags": [
    501     {
    502       "flag": "Selection bias in analysis",
    503       "detail": "Only 70/164 HumanEval and 0/33 hard LeetCode problems pass all models. Results restricted to easiest problems where all models succeed, limiting generalizability of efficiency comparisons."
    504     },
    505     {
    506       "flag": "Normalization obscures absolute performance",
    507       "detail": "Runtime normalized across models makes them appear similar. Reporting absolute times (seconds or CPU cycles) would show real performance gaps and practical impact more clearly."
    508     },
    509     {
    510       "flag": "Missing hyperparameters",
    511       "detail": "Temperature, top-p, max_tokens, repetition_penalty, and other crucial LLM settings not specified. Reproducibility is compromised without these details."
    512     },
    513     {
    514       "flag": "Data contamination unresolved for 2 of 3 benchmarks",
    515       "detail": "HumanEval and MBPP almost certainly in GPT-3.5/4 training data. Only LeetCode mitigation (May 2023+) addresses contamination. Unknown cutoffs for Code Llama, Phi-2, WizardCoder, DeepSeek."
    516     },
    517     {
    518       "flag": "No variance or confidence intervals",
    519       "detail": "10 gem5 runs and 3 LeetCode submissions per code, but only means reported. Std dev, CIs, or error bars absent. Cannot assess uncertainty in rankings."
    520     },
    521     {
    522       "flag": "Prompting improvements are marginal",
    523       "detail": "Speedups of 1.01-1.18 (1-18%) are modest. No significance testing performed. Unclear if improvements are meaningful for practitioners."
    524     },
    525     {
    526       "flag": "Small effective sample size on hard problems",
    527       "detail": "0/33 hard LeetCode problems pass all models means hard problems cannot be compared. Evaluation is limited to easier problems where efficiency differences may be smaller."
    528     },
    529     {
    530       "flag": "Limited practical context",
    531       "detail": "Paper does not discuss whether 1-18% runtime speedups matter in practice. No comparison to other optimization approaches (algorithmic changes, hardware). No user studies or developer adoption data."
    532     },
    533     {
    534       "flag": "Gem5 simulator validity not established",
    535       "detail": "Single validation paper [4] cited for gem5 reliability. Unclear if gem5 accurately models modern CPUs for Python code execution and cache effects."
    536     },
    537     {
    538       "flag": "Language and benchmark scope",
    539       "detail": "Only Python (HumanEval/MBPP) and C++ (LeetCode) tested. Results may not generalize to JavaScript, Go, Rust, or other languages. All benchmarks synthetic, not real-world code."
    540     }
    541   ],
    542   "cited_papers": [
    543     {
    544       "title": "DeepDev-PERF: a deep learning-based approach for improving software performance",
    545       "relevance": "Prior work on automated performance improvement suggestions; complements this efficiency evaluation."
    546     },
    547     {
    548       "title": "Learning Performance-Improving Code Edits (PIE dataset)",
    549       "relevance": "Madaan et al.; introduces dataset of C++ program pairs with runtime and prompting/finetuning approaches for code optimization, directly related to RQ2."
    550     },
    551     {
    552       "title": "Self-Refine: Iterative refinement with self-feedback",
    553       "relevance": "Shows iterative self-feedback improves performance; related to prompting strategies explored in RQ2."
    554     },
    555     {
    556       "title": "Evaluating Large Language Models Trained on Code (HumanEval benchmark)",
    557       "relevance": "Chen et al.; introduces HumanEval benchmark used for RQ1 evaluation of code correctness and efficiency."
    558     },
    559     {
    560       "title": "Program Synthesis with Large Language Models (MBPP benchmark)",
    561       "relevance": "Austin et al.; introduces MBPP benchmark (second standard benchmark) used in RQ1 evaluation."
    562     },
    563     {
    564       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    565       "relevance": "Liu et al.; provides source code and evaluation protocols for HumanEval/MBPP used in this study."
    566     },
    567     {
    568       "title": "Code Llama: Open Foundation Models for Code",
    569       "relevance": "Roziere et al.; describes Code Llama, one of six models evaluated for code efficiency."
    570     },
    571     {
    572       "title": "Validation of the gem5 simulator for x86 architectures",
    573       "relevance": "Akram & Sawalha; validates gem5 CPU simulator used for reproducible runtime measurement in this work."
    574     }
    575   ],
    576   "engagement_factors": {
    577     "practical_relevance": {
    578       "score": 1,
    579       "justification": "Efficiency matters for resource-constrained or real-time systems, but most practitioners prioritize correctness. Improvements are marginal (1-18%) and not contextualized against real-world impact. Not immediately actionable for practitioners."
    580     },
    581     "surprise_contrarian": {
    582       "score": 2,
    583       "justification": "Correctness ≠ efficiency is mildly surprising. Model size not mattering is somewhat unexpected. But that training strategy matters is expected. Not strongly contrarian to community beliefs."
    584     },
    585     "fear_safety": {
    586       "score": 0,
    587       "justification": "Paper focuses on runtime efficiency, not safety, alignment, robustness, or security concerns. No safety-relevant findings."
    588     },
    589     "drama_conflict": {
    590       "score": 0,
    591       "justification": "No controversy, conflict, or drama. Straightforward empirical comparison. No novel claims that challenge conventional wisdom strongly."
    592     },
    593     "demo_ability": {
    594       "score": 2,
    595       "justification": "Code promised released on GitHub, but requires gem5 simulator (significant setup overhead) for reproduction. Difficult for practitioners to easily try the methodology without infrastructure investment."
    596     },
    597     "brand_recognition": {
    598       "score": 1,
    599       "justification": "Authors from Nanjing University (SOTA for code), SMU, UT Dallas—respectable but not top-tier AI labs. Models tested are well-known (GPT-4, Code Llama, DeepSeek) but evaluators are not famous."
    600     }
    601   },
    602   "hn_data": {
    603     "threads": [
    604       {
    605         "hn_id": "40370779",
    606         "title": "Simultaneous Many-Row Activation in Off-the-Shelf DRAM Chips",
    607         "points": 7,
    608         "comments": 0,
    609         "url": "https://news.ycombinator.com/item?id=40370779",
    610         "created_at": "2024-05-15T18:44:38Z"
    611       },
    612       {
    613         "hn_id": "39368490",
    614         "title": "Keyframer: Empowering Animation Design Using Large Language Models (Apple)",
    615         "points": 6,
    616         "comments": 1,
    617         "url": "https://news.ycombinator.com/item?id=39368490",
    618         "created_at": "2024-02-14T10:48:19Z"
    619       },
    620       {
    621         "hn_id": "40286055",
    622         "title": "Forklift: An Extensible Neural Lifter",
    623         "points": 3,
    624         "comments": 0,
    625         "url": "https://news.ycombinator.com/item?id=40286055",
    626         "created_at": "2024-05-07T14:39:26Z"
    627       },
    628       {
    629         "hn_id": "43426799",
    630         "title": "Aardvark weather: end-to-end data-driven weather forecasting",
    631         "points": 2,
    632         "comments": 0,
    633         "url": "https://news.ycombinator.com/item?id=43426799",
    634         "created_at": "2025-03-20T18:10:12Z"
    635       },
    636       {
    637         "hn_id": "43211832",
    638         "title": "Heat as a Witness of Quantum Properties",
    639         "points": 2,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=43211832",
    642         "created_at": "2025-02-28T21:48:33Z"
    643       },
    644       {
    645         "hn_id": "41245268",
    646         "title": "Dwellers in the Deep: Biological Consequences of Dark Oxygen",
    647         "points": 2,
    648         "comments": 0,
    649         "url": "https://news.ycombinator.com/item?id=41245268",
    650         "created_at": "2024-08-14T12:25:02Z"
    651       },
    652       {
    653         "hn_id": "40948891",
    654         "title": "Fast-moving stars around an intermediate-mass black hole in Omega Centauri",
    655         "points": 2,
    656         "comments": 0,
    657         "url": "https://news.ycombinator.com/item?id=40948891",
    658         "created_at": "2024-07-12T20:03:03Z"
    659       },
    660       {
    661         "hn_id": "39050109",
    662         "title": "Mission: Impossible Language Models",
    663         "points": 2,
    664         "comments": 0,
    665         "url": "https://news.ycombinator.com/item?id=39050109",
    666         "created_at": "2024-01-19T00:38:50Z"
    667       },
    668       {
    669         "hn_id": "39026660",
    670         "title": "Mission: Impossible Language Models",
    671         "points": 2,
    672         "comments": 0,
    673         "url": "https://news.ycombinator.com/item?id=39026660",
    674         "created_at": "2024-01-17T12:11:54Z"
    675       },
    676       {
    677         "hn_id": "41284222",
    678         "title": "Assessing the Learning Limits of LLMs with Synthetic Impossible Languages",
    679         "points": 1,
    680         "comments": 0,
    681         "url": "https://news.ycombinator.com/item?id=41284222",
    682         "created_at": "2024-08-18T18:27:15Z"
    683       }
    684     ],
    685     "top_points": 7,
    686     "total_points": 29,
    687     "total_comments": 1
    688   }
    689 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs