scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28417B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Learning Code Preference via Synthetic Evolution",
      6     "authors": [
      7       "Jiawei Liu",
      8       "Thanh Nguyen",
      9       "Mingyue Shang",
     10       "Hantian Ding",
     11       "Xiaopeng Li"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2410.03837",
     16     "doi": "10.48550/arXiv.2410.03837"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims are verified in the paper: 28.8% improvement (Table 3), 6-9x parameter match and 34x cost savings (Table 4), and 15.1-40.3% unsolved human tasks derived from accuracy scores in Table 3.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims about CODEFAVOR improving preference accuracy, and Section 3.4 provides comprehensive ablation studies comparing individual data sources (Commit-Instruct vs Critic-Evol), modeling choices, and prompt variants across multiple base models.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Human performance findings (e.g., 'generalist programmers may struggle to accurately assess non-functional code properties') are stated broadly, but the study uses only 18 developers on Python tasks; Appendix A.6 acknowledges scope is limited to self-contained code snippets yet the abstract does not caveat this.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether CODEFAVOR improvements might stem from general fine-tuning domain adaptation rather than the specific synthetic data design; self-bias explanations appear only in narrow case studies, not as systematic alternative hypotheses.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes objective oracles (test execution, CPU instruction counts, static analysis) from human preference, and discusses where each proxy measure diverges from the others in Section 3.2-3.3.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Appendix A.6 'Limitation and Future Work' contains three dedicated limitation points covering data scale, context dependency, and benchmark validity.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats named include modest training set size (62,236 samples), restriction to self-contained Python snippets, and subjectivity in human annotation for non-functional properties; these are more specific than boilerplate disclaimers.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "A.6 explicitly states CODEFAVOR focuses on self-contained code snippets and that real-world repository-level context is out of scope, and Table 1 bounds evaluation to four specific objectives.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure statement is present; the acknowledgment section thanks colleagues but does not disclose any funding source or grant.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the title page: UIUC for the first author and AWS AI Labs for the remaining four authors.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Four of five authors are from AWS AI Labs; the work directly relates to Amazon's commercial code assistant (CodeWhisperer is cited in the introduction), making the institutional funder non-independent of the outcome.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper explicitly defines 'code preference task' in Section 1 as choosing one code response over another given a criterion, and defines Commit-Instruct and Critic-Evol with precise input/output descriptions.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three numbered contributions are explicitly listed at the end of Section 1: the CODEFAVOR framework, the CODEPREFBENCH benchmark, and the empirical study of human/LLM preferences.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4 'Related Work' situates CODEFAVOR relative to preference optimization (RLHF, DPO), LLM-as-judge approaches, and prior code preference work (Weyssow et al., McAleese et al.), explaining how this work differs and extends each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper explicitly states 'We release the data and code at https://github.com/amazon-science/llm-code-preference' in the contributions section.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "CODEPREFBENCH is released at the same GitHub URL; seed datasets (EvalPlus, EvalPerf, CyberSecEval, BigCodeBench-Hard) are standard public benchmarks.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Training hardware (8xA100-40G), framework (Axolotl, DeepSpeed ZeRO-3), and vLLM versions are mentioned, but no requirements.txt, Dockerfile, or equivalent formal dependency specification is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Hyperparameters are listed in Table 8 and training setup described in A.2, but no step-by-step instructions for reproducing experiments are provided.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Table 3 reports range values (e.g., '84.9 (±9.4)', '59.7 (±37.0)') for stochastic models run multiple times, indicating variance across seeds.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests (t-tests, bootstrap intervals, etc.) are applied to the accuracy comparisons despite many comparative claims across models.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported as relative percentage improvements (e.g., '9.3∼28.8% relatively', '34× cheaper') with baseline performance values provided for context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 1,364 benchmark tasks and 18-developer annotator pool are described but neither is justified by power analysis or sample size reasoning.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Variance (±ranges) is reported in Table 3 for models run with temperature > 0; entries without variance markers were run deterministically (greedy decoding).",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 3 compares against 10 open-weight and proprietary baselines including Claude 3.5 Sonnet, Gemini 1.5 Pro, Llama-3.1-405B, Mistral Large 2, and human annotators.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All baselines are state-of-the-art at time of writing (October 2024): Claude 3.5 Sonnet, Gemini 1.5, Llama 3.1 405B, Mistral Large 2, DeepSeek V2.5.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 3.4 provides extensive ablation: individual vs combined training data (Table 5), data mixture vs model merging, classification vs generation output, criteria variants (Table 6), draft/critic model pairing (Table 7).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Evaluation uses four distinct metrics: correctness accuracy, efficiency accuracy, security accuracy, and human preference accuracy, plus cost-effectiveness (Table 4).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "18 software developers annotated 145 human-preference pairs (Section 3.2) and served as a comparison baseline across all four benchmark categories.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "CODEPREFBENCH is the held-out evaluation set, explicitly separated from synthetic training data; Appendix A.5 measures and confirms low overlap between training and test sets.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "All results in Tables 3, 5, 6, 7 are broken down by correctness, efficiency, security, and human preference columns.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Appendix A.4 provides extensive case studies of failure modes for LLMs (hallucination, missed edge cases), human annotators (overlooked details, tied on clear cases), and CODEFAVOR models across all three categories.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results include: code comments hurt accuracy (Table 6), using same draft/critic model degrades performance (Table 7), empty criteria substantially reduces security accuracy, and human annotators fail on non-functional objectives.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model versions are named throughout: Gemini 1.5 Pro 001, Gemini 1.5 Flash 001, Llama-3.1-405B-Instruct, Mistral Large 2 (123B), Codestral-22B-v0.1; vLLM versions also specified in A.3.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Complete prompt templates are provided: Listing 1 for evaluation, Figures 3-5 for Commit-Instruct and Critic-Evol data generation with full multi-turn examples.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 8 reports batch size (32), sequence length (2048), learning rate (5×10⁻⁶), scheduler (cosine annealing with 40 warm-up steps), and sequence packing settings.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; models are prompted directly or fine-tuned as classifiers/generators without multi-step orchestration.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Data processing steps are documented: positional bias augmentation (flipping code pair order), code comment clipping for Critic-Evol samples, filtering criteria for both Commit-Instruct (8.1% filtered) and Critic-Evol (17.9% filtered).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "CODEPREFBENCH data and training datasets are released at the GitHub URL, enabling independent access to the raw preference pairs.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 details benchmark construction for each category: source datasets, pair selection method, oracle label generation, tie exclusion criteria, and positional bias shuffling.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The 18 developers' credentials are described (CS degrees, experience, proficiency) but recruitment method (how they were sourced, whether compensated, selection criteria) is not explained.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Both synthetic data pipelines (Commit-Instruct and Critic-Evol) are documented step-by-step in Section 2.2 with full prompt implementations in Figures 3-5.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for the evaluated LLMs (Claude 3.5 Sonnet, Llama 3.1, etc.) are never stated, despite these models being evaluated on benchmarks (EvalPlus, LBPP) that may have appeared in their pretraining.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Appendix A.5 'Quantifying Contamination' uses Levenshtein similarity analysis to measure overlap between synthetic training data and the evaluation benchmark, finding 0.1-1.7% above 80 similarity.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The contamination analysis in A.5 explicitly measures training-test code similarity and references Riddell et al. (2024) as a framework, confirming the evaluation set is largely contamination-free from training data.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned for the human annotation study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB approval or ethics review is mentioned despite involving 18 human participants in a formal annotation study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Section 3.2 reports education (2/3 with CS degrees), experience (95% with 2+ years), and Python proficiency levels (43% advanced, rest middle-level).",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "No formal inclusion or exclusion criteria for the 18-developer annotation team are stated.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": false,
    340           "justification": "Task assignment to annotators is not described as randomized; no randomization procedure is mentioned.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No blinding of annotators to model identities or task order is mentioned.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No attrition reporting; 145 final preference pairs are described as filtered for non-conflicting preferences, but annotator dropout is not discussed.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 4 provides normalized per-sample costs comparing human annotation ($6.1), Llama-405B (1200x CODEFAVOR cost), and CODEFAVOR models (baseline 1x).",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Training hardware (8xA100-40G) is mentioned but total GPU-hours, wall-clock time, or dollar cost for model training is not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "CODEFAVOR improves model-based code preference accuracy by up to 28.8% relatively over base models",
    375       "evidence": "Table 3 shows Mistral-7B improving from 64.5% to 74.3% average accuracy (+15.2pp); Gemma-2-9B improving from 60.1% to 77.4% (+28.8% relative stated in abstract/text)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "CODEFAVOR models match the performance of models 6-9x larger while being 34x more cost-effective",
    380       "evidence": "Table 3 shows 12B CODEFAVOR models achieving 77-77.7% average, near Llama-3-70B-Instruct (76.1%); Table 4 shows Mistral Nemo CODEFAVOR is 34x cheaper than Llama-70B",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Human preference is sub-optimal for non-functional code objectives (efficiency and security)",
    385       "evidence": "Table 3: human accuracy on efficiency (74.9%) is surpassed by Mistral Large 2 (81.2%); human security accuracy (59.7%) is the lowest of all approaches due to 73.9% of pairs being marked as tied",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "15.1-40.3% of code preference tasks remain unsolved by developer agreement despite 23.4 person-minutes per task",
    390       "evidence": "Derived from Table 3: correctness 84.9% solved (15.1% unsolved), security 59.7% solved (40.3% unsolved); annotation time of 7.8 min/developer × 3 annotators = 23.4 minutes",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Code comments negatively impact preference model accuracy, particularly for correctness tasks",
    395       "evidence": "Table 6: evaluating default models (trained without comments) with comments causes 6.2-10.4% drop in correctness accuracy; training and evaluating with comments causes 6-7% drop in overall accuracy",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Using different draft and critic models (8B draft, 70B critic) outperforms using the same model for both roles in Critic-Evol",
    400       "evidence": "Table 7: 8B/70B pairing achieves 75.3% average vs 68.2% for 8B/8B and 73.4% for 70B/70B; attributed to LLM self-bias in same-model conditions",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "empirical"
    407   ],
    408   "key_findings": "CODEFAVOR, trained on synthetic code evolution data from commits and LLM critiques, enables 7-12B models to match 70B+ model preference accuracy while being 34x cheaper, achieving up to 28.8% relative improvement over base models. Human annotators excel at code correctness (84.9% accuracy) but are surprisingly weak on non-functional objectives: security accuracy is 59.7% (lower than all LLMs) because 73.9% of security pairs are marked tied, and efficiency accuracy (74.9%) is surpassed by multiple models. Model merging of two complementary training datasets outperforms data mixture for classification models, and code comments systematically hurt preference accuracy by allowing LLMs' self-bias to prefer well-commented but incorrect code.",
    409   "red_flags": [
    410     {
    411       "flag": "Institutional conflict of interest",
    412       "detail": "Four of five authors are from AWS AI Labs; the paper advances technology directly applicable to Amazon's CodeWhisperer product, yet no competing interests statement is provided."
    413     },
    414     {
    415       "flag": "No statistical significance tests",
    416       "detail": "All comparative accuracy claims across models are made without statistical significance tests; many differences (e.g., 1-2pp) may not be meaningful given the benchmark sizes."
    417     },
    418     {
    419       "flag": "Human study lacks ethics disclosure",
    420       "detail": "18 developers participated in a formal annotation study with tracked timing and annotation behavior, but no IRB approval, consent procedures, or compensation information is disclosed."
    421     },
    422     {
    423       "flag": "No inference cost for model training",
    424       "detail": "Training compute budget (GPU-hours, total cost) is not disclosed, making the 'cost-effective' framing incomplete — CODEFAVOR requires upfront training that is not accounted for in Table 4."
    425     },
    426     {
    427       "flag": "Generalization limited to Python self-contained code",
    428       "detail": "All results are on Python snippets; claims about human limitations and LLM preference behavior are stated broadly but derive from a narrow, self-contained snippet evaluation context."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)",
    434       "relevance": "Primary evaluation benchmark for code correctness preference tasks in CODEPREFBENCH"
    435     },
    436     {
    437       "title": "Evaluating Language Models for Efficient Code Generation (EvalPerf)",
    438       "relevance": "Source of the code efficiency preference subset with CPU instruction count oracles"
    439     },
    440     {
    441       "title": "Purple Llama CyberSecEval: A Secure Coding Benchmark for Language Models",
    442       "relevance": "Source of security-vulnerable code pairs for CODEPREFBENCH's security category"
    443     },
    444     {
    445       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    446       "relevance": "Establishes the LLM-as-judge and pairwise preference evaluation framework this paper extends to code"
    447     },
    448     {
    449       "title": "CodeUltraFeedback: An LLM-as-a-Judge Dataset for Aligning Large Language Models to Coding Preferences",
    450       "relevance": "Closely related prior work on LLM-as-judge for code quality; CODEFAVOR addresses its limitation of not training preference models from scratch"
    451     },
    452     {
    453       "title": "LLM Critics Help Catch LLM Bugs (CritiGPT)",
    454       "relevance": "Related work on training critic models to catch code bugs; CODEFAVOR confirms and extends their findings on human preference limitations"
    455     },
    456     {
    457       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    458       "relevance": "Foundational method (DPO) for preference optimization that the code preference data from this paper could feed"
    459     },
    460     {
    461       "title": "Prometheus: Inducing Fine-Grained Evaluation Capability in Language Models",
    462       "relevance": "Prior work on training LLM judges that the CODEFAVOR pairwise modeling builds upon"
    463     },
    464     {
    465       "title": "Quantifying Contamination in Evaluating Code Generation Capabilities of Language Models",
    466       "relevance": "Provides the contamination quantification methodology (Levenshtein similarity) used in Appendix A.5"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Directly applicable to code review automation, RLHF data collection for code LLMs, and replacing expensive human annotation — released code and benchmark enable immediate adoption."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "The finding that human developers are worse than LLMs at evaluating code security (59.7% vs near-saturation for most models) and that code comments hurt LLM preference accuracy are counterintuitive results."
    477     },
    478     "fear_safety": {
    479       "score": 1,
    480       "justification": "The code security evaluation component touches on vulnerability detection but does not raise broader AI risk concerns; security is framed as a benchmark category, not a safety issue."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "Comparison between proprietary models (Claude 3.5 Sonnet, Gemini) and open models provides some competitive interest, but no major controversy angle."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Code and benchmark are released at GitHub, allowing practitioners to run CODEPREFBENCH evaluations and fine-tune models using the released synthetic datasets."
    489     },
    490     "brand_recognition": {
    491       "score": 2,
    492       "justification": "AWS AI Labs affiliation and evaluation of Claude 3.5 Sonnet, Gemini 1.5, and Llama 3.1 405B lend brand recognition; not from top-tier academic lab but from major industry player."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "42898158",
    499         "title": "Chrono: A Peer-to-Peer Network with Verifiable Causality",
    500         "points": 8,
    501         "comments": 1,
    502         "url": "https://news.ycombinator.com/item?id=42898158"
    503       },
    504       {
    505         "hn_id": "37882946",
    506         "title": "Next-Generation OS Physical Memory Management for Terabyte-Scale NVMMs",
    507         "points": 3,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=37882946"
    510       },
    511       {
    512         "hn_id": "33314496",
    513         "title": "A study of malicious CVE proof of concept exploits in GitHub",
    514         "points": 3,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=33314496"
    517       },
    518       {
    519         "hn_id": "41746915",
    520         "title": "Code-Survey: An LLM-Driven Methodology for Analyzing Large-Scale Codebases",
    521         "points": 1,
    522         "comments": 1,
    523         "url": "https://news.ycombinator.com/item?id=41746915"
    524       },
    525       {
    526         "hn_id": "42683996",
    527         "title": "Cost-Performance Evaluation of General Compute: AWS, Azure, GCP, and OCI",
    528         "points": 1,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=42683996"
    531       },
    532       {
    533         "hn_id": "42076089",
    534         "title": "Detection of Thermal Emission at mm Wavelengths from Low-Earth Orbit Satellites",
    535         "points": 1,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=42076089"
    538       },
    539       {
    540         "hn_id": "30476106",
    541         "title": "Application-Oriented Performance Benchmarks for Quantum Computing",
    542         "points": 1,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=30476106"
    545       }
    546     ],
    547     "top_points": 8,
    548     "total_points": 18,
    549     "total_comments": 2
    550   }
    551 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs