scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29453B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Large-scale, Independent and Comprehensive study of the power of LLMs for test case generation",
      6     "authors": [
      7       "Wendkûuni C. Ouédraogo",
      8       "Abdoul Kader Kaboré",
      9       "Yinghua Li",
     10       "Haoye Tian",
     11       "Anil Koyuncu",
     12       "Jacques Klein",
     13       "David Lo",
     14       "Tegawendé F. Bissyandé"
     15     ],
     16     "year": 2024,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2407.00225",
     19     "doi": "10.48550/arXiv.2407.00225"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All major abstract claims (GToT improves compilability, hallucination failure rates up to 86%, 0% mutation score) are backed by specific tables and experimental results throughout the paper.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Causal-directional claims like 'GToT enhances test reliability' are supported by controlled comparisons across 5 prompting strategies with 30 iterations each; the study design is adequate for comparative inference even if not for deep causal decomposition.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "GPT-4 was only evaluated on the 31-class CMD dataset due to budget constraints, yet findings about GPT-4 are sometimes stated alongside the broader comparative claims; the threats section acknowledges CMD's small size does not fully represent real-world software complexity.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper explicitly discusses that Defects4J's high LLM coverage may reflect memorization from pretraining rather than genuine reasoning ability (Finding 19), and acknowledges CMD's high coverage is likely due to its small modular structure rather than LLM capability.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper carefully distinguishes syntactic correctness, compilability, code coverage, and actual fault detection—Finding 23 explicitly notes that passed tests often target abstract classes or empty methods, making coverage metrics poor proxies for fault detection.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5 'Threats to Validity' is a dedicated section covering external and internal validity threats.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific threats are named: CMD's 31-class/2-project size, Mistral 7B's knowledge cutoff (2021-2022) vs Mixtral's (Dec 2023), test smell tool false-positive rates per Panichella et al. 2022, and the limitation that only 'Passed' tests were analyzed for mutation testing.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The study explicitly scopes to Java/JUnit class-level generation, four specific models, five prompting strategies, and the study period November 2023–April 2024, with the threats section noting that newer model versions may produce different results.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Funding is explicitly disclosed: Luxembourg National Research Fund (FNR) grant reference 17185670 and European Research Council (ERC) under Horizon 2020 grant agreement No. 949014.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are listed: University of Luxembourg (SnT), Singapore Management University, University of Melbourne, and Bilkent University.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Funders are national research agencies (Luxembourg FNR, ERC), independent of OpenAI and MistralAI whose products are evaluated.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The Declarations section explicitly states: 'The authors declare that they have no conflict of interest.'",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "MSR, CSR, all five prompting techniques (ZSL, FSL, CoT, ToT, GToT), test smells, and hallucination are defined in Section 2 with illustrative examples.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Six explicitly numbered contributions are listed in the introduction, covering prompt engineering investigation, structural analysis, execution/fault detection, readability/maintainability, multi-dataset generalizability, and a public replication package.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 6 has five subsections explicitly comparing findings to Siddiq et al. (2024), Tang et al. (2024), Yang et al. (2024), Yuan et al. (2024), and Gu et al. (2024), identifying specific gaps each prior work left unaddressed.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "A replication package is provided at https://anonymous.4open.science/r/LLM4TS-0F76/ containing prompt templates, evaluation scripts, and results.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "Defects4J and SF110 are established public benchmarks; CMD is included in the replication package. The Data Availability Statement confirms datasets are in the repository.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Hardware is described (AMD EPYC 7552, 640GB RAM) and specific tools are named (JaCoCo, Pitest, TsDetect, Checkstyle, PMD, SpotBugs, Javalang), but no requirements file, Dockerfile, or version-pinned dependency list is provided.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "The paper references the replication package for scripts but provides no step-by-step reproduction instructions in the paper itself; a reader cannot follow the pipeline without independently exploring the package.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Distribution statistics (min, Q1, median, Q3, max, mean) are reported for coverage and readability, but no confidence intervals or error bars are reported for any comparative claim.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical hypothesis tests are performed despite numerous comparative claims (e.g., 'GToT significantly enhances...'). The 30-iteration runs are used for averages but not formal significance testing.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Percentage differences are consistently reported (e.g., 'LLM-generated test suites improve readability over EvoSuite by 21–40%', compilation gap between syntactic correctness and actual compilation).",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The dataset sizes (690 classes, 216,300 test files) are described but not justified in terms of statistical power; CMD's 31 classes from 2 projects is acknowledged as small without justification for why it was kept.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": true,
    177           "justification": "Quartile distributions (Q1, Q3) are reported for coverage and readability metrics across all conditions, providing a measure of spread.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "EvoSuite (state-of-the-art SBST) is used as the primary baseline across all datasets and metrics.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "EvoSuite is the standard Java SBST tool and is referenced in recent comparative studies (Tang et al. 2024, Yang et al. 2024); it remains the relevant baseline for this task.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "There is no ablation study isolating which components of GToT (collaborative reasoning, guided steps, multi-expert simulation) drive improvements over CoT and ToT.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "The study uses MSR, CSR, syntactic correctness, compilability, line/instruction/method coverage, readability score, test smells (18 types), mutation score, and execution outcome (pass/fail/timeout).",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": false,
    209           "justification": "Readability is assessed with an automated model (Scalabrino et al. 2018) correlated with human evaluation, but no actual human evaluation of generated test suites was conducted in this study.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": false,
    214           "answer": false,
    215           "justification": "This is a code generation evaluation task, not a prediction task requiring held-out test splits.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Results are broken down by model, dataset (SF110, Defects4J, CMD), and prompting technique across all 7 research questions, with detailed tables for each combination.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Compilation errors are categorized into 22 error types (Table 8), runtime failures into 12 types (Table 18), and Finding 23 discusses why mutation testing yielded 0% scores for most passed tests.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Multiple negative results are prominently reported: 0% mutation score for most tests, Mistral 7B and Mixtral 8x7B failing 100% of CMD execution tests, and compilation rates as low as 0.44% for Mistral 7B on Defects4J with ZSL.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Table 1 gives model names with release dates but not specific API snapshot IDs (e.g., gpt-3.5-turbo-0125); for API-based models this matters for reproducibility since behavior changes across snapshots.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Full prompts with placeholders are shown verbatim in Figures 3 and 4 for CoT, ToT, and GToT; the NLD structure is described in detail for all techniques.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature is set to 0.7 for all models; token limits per model are stated in Table 1; EvoSuite's 3-minute budget and 30-iteration runs are specified.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": false,
    258           "answer": false,
    259           "justification": "No agentic scaffolding is used; models are called directly via API with structured prompts.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "CMD curation criteria are documented (post-May 2023, buggy-fixed pairs, token limit compliance); Javalang parsing and JVM compilation steps are described; tiktoken token counting is specified.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The Data Availability Statement confirms datasets and code are in the replication repository at anonymous.4open.science.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "CMD collection is described in detail (ODC and Conductor OSS projects, selection criteria including post-May 2023 commits, buggy-fixed pairing); SF110 and Defects4J are referenced with citations.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants; datasets are software benchmarks.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "Figure 2 shows the full 11-step pipeline from LLM prompting through code extraction, compilation, coverage measurement, readability, test smell detection, and mutation testing.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "The threats section states Mistral 7B has knowledge base from 2021-2022 and notes that model knowledge cutoff dates influenced results; release dates are also given in Table 1.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "Finding 19 explicitly states 'Defects4J's high coverage likely stems from pretraining exposure,' and the CMD dataset was specifically designed to exclude projects released before May 2023 to mitigate this.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": true,
    311           "justification": "The paper cites Sallou et al. (2023) showing ChatGPT can retrieve detailed Defects4J bug information, and responds by creating CMD with post-May-2023 projects; contamination is acknowledged as a 'persistent challenge.'",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants; the Declarations section explicitly confirms this.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "Paper states: 'This article does not contain any studies with human participants or animals performed by any of the authors.'",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "Budget constraints are mentioned (GPT-4 and Mixtral only run on CMD due to cost) but actual API cost figures are never reported.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "EvoSuite hardware is specified but total compute budget (API call count, GPU hours, dollar cost) is never stated for LLM evaluations.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Only 7.2% of all LLM-generated test suites and 12% of syntactically correct ones successfully compile; class-level dependencies drive this gap over prior method-level studies.",
    378       "evidence": "Table 7 shows compilability rates peaking at 9.67% (GPT-3.5/GToT/SF110) with averages consistently below 7%; Finding 7 states the overall 7.2%/12% figures.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Reasoning-based prompting (GToT) consistently improves structural adherence (MSR), extractability (CSR), syntactic correctness, and compilability over ZSL and FSL.",
    383       "evidence": "Tables 4, 5, 6, 7 all show GToT achieving highest rates across datasets and models for GPT-3.5; Findings 2, 3, 6 summarize these improvements.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "LLM-generated tests outperform EvoSuite in readability by 21-40%.",
    388       "evidence": "Table 12 shows GPT-3.5/GToT mean readability 84.75% vs EvoSuite 50.77% on combined Defects4J+SF110; automated readability model correlated with human assessment is cited.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Mutation testing yields 0% mutation scores for most LLM-generated tests because passed tests target abstract classes, interfaces, or empty methods rather than executable logic.",
    393       "evidence": "Finding 23 and Section 4.6 explain that Pitest could not run on most Passed tests due to non-executable targets; line coverage as low as 3.57% confirms lack of meaningful code interaction.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Hallucination-driven 'Cannot Find Symbol' errors dominate compilation failures across all models (67-86% of compilation errors).",
    398       "evidence": "Table 8 shows CFS rates: GPT-3.5-Turbo 86.47%, GPT-4 76.99%, Mistral 7B 72.41%, Mixtral 8x7B 67.91% in CMD.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Magic Number Test smell appears in nearly 100% of test suites across all models and prompting strategies, and no technique eliminates it.",
    403       "evidence": "Table 19 shows MNT at 84.41-100% for all model/dataset/prompt combinations; Finding 26 states 'all strategies consistently struggle with Magic Number smells (85.64-100%).'",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "EvoSuite outperforms all LLMs in code coverage (median 94.34% line coverage vs. LLM best of 75.47% for GPT-3.5/CoT).",
    408       "evidence": "Table 14 shows EvoSuite median line coverage 94.34% vs GPT-3.5/CoT 75.47%, with most LLM configurations far lower; Finding 17 confirms EvoSuite superiority.",
    409       "supported": "strong"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval"
    414   ],
    415   "key_findings": "LLM-generated Java unit tests have extremely low compilation rates (7.2% overall) despite higher syntactic correctness (~79%), primarily due to hallucinated dependencies manifesting as 'Cannot Find Symbol' errors (67-86% of compilation failures). Reasoning-based prompting—especially GToT—consistently improves structural quality, readability (21-40% above EvoSuite), and compilability, but the improvement is insufficient for production use. Most critically, mutation testing reveals 0% mutation scores for the vast majority of 'passed' tests because they target abstract classes, interfaces, or empty methods rather than executable logic, meaning LLMs currently fail at the fundamental purpose of testing: fault detection. Magic Number Test smells appear in nearly 100% of all generated tests regardless of model or prompting strategy.",
    416   "red_flags": [
    417     {
    418       "flag": "GPT-4 budget limitation",
    419       "detail": "GPT-4 was only evaluated on CMD (31 classes, 2 projects) due to budget constraints, making comparisons between GPT-4 and other models unreliable and preventing assessment on the larger SF110 and Defects4J datasets."
    420     },
    421     {
    422       "flag": "No statistical significance tests",
    423       "detail": "Despite claims of 'significant' improvements for GToT across multiple findings, no formal statistical hypothesis tests (Mann-Whitney, t-test, etc.) are performed on any comparative result."
    424     },
    425     {
    426       "flag": "Model versions imprecise",
    427       "detail": "Only marketing names with release dates are provided (not API snapshot IDs like gpt-3.5-turbo-0125), making exact reproduction impossible since OpenAI updates models in-place."
    428     },
    429     {
    430       "flag": "CMD dataset too small",
    431       "detail": "CMD has only 31 classes from 2 projects; several cells in results tables are empty or report single-point statistics, and the authors acknowledge it limits the generalizability of observations."
    432     },
    433     {
    434       "flag": "Anonymous replication package",
    435       "detail": "The replication package is on anonymous.4open.science, which may not provide permanent access; if the paper is published, a permanent DOI-linked archive would be needed."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "Using Large Language Models to Generate JUnit Tests: An Empirical Study",
    441       "relevance": "Primary comparison point for LLM test generation quality; this paper extends Siddiq et al.'s work on ChatGPT compilability to include execution and maintainability."
    442     },
    443     {
    444       "title": "ChatGPT vs SBST: A Comparative Assessment of Unit Test Suite Generation",
    445       "relevance": "Direct predecessor comparing ChatGPT to EvoSuite; this paper uses same baseline and extends with 5 prompting strategies and mutation testing."
    446     },
    447     {
    448       "title": "On the Evaluation of Large Language Models in Unit Test Generation",
    449       "relevance": "Cross-LLM evaluation that this paper contradicts regarding CoT's effectiveness; Yang et al. found reasoning prompting had limited impact on code-specialized LLMs."
    450     },
    451     {
    452       "title": "Breaking the Silence: The Threats of Using LLMs in Software Engineering",
    453       "relevance": "Evidence that Defects4J is contaminated in LLM pretraining; used to justify CMD dataset creation and contextualize Defects4J coverage results."
    454     },
    455     {
    456       "title": "Exploring the Impact of the Output Format on the Evaluation of Large Language Models for Code Translation",
    457       "relevance": "Source of MSR/CSR metrics used to evaluate LLM output structure and extractability in this study."
    458     },
    459     {
    460       "title": "TestArt: Improving LLM-Based Unit Test via Co-Evolution of Automated Generation and Repair Iteration",
    461       "relevance": "Iterative LLM test refinement approach that this paper compares against, arguing that execution-based and maintainability evaluation is also needed."
    462     },
    463     {
    464       "title": "Software Testing with Large Language Models: Survey, Landscape, and Vision",
    465       "relevance": "Survey of LLM testing approaches providing context for this study's positioning as comprehensive empirical evaluation."
    466     },
    467     {
    468       "title": "A Comprehensive Model for Code Readability",
    469       "relevance": "The automated readability model used to assess LLM-generated test readability, validated against human assessments.",
    470       "source": "haiku"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Directly addresses whether LLMs can replace or supplement manual unit test writing, a high-priority concern for software engineering practitioners."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "Contradicts Yang et al.'s claim that reasoning-based prompting has limited impact, and the 0% mutation score finding is a stark negative result that challenges optimistic LLM narratives."
    481     },
    482     "fear_safety": {
    483       "score": 0,
    484       "justification": "No AI safety or risk concerns; purely a software engineering quality study."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Mild conflict with prior papers (Yang et al. 2024) but framed constructively rather than adversarially."
    489     },
    490     "demo_ability": {
    491       "score": 2,
    492       "justification": "GToT prompts are fully provided and could be directly applied by practitioners with API access to GPT-3.5 or GPT-4 today."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "Uses OpenAI GPT-3.5/GPT-4 which are well-known, but no involvement from major AI labs as authors."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "39960717",
    503         "title": "Mixture-of-Depths: Dynamically allocating compute in transformers",
    504         "points": 281,
    505         "comments": 83,
    506         "url": "https://news.ycombinator.com/item?id=39960717",
    507         "created_at": "2024-04-07T13:42:05Z"
    508       },
    509       {
    510         "hn_id": "40389576",
    511         "title": "GDPR: Is It Worth It?",
    512         "points": 72,
    513         "comments": 205,
    514         "url": "https://news.ycombinator.com/item?id=40389576",
    515         "created_at": "2024-05-17T13:22:06Z"
    516       },
    517       {
    518         "hn_id": "39927422",
    519         "title": "Mixture-of-Depths: Dynamically allocating compute in transformer language models",
    520         "points": 5,
    521         "comments": 2,
    522         "url": "https://news.ycombinator.com/item?id=39927422",
    523         "created_at": "2024-04-04T07:11:29Z"
    524       },
    525       {
    526         "hn_id": "39932637",
    527         "title": "Mixture-of-Depths: Dynamically allocating compute in transformers",
    528         "points": 4,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=39932637",
    531         "created_at": "2024-04-04T16:31:27Z"
    532       },
    533       {
    534         "hn_id": "39058537",
    535         "title": "ChatQA: Building GPT-4 Level Conversational QA Models",
    536         "points": 3,
    537         "comments": 2,
    538         "url": "https://news.ycombinator.com/item?id=39058537",
    539         "created_at": "2024-01-19T17:47:24Z"
    540       },
    541       {
    542         "hn_id": "39940557",
    543         "title": "DeepMind: Mixture-of-Depths: Dynamically allocating compute in transformers",
    544         "points": 2,
    545         "comments": 1,
    546         "url": "https://news.ycombinator.com/item?id=39940557",
    547         "created_at": "2024-04-05T09:51:16Z"
    548       },
    549       {
    550         "hn_id": "39949473",
    551         "title": "Dynamically allocating compute in transformer-based language models",
    552         "points": 2,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=39949473",
    555         "created_at": "2024-04-06T02:20:21Z"
    556       },
    557       {
    558         "hn_id": "32145329",
    559         "title": "Pile of Law: Learning Responsible Data Filtering from the Law",
    560         "points": 2,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=32145329",
    563         "created_at": "2022-07-18T23:12:30Z"
    564       },
    565       {
    566         "hn_id": "40320577",
    567         "title": "Aligning Large Language Models with Recommendation Knowledge",
    568         "points": 1,
    569         "comments": 0,
    570         "url": "https://news.ycombinator.com/item?id=40320577",
    571         "created_at": "2024-05-10T16:06:18Z"
    572       },
    573       {
    574         "hn_id": "39992163",
    575         "title": "Understanding Physical Breakdowns in Virtual Reality",
    576         "points": 1,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=39992163",
    579         "created_at": "2024-04-10T15:56:57Z"
    580       }
    581     ],
    582     "top_points": 281,
    583     "total_points": 373,
    584     "total_comments": 293
    585   }
    586 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs