ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (29150B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On the Evaluation of Large Language Models in Unit Test Generation",
      6     "authors": [
      7       "Lin Yang",
      8       "Chen Yang",
      9       "Shutao Gao",
     10       "Weijing Wang",
     11       "Bo Wang",
     12       "Qihao Zhu",
     13       "Xiao Chu",
     14       "Jianyi Zhou",
     15       "Guangtai Liang",
     16       "Qianxiang Wang",
     17       "Junjie Chen"
     18     ],
     19     "year": 2024,
     20     "venue": "ASE 2024",
     21     "arxiv_id": "2406.18181",
     22     "doi": "10.1145/3691620.3695529"
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All abstract claims (first empirical study with 17 Java projects and 5 open-source LLMs, influence of prompt factors, LLM vs GPT-4 vs Evosuite comparison, identified limitations) are directly supported by the paper's experiments and findings sections.",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Causal claims about prompt design and ICL methods affecting performance are supported by controlled ablation experiments with Wilcoxon rank sum tests and rank-biserial correlation effect sizes, which is adequate for the comparative claims made.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper explicitly bounds its results to 17 Java projects from Defects4J 2.0 and five specific open-source LLMs; the threats section acknowledges these scope limitations and notes the ablation finds only a locally optimal prompt setting.",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper consistently offers single mechanistic explanations for findings (e.g., training data style alignment, code comprehension ability) without systematically considering alternative interpretations or confounds.",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper explicitly distinguishes between test coverage metrics (what is measured) and readability/maintainability (a separate quality dimension not measured), noting that Evosuite's high coverage comes at the cost of poor readability.",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 4 'Threats to Validity' covers internal, external, and construct validity threats across multiple paragraphs.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Specific threats are enumerated: ablation covers only one-feature-at-a-time (not all combinations), data leakage checked via exact-match comparison with specific numbers (3.70 vs 2.41 average unit tests), and CoT/RAG adaptations acknowledged as potentially suboptimal.",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper explicitly states the prompt variant found is 'the locally optimal setting from our ablation experiment' (not global optimum) and acknowledges results may not extend to non-Java languages or projects outside Defects4J.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The acknowledgments section explicitly discloses funding from the National Natural Science Foundation of China (four grant numbers) and CCF-Huawei PopulusGrove Fund.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All author affiliations are disclosed on the title page, including four authors from Huawei Cloud Computing Co. Ltd. and others from Tianjin University, Beijing Jiaotong University, and Peking University.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Huawei Cloud Computing Co. Ltd. funds the work via CCF-Huawei fund and has four co-authors on the paper; while Huawei products are not directly evaluated, the institutional entanglement represents a conflict of interest not addressed by a competing interests statement.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "There is no competing interests statement or declaration of financial interests (patents, equity, consulting) beyond the acknowledgment of funding sources.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Key terms are defined precisely: focal method, focal class, related classes, six code features (FM_b, FM_p, FC_c, FC_f, FC_m, RC_c), CSR, CovL, CovB, NDD, and the two description styles (NL vs CL) are all explicitly defined.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper explicitly states three contributions: first empirical study of open-source LLMs for unit test generation, comprehensive evaluation across four aspects (prompt, comparison, ICL, defect detection), and nine major findings with actionable implications.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section 6 systematically positions this work against prior unit test generation approaches (traditional, DL-based, LLM-based), explicitly explaining how it differs: prior work used closed-source LLMs with fixed prompting, while this work investigates open-source LLMs with varied prompting strategies.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper states 'All of our code and data are available at our project homepage' with a GitHub URL (github.com/LeonYang95/LLM4UT) provided as reference [5].",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The study uses Defects4J 2.0, a publicly available standard benchmark; experimental data is also stated to be available at the project homepage.",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The paper specifies PyTorch 2.0.0, transformers 4.34.1, VLLM library, Ubuntu 18.04 LTS, Intel Xeon Gold 6240C CPU, 512GB RAM, and NVIDIA A100 GPUs as the environment.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "The paper mentions releasing code 'for replication' but provides no step-by-step reproduction instructions within the paper itself; readers must consult the external GitHub repository.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Results in Tables 1–6 are reported as point estimates (percentages, counts) without confidence intervals or error bars.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Wilcoxon rank sum tests with significance level 0.05 are applied to compare NL vs CL description styles and all prompt variant pairs across LLMs.",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Rank-biserial correlation scores are computed as effect sizes alongside p-values, with a threshold of >0.3 for meaningful difference explicitly stated.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The 17 Java projects and 778 focal methods are taken from the existing Defects4J benchmark without explicit sample size justification or power analysis; scale is noted by GPU hours spent but not by statistical power.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Results are reported as point estimates only; temperature is set to 0 for determinism but no variance metrics (std dev, IQR) are reported across runs or across projects.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Evosuite (traditional search-based approach) and GPT-4 (state-of-the-art commercial LLM) serve as explicit baselines for comparison in Table 4.",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Evosuite is the widely-adopted state-of-the-art traditional tool and GPT-4 was the leading commercial LLM at the time of evaluation (2024).",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Section 3.1 performs systematic ablation on code features by removing each of five features (FM_p, FC_c, FC_f, FC_m, RC_c) individually from the full prompt to assess their contributions.",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Four metrics are used: Compilation Success Rate (CSR), Line Coverage (CovL), Branch Coverage (CovB), and Number of Detected Defects (NDD).",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Four authors with 4+ years Java experience manually labeled the reasons for undetected defects in RQ4, achieving Cohen's Kappa of 0.95 for inter-rater reliability.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": false,
    217           "answer": false,
    218           "justification": "The study evaluates generative LLM behavior on an established benchmark rather than a prediction/training task, so a train/test split is not applicable.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Results are broken down per LLM (5 open-source + GPT-4), per prompt variant (5 code feature ablations + 2 description styles), per ICL method, and per defect-failure reason category.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Section 3.2 identifies and quantifies three types of compilation failures (unresolved symbol 30.68%, parameter mismatch 17.25%, abstract instantiation 10.38%); Section 3.4 analyzes three categories of undetected defects with concrete examples (Math-53, Compress-34).",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "RAG consistently reduces performance for all five open-source LLMs; CoT hurts all three CodeLlama models; these negative results are prominently reported in Table 5 and Findings 6–7 rather than buried.",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Open-source model names include version identifiers (e.g., Phind-CodeLlama-34B-v2) but GPT-4 is referenced without a snapshot date or API version, making that portion of the study non-reproducible.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "The paper describes prompt components and design choices conceptually but does not include actual prompt text or templates; readers must consult the GitHub repository.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": false,
    256           "justification": "Only temperature (set to 0) is mentioned; other inference parameters (top-p, max new tokens, beam width) are not reported.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": false,
    261           "answer": false,
    262           "justification": "The paper uses direct prompting without agentic scaffolding; post-processing steps (AST extraction, test class assembly, compilation retry) are described but this is not scaffolding.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The full preprocessing pipeline is described: tree-sitter AST extraction of generated tests, integration into a test class, import resolution, and recursive removal of test methods causing compilation errors until successful compilation.",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The paper claims all code and data are available at the project homepage (github.com/LeonYang95/LLM4UT).",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Data collection from Defects4J 2.0 is described: 835 real-world defects from 17 projects, filtered to 778 public focal methods involving 413 defects, with the selection rationale (patched methods, public access only) explained.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No participant/sample recruitment needed; the study uses a standard public benchmark (Defects4J 2.0) with no human subjects.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "The full pipeline is described: focal method selection → prompt construction → LLM generation → AST-based extraction → test class assembly → compilation → coverage measurement via JaCoCo → defect detection evaluation.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No training data cutoff dates are stated for any of the evaluated models (CodeLlama, DeepSeek-Coder, or GPT-4).",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Section 4 explicitly discusses potential data leakage, comparing LLM-generated tests to original benchmark tests and finding no exact matches, with average test counts (3.70 generated vs 2.41 original) as additional evidence.",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": true,
    314           "justification": "The paper uses exact-match comparison between LLM-generated and benchmark-provided unit tests as a contamination proxy check, finding no exact matches, though this is acknowledged as only a partial mitigation.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants; the manual labeling by four authors is internal analysis methodology, not a human subjects study.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "The paper reports 'approximately 3,000 NVIDIA A100 GPU-hours' for open-source model experiments, giving practitioners a concrete cost estimate.",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": true,
    372           "justification": "Total computational budget is explicitly stated as approximately 3,000 NVIDIA A100 GPU-hours across four servers with eight A100 GPUs each.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "All studied LLMs including GPT-4 underperform Evosuite in test coverage (GPT-4: 40.43% line coverage vs Evosuite: 78.91%)",
    381       "evidence": "Table 4 shows CSR, CovL, and CovB for all models vs Evosuite; the gap is large and attributed to hallucination-induced invalid tests (34–62% invalid across models)",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Description style alignment with training data significantly affects performance: CL-7B/CL-13B perform better with NL style, DeepSeek-Coder models are style-robust",
    386       "evidence": "Table 1 with Wilcoxon tests and rank-biserial correlation effect sizes showing statistically significant differences (p<0.05, effect>0.3) for CL-7B/CL-13B but not DC models",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Including other class methods (FCm) in prompts improves syntactic validity but reduces coverage by consuming token budget",
    391       "evidence": "Tables 2–3: FCm removal reduces CSR significantly but increases CovL; average generated tests increase from 3,654 to 5,434 when FCm removed",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "CoT improves DeepSeek-Coder models but hurts CodeLlama models depending on code comprehension ability",
    396       "evidence": "Table 5 shows DC-7B +2.72% CovL and DC-33B +0.69% with CoT vs CL-7B -3.04% and CL-13B -6.45%; manual analysis confirms DeepSeek provides more accurate code descriptions",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "RAG as adapted from code generation consistently reduces unit test generation effectiveness across all five open-source LLMs",
    401       "evidence": "Table 5 shows negative CovL increments for all models (CL-7B: -5.57%, CL-13B: -6.03%, PD-34B: -9.28%, DC-7B: -5.80%, DC-33B: -3.34%); attributed to mismatch between retrieved (12.10 LOC avg) and generated (5.60 LOC avg) tests",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "LLM defect detection ability is severely limited: 87.13% of defects yield no valid tests, and among testable defects only 47.28% are detected",
    406       "evidence": "Table 6 shows NTD vs NDD ratios; Section 3.4 provides three-category failure analysis with manual annotation (Cohen's Kappa 0.95)",
    407       "supported": "strong"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval",
    412     "observational"
    413   ],
    414   "key_findings": "All LLMs including GPT-4 substantially underperform Evosuite in unit test coverage (GPT-4: 40.43% vs Evosuite: 78.91% line coverage), primarily because hallucination causes 34–62% of generated tests to be syntactically invalid. Prompt design critically affects performance: description style must align with each model's training data, and including other class methods (FCm) in prompts improves validity but reduces coverage by consuming the token budget. Both CoT and RAG show mixed or negative results when adapted from other code tasks, with RAG consistently hurting all five models due to a mismatch between retrieved and LLM-preferred test styles. Defect detection is severely limited, with 87.13% of defects yielding no valid tests at all, and the primary barrier for the remaining defects is missing specific defect-triggering inputs rather than insufficient coverage.",
    415   "red_flags": [
    416     {
    417       "flag": "GPT-4 version unspecified",
    418       "detail": "GPT-4 is evaluated without a snapshot date or API version identifier, making this portion of the study non-reproducible as GPT-4 behavior changes across versions."
    419     },
    420     {
    421       "flag": "Prompts not provided in paper",
    422       "detail": "Actual prompt templates and text are described only conceptually in the paper; readers must consult the external GitHub repository to understand exactly what was tested."
    423     },
    424     {
    425       "flag": "No confidence intervals on main results",
    426       "detail": "All main results (CSR, CovL, CovB, NDD) are reported as point estimates without confidence intervals or standard errors, obscuring uncertainty in measurements."
    427     },
    428     {
    429       "flag": "Funder-author overlap (Huawei)",
    430       "detail": "Four of eleven authors are from Huawei Cloud Computing, which also funds the work via CCF-Huawei fund; no competing interests statement is provided."
    431     },
    432     {
    433       "flag": "Non-exhaustive ablation",
    434       "detail": "Code feature ablation removes one feature at a time from the full set rather than exploring all combinations; the paper acknowledges the globally optimal prompt was not found, and non-additive interaction effects are unexplored."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    440       "relevance": "Direct predecessor (TestPilot/Schäfer et al.) evaluating GPT-3.5 for unit test generation in JavaScript; this paper extends to open-source LLMs and broader prompt investigation"
    441     },
    442     {
    443       "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation",
    444       "relevance": "Evaluates ChatGPT (ChatUniTest) for unit test generation with CoT; key baseline comparison paper, primary prior work this study extends to open-source LLMs"
    445     },
    446     {
    447       "title": "Exploring the Effectiveness of Large Language Models in Generating Unit Tests",
    448       "relevance": "Evaluates GPT-3.5 and Codex for unit test generation; directly related work this paper extends to open-source LLMs with varied prompting"
    449     },
    450     {
    451       "title": "EvoSuite: automatic test suite generation for object-oriented software",
    452       "relevance": "Key baseline tool (evolutionary search-based) against which all LLMs are compared; represents state-of-the-art traditional approach and outperforms all LLMs"
    453     },
    454     {
    455       "title": "Unit Test Case Generation with Transformers",
    456       "relevance": "AthenaTest — early DL-based unit test generation using BART; represents the DL-based approach this work supersedes and builds upon"
    457     },
    458     {
    459       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    460       "relevance": "Foundation paper for the CoT methodology investigated in RQ3; key ICL method tested and found model-dependent in this study"
    461     },
    462     {
    463       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    464       "relevance": "Foundation paper for RAG methodology adapted and evaluated in RQ3; found consistently ineffective for unit test generation in this study"
    465     },
    466     {
    467       "title": "Enhancing LLM-based Test Generation for Hard-to-Cover Branches via Program Analysis",
    468       "relevance": "Most recent related work (TELPA) improving LLM-based test generation with bidirectional analysis; direct competitor using PD-34B only"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 3,
    474       "justification": "Unit test generation is a high-priority SE task; the paper gives concrete, actionable guidance on model selection, prompt design, and ICL methods backed by 3,000 A100 GPU-hours of experiments."
    475     },
    476     "surprise_contrarian": {
    477       "score": 2,
    478       "justification": "The finding that all LLMs including GPT-4 are beaten by decade-old Evosuite, and that RAG consistently hurts performance contrary to expectations from other code tasks, challenges common assumptions about LLM superiority."
    479     },
    480     "fear_safety": {
    481       "score": 0,
    482       "justification": "No safety or risk implications discussed; purely a software engineering methodology study."
    483     },
    484     "drama_conflict": {
    485       "score": 1,
    486       "justification": "The Huawei authorship and funding alongside evaluation of non-Huawei models creates a minor institutional tension, but no explicit controversy."
    487     },
    488     "demo_ability": {
    489       "score": 2,
    490       "justification": "Code is released on GitHub with Defects4J as the public benchmark; practitioners can reproduce or extend the evaluation with publicly available model weights."
    491     },
    492     "brand_recognition": {
    493       "score": 1,
    494       "justification": "Evaluates GPT-4 (recognizable) alongside CodeLlama and DeepSeek; published at ASE 2024 (top SE venue) with Huawei industry involvement."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [
    499       {
    500         "hn_id": "39499207",
    501         "title": "Hallucination is inevitable: An innate limitation of large language models",
    502         "points": 308,
    503         "comments": 474,
    504         "url": "https://news.ycombinator.com/item?id=39499207"
    505       },
    506       {
    507         "hn_id": "28230092",
    508         "title": "A Dyson sphere around a black hole",
    509         "points": 214,
    510         "comments": 231,
    511         "url": "https://news.ycombinator.com/item?id=28230092"
    512       },
    513       {
    514         "hn_id": "39888769",
    515         "title": "Mini-Gemini: Mining the Potential of Multi-Modality Vision Language Models",
    516         "points": 83,
    517         "comments": 7,
    518         "url": "https://news.ycombinator.com/item?id=39888769"
    519       },
    520       {
    521         "hn_id": "42531993",
    522         "title": "Empirical Study of Test Generation with LLM's",
    523         "points": 40,
    524         "comments": 36,
    525         "url": "https://news.ycombinator.com/item?id=42531993"
    526       },
    527       {
    528         "hn_id": "41022645",
    529         "title": "Modal Effect Types",
    530         "points": 4,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=41022645"
    533       },
    534       {
    535         "hn_id": "39314708",
    536         "title": "Hallucination Is Inevitable: An Innate Limitation of Large Language Models",
    537         "points": 3,
    538         "comments": 2,
    539         "url": "https://news.ycombinator.com/item?id=39314708"
    540       },
    541       {
    542         "hn_id": "40390670",
    543         "title": "Acoustic Manipulation of Underwater Data Center Operations, Resource Management",
    544         "points": 1,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=40390670"
    547       },
    548       {
    549         "hn_id": "40190640",
    550         "title": "Holographic Parallax Improves 3D Perceptual Realism",
    551         "points": 1,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=40190640"
    554       },
    555       {
    556         "hn_id": "39899945",
    557         "title": "Turning News Graphics into TikToks by Adjusting Narrative Beats and Pacing",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=39899945"
    561       },
    562       {
    563         "hn_id": "39503420",
    564         "title": "An Empirical Evaluation of LLMs for Solving Offensive Security Challenges",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=39503420"
    568       }
    569     ],
    570     "top_points": 308,
    571     "total_points": 656,
    572     "total_comments": 750
    573   }
    574 }

Impressum · Datenschutz