scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29668B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On the Evaluation of Large Language Models in Unit Test Generation",
      6     "authors": [
      7       "Lin Yang",
      8       "Chen Yang",
      9       "Shutao Gao",
     10       "Weijing Wang",
     11       "Bo Wang",
     12       "Qihao Zhu",
     13       "Xiao Chu",
     14       "Jianyi Zhou",
     15       "Guangtai Liang",
     16       "Qianxiang Wang",
     17       "Junjie Chen"
     18     ],
     19     "year": 2024,
     20     "venue": "ASE 2024",
     21     "arxiv_id": "2406.18181",
     22     "doi": "10.1145/3691620.3695529"
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All abstract claims — first empirical study on open-source LLMs for unit test generation, coverage of 17 Java projects/5 LLMs, prompt factor influence, comparison with GPT-4 and Evosuite — are substantiated by corresponding experimental sections (RQ1–RQ4).",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Causal claims such as 'FCm negatively impacts effectiveness due to extensive length' and 'description style alignment with training data improves performance' are backed by ablation experiments (Tables 2–3) with statistical tests and effect sizes.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper explicitly bounds results to Defects4J 2.0 (17 Java projects), five specific open-source LLMs, and the tested prompting strategies; threats-to-validity section acknowledges external generalizability limits.",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper discusses alternative explanations throughout, e.g., why CL-7B outperforms CL-13B (repetition artifact), why removing FCm paradoxically improves coverage (more generation space), and why RAG fails (retrieval-generation gap).",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper uses CSR, line/branch coverage, and NDD as proxies for test quality but does not explicitly discuss the gap between these metrics and broader developer utility; readability is mentioned only as a known Evosuite weakness, not measured.",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 4 'Threats to Validity' is a dedicated section covering internal, external, and construct validity threats across multiple paragraphs.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Specific threats named include: only public methods considered, ablation doesn't cover all feature combinations ('locally optimal not globally optimal'), temperature=0 assumption, exact-match data leakage check limitations, and the project-specific RAG retrieval constraint.",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper explicitly states results are bounded to Java/Defects4J, five specific LLM architectures and sizes, and that the prompt ablation explores single-feature removal rather than all combinations — acknowledged as a scope limit.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Acknowledgments disclose National Natural Science Foundation of China grants (62322208, 62202040, 62232001, 12411530122) and CCF-Huawei PopulusGrove Fund.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All author affiliations are listed in the header: Tianjin University, Beijing Jiaotong University, Peking University, and Huawei Cloud Computing Co. Ltd., making the Huawei connection explicit.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Huawei Cloud Computing co-funds the work (CCF-Huawei PopulusGrove Fund) and four co-authors are Huawei employees; while the paper evaluates third-party LLMs rather than Huawei products, the funder is not independent.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests statement is present; the acknowledgment section lists funding but does not declare any financial interests, patents, or equity holdings.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Key terms are defined: unit testing, focal method/class, code features (FM_b, FM_p, FC_c, FC_f, FC_m, RC_c), metrics (CSR, CovL, CovB, NDD), CoT, RAG, and description styles (NL vs CL) are all precisely defined.",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper explicitly states it performs 'the first empirical study' on open-source LLMs for unit test generation across four RQs covering prompt design, model comparison, ICL methods, and defect detection.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section 6 engages substantively with prior work including ChatTester, TestPilot, CODAMOSA, AthenaTest, and ChatUniTest, distinguishing this work's focus on open-source LLMs and prompt variation from closed-source fixed-prompting prior studies.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper states 'All of our code and data are available at our project homepage' with reference [5] pointing to https://github.com/LeonYang95/LLM4UT.",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Defects4J 2.0 is a standard public benchmark; the paper also claims all data is on their project homepage.",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Specific versions provided: PyTorch 2.0.0, transformers 4.34.1, VLLM, Ubuntu 18.04 LTS, and hardware (Intel Xeon Gold 6240C, 512GB RAM, 8× NVIDIA A100 GPUs) are stated.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "The paper does not include step-by-step reproduction instructions; it points to the project homepage for code and data, but no procedural instructions appear in the paper itself.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Tables report single point estimates (coverage percentages, CSR) without confidence intervals or error bars; Wilcoxon tests provide significance but not CI for main results.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Wilcoxon rank sum tests with significance level 0.05 are applied for all pairwise comparisons in Tables 1–3 and 5.",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Rank-biserial correlation is computed as effect size for all comparisons, with threshold >0.3 for meaningful differences, clearly reported.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "778 focal methods from 413 defects are used but no power analysis or sample size justification is provided; selection appears driven by the benchmark's contents and cost constraints.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "All tables report single values per condition without standard deviation or variance; temperature is set to 0 to reduce randomness but per-run variance is never reported.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Evosuite (search-based traditional approach) and GPT-4 (state-of-the-art commercial LLM) are used as explicit baselines in RQ2.",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Evosuite remains the leading search-based tool; GPT-4 was the leading commercial LLM at time of writing; both are appropriate contemporary references.",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Section 3.1 performs an ablation by creating five prompt variants each removing one code feature (FM_p, FC_c, FC_f, FC_m, RC_c) and comparing to the full prompt.",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Four metrics used: Compilation Success Rate (CSR), Line Coverage (CovL), Branch Coverage (CovB), and Number of Detected Defects (NDD).",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Four authors with 4+ years of Java experience manually labeled undetected defects by failure reason (Cohen's Kappa = 0.95); this evaluates system outputs to diagnose defect detection failures.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": false,
    217           "answer": false,
    218           "justification": "This is a benchmark evaluation study, not a prediction task with train/test split; the Defects4J benchmark serves as the evaluation corpus throughout.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Results are broken down per model (Tables 1–6), per prompt variant, per ICL method, and Table 7 breaks down undetected defects by failure category per model.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Section 3.2 categorizes compilation errors (unresolved symbol, parameter mismatch, abstract instantiation) and Section 3.4 categorizes defect detection failures (insufficient coverage, missing inputs, improper assertions) with proportions.",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Major negative results are prominently reported: all LLMs underperform Evosuite, CoT hurts CodeLlama models, and RAG hurts all studied models.",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Open-source models are specified with full names and version numbers (e.g., Phind-CodeLlama-34B-v2, DeepSeekCoder-33B-Instruct); GPT-4 lacks a snapshot date but model names are otherwise specific.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Prompt design is described in detail (description styles, which code features to include) but actual prompt text/templates are not shown in the paper; they are referenced at the project homepage.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Temperature is explicitly set to 0 for all experiments; VLLM is used for inference; hardware and library versions are specified, though top-p and other generation parameters are not mentioned.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": false,
    261           "answer": false,
    262           "justification": "This is not an agentic pipeline; LLMs are called once per focal method with no multi-step scaffolding beyond single-pass generation.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Preprocessing is documented: AST parsing with tree-sitter to extract tests, integration into a test class, automatic import injection, recursive removal of failing methods for coverage collection.",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Authors state all code and data are at their project homepage (GitHub); Defects4J 2.0 is also independently publicly available.",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Section 2.5 describes benchmark collection: Defects4J 2.0, selection criteria (public methods patched for defects), handling of multi-method bugs, resulting in 778 focal methods from 413 defects across 17 projects.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No participant recruitment; the study uses a standard public benchmark with no human subjects.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "Section 2.6 documents the full pipeline: LLM inference → AST extraction → test class integration → compilation → JaCoCo coverage collection → defect detection via pass/fail on buggy vs. fixed versions.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Training data cutoffs are not stated for any of the five open-source models or GPT-4; the paper does not report when training data was collected.",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Section 4 discusses potential data leakage, comparing LLM-generated tests to original benchmark tests: 'no exact match between them'; acknowledged as a threat even if the check is superficial.",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": true,
    314           "justification": "The paper explicitly acknowledges Defects4J is a publicly available benchmark that predates model training and treats contamination as a construct validity threat, including a no-exact-match comparison as a weak mitigation.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human subjects study; the manual labeling by authors is an analysis task, not a human participant study.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants; no IRB needed.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "The paper reports 'around 3,000 NVIDIA A100 GPU-hours' for the complete experimental evaluation.",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": true,
    372           "justification": "3,000 A100 GPU-hours is explicitly stated; hardware configuration (four servers, 8× A100 each) is also provided.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "Prompt description style significantly affects CodeLlama models: CL-7B and CL-13B perform significantly better with natural language style, while DeepSeek-Coder and Phind models are robust to style choice.",
    381       "evidence": "Table 1 with Wilcoxon tests showing statistically significant NL superiority for CL-7B/CL-13B across all three metrics (p<0.05, effect size>0.3), with no significant difference for DC/PD models.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Including other class methods (FCm) in the prompt improves syntactic validity but hurts test coverage because it consumes context window space, reducing the number of generated tests.",
    386       "evidence": "Tables 2–3 show removing FCm causes coverage improvement (e.g., CL-7B CovL +9.26%) while reducing CSR for most models, with explanation that average test count increases from 3,654 to 5,434 when FCm is removed.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "All studied LLMs, including GPT-4, significantly underperform traditional Evosuite in test coverage (GPT-4: 40.43% line coverage vs. Evosuite: 78.91%).",
    391       "evidence": "Table 4 shows Evosuite achieving 85.71% CSR, 78.91% CovL, 76.59% CovB vs. GPT-4's 52.96%/40.43%/31.78% and all open-source LLMs performing worse.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "CoT improves unit test generation for DeepSeek-Coder models but hurts CodeLlama models, due to differences in code comprehension ability.",
    396       "evidence": "Table 5 shows DC-7B gains +2.72% CovL with CoT while CL-7B loses -3.04% CovL; manual analysis confirms DeepSeek models produce more accurate focal method descriptions.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "RAG adapted from code generation degrades performance for all five studied LLMs in unit test generation.",
    401       "evidence": "Table 5 shows negative coverage deltas for all models with RAG (e.g., PD-34B -9.28% CovL); analysis attributes this to a large gap between retrieved tests (12.10 LOC avg) and LLM-generated tests (5.60 LOC avg).",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "On average, 87.13% of defects have no valid LLM-generated tests due to compilation failures, severely limiting defect detection ability.",
    406       "evidence": "Table 6 NTD row shows very few testable defects (e.g., CL-7B: 41, CL-13B: 28 out of 413 total defects); the paper derives the 87.13% figure from these counts.",
    407       "supported": "strong"
    408     },
    409     {
    410       "claim": "Among defects with valid tests, 74.99% remain undetected primarily because LLMs fail to generate the specific inputs needed to trigger defects.",
    411       "evidence": "Table 7 shows 'Missing Specific Inputs' accounts for the majority of undetected defects across all models (e.g., DC-7B: 26, GPT-4: 24 out of undetected); the 74.99% figure is stated explicitly.",
    412       "supported": "moderate"
    413     }
    414   ],
    415   "methodology_tags": [
    416     "benchmark-eval",
    417     "empirical",
    418     "ablation"
    419   ],
    420   "key_findings": "Five open-source code LLMs (7B–34B parameters) were evaluated on unit test generation using Defects4J 2.0 (778 focal methods, 17 Java projects). All LLMs, including GPT-4, substantially underperform traditional Evosuite in test coverage (GPT-4: 40% vs. Evosuite: 79%) due to high rates of syntactically invalid tests caused by hallucination. Prompt design — specifically description style alignment with training data and code feature selection — significantly impacts effectiveness, while directly adapted CoT and RAG methods from other tasks fail to improve (and often hurt) unit test generation. Defect detection is severely limited: on average, LLMs fail to generate any valid test for 87% of defects, and among detectable defects, the primary failure mode is generating inputs that do not trigger the specific bug.",
    421   "red_flags": [
    422     {
    423       "flag": "GPT-4 version unspecified",
    424       "detail": "GPT-4 is used as a primary baseline without specifying snapshot date or API version, making results unreproducible since GPT-4 is regularly updated."
    425     },
    426     {
    427       "flag": "Funder-author conflict",
    428       "detail": "Four co-authors are Huawei employees and the CCF-Huawei PopulusGrove Fund co-funds the work; while Huawei products are not directly evaluated, the conflict is not acknowledged or addressed."
    429     },
    430     {
    431       "flag": "No variance reported",
    432       "detail": "All coverage results are single point estimates; while temperature=0 reduces stochasticity, no variance across runs or focal methods is reported, preventing assessment of result stability."
    433     },
    434     {
    435       "flag": "Prompts not in paper",
    436       "detail": "Actual prompt templates are not shown in the paper — only described structurally — requiring readers to trust the project homepage for reproducibility."
    437     },
    438     {
    439       "flag": "Superficial contamination check",
    440       "detail": "The data leakage check consists only of verifying no exact match between LLM outputs and original benchmark tests, which does not address whether the Defects4J projects (published 2014) were in training data."
    441     },
    442     {
    443       "flag": "Locally optimal prompts",
    444       "detail": "The paper acknowledges that single-feature ablation may not find globally optimal prompt configurations, potentially underrepresenting model capabilities in the comparison against Evosuite."
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "EvoSuite: automatic test suite generation for object-oriented software",
    450       "relevance": "Primary baseline for test coverage comparison; the key finding that all LLMs underperform Evosuite is central to the paper."
    451     },
    452     {
    453       "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation (ChatUniTest)",
    454       "relevance": "Key prior work on LLM-based unit test generation with closed-source ChatGPT that this paper directly extends and contrasts with."
    455     },
    456     {
    457       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation (TestPilot)",
    458       "relevance": "Most directly comparable prior empirical study; uses GPT-3.5 for JavaScript, while this paper focuses on open-source models for Java."
    459     },
    460     {
    461       "title": "Code Llama: Open Foundation Models for Code",
    462       "relevance": "Foundation model for three of the five evaluated LLMs (CL-7B, CL-13B, PD-34B)."
    463     },
    464     {
    465       "title": "DeepSeek Coder: Let the Code Write Itself",
    466       "relevance": "Foundation model for two of the five evaluated LLMs (DC-7B, DC-33B)."
    467     },
    468     {
    469       "title": "Defects4J: a database of existing faults to enable controlled testing studies for Java programs",
    470       "relevance": "The primary evaluation benchmark used for all experiments."
    471     },
    472     {
    473       "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    474       "relevance": "Related approach combining LLMs with evolutionary search for test generation, compared in related work."
    475     },
    476     {
    477       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    478       "relevance": "Foundation for the CoT ICL method evaluated in RQ3."
    479     },
    480     {
    481       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    482       "relevance": "Foundation for the RAG ICL method evaluated in RQ3."
    483     }
    484   ],
    485   "engagement_factors": {
    486     "practical_relevance": {
    487       "score": 3,
    488       "justification": "Directly actionable for practitioners: provides specific guidance on prompt design, LLM selection, and warnings that current LLMs cannot replace Evosuite for test coverage."
    489     },
    490     "surprise_contrarian": {
    491       "score": 2,
    492       "justification": "Counterintuitive findings include: CoT hurts some models, including more code context hurts coverage, and 10-year-old Evosuite beats all LLMs including GPT-4."
    493     },
    494     "fear_safety": {
    495       "score": 0,
    496       "justification": "No AI safety or risk concerns; this is a software engineering productivity study."
    497     },
    498     "drama_conflict": {
    499       "score": 1,
    500       "justification": "The 'GPT-4 loses to Evosuite' result has mild drama potential but the paper presents it matter-of-factly."
    501     },
    502     "demo_ability": {
    503       "score": 2,
    504       "justification": "Code and data are publicly available on GitHub; practitioners can run their own LLMs against Defects4J using the released scripts."
    505     },
    506     "brand_recognition": {
    507       "score": 1,
    508       "justification": "GPT-4 and Llama/DeepSeek are named, but the paper is from Tianjin University/Huawei rather than a top-tier AI lab."
    509     }
    510   },
    511   "hn_data": {
    512     "threads": [
    513       {
    514         "hn_id": "39499207",
    515         "title": "Hallucination is inevitable: An innate limitation of large language models",
    516         "points": 308,
    517         "comments": 474,
    518         "url": "https://news.ycombinator.com/item?id=39499207",
    519         "created_at": "2024-02-25T09:28:39Z"
    520       },
    521       {
    522         "hn_id": "28230092",
    523         "title": "A Dyson sphere around a black hole",
    524         "points": 214,
    525         "comments": 231,
    526         "url": "https://news.ycombinator.com/item?id=28230092",
    527         "created_at": "2021-08-19T03:40:00Z"
    528       },
    529       {
    530         "hn_id": "39888769",
    531         "title": "Mini-Gemini: Mining the Potential of Multi-Modality Vision Language Models",
    532         "points": 83,
    533         "comments": 7,
    534         "url": "https://news.ycombinator.com/item?id=39888769",
    535         "created_at": "2024-03-31T22:38:09Z"
    536       },
    537       {
    538         "hn_id": "42531993",
    539         "title": "Empirical Study of Test Generation with LLM's",
    540         "points": 40,
    541         "comments": 36,
    542         "url": "https://news.ycombinator.com/item?id=42531993",
    543         "created_at": "2024-12-28T16:10:24Z"
    544       },
    545       {
    546         "hn_id": "41022645",
    547         "title": "Modal Effect Types",
    548         "points": 4,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=41022645",
    551         "created_at": "2024-07-21T05:31:06Z"
    552       },
    553       {
    554         "hn_id": "39314708",
    555         "title": "Hallucination Is Inevitable: An Innate Limitation of Large Language Models",
    556         "points": 3,
    557         "comments": 2,
    558         "url": "https://news.ycombinator.com/item?id=39314708",
    559         "created_at": "2024-02-09T13:37:34Z"
    560       },
    561       {
    562         "hn_id": "40390670",
    563         "title": "Acoustic Manipulation of Underwater Data Center Operations, Resource Management",
    564         "points": 1,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=40390670",
    567         "created_at": "2024-05-17T15:00:15Z"
    568       },
    569       {
    570         "hn_id": "40190640",
    571         "title": "Holographic Parallax Improves 3D Perceptual Realism",
    572         "points": 1,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=40190640",
    575         "created_at": "2024-04-28T18:27:41Z"
    576       },
    577       {
    578         "hn_id": "39899945",
    579         "title": "Turning News Graphics into TikToks by Adjusting Narrative Beats and Pacing",
    580         "points": 1,
    581         "comments": 0,
    582         "url": "https://news.ycombinator.com/item?id=39899945",
    583         "created_at": "2024-04-01T22:04:16Z"
    584       },
    585       {
    586         "hn_id": "39503420",
    587         "title": "An Empirical Evaluation of LLMs for Solving Offensive Security Challenges",
    588         "points": 1,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=39503420",
    591         "created_at": "2024-02-25T18:35:55Z"
    592       }
    593     ],
    594     "top_points": 308,
    595     "total_points": 656,
    596     "total_comments": 750
    597   }
    598 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs