scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29932B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Knowledge Matters: Injecting Project and Testing Knowledge into LLM-based Unit Test Generation",
      6     "authors": [
      7       "Anji Li",
      8       "Mingwei Liu",
      9       "Zhenxi Chen",
     10       "Zheng Pei",
     11       "Zike Li",
     12       "Dekun Dai",
     13       "Yanlin Wang",
     14       "Zibin Zheng"
     15     ],
     16     "year": 2026,
     17     "venue": "ICSE 2026",
     18     "arxiv_id": "2511.14224",
     19     "doi": "10.1145/3744916.3787769"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All key quantitative abstract claims are directly supported by Table 2: 5.03% EPR improvement (KTester 76.41% vs HITS 71.38%), 11.67% LC improvement (63.94% vs 52.27%), fewer test cases (7.33 vs 15.78), and less time (152.68s vs 625.69s). Human study findings are supported by RQ3 results.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Causal claims about component contributions are backed by a systematic ablation study (RQ2) with 4 variants, each removing one component while keeping others constant, providing adequate evidence for causal attribution within the pipeline.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Abstract and conclusions claim KTester 'significantly outperforms existing methods' without adequately bounding scope — evaluation covers only 110 Java methods with cyclomatic complexity >10 from 10 open-source projects, yet the paper frames contributions broadly without consistently qualifying this narrow scope.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper does not discuss alternative explanations for performance gains, such as whether improvements stem from richer prompts generally rather than structured knowledge injection specifically, or whether the HITS benchmark characteristics particularly advantage their approach.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper explicitly distinguishes coverage metrics (proxy for thoroughness) from execution pass rate (correctness) and adds a human study for readability/maintainability. The threats section explicitly acknowledges 'correctness and coverage metrics may not fully capture clarity and structure.'",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5 'Threats to Validity' is a dedicated section addressing limitations around LLM fairness, Java-focus generalizability, and user study subjectivity.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Threats are mostly generic: the Java focus is acknowledged but not quantified ('Extending to other frameworks mainly requires prompt and library updates'), and the user study limitation ('participant bias and limited sampling') lacks specifics about what sample size would be adequate or what biases were observed.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Evaluation is explicitly bounded to Java, complex methods (cyclomatic complexity >10), and the HITS dataset of 10 open-source projects, stated clearly in Section 3.1.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Acknowledgments disclose funding from NSFC (Grant 62402113), Guangdong Province Natural Science Foundation (2025A1515011631), Social Science Planning Project (SZ2025A002), and GMCC-SYSU Joint Lab.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All 8 authors are explicitly affiliated with Sun Yat-sen University's School of Software Engineering and Zhuhai Key Laboratory of Trusted Large Language Models, disclosed in the author block.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Primary funders are government grants (NSFC, Guangdong Province) independent of the KTester outcome. GMCC-SYSU Joint Lab is noted, but the paper evaluates KTester on open-source projects rather than any GMCC product.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement appears in the paper. Acknowledgments list funders but do not address patents, equity, or consulting arrangements.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Key terms are defined with examples: 'focal method,' 'test oracle,' 'test prefix,' 'cyclomatic complexity,' 'project structure knowledge,' and 'project usage knowledge' are all explicitly introduced and illustrated.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Three bullet-point contributions are explicitly listed at the end of Section 1: the KTester framework, multi-perspective prompting with design/generation separation, and extensive empirical evaluation.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4 (Related Work) substantively engages with prior methods (ChatTester, ChatUniTest, HITS, ASTER, TestPilot, CodaMosa, RAG approaches) explaining how KTester differs from and builds on each.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The paper states 'All data/code used in this study is provided in the package [9]' linking to https://github.com/SYSUSELab/KTester, a live public GitHub repository.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The HITS dataset [52] is a published benchmark from Wang et al. (ASE 2024) and the replication package includes it. Standard public benchmark used as-is.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "The paper names tools (Spoon, JaCoCo, GPT-4o-mini) but provides no requirements file, Dockerfile, or explicit version specifications for the Java/library stack within the paper itself.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "The paper only states 'All data/code is provided in the package' without step-by-step reproduction instructions in the paper text. Instructions may exist in the GitHub repo but are not described here.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Tables 2-4 report only average results from 5 repetitions without confidence intervals, standard deviations, or error bars. All results are presented as point estimates.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical significance tests are performed for any comparative claims. Differences in coverage and pass rates between methods are presented as raw averages without p-values or hypothesis testing.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Concrete percentage improvements with baseline context are reported throughout: '5.03% EPR improvement,' '11.67% LC improvement,' 'LC dropping by 13.39%' in ablation. Baseline values always accompany improvements.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The 110-task HITS dataset is adopted from prior work without justification for whether this sample size is sufficient to detect meaningful differences between the evaluated methods.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Tables 2-4 report only means across 5 runs. No standard deviations, ranges, or other variance measures are provided despite explicitly running each experiment 5 times to mitigate LLM randomness.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Four baselines are included: ChatUniTest, ChatTester, HITS (pure LLM-based), and UTGen (SBST+LLM hybrid), covering the main paradigms of prior automated test generation work.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "All baselines are from 2024-2025: HITS (ASE 2024), ChatTester (FSE 2024), ChatUniTest (FSE 2024), UTGen (ICSE 2025). These are recent competitive methods.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "RQ2 presents a systematic ablation study with 4 variants (KTester-UTE, KTester-FMR, KTester-MVG, KTester-DGT), each removing one component while keeping others intact, measured on all 8 metrics.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Eight automated metrics are used: CPR, EPR, LC, BC, LCP, BCP, AvT, AvTC. Plus a qualitative human study on correctness, readability, and maintainability using 4-point Likert scales.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "RQ3 includes a user study with 15 participants (5 PhD, 10 Master's students with 2-5 years Java experience) evaluating generated test correctness, readability, and maintainability on 10 tasks using blinded, randomized presentation.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": false,
    214           "answer": false,
    215           "justification": "KTester is a prompting-based generation system, not a trained model; there is no train/test split. The HITS benchmark is the evaluation set but there is no held-out partition relevant to this generative task.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": false,
    221           "justification": "Table 1 describes the 10 projects but Tables 2-4 report only aggregate results across all 110 tasks. No per-project or per-domain breakdown of performance is provided.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Figures 1-3 show concrete baseline failure cases (incorrect object construction, insufficient assertions, hard-coded values), and Figure 10 provides a qualitative comparison of HITS vs. KTester output.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "The ablation study explicitly reports performance drops when components are removed. The paper also reports that UTGen outperforms KTester on EPR (90.05% vs 76.41%), an unfavorable comparison that is not suppressed.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "The primary experimental model 'gpt-4o-mini' is cited without a snapshot date (e.g., gpt-4o-mini-2024-07-18). While claude-3-5-haiku-20241022 is versioned, the main model lacks a pinned version for reproducibility.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Figures 7, 8, and 9 provide detailed prompt templates for all three main generation steps, including task descriptions, input/output structure, and specific instructional content.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature=0.5 and maximum repair iterations=5 are explicitly reported. The paper notes gpt-4o-mini with temperature=0.5 was used consistently across all methods.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "The 5-step pipeline (framework generation, test case design, method transformation, integration, refinement) is described in full detail including static analysis, CFG construction, Jaccard similarity retrieval, and LLM-based repair feedback loops.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Section 2.1 documents the offline knowledge extraction: AST parsing via Spoon, control flow graph construction, intra-procedural slicing, and function-level index construction. Project source code versions are specified in Table 1.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The paper states 'All data/code used in this study is provided in the package [9]' (GitHub), implying generated tests and evaluation outputs are available.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 3.1.1 describes the HITS dataset: 110 focal methods with cyclomatic complexity >10 from 10 open-source Java projects, with project versions specified in Table 1.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "User study recruitment is described in Section 3.4.1: 'public invitation distributed across the computer science departments of six universities,' with selection criteria (2-5 years Java experience) and compensation noted.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The complete pipeline from project source code through static analysis, knowledge extraction, LLM-based generation, repair, and JaCoCo-based coverage measurement is documented in Sections 2 and 3.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The training cutoff of GPT-4o-mini is not stated. This matters because the open-source Java libraries in the benchmark (Commons-CLI, Gson, Commons-CSV, etc.) are publicly available and almost certainly in GPT's training data.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "The paper does not discuss whether GPT-4o-mini's training data includes the focal methods from the 10 evaluated projects. Memorization of these popular libraries could inflate pass rates for all LLM-based methods.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "The HITS benchmark uses well-known open-source Java libraries predating GPT-4o-mini's training cutoff. Benchmark contamination is never discussed despite its relevance to evaluating LLM-based test generation.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "No pre-registration of the user study is mentioned in the paper.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": true,
    324           "answer": false,
    325           "justification": "No IRB or ethics approval is mentioned for the user study involving 15 compensated human participants.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": true,
    330           "answer": true,
    331           "justification": "Participants are described as 15 individuals (5 PhD, 10 Master's students) with 2-5 years of Java development experience who received compensation. Relevant professional demographics are reported.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": true,
    336           "answer": true,
    337           "justification": "Inclusion criteria are stated: participants selected from CS departments of six universities with 2-5 years Java development experience.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": true,
    342           "answer": true,
    343           "justification": "'The identities of the methods were fully anonymised, and the presentation order was randomised' (Section 3.4.1), describing randomization of both method identity and presentation order.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": true,
    348           "answer": true,
    349           "justification": "'The identities of the methods were fully anonymised' — participants did not know which test generation method produced each test class, constituting single-blind evaluation.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": true,
    354           "answer": false,
    355           "justification": "Attrition is not reported. The paper inconsistently states 15 participants in Section 3.4.1 but '14 professional developers' in Section 5, suggesting possible undisclosed dropout with no explanation.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Average time per task (AvT) is explicitly reported as a metric: KTester 152.68s vs HITS 625.69s vs ChatTester 354.83s, directly quantifying practical inference cost.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "Total compute budget (dollar cost, total API calls, or total compute hours) for the full evaluation is not reported. Only per-task time averages are provided.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "KTester improves execution pass rate by 5.03% over the strongest LLM-based baseline (HITS) while generating 8.45 fewer test cases on average.",
    378       "evidence": "Table 2: KTester EPR=76.41% vs HITS EPR=71.38% (difference=5.03%); KTester AvTC=7.33 vs HITS AvTC=15.78. Verified arithmetic match.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "KTester achieves 63.94% line coverage, an 11.67% improvement over HITS (52.27%), while requiring much less time (152.68s vs 625.69s per task).",
    383       "evidence": "Table 2: KTester LC=63.94%, HITS LC=52.27%; KTester AvT=152.68s, HITS AvT=625.69s. Verified arithmetic match.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Modular test case transformation (separation of design and code generation) is the most critical component for test coverage adequacy.",
    388       "evidence": "Table 4: KTester-DGT removal causes LC to drop 13.39% (63.94→50.55%) and BC 11.15% (55.46→44.31%), the largest coverage drops among all 4 ablation variants.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Usage trace extraction is the primary contributor to test correctness.",
    393       "evidence": "Table 4: KTester-UTE removal causes EPR to drop 14.41% (76.41→62.00%) and CPR 6.37% (100→93.63%), the largest correctness drops across all ablation variants.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Human evaluators rate KTester tests significantly higher in correctness, readability, and maintainability than all baselines.",
    398       "evidence": "Figure 11: KTester has the highest 'Strongly Agree' proportions (0.34 correctness, 0.57 readability, 0.39 maintainability) in a 15-participant user study on 10 tasks, though no significance tests are reported.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "KTester generalizes across LLM backends, with Claude-3.5-haiku and DeepSeek-v3.1 achieving higher coverage than GPT-4o-mini.",
    403       "evidence": "Table 3: KTester-deepseek achieves 71.22% LC and 65.75% BC vs KTester-gpt 63.94% LC and 55.46% BC; KTester-claude achieves 66.46% LC.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "case-study"
    410   ],
    411   "key_findings": "KTester integrates project-specific static analysis (structure and usage knowledge extracted via AST/CFG) with testing domain knowledge (multi-perspective prompting, test design/implementation separation) in a 5-step LLM pipeline for Java unit test generation. On 110 complex focal methods (cyclomatic complexity >10) from 10 open-source projects, KTester outperforms all LLM-based baselines on coverage and correctness: 63.94% LC vs 52.27% for HITS, 76.41% EPR vs 71.38% for HITS, while generating fewer tests and running 4x faster. Ablation identifies usage trace extraction as critical for test correctness and modular test transformation as critical for coverage. A 15-participant user study confirms KTester tests are perceived as more correct, readable, and maintainable, though the improvement over UTGen on EPR (90.05% vs 76.41%) is unfavorable and underemphasized.",
    412   "red_flags": [
    413     {
    414       "flag": "No statistical significance testing",
    415       "detail": "All comparative claims rest on averages from 5 runs across 110 tasks with no confidence intervals, standard deviations, or statistical tests (e.g., Wilcoxon). Differences of 5pp EPR may not exceed run-to-run noise."
    416     },
    417     {
    418       "flag": "No variance reported",
    419       "detail": "Tables 2-4 report only means across 5 runs. Standard deviations are omitted, making it impossible to assess whether differences between methods exceed normal LLM sampling variance."
    420     },
    421     {
    422       "flag": "GPT-4o-mini version unpinned",
    423       "detail": "The primary evaluation model is 'gpt-4o-mini' without a snapshot date (contrast: claude-3-5-haiku-20241022 is pinned). OpenAI can silently update model weights, making exact replication uncertain."
    424     },
    425     {
    426       "flag": "Benchmark contamination unaddressed",
    427       "detail": "The HITS benchmark uses popular open-source Java libraries (Commons-CLI, Gson, Commons-CSV, etc.) that predate GPT-4o-mini's training cutoff. LLM memorization of these methods could inflate pass rates for all methods equally or differently."
    428     },
    429     {
    430       "flag": "Inconsistent user study participant count",
    431       "detail": "Section 3.4.1 reports 15 participants; Section 5 (Threats) reports '14 professional developers.' The discrepancy is unexplained and suggests possible undisclosed attrition or an error."
    432     },
    433     {
    434       "flag": "UTGen EPR comparison underemphasized",
    435       "detail": "UTGen achieves 90.05% EPR vs KTester's 76.41%, but the abstract claims 'outperforms existing methods' and compares EPR improvement only to HITS. The framing obscures that a hybrid baseline significantly beats KTester on one of the six primary metrics."
    436     },
    437     {
    438       "flag": "No per-project results breakdown",
    439       "detail": "All quantitative results are aggregate across 10 heterogeneous projects. Per-project breakdowns would reveal whether gains are uniform or concentrated in specific project types (e.g., event-ruler vs. commons-cli)."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing",
    445       "relevance": "Primary baseline and source of the evaluation dataset; closest prior work using method decomposition for LLM test generation"
    446     },
    447     {
    448       "title": "Evaluating and Improving ChatGPT for Unit Test Generation (ChatTester)",
    449       "relevance": "Key LLM-based baseline using incremental context construction; directly compared in RQ1 and RQ2"
    450     },
    451     {
    452       "title": "ChatUniTest: A Framework for LLM-Based Test Generation",
    453       "relevance": "Key LLM-based baseline with iterative repair; directly compared in RQ1 and ablation"
    454     },
    455     {
    456       "title": "Leveraging Large Language Models for Enhancing the Understandability of Generated Unit Tests (UTGen)",
    457       "relevance": "Hybrid SBST+LLM baseline combining EvoSuite with LLM refinement; outperforms KTester on EPR"
    458     },
    459     {
    460       "title": "ASTER: Natural and Multi-Language Unit Test Generation with LLMs",
    461       "relevance": "Related prompt-based LLM test generation approach across multiple languages"
    462     },
    463     {
    464       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-Trained LLMs",
    465       "relevance": "Related hybrid approach integrating LLM seeds with evolutionary search for coverage-driven generation"
    466     },
    467     {
    468       "title": "An empirical evaluation of using large language models for automated unit test generation (TestPilot)",
    469       "relevance": "Related work incorporating documentation and usage examples into LLM test generation context"
    470     },
    471     {
    472       "title": "CoverUp: Effective High Coverage Test Generation for Python",
    473       "relevance": "Coverage-guided dialogue-based test generation, related approach in Python context"
    474     },
    475     {
    476       "title": "Unit test case generation with transformers and focal context (AthenaTest)",
    477       "relevance": "Fine-tuning baseline framing test generation as sequence-to-sequence task"
    478     },
    479     {
    480       "title": "Evosuite: automatic test suite generation for object-oriented software",
    481       "relevance": "SBST baseline used within UTGen; foundational tool for search-based Java test generation"
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 3,
    487       "justification": "Directly usable tool for software developers with released code, evaluated on real-world Java projects targeting a common pain point in the development workflow."
    488     },
    489     "surprise_contrarian": {
    490       "score": 1,
    491       "justification": "Confirms the expected hypothesis that richer project context improves LLM test generation; no finding challenges conventional wisdom."
    492     },
    493     "fear_safety": {
    494       "score": 0,
    495       "justification": "No AI safety concerns; this is a software engineering productivity tool."
    496     },
    497     "drama_conflict": {
    498       "score": 1,
    499       "justification": "Competes with and outperforms several published methods, but no methodological controversy or paradigm conflict."
    500     },
    501     "demo_ability": {
    502       "score": 2,
    503       "justification": "Code released on GitHub and could be applied to other Java projects, but requires setting up GPT API access and the full pipeline infrastructure."
    504     },
    505     "brand_recognition": {
    506       "score": 0,
    507       "justification": "Sun Yat-sen University is not a widely recognized AI lab brand; no famous industry lab affiliation."
    508     }
    509   },
    510   "hn_data": {
    511     "threads": [
    512       {
    513         "hn_id": "34136879",
    514         "title": "Peekaboo: Text to Image Diffusion Models Are Zero-Shot Segmentors",
    515         "points": 106,
    516         "comments": 5,
    517         "url": "https://news.ycombinator.com/item?id=34136879"
    518       },
    519       {
    520         "hn_id": "38415451",
    521         "title": "Double the performance per MAC unit in ML accelerators",
    522         "points": 1,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=38415451"
    525       },
    526       {
    527         "hn_id": "38408983",
    528         "title": "Fast DNN Accelerator Architectures",
    529         "points": 1,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=38408983"
    532       },
    533       {
    534         "hn_id": "34766761",
    535         "title": "SRIFTY: Swift and Thrifty Distributed Training on the Cloud (rev.3 2022)",
    536         "points": 1,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=34766761"
    539       },
    540       {
    541         "hn_id": "33726398",
    542         "title": "One Venue, Two Conferences: Separation of Chinese and American Citation Networks",
    543         "points": 1,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=33726398"
    546       }
    547     ],
    548     "top_points": 106,
    549     "total_points": 110,
    550     "total_comments": 5
    551   }
    552 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs