ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (24508B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Efficient Knowledge Infusion via KG-LLM Alignment",
      6     "authors": [
      7       "Zhouyu Jiang",
      8       "Ling Zhong",
      9       "Mengshu Sun",
     10       "Jun Xu",
     11       "Rui Sun"
     12     ],
     13     "year": 2024,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2406.03746",
     16     "doi": "10.48550/arXiv.2406.03746"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims the approach outperforms baselines on two biomedical QA datasets; Table 1 shows ROUGE and BLEU improvements over all baselines on both CMedQA and BioASQ.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about each component's contribution (K-LoRA, AKGF, KG retrieval) are backed by ablation experiments in Table 2, which is adequate for the scope of the claim.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The Limitations section explicitly states 'we only conducted experiments on medical domain texts. This limitation may pose a risk to the generalized ability of our findings in other scenarios.'",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether performance gains could stem from additional fine-tuning steps (more compute/data exposure) rather than the KG alignment specifically; only one interpretation is presented.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "ROUGE/BLEU scores are used to measure 'knowledge correctness' and 'quality of generation' without adequately discussing that these metrics are poor proxies for domain accuracy or hallucination reduction.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section is present, discussing graph quality dependency, noise handling, and domain restriction.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named: dependency on KG construction quality, incomplete KG limiting error detection, conservative AKGF strategy restricting optimization space, and restriction to medical domain only.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Scope explicitly bounded to domain-specific text generation in the medical domain under limited sample scenarios; results on other domains are flagged as not demonstrated.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed as affiliated with Ant Group, with institutional email addresses provided.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All authors are Ant Group (industry) employees evaluating their own method with no independent external evaluation.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Knowledge mismatch' and 'poor information compliance' are both explicitly defined in the Introduction with concrete characterizations of what each problem entails.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Two numbered contributions are stated in the Introduction: the modular knowledge infusion framework and the two novel strategies (pre-learning and AKGF).",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Related work discusses retrieval-augmented LLMs and LLM-augmented KG construction, and the experimental section directly compares against GAP and RAG baselines to position the contribution.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository is referenced or released; only a footnote to an existing third-party text embedding library is provided.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Evaluation uses standard public benchmarks (BioASQ and CMedQA) that are publicly available, though the derived domain KGs themselves are not released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Hardware (A100/V100 GPUs) and hyperparameters are listed in Appendix D/Table 5, but no requirements file, Dockerfile, or dependency specification is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the methodology description is conceptual and lacks commands or scripts needed to replicate results.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1–3 are point estimates with no confidence intervals or error bars reported.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative result despite multiple baseline comparisons.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute improvement values are reported in-text (e.g., '1.03 ROUGE-L improvement', '1.12 improvement in ROUGE-L') with baseline context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of 500 training and 1,000 test samples is described as simulating a limited-data scenario but no power analysis or justification for these numbers is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Single-run results only; no standard deviation or variance across runs is reported for any table.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Six baselines are included: ChatGPT-3.5 (zero-shot and 2-shot), LLM-base, LLM-base-SFT, LLM-CP-SFT, GAP, and RAG.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include ChatGPT-3.5 and Llama2-chat-7B (contemporary at submission), alongside GAP (2022) as the most relevant prior KG-to-text method.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 2 presents four ablation conditions: removing K-LoRA only, AKGF only, both K-LoRA & AKGF, and KG retrieval.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Automatic metrics (ROUGE-1, ROUGE-2, ROUGE-L, BLEU) plus five-dimensional manual ranking evaluation (fluency, relevance, viewpoint, diversity, hallucination) are both used.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "200 BioASQ entries were manually ranked across five dimensions by human evaluators; results shown in Figure 2.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "1,000 instances per dataset are designated as the test set, held out from the 500-sample training set.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by dataset (CMedQA vs. BioASQ) and by ablation variant; Table 3 provides breakdown by KG size.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 5.3 discusses KG sparsity causing performance degradation, and the Limitations section identifies noise handling and incomplete KG as sources of failure.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports lower BLEU scores vs. RAG on BioASQ and notes in Section 5.3 that sparse KGs can hurt performance below no-KG baseline.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "ChatGLM2-6B and Llama2-chat-7B include HuggingFace links, but ChatGPT-3.5 is referenced by marketing name only with no API version or snapshot date.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The SFT input template is shown but the knowledge extraction prompts used with the LLM for KG construction are not provided.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 5 in Appendix D reports batch size, epochs, LoRA rank, LoRA target, learning rate, max input/output length, KL-div β, top-p, and temperature for all stages and both datasets.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; the paper evaluates standard fine-tuning and retrieval pipelines.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The four-step error removal process for KG construction is documented, entity resolution procedure is described, and dataset subsampling approach is stated.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The constructed domain KGs, training subsets, and annotation outputs are not released; only the public benchmark names are given.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Appendix A describes the annotation process: 100 samples per dataset, two blind annotators plus QC personnel, inter-annotator agreement 0.9, acceptance accuracy 0.97.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Human annotators for KG annotation and manual evaluation are mentioned but their recruitment, qualifications, and compensation are not described.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The KG construction pipeline (extraction → error removal → entity resolution) and the downstream SFT data pipeline are documented in Sections 3.1–3.4.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for neither ChatGPT-3.5 nor Llama2-chat-7B are stated anywhere in the paper.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The possibility that BioASQ or CMedQA questions appeared in Llama2 or ChatGPT pre-training data is never discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "BioASQ 2022 data predates Llama2 training; the paper does not address whether model pre-training included these benchmarks.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human subjects study; human annotators perform evaluation tasks, not participant studies.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "NA — no human subjects research.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "NA — no human subjects research.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "NA — no human subjects research.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "NA — no human subjects research.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "NA — no human subjects research.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "NA — no human subjects research.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference latency or cost figures are reported; only training hardware is mentioned.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "GPU types are listed (A100 80GB, V100 32GB) but total training time or GPU-hours are not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "ELPF significantly outperforms all baselines on CMedQA and BioASQ in limited-sample settings",
    375       "evidence": "Table 1 shows ELPF achieves highest ROUGE-L on CMedQA (15.44 vs. 14.71 for LLM-CP-SFT) and BioASQ (24.21 vs. 24.37 for GAP on ROUGE-L); BLEU improvements are more pronounced.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "K-LoRA pre-learning is the most impactful component, contributing most to performance",
    380       "evidence": "Ablation Table 2 shows removing K-LoRA causes the largest ROUGE/BLEU drop; Figure 3 shows faster convergence and lower initial loss with K-LoRA.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "AKGF reduces hallucinations and improves knowledge diversity even though its effect on ROUGE/BLEU is limited",
    385       "evidence": "Manual evaluation (Figure 2) shows ELPF outperforms w/o AKGF on hallucination and diversity dimensions; ROUGE/BLEU differences in Table 2 are small.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Domain-specific KG can be efficiently constructed with only ~100 annotated examples at >85% precision",
    390       "evidence": "Quality assessment on 200 extracted samples reports precision 0.85 (CMedQA) and 0.89 (BioASQ); only precision is measured, not recall.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "LLM-based KG construction outperforms traditional supervised extraction methods",
    395       "evidence": "Preliminary experiments with BERT-based joint extraction at >2000 samples achieved ~0.80 precision vs. their 0.85; comparison is indirect and marginal.",
    396       "supported": "weak"
    397     }
    398   ],
    399   "methodology_tags": [
    400     "benchmark-eval"
    401   ],
    402   "key_findings": "The ELPF framework combines efficient LLM-based domain KG construction (~100 annotated examples) with a three-stage alignment pipeline (K-LoRA pre-learning, SFT with KG retrieval, AKGF) to improve biomedical QA under limited-data conditions. K-LoRA pre-learning is the dominant contributor, improving both automatic metrics and KG compliance, while AKGF primarily reduces hallucinations and improves knowledge diversity rather than ROUGE/BLEU. Improvements over the best baseline are modest (approximately 1 ROUGE-L point) and no statistical significance tests were applied. The framework is limited to medical domain text generation and the constructed KGs and code are not publicly released.",
    403   "red_flags": [
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "All results in Tables 1–3 are point estimates without p-values, confidence intervals, or variance across runs, making it impossible to assess whether reported improvements are reliable."
    407     },
    408     {
    409       "flag": "Modest gains claimed as 'significant'",
    410       "detail": "Improvements of ~1 ROUGE-L point over baselines are described as 'significant improvements' without statistical grounding; ELPF loses to GAP on BioASQ ROUGE-L."
    411     },
    412     {
    413       "flag": "ChatGPT-3.5 unversioned",
    414       "detail": "ChatGPT-3.5 is used via API with no snapshot date or version pinning, making the comparison unreproducible."
    415     },
    416     {
    417       "flag": "No code or KG artifacts released",
    418       "detail": "The constructed domain KGs, extraction models, and fine-tuned adapters are not released, preventing reproduction of the main results."
    419     },
    420     {
    421       "flag": "ROUGE/BLEU as hallucination proxy",
    422       "detail": "The paper claims to reduce hallucinations but primarily measures this via ROUGE/BLEU, which do not reliably capture factual accuracy; the manual evaluation covers only 200 BioASQ samples."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    428       "relevance": "Foundational RAG baseline directly compared against in experiments"
    429     },
    430     {
    431       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    432       "relevance": "Core parameter-efficient fine-tuning method used throughout the ELPF pipeline"
    433     },
    434     {
    435       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    436       "relevance": "Training strategy for the AKGF alignment stage"
    437     },
    438     {
    439       "title": "GAP: A Graph-Aware Language Model Framework for Knowledge Graph-to-Text Generation",
    440       "relevance": "Primary KG-to-text baseline compared in experiments"
    441     },
    442     {
    443       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    444       "relevance": "Base model for BioASQ experiments"
    445     },
    446     {
    447       "title": "Overview of BioASQ 2022: The Tenth BioASQ Challenge",
    448       "relevance": "One of two evaluation benchmarks used"
    449     },
    450     {
    451       "title": "Unifying Large Language Models and Knowledge Graphs: A Roadmap",
    452       "relevance": "Survey of the KG-LLM integration space this work contributes to"
    453     }
    454   ],
    455   "engagement_factors": {
    456     "practical_relevance": {
    457       "score": 2,
    458       "justification": "Domain-specific KG infusion with minimal annotation is directly applicable to enterprise NLP settings where labeled data is scarce."
    459     },
    460     "surprise_contrarian": {
    461       "score": 1,
    462       "justification": "The finding that pre-learning on triples-to-text outweighs RLHF-style feedback is mildly interesting, but the overall KG+LLM direction is well-established."
    463     },
    464     "fear_safety": {
    465       "score": 0,
    466       "justification": "No AI safety or risk concerns raised."
    467     },
    468     "drama_conflict": {
    469       "score": 0,
    470       "justification": "No controversy; incremental improvement paper on a known problem."
    471     },
    472     "demo_ability": {
    473       "score": 1,
    474       "justification": "The system cannot be tried without the unreleased code and KGs; only a conceptual understanding is accessible."
    475     },
    476     "brand_recognition": {
    477       "score": 1,
    478       "justification": "Ant Group (Alibaba affiliate) is a recognizable industry lab in the ML community."
    479     }
    480   },
    481   "hn_data": {
    482     "threads": [
    483       {
    484         "hn_id": "41541053",
    485         "title": "LLMs Will Always Hallucinate, and We Need to Live with This",
    486         "points": 291,
    487         "comments": 261,
    488         "url": "https://news.ycombinator.com/item?id=41541053"
    489       },
    490       {
    491         "hn_id": "41333011",
    492         "title": "An exploration of Bluesky's public opening",
    493         "points": 28,
    494         "comments": 45,
    495         "url": "https://news.ycombinator.com/item?id=41333011"
    496       },
    497       {
    498         "hn_id": "41541888",
    499         "title": "Complexity as Design Material",
    500         "points": 5,
    501         "comments": 0,
    502         "url": "https://news.ycombinator.com/item?id=41541888"
    503       },
    504       {
    505         "hn_id": "41519163",
    506         "title": "LLMs Will Always Hallucinate, and We Need to Live with This",
    507         "points": 4,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=41519163"
    510       },
    511       {
    512         "hn_id": "39190527",
    513         "title": "Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon",
    514         "points": 4,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=39190527"
    517       },
    518       {
    519         "hn_id": "41619018",
    520         "title": "Facial Recognition Technology Detects Entrepreneurs, Outperforming Human Experts",
    521         "points": 3,
    522         "comments": 1,
    523         "url": "https://news.ycombinator.com/item?id=41619018"
    524       },
    525       {
    526         "hn_id": "39403991",
    527         "title": "A Fuzzy Approach to Record Linkages",
    528         "points": 3,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=39403991"
    531       },
    532       {
    533         "hn_id": "31684450",
    534         "title": "A Survey on the Fairness of Recommender Systems",
    535         "points": 3,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=31684450"
    538       },
    539       {
    540         "hn_id": "40066890",
    541         "title": "Warning Affects Human Perception and Engagement Regarding LLM Hallucinations",
    542         "points": 2,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=40066890"
    545       },
    546       {
    547         "hn_id": "39848438",
    548         "title": "Probing for Passwords: Privacy Implications of SSIDs in Probe Requests (2022)",
    549         "points": 2,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=39848438"
    552       }
    553     ],
    554     "top_points": 291,
    555     "total_points": 345,
    556     "total_comments": 307
    557   }
    558 }

Impressum · Datenschutz