scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33134B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Efficient Knowledge Infusion via KG-LLM Alignment",
      6     "authors": [
      7       "Zhouyu Jiang",
      8       "Ling Zhong",
      9       "Mengshu Sun",
     10       "Jun Xu",
     11       "Rui Sun"
     12     ],
     13     "year": 2024,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2406.03746",
     16     "doi": "10.48550/arXiv.2406.03746"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 'our approach outperforms existing baselines' on 'two biomedical question-answering datasets' with 'a limited-sample setting.' Table 1 confirms ELPF achieves highest scores across most metrics on both datasets.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims like 'Removing K-LoRA leads to the most significant performance drop' (Section 5.1) are supported by controlled ablation experiments in Table 2, where individual components are removed while holding others constant.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper frames ELPF as a general 'modular knowledge infusion framework' and the title 'Efficient Knowledge Infusion via KG-LLM Alignment' implies domain-general applicability, but experiments are limited to two biomedical QA datasets with two specific LLMs. While the Limitations section acknowledges 'we only conducted experiments on medical domain texts,' the framing throughout exceeds the tested scope.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not substantively consider alternative explanations for ELPF's improvements. For example, it does not discuss whether the gains come simply from having more training stages (three-stage pipeline vs. single-stage baselines), or from additional data exposure during pre-learning, rather than the specific KG alignment mechanism.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims to produce 'comprehensive, logical, and low-hallucination responses' (Section 3) and improve 'knowledge correctness,' but the primary metrics are ROUGE and BLEU which measure surface text overlap, not knowledge correctness or logical coherence. The human evaluation partially addresses this gap but the disconnect between automated metrics and claimed outcomes is not discussed.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section appears after the Conclusions, discussing KG quality dependency, incomplete KG detection issues, conservative AKGF strategy, and single-domain evaluation.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The Limitations section discusses threats specific to this study: 'the ELPF method is highly dependent on the quality of the graph construction' with 'inevitably noises,' 'it is challenging to detect knowledge errors unless they conflict with known knowledge,' and the conservative AKGF strategy 'somewhat limits the optimization space.'",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The Limitations section explicitly states: 'we only conducted experiments on medical domain texts. This limitation may pose a risk to the generalized ability of our findings in other scenarios.' This bounds the scope to the medical domain.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source, grant numbers, or acknowledgments section is present in the paper. All authors are affiliated with Ant Group, a major fintech company.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All eight authors are listed with Ant Group affiliation and institutional email addresses on the first page.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All authors are employees of Ant Group. As a technology company that could deploy such methods in production, Ant Group has a commercial interest in demonstrating the effectiveness of their knowledge infusion approach. No funding independence statement is provided.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms 'knowledge mismatch' and 'poor information compliance' are explicitly defined in the introduction; 'KG-LLM alignment' is explained through the three-stage framework description.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists two main contributions: (1) a modular knowledge infusion framework addressing knowledge mismatch and poor compliance, and (2) two innovative strategies ('pre-learning' and 'AKGF').",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The related works section explicitly situates the work relative to retrieval-augmented LLMs (RAG, ChainRAG, WebBrain) and LLM-augmented KG construction (AutoKG, PiVe), explaining how ELPF differs from and builds on each approach.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The experiments use two publicly available datasets: CMedQA (Cui and Han, 2020) and BioASQ (Nentidis et al., 2022). The constructed domain KGs and specific train/test splits are not released, but the base datasets are public.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Appendix D mentions 'four A100 80GB GPUs and two V100 32GB GPUs' and HuggingFace model URLs, but no requirements.txt, Dockerfile, or detailed dependency/library version specifications are provided.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described algorithmically but there are no runnable instructions.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '15.44 ROUGE-L') with no confidence intervals, error bars, or ± notation.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims ELPF 'outperforms existing baselines' and shows 'significant performance improvement' based solely on comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests).",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper reports absolute improvements with baseline context: '1.03 ROUGE-L improvement and a 1.03 BLEU improvement compared to the vanilla LoRA-based SFT method' (Section 4.5). Table 1 provides all baseline values for comparison.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses 500 training and 1000 test instances per dataset, described as simulating 'a scenario with limited samples,' but provides no justification for why 500 specifically was chosen and no power analysis.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run numbers.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 1 compares against multiple baselines: ChatGPT-3.5 (0-shot and 2-shot), LLM-base, LLM-base-SFT, LLM-CP-SFT (continual pre-trained), LLM-base-SFT(RAG), and GAP.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include ChatGPT-3.5 (2022), GAP (Colas 2022), RAG (Lewis 2020 but still standard), and contemporaneous base models (ChatGLM2-6B, Llama-2-chat-7B). For a 2024 paper, these are reasonably current.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 2 presents ablation experiments removing K-LoRA, AKGF, KG retrieval, and both K-LoRA & AKGF simultaneously. Figure 2 shows corresponding human evaluation ablations.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports ROUGE-1, ROUGE-2, ROUGE-L, and BLEU (n=4) for automated evaluation, plus five-dimensional human evaluation (fluency, relevance, viewpoint, diversity, hallucination).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Section 4.2 describes manual evaluation on 200 sampled entries, ranked across five dimensions: fluency, relevance to question, correctness of core viewpoint, diversity & completeness, and knowledge hallucination. Results shown in Figure 2.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "The paper creates a 500 train / 1000 test split but mentions no separate validation set. Table 6 shows β parameter comparison on BioASQ which may have been done on the test set. It is unclear whether test data was used for any hyperparameter selection decisions.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down across two datasets (CMedQA and BioASQ), multiple metrics, and five human evaluation dimensions. Table 3 shows performance across different KG completeness levels.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 4.5 discusses where ELPF underperforms RAG on BLEU for BioASQ due to KG information loss. Section 5.3 shows sparse KGs (20%) lead to worse performance than no KG. The case study in Figure 6 compares successful and less successful outputs.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 3 shows that reducing KG to 20% yields worse performance than 0% KG, demonstrating that noise in sparse KGs hurts performance. Section 4.5 notes ELPF achieves lower BLEU than RAG on BioASQ.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "ChatGLM2-6B and Llama-2-7b-chat-hf are specified with exact HuggingFace URLs (Section 4.3), but ChatGPT-3.5 is referenced without a specific API version or snapshot date (e.g., gpt-3.5-turbo-0613). Model behavior varies across API versions.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Section 3.3 provides the KG-augmented input template format ('[KG]: {gq} [Instruction]: Refer to the KG and answer the following question: {q}'), but the prompts used for KG extraction (Section 3.1), the triples-to-text pre-learning format, and the AKGF generation prompts are not fully provided.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 5 (Appendix D) provides detailed hyperparameters for all three stages across both datasets, including batch size, epochs, LoRA rank, LoRA target, learning rate, max input/output length, KL-div β, top-p, and temperature.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The system is a multi-stage fine-tuning pipeline with KG retrieval, not an agentic system.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.1 documents the KG construction pipeline in detail: extraction, four post-processing error removal steps (format errors, hallucinated entities, invalid relations, self-loops), and entity resolution via embedding similarity. Section 4.1 describes dataset preparation (500 train, 1000 test, corpus selection).",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The constructed domain KGs, specific train/test splits, extraction training data, and DPO preference pairs are not released. Only the base public datasets (CMedQA, BioASQ) are available.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.1 describes dataset sourcing. Appendix A details the annotation process: reference schemas (CMeIE v2, BioRED), manual annotation of 100 samples from corpora, two annotators for blind labeling plus one QC inspector, inter-annotator agreement of 0.9, acceptance accuracy of 0.97.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants in the study. Data sources are standard public benchmarks (CMedQA, BioASQ). Annotators for KG construction are employed staff, not study participants.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is documented: corpus selection → extraction by fine-tuned LLM → four post-processing steps → entity resolution → KG construction (Section 3.1). Table 4 provides statistics (subjects, triples, precision). Dataset splitting is described in Section 4.1.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for ChatGLM2-6B, Llama-2-chat-7B, or ChatGPT-3.5. CMedQA and BioASQ are public datasets that may have been in the pre-training corpora.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether CMedQA or BioASQ data appeared in the pre-training data of ChatGLM2-6B, Llama-2, or GPT-3.5. The zero-shot baselines are particularly vulnerable to this confound.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "CMedQA and BioASQ were published before the training cutoffs of all models used. No contamination analysis or decontamination steps are discussed.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in the study. The human evaluators rank model outputs but are not study subjects.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants. The study evaluates LLM fine-tuning methods on public datasets.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in the study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in the study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in the study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in the study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in the study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or per-example cost is reported despite the method involving multi-stage processing (KG retrieval + LLM generation).",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Appendix D mentions 'four A100 80GB GPUs and two V100 32GB GPUs' but does not report total GPU hours, training time, or computational cost for any stage.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No results across multiple random seeds are reported. All results appear to be from single runs despite random train/test splitting.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Table 6 compares three β values for DPO, but no overall hyperparameter search budget is reported. The selection process for other parameters (learning rate, LoRA rank, epochs) is not described.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "Table 6 shows β parameter comparison for BioASQ only, but it is unclear whether this selection was done on test or validation data (no validation set is mentioned). Other hyperparameter selections are not justified.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": false,
    397           "answer": false,
    398           "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors compare their ELPF system against baselines without acknowledging the systematic bias of evaluating their own system. No independent evaluation is conducted.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "ELPF requires three stages of training (K-LoRA, SFT, DPO) plus KG construction, substantially more compute than single-stage baselines. This compute disparity is never discussed or controlled for.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "ROUGE and BLEU are used to evaluate biomedical QA quality without discussing whether these surface overlap metrics validly measure knowledge correctness or response quality in the medical domain.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No agentic scaffolding is used. The system is a fine-tuning pipeline, not an agentic architecture.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of whether the pre-trained models' training data temporally overlaps with CMedQA or BioASQ benchmark data.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information. For CMedQA, the non-selected QA pairs are used as KG construction corpus, meaning the KG may encode answer patterns from the same distribution.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "For CMedQA, the corpus for KG construction comes from the same QA dataset (non-selected pairs). The structural similarity between training corpus and test data is not addressed.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection or prevention method (canary strings, membership inference, decontamination) is applied.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "ELPF outperforms existing baselines on biomedical QA, achieving 1.03–1.12 ROUGE-L improvement over vanilla LoRA SFT",
    457       "evidence": "Table 1: ELPF achieves ROUGE-L 15.44 vs 14.41 (CMedQA) and 24.21 vs 23.09 (BioASQ) for no-retrieval SFT baseline",
    458       "supported": "moderate"
    459     },
    460     {
    461       "claim": "K-LoRA pre-learning is the most critical component, causing the largest performance drop when removed",
    462       "evidence": "Table 2 ablation: removing K-LoRA drops CMedQA ROUGE-L by 0.39 points vs 0.13 for AKGF removal; human evaluation in Figure 2 shows larger gap on diversity and viewpoint dimensions",
    463       "supported": "moderate"
    464     },
    465     {
    466       "claim": "Domain-specific KGs can be efficiently constructed with ~100 labeled samples at >85% precision",
    467       "evidence": "Quality assessment on 200 extracted samples shows precision 0.85 (CMedQA) and 0.89 (BioASQ) with ~100 training annotations, vs ~2000 samples needed for conventional BERT-based methods achieving only 0.80 precision",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "AKGF improves knowledge correctness and reduces hallucinations without requiring manual scoring",
    472       "evidence": "Figure 2 human evaluation shows ELPF outranks ablated models on hallucination and viewpoint correctness; ROUGE/BLEU impact is small because AKGF optimizes for correctness, not reference matching",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Sparse KGs (20% of nodes) perform worse than no KG retrieval at all",
    477       "evidence": "Table 3: 20% KG achieves 14.98 ROUGE-L on CMedQA vs 15.04 for 0% KG; attributed to noise overwhelming signal in sparse graphs",
    478       "supported": "strong"
    479     }
    480   ],
    481   "methodology_tags": [
    482     "benchmark-eval"
    483   ],
    484   "key_findings": "ELPF, a three-stage KG-LLM alignment framework (K-LoRA pre-learning, SFT with KG retrieval, AKGF DPO feedback), improves biomedical QA by 1–1.5 ROUGE-L points over strong baselines using only ~100 labeled samples to construct domain KGs at >85% precision. The K-LoRA pre-learning stage is the most critical component, enabling faster domain adaptation and better KG information compliance as evidenced by loss convergence curves and ablation. AKGF's primary benefit appears in human evaluation dimensions (hallucination reduction, knowledge diversity) rather than ROUGE/BLEU, which the authors correctly interpret as alignment objectives diverging from reference matching. A notable negative result: very sparse KGs (20% nodes) perform worse than no KG retrieval, highlighting that noise management is essential for KG-augmented approaches.",
    485   "red_flags": [
    486     {
    487       "flag": "No statistical tests",
    488       "detail": "Performance improvements of 1–2 ROUGE points are described as 'significant' without any confidence intervals, significance tests, or variance across runs. With no error bars, it is impossible to assess whether differences are meaningful."
    489     },
    490     {
    491       "flag": "No code or KG release",
    492       "detail": "Neither implementation code nor the constructed domain KGs (the primary novel artifacts) are released, making independent reproduction impossible from the paper alone."
    493     },
    494     {
    495       "flag": "Two-domain evaluation only",
    496       "detail": "All experiments are on biomedical QA (CMedQA, BioASQ); title and conclusions frame contributions generally as 'domain-specific knowledge infusion' but evidence is restricted to one domain."
    497     },
    498     {
    499       "flag": "Benchmark contamination ignored",
    500       "detail": "No discussion of whether BioASQ and CMedQA test examples appeared in LLM pre-training data; both datasets predate the training cutoffs of the models evaluated."
    501     },
    502     {
    503       "flag": "ChatGPT-3.5 snapshot undated",
    504       "detail": "ChatGPT-3.5 API is used as a baseline without a snapshot date, making this comparison potentially inconsistent across time and irreproducible."
    505     },
    506     {
    507       "flag": "No funding disclosure",
    508       "detail": "All authors are Ant Group employees but no funding source or competing interests statement is included in the paper."
    509     }
    510   ],
    511   "cited_papers": [
    512     {
    513       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    514       "relevance": "Foundational RAG method used as a primary baseline and conceptual basis for the retrieval-augmented approach"
    515     },
    516     {
    517       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    518       "relevance": "DPO training strategy adopted for the AKGF alignment stage to avoid sensitivity to reward values"
    519     },
    520     {
    521       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    522       "relevance": "Parameter-efficient fine-tuning method used throughout all training stages (K-LoRA, SFT, AKGF)"
    523     },
    524     {
    525       "title": "GAP: A Graph-Aware Language Model Framework for Knowledge Graph-to-Text Generation",
    526       "relevance": "State-of-the-art KG-to-text baseline compared against in main experiments"
    527     },
    528     {
    529       "title": "Unifying Large Language Models and Knowledge Graphs: A Roadmap",
    530       "relevance": "Survey contextualizing the KG-LLM integration research area and identifying open challenges"
    531     },
    532     {
    533       "title": "Overview of BioASQ 2022: The Tenth BioASQ Challenge",
    534       "relevance": "One of the two primary evaluation datasets used in experiments"
    535     },
    536     {
    537       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    538       "relevance": "Base model (Llama2-chat-7B) used for all BioASQ experiments"
    539     },
    540     {
    541       "title": "SKILL: Structured Knowledge Infusion for Large Language Models",
    542       "relevance": "Prior work on structured knowledge infusion that motivates the problem setting"
    543     }
    544   ],
    545   "engagement_factors": {
    546     "practical_relevance": {
    547       "score": 2,
    548       "justification": "The KG-LLM alignment framework is applicable to domain-specific LLM deployment, but requires substantial setup (KG construction, multi-stage training) with no released code."
    549     },
    550     "surprise_contrarian": {
    551       "score": 0,
    552       "justification": "Confirms expected intuitions that structured knowledge helps LLMs and that alignment improves knowledge correctness."
    553     },
    554     "fear_safety": {
    555       "score": 0,
    556       "justification": "No safety or security concerns raised; focuses on improving domain QA quality."
    557     },
    558     "drama_conflict": {
    559       "score": 0,
    560       "justification": "No controversy or provocative claims; standard incremental improvement paper."
    561     },
    562     "demo_ability": {
    563       "score": 0,
    564       "justification": "No code, demo, or models released."
    565     },
    566     "brand_recognition": {
    567       "score": 1,
    568       "justification": "Ant Group is a well-known fintech company but not a prominent AI research lab."
    569     }
    570   },
    571   "hn_data": {
    572     "threads": [
    573       {
    574         "hn_id": "41541053",
    575         "title": "LLMs Will Always Hallucinate, and We Need to Live with This",
    576         "points": 291,
    577         "comments": 261,
    578         "url": "https://news.ycombinator.com/item?id=41541053"
    579       },
    580       {
    581         "hn_id": "41333011",
    582         "title": "An exploration of Bluesky's public opening",
    583         "points": 28,
    584         "comments": 45,
    585         "url": "https://news.ycombinator.com/item?id=41333011"
    586       },
    587       {
    588         "hn_id": "41541888",
    589         "title": "Complexity as Design Material",
    590         "points": 5,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=41541888"
    593       },
    594       {
    595         "hn_id": "41519163",
    596         "title": "LLMs Will Always Hallucinate, and We Need to Live with This",
    597         "points": 4,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=41519163"
    600       },
    601       {
    602         "hn_id": "39190527",
    603         "title": "Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon",
    604         "points": 4,
    605         "comments": 0,
    606         "url": "https://news.ycombinator.com/item?id=39190527"
    607       },
    608       {
    609         "hn_id": "41619018",
    610         "title": "Facial Recognition Technology Detects Entrepreneurs, Outperforming Human Experts",
    611         "points": 3,
    612         "comments": 1,
    613         "url": "https://news.ycombinator.com/item?id=41619018"
    614       },
    615       {
    616         "hn_id": "39403991",
    617         "title": "A Fuzzy Approach to Record Linkages",
    618         "points": 3,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=39403991"
    621       },
    622       {
    623         "hn_id": "31684450",
    624         "title": "A Survey on the Fairness of Recommender Systems",
    625         "points": 3,
    626         "comments": 0,
    627         "url": "https://news.ycombinator.com/item?id=31684450"
    628       },
    629       {
    630         "hn_id": "40066890",
    631         "title": "Warning Affects Human Perception and Engagement Regarding LLM Hallucinations",
    632         "points": 2,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=40066890"
    635       },
    636       {
    637         "hn_id": "39848438",
    638         "title": "Probing for Passwords: Privacy Implications of SSIDs in Probe Requests (2022)",
    639         "points": 2,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=39848438"
    642       }
    643     ],
    644     "top_points": 291,
    645     "total_points": 345,
    646     "total_comments": 307
    647   }
    648 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs