scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27887B)
      1 {
      2   "paper": {
      3     "title": "GLM-Dialog: Noise-tolerant Pre-training for Knowledge-grounded Dialogue Generation",
      4     "authors": [
      5       "Jing Zhang",
      6       "Xiaokang Zhang",
      7       "Daniel Zhang-Li",
      8       "Jifan Yu",
      9       "Zijun Yao",
     10       "Zeyao Ma",
     11       "Yiqi Xu",
     12       "Haohua Wang",
     13       "Xiaohan Zhang",
     14       "Nianyi Lin",
     15       "Sunrui Lu",
     16       "Juanzi Li",
     17       "Jie Tang"
     18     ],
     19     "year": 2023,
     20     "venue": "KDD (Knowledge Discovery and Data Mining)",
     21     "arxiv_id": "2302.14401",
     22     "doi": "10.1145/3580305.3599832"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor", "data_leakage"],
     26   "methodology_tags": ["benchmark-eval"],
     27   "key_findings": "GLM-Dialog, a 10B-parameter Chinese knowledge-grounded dialogue model fine-tuned from GLM10B, outperforms CDial-GPT, EVA2.0, PLATO-2, and untuned GLM10B/130B on both automatic metrics (DuSincR) and human evaluations (explicit and implicit). Ablation studies confirm that each component (continual pre-training, knowledge injection with classification, iterative bootstrap training) contributes meaningfully. A novel implicit human evaluation platform where users simultaneously converse with multiple anonymous bots reduces annotation bias compared to traditional explicit rating schemes.",
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "GitHub repository provided: https://github.com/RUCKBReasoning/GLM-Dialog (footnote 3, abstract). The paper states 'We release both the model checkpoint and source code.'"
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper releases the DuSincR benchmark, model checkpoint, and toolkit. Benchmark datasets used (KDConv, DuConv, NaturalConv, DuSinc, etc.) are publicly available. The evaluation platform is deployed online."
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper mentions '8×80G A100 server' for training but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No step-by-step reproduction instructions are provided in the paper. Code is released but the paper lacks a 'Reproducing Results' section or specific commands to run experiments."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Tables 1-5 report only point estimates with no confidence intervals, error bars, or ± notation for any results."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No statistical significance tests are reported. Claims of superiority (e.g., 'GLM-Dialog outperforms the baselines on most of the automatic metrics') are based solely on comparing raw numbers."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Only raw metric values are reported in tables. No effect sizes (Cohen's d, percentage improvements with baseline context) are computed or discussed."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No justification for sample sizes: 50 chit-chat utterances, 100 knowledge-grounded utterances, 20 annotators for implicit evaluation, 3 annotators for explicit evaluation. No power analysis."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No standard deviations, variance, or spread measures reported across any experimental runs. All results appear to be single-run numbers."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 5.1 compares against CDial-GPT (104M), EVA2.0 (2.8B), PLATO-2 (11B), GLM10B, and GLM130B. Additional comparisons include GLM10B with knowledge prompting and a pre-classifier variant."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Baselines are contemporary for Chinese dialogue models circa 2022-2023: EVA2.0 (2022), GLM (2022), PLATO-2 (2021). The paper notes that updated versions of EVA and PLATO 'do not share their models or source codes' so cannot be compared."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Sections 5.3-5.4 present extensive ablations: w/o stage-2 training, w/o knowledge injection, w/o knowledge classification, w/o iterative knowledge injection, w/o query generation. Each component's contribution is isolated."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Automatic: BLEU-4, F1, Rouge-L, Rouge-1, Rouge-2, BertScore (Table 1). Human: coherence, informativeness, safety, inspiration, hallucination, engagingness, faithfulness, knowledgeability (Tables 2-4)."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Three types of human evaluation: (1) explicit evaluation on self-chat dialogues (Table 2), (2) explicit evaluation on human-bot dialogues (Table 3), (3) implicit evaluation with 20 annotators and 10,000 selections (Figure 4a)."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Automatic evaluation uses the DuSincR test set (built on DuSinc test set, 2,309 sessions). Query/knowledge similarity is evaluated on 9,353 DuSinc test cases. Training uses separate data."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Results are broken down by: chit-chat vs knowledge-grounded opening utterances (Tables 2-3), per-metric (all tables), utterance-level vs session-level metrics, and ablation component (Table 4)."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Hallucination is identified as a weakness: 'we still need to lessen the model's hallucination.' Tables 41-42 in the appendix show case studies with both noisy and helpful knowledge injection, illustrating where the system struggles."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "GLM130B 'always repeats its own words when speaking to itself' (Section 5.2). Ablation results show clear degradation when components are removed. Hallucination is acknowledged as an unsolved problem."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Abstract claims about comprehensive evaluations demonstrating advantages are supported by Tables 1-4 and Figure 4a. Claims about the novel evaluation method are supported by Section 4.3."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Causal claims are primarily from ablation studies (Table 4): removing knowledge classification, knowledge injection, stage-2 training each degrades performance. These are controlled single-variable manipulations adequate for the causal claims made."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper consistently scopes claims to Chinese dialogue: 'knowledge-grounded conversation in Chinese,' evaluations on Chinese benchmarks. The title and claims do not overreach beyond the tested setting."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "No discussion of alternative explanations for the observed results. The paper does not consider confounds such as differences in training data quantity, model architecture advantages, or whether the improvements are due to the knowledge integration strategy vs. simply more fine-tuning data."
    145       },
    146       "proxy_outcome_distinction": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 4.3 explicitly states 'The automatic evaluation cannot faithfully reflect the quality of the dialogues' and supplements with human evaluation. The paper distinguishes between automatic metrics (proxy) and dialogue quality (outcome)."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Specific model names with sizes: GLM10B (10B, Du et al. 2022), GLM130B (130B, Zeng et al. 2022), CDial-GPT (104M), EVA2.0 (2.8B), PLATO-2 (11B). These are specific open-source checkpoints referenced by their papers."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Actual prompt text provided in Chinese with English translation: P_q for query generation ('对话：...此时应该去检索[sMask]'), P_r for response generation, P_kr for knowledge-infused response generation (Section 3)."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 5.1: learning rate 5×10^-5 with cosine decay, batch size 256, max input length 512. Section 3.2: λ=1 for auxiliary loss weight, m=1 for knowledge pool size at deployment."
    167       },
    168       "scaffolding_described": {
    169         "applies": false,
    170         "answer": false,
    171         "justification": "The system uses a deterministic pipeline (query generation → search engine → response generation) rather than agentic scaffolding with retry logic, feedback mechanisms, or memory management."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 3.1 describes corpus preparation: social media conversations compiled from responses and timing info, benchmark data converted into dialogue form, online service data augmented with Wikipedia knowledge via entity linking (HOSMEL). Table 6 provides data statistics."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No dedicated limitations section. The paper moves directly from experiments (Section 5) to conclusion (Section 6) without discussing limitations."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No threats to validity discussed anywhere in the paper. The hallucination issue is mentioned as an area for improvement but not framed as a threat to the validity of the evaluation."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper does not explicitly state what its results do NOT show. No explicit boundaries on what populations, settings, or claims are excluded from the conclusions."
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "Raw evaluation data (individual annotator ratings, conversation logs from implicit evaluation) are not made available. The social media and online service training data are not released. Only aggregated results are presented."
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 3.1 and Table 6 describe data sources: Weibo (6.8M), Bilibili (10K), Baidu Tieba (300K), Zhihu (4.1M), Douban (280K) for social media; named benchmarks; XDAI online service data from Sept 1 to Dec 15, 2022 (800K dialogues)."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "For human evaluation: 'We hire three annotators' and 'We employ 20 annotators' with no description of recruitment channels, qualifications, compensation, or potential selection bias. WeChat deployment had '100 users' with no recruitment details."
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The two-stage training pipeline is documented: Stage 1 (social media pre-training with anti-forgetting) → Stage 2 (knowledge-infused fine-tuning with classification loss + iterative bootstrap). Data augmentation via entity linking is described step by step."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding or acknowledgments section is present in the paper. No grants or sponsors disclosed."
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Author affiliations listed: Renmin University of China, Tsinghua University, and Zhipu.AI. The GLM backbone is associated with Tsinghua/Zhipu.AI."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No funding is disclosed. One author is from Zhipu.AI, which is commercially associated with the GLM model family being evaluated. The independence of funding cannot be assessed."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No competing interests statement in the paper. Given that Zhipu.AI is a commercial entity associated with GLM (the model being fine-tuned and evaluated), potential financial interests are undisclosed."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The pre-training data cutoff for GLM10B is not stated. Online service data collection period (Sept 1 - Dec 15, 2022) is given, but the original GLM pre-training cutoff is not disclosed."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No discussion of potential overlap between GLM10B's pre-training data and the DuSinc/DuSincR test sets. The paper fine-tunes on DuSinc training data and tests on DuSincR but does not verify pre-training contamination."
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "DuSinc was published in 2022, and GLM10B was also published in 2022. No discussion of whether DuSinc data appeared in GLM's pre-training corpus."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No pre-registration mentioned. The evaluation protocol, including the novel implicit evaluation method, was not registered in advance."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No IRB or ethics board approval mentioned, despite hiring annotators and deploying a public-facing chatbot on WeChat with 100+ users."
    267       },
    268       "demographics_reported": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No demographics reported for the 3 explicit evaluation annotators, 20 implicit evaluation annotators, or 100+ WeChat users."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No criteria described for annotator selection or WeChat user inclusion. 'We hire three annotators' and 'We employ 20 annotators' are stated without any qualification requirements."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "Not an experimental study comparing conditions with human assignment. All annotators evaluate all models (within-subjects design for implicit evaluation)."
    282       },
    283       "blinding_described": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Section 4.3: 'the name of the bot is not disclosed to users and the order of messages will be shuffled before being displayed on the platform in order to prevent potential annotation bias.'"
    287       },
    288       "attrition_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No attrition data reported. The paper states 'Only dialogues lasting more than five turns are regarded as useful information' but does not report how many were excluded or reasons for dropout."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Table 5 reports average online time cost: query generation 1.09s, search 0.92s, response generation 1.64s, overall 4.22s. Compared against GLM10B (2.25s) and pre-classifier (4.17s)."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Hardware mentioned ('8×80G A100 server') but no total GPU hours, training time, or API costs reported."
    304       }
    305     },
    306     "experimental_rigor": {
    307       "seed_sensitivity_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No mention of random seeds or seed sensitivity analysis. All results appear to be from single runs."
    311       },
    312       "number_of_runs_stated": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The number of experimental runs is never stated. No indication of whether results are from single or multiple runs."
    316       },
    317       "hyperparameter_search_budget": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Hyperparameters are set (λ=1 'empirically', lr=5×10^-5) but no search budget, method, or number of configurations tried is reported."
    321       },
    322       "best_config_selection_justified": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Key hyperparameters like λ=1 are set 'empirically' with no justification. No description of how the final configuration was selected or whether validation data was used."
    326       },
    327       "multiple_comparison_correction": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    331       },
    332       "self_comparison_bias_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Authors evaluate their own GLM-Dialog system against baselines without acknowledging author-evaluation bias. Baselines are used as-is (pre-trained checkpoints), but the comparison is still between their optimized system and others' released models."
    336       },
    337       "compute_budget_vs_performance": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No analysis of performance as a function of compute budget. GLM-Dialog requires additional inference for query generation and search (Table 5) but performance is not compared at matched compute."
    341       },
    342       "benchmark_construct_validity": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "DuSincR is introduced and described in detail, but no analysis of whether the benchmark actually measures knowledge-grounded dialogue quality vs. surface-level text overlap. The paper acknowledges automatic metrics don't faithfully reflect quality but does not address construct validity of DuSincR itself."
    346       },
    347       "scaffold_confound_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "The paper explicitly addresses that GLM-Dialog has knowledge retrieval while baselines don't by: (1) testing 'GLM10B with knowledge prompting' using the same knowledge (Table 4), and (2) ablating the query generation and knowledge components to isolate their contributions."
    351       }
    352     },
    353     "data_leakage": {
    354       "temporal_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of temporal leakage. DuSinc benchmark data may have been available before GLM10B's pre-training, but this is not addressed."
    358       },
    359       "feature_leakage_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of feature leakage. The evaluation provides knowledge snippets from search results that may contain answer information, but this is by design rather than a leakage concern."
    363       },
    364       "non_independence_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No verification that training and test data are independent. Social media data, benchmark data, and online service data may share content with DuSincR test cases."
    368       },
    369       "leakage_detection_method": {
    370         "applies": true,
    371         "answer": false,
    372         "justification": "No leakage detection or prevention method applied. No decontamination pipeline, canary strings, or overlap analysis."
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "GLM-Dialog outperforms existing open-source Chinese dialogue models on most automatic metrics",
    379       "evidence": "Table 1: GLM-Dialog achieves highest F1 (22.010), Rouge-L (19.464), and BertScore (0.630) on DuSincR. BLEU-4 (4.190) is comparable to GLM130B (4.177). Section 5.2.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "GLM-Dialog produces the most engaging and informative responses in human evaluation",
    384       "evidence": "Tables 2-3: GLM-Dialog achieves highest informativeness and engagingness scores in both self-chat and human-bot evaluations. Figure 4a: highest selection rate (10,000 total selections) in implicit evaluation with 20 annotators.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Each training component (continual pre-training, knowledge injection, classification, iterative training) contributes to performance",
    389       "evidence": "Table 4: Ablation on 100 DuSincR dialogues shows progressive degradation when components are removed. w/o stage-2: informativeness drops from 1.840 to 1.413; w/o knowledge classification: informativeness drops to 1.633.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The query generation module produces high-quality queries with mean similarity of 0.85 to ground-truth queries",
    394       "evidence": "Section 5.4 and Figure 4b: Cosine similarity between generated and ground-truth queries on 9,353 DuSinc test cases averages 0.85. Retrieved knowledge similarity averages 0.86.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "The implicit evaluation method reduces annotation bias compared to explicit rating",
    399       "evidence": "Section 4.3 argues this conceptually (central conversation, implicit rating) but provides no empirical comparison between implicit and explicit evaluation methods to demonstrate bias reduction.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No error bars or statistical tests",
    406       "detail": "All results across Tables 1-5 are single point estimates with no uncertainty quantification. Claims of superiority are based on raw number comparisons without any significance testing."
    407     },
    408     {
    409       "flag": "No limitations section",
    410       "detail": "The paper has no dedicated limitations or threats-to-validity discussion despite having 13 authors and being a system paper with complex evaluation design."
    411     },
    412     {
    413       "flag": "Undisclosed conflicts of interest",
    414       "detail": "One author is from Zhipu.AI, which is commercially associated with the GLM model family. No funding disclosure and no competing interests statement. This is a company-adjacent evaluation of a company-related product."
    415     },
    416     {
    417       "flag": "Small annotator pool for explicit evaluation",
    418       "detail": "Only 3 annotators for explicit human evaluation (Tables 2-3). Inter-annotator agreement is not reported; only averages of 3 annotators are used as final scores."
    419     },
    420     {
    421       "flag": "Unfair baseline comparison",
    422       "detail": "GLM-Dialog benefits from knowledge retrieval (search engine integration) while baselines do not have this capability. Although addressed partially through ablation, the main results tables compare a knowledge-augmented system against non-knowledge-augmented baselines."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Lamda: Language models for dialog applications",
    428       "authors": ["Romal Thoppilan", "Daniel De Freitas"],
    429       "year": 2022,
    430       "arxiv_id": "2201.08239",
    431       "relevance": "Google's large-scale dialogue LLM, key comparison point for knowledge-grounded dialogue systems."
    432     },
    433     {
    434       "title": "Godel: Large-scale pre-training for goal-directed dialog",
    435       "authors": ["Baolin Peng", "Michel Galley"],
    436       "year": 2022,
    437       "arxiv_id": "2206.11309",
    438       "relevance": "Microsoft's knowledge-grounded dialogue model using pre-training on multiple datasets."
    439     },
    440     {
    441       "title": "Blenderbot 3: a deployed conversational agent that continually learns to responsibly engage",
    442       "authors": ["Kurt Shuster", "Jing Xu"],
    443       "year": 2022,
    444       "arxiv_id": "2208.03188",
    445       "relevance": "Meta's deployed dialogue agent combining LLM with external knowledge, demonstrates knowledge-grounded dialogue at scale."
    446     },
    447     {
    448       "title": "GLM: General Language Model Pretraining with Autoregressive Blank Infilling",
    449       "authors": ["Zhengxiao Du", "Yujie Qian"],
    450       "year": 2022,
    451       "relevance": "The backbone LLM used for GLM-Dialog, demonstrates hybrid attention mechanism for generation tasks."
    452     },
    453     {
    454       "title": "GLM-130B: An Open Bilingual Pre-trained Model",
    455       "authors": ["Aohan Zeng", "Xiao Liu"],
    456       "year": 2022,
    457       "arxiv_id": "2210.02414",
    458       "relevance": "Larger scale version of GLM used as a baseline, demonstrates open bilingual pre-training."
    459     },
    460     {
    461       "title": "Plato-xl: Exploring the large-scale pre-training of dialogue generation",
    462       "authors": ["Siqi Bao", "Huang He"],
    463       "year": 2021,
    464       "arxiv_id": "2109.09519",
    465       "relevance": "Large-scale Chinese dialogue model (11B parameters), key baseline for Chinese dialogue research."
    466     },
    467     {
    468       "title": "EVA2.0: Investigating open-domain Chinese dialogue systems with large-scale pre-training",
    469       "authors": ["Yuxian Gu", "Jiaxin Wen"],
    470       "year": 2022,
    471       "arxiv_id": "2203.09313",
    472       "relevance": "2.8B Chinese dialogue model trained on 0.4B dialogue sessions, key baseline in Chinese dialogue."
    473     },
    474     {
    475       "title": "Survey of hallucination in natural language generation",
    476       "authors": ["Ziwei Ji", "Nayeon Lee"],
    477       "year": 2022,
    478       "relevance": "Foundational survey on LLM hallucination, directly relevant to the knowledge grounding problem addressed in GLM-Dialog."
    479     },
    480     {
    481       "title": "XDAI: A Tuning-free Framework for Exploiting Pre-trained Language Models in Knowledge Grounded Dialogue Generation",
    482       "authors": ["Jifan Yu", "Xiaohan Zhang"],
    483       "year": 2022,
    484       "relevance": "Prior system from the same group that provided online service data for GLM-Dialog training."
    485     },
    486     {
    487       "title": "Wizard of wikipedia: Knowledge-powered conversational agents",
    488       "authors": ["Emily Dinan", "Stephen Roller"],
    489       "year": 2018,
    490       "arxiv_id": "1811.01241",
    491       "relevance": "Foundational work on knowledge-grounded dialogue using retrieval, established the paradigm GLM-Dialog follows."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs