scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27156B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GLM-Dialog: Noise-tolerant Pre-training for Knowledge-grounded Dialogue Generation",
      6     "authors": [
      7       "Jing Zhang",
      8       "Xiaokang Zhang",
      9       "Daniel Zhang-Li",
     10       "Jifan Yu",
     11       "Zijun Yao",
     12       "Zeyao Ma",
     13       "Yiqi Xu",
     14       "Haohua Wang",
     15       "Xiaohan Zhang",
     16       "Nianyi Lin",
     17       "Sunrui Lu",
     18       "Juanzi Li",
     19       "Jie Tang"
     20     ],
     21     "year": 2023,
     22     "venue": "Knowledge Discovery and Data Mining",
     23     "arxiv_id": "2302.14401",
     24     "doi": "10.1145/3580305.3599832"
     25   },
     26   "checklist": {
     27     "claims_and_evidence": {
     28       "abstract_claims_supported": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The abstract claims GLM-Dialog outperforms open-source Chinese dialogue models, supported by Tables 1-3 showing consistent improvements on automatic metrics and human evaluation. The evaluation platform claim is supported by the online deployment described in Section 4.3.",
     32         "source": "haiku"
     33       },
     34       "causal_claims_justified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Ablation studies in Section 5.3 systematically remove components (stage-2 training, knowledge injection, knowledge classification, iterative injection) and measure resulting performance drops, providing adequate support for causal claims about each component's contribution.",
     38         "source": "haiku"
     39       },
     40       "generalization_bounded": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Claims are appropriately bounded to Chinese-language open-domain dialogue; the paper explicitly positions contributions relative to Chinese language community challenges and only evaluates on Chinese benchmarks.",
     44         "source": "haiku"
     45       },
     46       "alternative_explanations_discussed": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper does not discuss whether performance gains could be attributed to the GLM10B backbone quality, the larger total training data volume, or the search engine itself independent of the noise-tolerant training strategy.",
     50         "source": "haiku"
     51       },
     52       "proxy_outcome_distinction": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper explicitly acknowledges automatic metrics cannot faithfully reflect dialogue quality (Section 4.3) and supplements with human evaluation across multiple dimensions including coherence, informativeness, hallucination, and engagingness.",
     56         "source": "haiku"
     57       }
     58     },
     59     "limitations_and_scope": {
     60       "limitations_section_present": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "There is no dedicated limitations or threats-to-validity section; the only limitation acknowledged is hallucination mentioned briefly in Section 5.2 as speculation, and the conclusion lacks any formal limitations discussion.",
     64         "source": "haiku"
     65       },
     66       "threats_to_validity_specific": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No specific threats to validity are discussed; the potential confound of DuSinc data appearing in both training and the DuSincR test set is never addressed, nor are annotator qualification or preference biases beyond the implicit evaluation design.",
     70         "source": "haiku"
     71       },
     72       "scope_boundaries_stated": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No explicit scope boundaries are stated regarding what the results do not show; the paper does not clarify limitations to Chinese, to specific dialogue types, or to the tested model scale.",
     76         "source": "haiku"
     77       }
     78     },
     79     "conflicts_of_interest": {
     80       "funding_disclosed": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No funding source is disclosed anywhere in the paper; there is no acknowledgments section mentioning grants or institutional support.",
     84         "source": "haiku"
     85       },
     86       "affiliations_disclosed": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Author affiliations are clearly disclosed: Renmin University of China, Tsinghua University, and Zhipu.AI, the company that commercializes GLM.",
     90         "source": "haiku"
     91       },
     92       "funder_independent_of_outcome": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "One author (Xiaohan Zhang) is affiliated with Zhipu.AI, which commercializes GLM—the backbone evaluated favorably in this paper—creating a non-independent relationship between institutional affiliation and outcome.",
     96         "source": "haiku"
     97       },
     98       "financial_interests_declared": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No competing interests statement or financial interests declaration is present despite an author's affiliation with Zhipu.AI, a commercial entity with direct interest in GLM's perceived quality.",
    102         "source": "haiku"
    103       }
    104     },
    105     "scope_and_framing": {
    106       "key_terms_defined": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper formally defines Dialogue History, External Knowledge Pool, and the knowledge-grounded dialogue generation task with mathematical notation in Section 2.2, and distinguishes helpful vs. noisy knowledge with concrete examples in Figure 1.",
    110         "source": "haiku"
    111       },
    112       "intended_contribution_clear": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 1 clearly enumerates four contributions: noise-tolerant training techniques, an evaluation platform, an open-source dialogue model, and an easy-to-use toolkit for Chinese knowledge-grounded dialogue.",
    116         "source": "haiku"
    117       },
    118       "engagement_with_prior_work": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 2.1 contextualizes GLM-Dialog relative to LaMDA, GODEL, BlenderBot3, CDial-GPT, EVA2.0, and PLATO-XL, explicitly identifying the gap (limited high-quality non-English datasets) that this work addresses.",
    122         "source": "haiku"
    123       }
    124     }
    125   },
    126   "type_checklist": {
    127     "empirical": {
    128       "artifacts": {
    129         "code_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Source code is released on GitHub (https://github.com/RUCKBReasoning/GLM-Dialog) and model checkpoints are explicitly stated as released.",
    133           "source": "haiku"
    134         },
    135         "data_released": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Key training data (800K online service dialogues from XDAI) and the DuSincR evaluation benchmark are not clearly released for download; while public benchmarks are used for training, the paper's novel data contributions are not confirmed publicly available.",
    139           "source": "haiku"
    140         },
    141         "environment_specified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Only hardware (8×80G A100) and basic training parameters are specified; no requirements.txt, Dockerfile, or software environment specification is provided.",
    145           "source": "haiku"
    146         },
    147         "reproduction_instructions": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No step-by-step reproduction instructions are provided; the paper describes the methodology but a reader would need to infer the full pipeline from scattered descriptions across Sections 3.1-3.3.",
    151           "source": "haiku"
    152         }
    153       },
    154       "statistical_methodology": {
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No confidence intervals or error bars are reported in any of the tables (Tables 1-5); only point estimates are given for all metrics.",
    159           "source": "haiku"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No statistical significance tests are performed for any comparative claims despite multiple pairwise model comparisons across many metrics.",
    165           "source": "haiku"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Absolute metric values with baselines are reported in all tables, allowing readers to directly assess the magnitude of improvements (e.g., F1 22.010 vs. 13.548 for CDial-GPT).",
    171           "source": "haiku"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The use of 100 conversations for ablation studies, 50+100 opening utterances for human evaluation, and 20 annotators for implicit evaluation are not justified with power analysis or sample size rationale.",
    177           "source": "haiku"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No variance, standard deviation, or inter-annotator agreement statistics are reported for any human or automatic evaluation results.",
    183           "source": "haiku"
    184         }
    185       },
    186       "evaluation_design": {
    187         "baselines_included": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Five baselines are included: CDial-GPT, EVA2.0, PLATO-2, GLM10B, and GLM130B, covering small, medium, and large models.",
    191           "source": "haiku"
    192         },
    193         "baselines_contemporary": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "The paper notes EVA and PLATO have newer versions but those lack released code; the selected baselines are the best available with accessible model weights as of early 2023.",
    197           "source": "haiku"
    198         },
    199         "ablation_study": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Sections 5.3-5.4 present systematic ablation removing stage-2 training, knowledge injection, knowledge classification, iterative injection, and query generation with human evaluation results.",
    203           "source": "haiku"
    204         },
    205         "multiple_metrics": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Evaluation uses six automatic metrics (BLEU-4, F1, Rouge-L, Rouge-1, Rouge-2, BertScore) and seven human metrics (coherence, informativeness, safety, inspiration, hallucination, engagingness, faithfulness).",
    209           "source": "haiku"
    210         },
    211         "human_evaluation": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Both explicit human evaluation (3 annotators rating self-chat and human-bot dialogues) and a novel implicit human evaluation (20 annotators selecting responses from 6 deployed bots) are conducted.",
    215           "source": "haiku"
    216         },
    217         "held_out_test_set": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "DuSincR is used as the held-out test benchmark, built by modifying the DuSinc test set, and results are reported on this set rather than training data.",
    221           "source": "haiku"
    222         },
    223         "per_category_breakdown": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Human evaluation results are broken down by chit-chat (50 opening utterances) vs. knowledge-grounded (100 opening utterances) dialogue types in Tables 2 and 3.",
    227           "source": "haiku"
    228         },
    229         "failure_cases_discussed": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "The paper briefly notes hallucination as an ongoing concern (Section 5.2) but provides no systematic discussion or examples of failure cases where the model produces incorrect or harmful outputs.",
    233           "source": "haiku"
    234         },
    235         "negative_results_reported": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "The ablation study effectively reports negative results by showing performance drops when each component is removed, confirming each component's necessity.",
    239           "source": "haiku"
    240         }
    241       },
    242       "setup_transparency": {
    243         "model_versions_specified": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "GLM10B is specified with the exact parameter count, backbone paper citation (Du et al. 2022), and reference to the GitHub release; all comparison models cite specific papers.",
    247           "source": "haiku"
    248         },
    249         "prompts_provided": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "The actual Chinese-language prompts 𝑃𝑞, 𝑃𝑟, and 𝑃𝑘𝑟 are provided verbatim in Section 3.1-3.2 with both Chinese and English translations.",
    253           "source": "haiku"
    254         },
    255         "hyperparameters_reported": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Learning rate (5×10⁻⁵ with cosine decay), batch size (256), max input length (512), auxiliary loss weight (λ=1), and knowledge pool size (m=1 at inference) are all reported.",
    259           "source": "haiku"
    260         },
    261         "scaffolding_described": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 3.3 details the three-component inference pipeline: query generation module → external search engine → response generation, including time costs in Table 5.",
    265           "source": "haiku"
    266         },
    267         "data_preprocessing_documented": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Data preprocessing steps are documented including entity linking via HOSMEL, negative knowledge sampling using low-confidence entities, knowledge labeling strategy, and dialogue compilation from social media timing information.",
    271           "source": "haiku"
    272         }
    273       },
    274       "data_integrity": {
    275         "raw_data_available": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "The 800K online service dialogues used for training and the DuSincR test set annotations are not released as raw data files; only the code and model checkpoint are confirmed released.",
    279           "source": "haiku"
    280         },
    281         "data_collection_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Data collection is described: social media from Weibo/Bilibili/Zhihu/Douban, public benchmarks listed in Table 6, and online service data from XDAI deployment Sept 1–Dec 15, 2022.",
    285           "source": "haiku"
    286         },
    287         "recruitment_methods_described": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "The paper states '3 annotators' and '20 annotators' were hired but provides no description of how they were recruited, their qualifications, or compensation.",
    291           "source": "haiku"
    292         },
    293         "data_pipeline_documented": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The full pipeline from data collection through entity linking, knowledge labeling, knowledge pool construction, and iterative bootstrap training is documented in Sections 3.1-3.2.",
    297           "source": "haiku"
    298         }
    299       },
    300       "contamination": {
    301         "training_cutoff_stated": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "The GLM10B backbone training data cutoff is not stated; the online service data cutoff (Dec 15, 2022) is mentioned but this does not address the pre-training corpus cutoff relevant to benchmark contamination.",
    305           "source": "haiku"
    306         },
    307         "train_test_overlap_discussed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "DuSinc data appears in both training (as benchmark data) and evaluation (DuSincR is modified DuSinc test set); this overlap is never acknowledged or addressed.",
    311           "source": "haiku"
    312         },
    313         "benchmark_contamination_addressed": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "The paper does not address whether DuSinc test cases—the basis for DuSincR evaluation—may have been seen during training given DuSinc's inclusion in the training benchmark data.",
    317           "source": "haiku"
    318         }
    319       },
    320       "human_studies": {
    321         "pre_registered": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "No pre-registration is mentioned for the human evaluation studies.",
    325           "source": "haiku"
    326         },
    327         "irb_or_ethics_approval": {
    328           "applies": true,
    329           "answer": false,
    330           "justification": "No IRB or ethics approval is mentioned despite hiring annotators and deploying a live system with real user interactions.",
    331           "source": "haiku"
    332         },
    333         "demographics_reported": {
    334           "applies": true,
    335           "answer": false,
    336           "justification": "No demographics (age, gender, language background, expertise) are reported for the annotators used in explicit or implicit evaluation.",
    337           "source": "haiku"
    338         },
    339         "inclusion_exclusion_criteria": {
    340           "applies": true,
    341           "answer": false,
    342           "justification": "No inclusion/exclusion criteria for annotators are stated beyond the implicit requirement that they participate in Chinese-language dialogue.",
    343           "source": "haiku"
    344         },
    345         "randomization_described": {
    346           "applies": true,
    347           "answer": true,
    348           "justification": "The implicit evaluation explicitly states that bot response ordering is shuffled before display to prevent position bias, as described in Section 4.3.",
    349           "source": "haiku"
    350         },
    351         "blinding_described": {
    352           "applies": true,
    353           "answer": true,
    354           "justification": "The implicit evaluation uses anonymous bots with identities not disclosed to users, preventing brand preference bias as explicitly stated in Section 4.3.",
    355           "source": "haiku"
    356         },
    357         "attrition_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "This is a rating/selection task without longitudinal follow-up; attrition is not applicable, though the 5-turn minimum filter is mentioned.",
    361           "source": "haiku"
    362         }
    363       },
    364       "cost_and_practicality": {
    365         "inference_cost_reported": {
    366           "applies": true,
    367           "answer": true,
    368           "justification": "Table 5 reports online time costs for each pipeline stage: query generation (1.09s), search (0.92s), response generation (1.64s), and overall latency including network (4.22s).",
    369           "source": "haiku"
    370         },
    371         "compute_budget_stated": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Only hardware type (8×80G A100) is mentioned without total compute budget, training duration, or GPU-hours, making it impossible to assess total training cost.",
    375           "source": "haiku"
    376         }
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "GLM-Dialog outperforms all open-source Chinese dialogue baselines on most automatic evaluation metrics on DuSincR",
    383       "evidence": "Table 1 shows GLM-Dialog achieves highest BLEU-4 (4.190), F1 (22.010), Rouge-L (19.464), and BertScore (0.630) among all models, though not highest on Rouge-1 and Rouge-2",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Knowledge classification and iterative injection each contribute independently to dialogue quality",
    388       "evidence": "Table 4 ablation shows removing knowledge classification drops coherence from 1.820 to 1.730 and informativeness from 1.840 to 1.633; removing iterative injection further drops coherence to 1.757",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "The generated queries achieve high quality with mean cosine similarity 0.85 to ground-truth queries",
    393       "evidence": "Figure 4(b) shows frequency histogram of query similarity scores on 9,353 DuSinc test cases with mean 0.85; retrieved knowledge similarly scores mean 0.86",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Implicit human evaluation reduces annotation bias compared to explicit rating",
    398       "evidence": "The paper argues that selection-based implicit rating removes explicit scoring bias but provides no empirical comparison of bias levels between methods—this is a design argument, not a measured claim",
    399       "supported": "weak"
    400     },
    401     {
    402       "claim": "GLM-Dialog achieves highest informativeness among comparison models due to external knowledge injection",
    403       "evidence": "Tables 2 and 3 show GLM-Dialog consistently achieves highest informativeness scores across both chit-chat and knowledge-grounded categories in self-chat and human-bot evaluation",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "GLM-Dialog receives the most user selections in implicit evaluation among six deployed models",
    408       "evidence": "Figure 4(a) shows GLM-Dialog achieves highest score among models in the 10,000 total selections by 20 annotators, though exact numbers are not shown clearly",
    409       "supported": "moderate"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval",
    414     "case-study"
    415   ],
    416   "key_findings": "GLM-Dialog, a 10B-parameter Chinese knowledge-grounded dialogue model, outperforms open-source baselines (CDial-GPT, EVA2.0, PLATO-2) on most automatic and human evaluation metrics through a two-stage training strategy combining continual dialogue pre-training and noise-tolerant knowledge-infused fine-tuning. The auxiliary knowledge classification loss and iterative bootstrap injection each contribute incrementally to performance, with knowledge classification being the most critical component. An implicit human evaluation platform where annotators select from anonymous bot responses is introduced as a bias-reducing alternative to explicit multi-dimensional rating. Hallucination remains a persistent problem despite noise-tolerant training, and the model takes approximately 4 seconds total latency for online inference.",
    417   "red_flags": [
    418     {
    419       "flag": "Train-test overlap",
    420       "detail": "DuSinc benchmark data is included in training (Table 6) while DuSincR—the primary evaluation benchmark—is a modified version of the DuSinc test set. This potential contamination is never acknowledged."
    421     },
    422     {
    423       "flag": "No statistical tests",
    424       "detail": "No confidence intervals, error bars, or significance tests are reported for any of the comparative claims despite multiple model comparisons across numerous metrics."
    425     },
    426     {
    427       "flag": "Undisclosed conflict of interest",
    428       "detail": "Author Xiaohan Zhang is affiliated with Zhipu.AI, which commercializes GLM—the backbone system being evaluated favorably—but no competing interests are declared."
    429     },
    430     {
    431       "flag": "No limitations section",
    432       "detail": "The paper has no dedicated limitations or threats-to-validity section; hallucination is briefly speculated about in one paragraph but no systematic limitations are discussed."
    433     },
    434     {
    435       "flag": "Ablation sample size",
    436       "detail": "Ablation studies use only 100 randomly selected conversations with no justification for this sample size and no variance reported, making it difficult to assess reliability of component-level findings."
    437     },
    438     {
    439       "flag": "Implicit evaluation not validated",
    440       "detail": "The claim that implicit evaluation reduces annotation bias is made by design argument only; no empirical comparison of bias levels between implicit and explicit methods is provided."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "LaMDA: Language Models for Dialog Applications",
    446       "relevance": "Major English knowledge-grounded dialogue system used as motivation and implicit benchmark for Chinese counterpart"
    447     },
    448     {
    449       "title": "GODEL: Large-Scale Pre-Training for Goal-Directed Dialog",
    450       "relevance": "Contemporary knowledge-grounded dialogue baseline representing English dialogue LLM state-of-the-art"
    451     },
    452     {
    453       "title": "BlenderBot 3: A Deployed Conversational Agent that Continually Learns to Responsibly Engage",
    454       "relevance": "Deployed knowledge-grounded dialogue system with internet access, direct architectural parallel to GLM-Dialog"
    455     },
    456     {
    457       "title": "GLM: General Language Model Pretraining with Autoregressive Blank Infilling",
    458       "relevance": "Backbone model used for GLM-Dialog; provides the 10B parameter pre-trained checkpoint"
    459     },
    460     {
    461       "title": "EVA2.0: Investigating Open-Domain Chinese Dialogue Systems with Large-Scale Pre-Training",
    462       "relevance": "Primary Chinese dialogue baseline used for comparison"
    463     },
    464     {
    465       "title": "PLATO-XL: Exploring the Large-Scale Pre-Training of Dialogue Generation",
    466       "relevance": "Large-scale Chinese dialogue pre-training baseline"
    467     },
    468     {
    469       "title": "Link the World: Improving Open-domain Conversation with Dynamic Spatiotemporal-aware Knowledge (DuSinc)",
    470       "relevance": "Primary evaluation benchmark; DuSincR is built by extending DuSinc test set"
    471     },
    472     {
    473       "title": "Wizard of Wikipedia: Knowledge-Powered Conversational Agents",
    474       "relevance": "Foundational knowledge-grounded dialogue task formulation that this work extends to Chinese"
    475     },
    476     {
    477       "title": "XDAI: A Tuning-free Framework for Exploiting Pre-trained Language Models in Knowledge Grounded Dialogue Generation",
    478       "relevance": "Online service from which the 800K training dialogues were collected"
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 3,
    484       "justification": "System is deployed as a live WeChat chatbot with 100+ real users and an open evaluation platform, making it immediately usable by practitioners."
    485     },
    486     "surprise_contrarian": {
    487       "score": 0,
    488       "justification": "Results confirm expected patterns—larger models with knowledge grounding outperform smaller baselines—with no surprising or counterintuitive findings."
    489     },
    490     "fear_safety": {
    491       "score": 0,
    492       "justification": "The paper briefly notes hallucination as a limitation but raises no substantive AI safety concerns."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "Standard system paper with no controversy; the implicit evaluation novelty is presented modestly."
    497     },
    498     "demo_ability": {
    499       "score": 3,
    500       "justification": "A live WeChat demo and online evaluation platform are both publicly deployed and directly usable."
    501     },
    502     "brand_recognition": {
    503       "score": 2,
    504       "justification": "Tsinghua University and the GLM/ChatGLM lineage are well-known in the Chinese AI research community; Zhipu.AI has commercial visibility."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [],
    509     "top_points": 0,
    510     "total_points": 0,
    511     "total_comments": 0
    512   }
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs