scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29812B)
      1 {
      2   "paper": {
      3     "title": "Mix-of-Granularity: Optimize the Chunking Granularity for Retrieval-Augmented Generation",
      4     "authors": [
      5       "Zijie Zhong",
      6       "Hanwen Liu",
      7       "Xiaoya Cui",
      8       "Xiaofan Zhang",
      9       "Zengchang Qin"
     10     ],
     11     "year": 2024,
     12     "venue": "International Conference on Computational Linguistics",
     13     "arxiv_id": "2406.00456",
     14     "doi": "10.48550/arXiv.2406.00456"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "MoG dynamically selects optimal chunking granularity for RAG retrieval via a trained router, improving accuracy over fixed-granularity MedRAG baselines across five medical QA benchmarks and five backbone LLMs (average 5% over MedRAG on smaller models). MoGG extends MoG by reorganizing documents as graphs, enabling retrieval of distantly situated snippets with even fewer training samples. Performance gains are larger on weaker LLMs (GLM3, Qwen1.5) and MoG does not consistently outperform chain-of-thought prompting without retrieval, which the authors attribute to noise injection from unfiltered retrieved snippets.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract states: 'The code of both MoG and MoGG are released in https://github.com/ZGChung/Mix-of-Granularity.' A specific GitHub URL is provided."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All corpora (PubMed, StatPearls, Textbooks, Wikipedia) and QA datasets (MMLU-Med, MedQA-US, MedMCQA, PubMedQA*, BioASQ-Y/N) are publicly available standard benchmarks described in Section 4.1 and Appendix B."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions PyTorch framework and Nvidia GeForce 3090/4090 GPUs (Section 4.2), but provides no requirements.txt, Dockerfile, or detailed library versions sufficient to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are included in the paper. The experimental setup (Section 4.2) describes parameters but not a reproducible workflow."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Tables 1 and 2 report ± notation for all results (e.g., '0.5198±0.04'). These appear to be confidence intervals based on dataset size rather than variance across runs, but CI notation is present."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are reported. The paper claims MoG 'consistently enhanced the performance' (Section 4.3) based solely on comparing accuracy numbers without any p-values, t-tests, or other significance testing."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 4.5 states 'MoG shows an average improvement of 5% compared to MedRAG and 8.7% compared to CoT' with baseline context. Tables provide both baseline and improved numbers enabling effect size calculation."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification for dataset sizes or power analysis. The paper uses existing benchmark datasets (500-4183 questions per dataset) without discussing whether these sizes are adequate for the claimed comparisons."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The ± values in tables are constant per dataset column across all methods (e.g., MMLU always ±0.03 or ±0.04), indicating they are binomial proportion confidence intervals based on dataset size, not variance across experimental runs. No multi-run variance, standard deviation, or seed-based variability is reported."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Two baselines are included: CoT (chain-of-thought prompting without retrieval) and MedRAG (fixed single-granularity RAG system from Xiong et al., 2024), described in Section 4.3."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "MedRAG (Xiong et al., 2024) is a contemporary baseline from the same year. The MIRAGE benchmark framework is also current."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table 9 (Appendix K) presents an ablation removing the router from MoGG. Appendix E compares different soft label construction methods (TF-IDF, RoBERTa, hitrate). Appendix F varies the number of candidate snippets."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Only exact matching accuracy is used as the evaluation metric across all experiments. While results are broken down by dataset, the metric is always the same."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of system outputs is performed. Appendix I mentions manual verification of ~10% of degraded samples to diagnose causes, but this is error analysis, not human evaluation of output quality."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Standard benchmark test sets are used. For MedMCQA, they specifically use the 'dev' set (Appendix B.3). The other datasets use their standard evaluation splits."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by dataset (5 QA benchmarks), by backbone LLM (5 models), by training corpus (4 corpora in Table 7), and by number of candidate snippets (Appendix F)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Appendix I provides detailed analysis of degraded samples. The authors manually verified ~10% of degraded cases and found 95% were caused by noise injection from retrieved snippets, not MoG design flaws."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that MoG does not consistently beat CoT ('though not necessarily better than CoT', Section 4.3), and that noise from retrieved snippets can degrade performance. MoGG also does not consistently outperform MoG across all settings."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims MoG/MoGG 'effectively predict optimal granularity levels, significantly enhancing the performance of the RAG system.' Tables 1 and 2 show improvements over MedRAG baselines in average accuracy across most configurations."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The ablation study in Table 9 removes the router component from MoGG, showing it degrades to MedRAG-level performance. This controlled single-variable manipulation provides adequate evidence for the causal claim that the router mechanism drives the improvement."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims to 'Optimize the Chunking Granularity for Retrieval-Augmented Generation' generally, but all experiments are on medical QA datasets only. Section 4.1 further states 'We posit that significant improvements demonstrated by the tests on this knowledge-intensive field suggest MoG's potential effectiveness in other domains' — an unbounded generalization claim."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper discusses noise as a cause of degradation (Appendix I) but does not consider alternative explanations for the observed improvements — e.g., whether improvements stem from ensemble effects of multi-granularity retrieval rather than optimal granularity selection, or whether the soft label training signal is confounded with corpus characteristics."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures exact matching accuracy on multiple-choice QA tasks and frames claims at this same granularity. No proxy gap exists between measurement and claim."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Table 4 (Appendix C) lists exact model versions: 'gpt-3.5-turbo-16k', 'Meta-Llama-3-8B', 'internlm2-123b', 'chatglm3-6b', 'Qwen1.5-MoE-A2.7B' with URLs to specific model pages."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "No actual prompt text is provided. The paper states snippets are 'injected into the backbone LLM via prompt' (Section 3.1) but never shows the actual prompt template or text used for the QA task."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.2 reports: Adam optimizer, learning rate 0.001, top-3 retrieval per granularity, top-2 final snippet selection, 1000 epochs per training session, ~12GB GPU memory for training, 6GB for inference."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. MoG is a retrieval optimization method with a trained router, not an agent-based system."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 3.2.1 describes the chunking procedure (non-overlapping, each coarser level = 2× previous, 5 levels). Section 4.1 states chunking sizes relative to MedRAG ({1/2, 1, 2, 4, 8}×). Section 3.3 describes graph construction for MoGG. Soft label construction is detailed in Section 3.2.2."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 5 'Limitations and Broader Impacts' provides a dedicated discussion of three specific limitations and a security concern."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 5 lists specific limitations: (1) granularity levels are manually assigned requiring grid search, (2) the router uses only semantic query information without query type or expected response length, (3) lack of length normalization in retrieval scoring, (4) security risk from a compromised router."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state what the results do NOT show. Instead, it generalizes from medical QA to other domains: 'We posit that significant improvements demonstrated by the tests on this knowledge-intensive field suggest MoG's potential effectiveness in other domains with lower knowledge dependencies and higher error tolerances' (Section 4.1)."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw experimental data (model predictions, retrieved snippets, router weights per query) is released. Only aggregated accuracy numbers are reported in tables."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4.1 and Appendix B describe all corpora and QA datasets with statistics (Table 3): document counts, snippet counts, average lengths, domains, dataset sizes, and number of options."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. All data comes from standard public benchmarks and corpora."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline is documented: corpora → chunking at 5 granularity levels → BM25 indexing → soft label construction → router training → inference. For MoGG, the graph construction step is additionally described (Section 3.3). Corpus statistics are provided in Table 3."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "The acknowledgements thank 'supervisors and colleagues at Shanghai Artificial Intelligence Laboratory' and reviewers, but no explicit funding sources, grants, or sponsors are listed."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed on the first page: Shanghai AI Laboratory, Beihang University, and theSight Technology."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Cannot be determined since funding is not disclosed. The work is affiliated with Shanghai AI Laboratory which may have interests in RAG system improvements."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial disclosure statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the five backbone LLMs (GPT-3.5, InternLM2, Llama3, GLM3, Qwen1.5) despite evaluating them on public benchmarks."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of whether the QA benchmark questions (MMLU, MedQA, etc.) appeared in the training data of the backbone LLMs. These are well-known public benchmarks likely present in training corpora."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "MMLU (2021), MedQA (2018), MedMCQA (2022), PubMedQA (2019), and BioASQ (2015-2023) were all publicly available before the training cutoffs of the tested LLMs. No contamination discussion is provided."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Appendix M (Table 11) reports wall-clock execution times for different numbers of granularity levels. Section 4.5 states router requires ~12GB GPU memory for training and 6GB for inference."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 4.2 states: '35 training sessions and over a hundred inferences, each training session taking around 4 hours for 1000 epochs.' GPU types (3090, 4090) and memory requirements are specified. Storage overhead is 2.7× corpus size."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. Results appear to be from single runs, with ± values being dataset-size-based confidence intervals rather than seed-based variance."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never stated. 'Each training job is run until the convergence of the loss value' (Section 4.2) suggests single runs per configuration."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search budget is reported. The learning rate (0.001), soft label values (0.8/0.2), kr values, and kgraph are presented without documenting how many configurations were tried."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Key design choices like soft label values '[0.8, 0.2, 0] or [0.7, 0.3, 0] yields similar results' (Section 3.2.2) are mentioned but the selection process and number of alternatives tested is not systematically documented."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper runs comparisons across 5 LLMs × 5 datasets × multiple methods with no correction for multiple comparisons applied."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate their own MoG/MoGG system against their own implementation of baselines (MedRAG, CoT) without acknowledging or mitigating author-evaluation bias."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Appendix M (Table 11) reports execution time across different numbers of granularity levels, and Section 4.5 discusses the marginal cost increase with more granularity levels relative to the performance gains."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "No discussion of whether the MIRAGE benchmark QA tasks actually measure the claimed retrieval quality improvements, or whether exact matching accuracy is an adequate construct for evaluating chunking optimization."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No agentic scaffolding is involved. MoG is a retrieval optimization method, not an agent-based system."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of whether LLM training data includes information from after the benchmark creation dates. All five benchmarks predate the tested LLMs' training."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "Section 4.3 states: 'To prevent knowledge leakage, only the question is used (options excluded) to retrieve reference documents from the external knowledge database.' This explicitly addresses one form of feature leakage in the retrieval setup."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether training and test examples in the QA datasets share structural similarities or whether the soft-label training data overlaps with evaluation data."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap analysis) is applied. The option exclusion is a design choice, not a detection method for training data contamination."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "MoG consistently enhances RAG system performance over MedRAG baseline across different backbone LLMs",
    371       "evidence": "Table 1 shows MoG outperforms MedRAG on average accuracy for all 5 backbone LLMs when trained with MedCorp corpus (GLM3: 0.4923 vs 0.4804, GPT-3.5: 0.6631 vs 0.6484, InternLM: 0.6057 vs 0.5428, Llama3: 0.6648 vs 0.5900, Qwen1.5: 0.5174 vs 0.4558).",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "MoG improves accuracy more on weaker LLMs than stronger ones",
    376       "evidence": "Section 4.3 notes this pattern. From Table 1: Qwen1.5 gains +6.2pp, GLM3 gains +1.2pp over MedRAG averages, while GPT-3.5 gains +1.5pp. The pattern is directionally consistent but not universal (InternLM gains +6.3pp despite being larger).",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "MoGG brings more significant improvement over MedRAG than MoG when trained on smaller corpora",
    381       "evidence": "Section 4.4 claims this by comparing Tables 1 and 2, noting MoGG trained on Textbooks (0.2% of MedCorp) still yields improvements. However, MoGG does not consistently outperform MoG across all LLM-dataset combinations in Table 2.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "The router mechanism is the key component driving MoGG's improvement",
    386       "evidence": "Table 9 (Appendix K) shows ablation: MoGG without router (0.5120 avg) performs similarly to MedRAG (0.5127 avg), while MoGG with router achieves 0.6262 avg on Llama3.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Increasing granularity levels adds only marginal execution time",
    391       "evidence": "Table 11 (Appendix M) shows global average time increases from 13.16s (1 level with router) to 17.93s (5 levels), with the bottleneck being LLM API call time.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Performance degradation from MoG is caused by noise in retrieved snippets, not the routing mechanism",
    396       "evidence": "Appendix I reports manual verification of ~10% of degraded samples, finding in 95% of cases the snippets were correctly retrieved but the LLM changed its answer based on noisy content.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No statistical significance testing",
    403       "detail": "All performance comparisons rely on comparing point estimates. The ± values in tables are binomial CIs based on dataset size (they are constant per dataset across methods), not variance across experimental runs. No significance tests are applied to any comparison despite claiming 'statistically and practically significant' improvements."
    404     },
    405     {
    406       "flag": "Single-domain evaluation with general claims",
    407       "detail": "All experiments use medical QA datasets only, yet the paper title and claims frame the contribution as generally applicable to RAG systems. The paper explicitly extrapolates: 'We posit that significant improvements... suggest MoG's potential effectiveness in other domains.'"
    408     },
    409     {
    410       "flag": "No benchmark contamination analysis",
    411       "detail": "Five public benchmarks (MMLU 2021, MedQA 2018, PubMedQA 2019, MedMCQA 2022, BioASQ 2015-2023) are used with LLMs that likely saw them during training. The paper's main contribution is a retrieval method, but contaminated baseline LLM knowledge confounds the comparison between RAG-enhanced and non-RAG (CoT) conditions."
    412     },
    413     {
    414       "flag": "Inconsistent results presented as consistent improvement",
    415       "detail": "MoG does not beat CoT on several LLM-dataset combinations (acknowledged in Section 4.3), and MoGG does not consistently beat MoG. The abstract's claim of 'significantly enhancing the performance' overstates the evidence."
    416     },
    417     {
    418       "flag": "Single-run results",
    419       "detail": "No evidence of multiple experimental runs. Training converges once per configuration, and the ± values are not run-based variance. Result stability across seeds or initializations is unknown."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    425       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    426       "year": 2020,
    427       "relevance": "Foundational RAG paper defining the retrieval-augmented generation paradigm for enhancing LLMs with external knowledge."
    428     },
    429     {
    430       "title": "Benchmarking retrieval-augmented generation for medicine",
    431       "authors": ["Guangzhi Xiong", "Qiao Jin", "Zhiyong Lu", "Aidong Zhang"],
    432       "year": 2024,
    433       "arxiv_id": "2402.13178",
    434       "relevance": "MIRAGE benchmark used as the primary evaluation framework; MedRAG serves as the main baseline for comparison."
    435     },
    436     {
    437       "title": "RAPTOR: Recursive abstractive processing for tree-organized retrieval",
    438       "authors": ["Parth Sarthi", "Salman Abdullah", "Aditi Tuli"],
    439       "year": 2024,
    440       "arxiv_id": "2401.18059",
    441       "relevance": "Hierarchical retrieval method organizing snippets as trees with multi-granularity access, directly related to granularity optimization in RAG."
    442     },
    443     {
    444       "title": "Retrieval-augmented generation for large language models: A survey",
    445       "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"],
    446       "year": 2024,
    447       "arxiv_id": "2312.10997",
    448       "relevance": "Comprehensive RAG survey covering retrieval strategies, graph indexing, and generation techniques relevant to the survey scope."
    449     },
    450     {
    451       "title": "Active retrieval augmented generation",
    452       "authors": ["Zhengbao Jiang", "Frank F. Xu", "Luyu Gao"],
    453       "year": 2023,
    454       "arxiv_id": "2305.06983",
    455       "relevance": "Adaptive retrieval strategy that dynamically decides when to retrieve, relevant to dynamic RAG optimization research."
    456     },
    457     {
    458       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    459       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    460       "year": 2022,
    461       "relevance": "CoT prompting technique used as a baseline comparison across all experiments in this paper."
    462     },
    463     {
    464       "title": "Demonstrate-search-predict: Composing retrieval and language models for knowledge-intensive NLP",
    465       "authors": ["Omar Khattab", "Keshav Santhanam", "Xiang Lisa Li"],
    466       "year": 2023,
    467       "arxiv_id": "2212.14024",
    468       "relevance": "DSP framework for composing retrieval and generation, representing information fusion techniques for RAG systems."
    469     },
    470     {
    471       "title": "Improving language models by retrieving from trillions of tokens",
    472       "authors": ["Sebastian Borgeaud", "Arthur Mensch", "Jordan Hoffmann"],
    473       "year": 2022,
    474       "arxiv_id": "2112.04426",
    475       "relevance": "RETRO architecture integrating retrieval at intermediate layers of LLMs, a key related work on retrieval-augmented language modeling."
    476     },
    477     {
    478       "title": "Atlas: Few-shot learning with retrieval augmented language models",
    479       "authors": ["Gautier Izacard", "Patrick Lewis", "Maria Lomeli"],
    480       "year": 2022,
    481       "arxiv_id": "2208.03299",
    482       "relevance": "Multiple retrieval strategy for RAG systems relevant to the multi-granularity retrieval approach proposed in MoG."
    483     },
    484     {
    485       "title": "Language models are few-shot learners",
    486       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    487       "year": 2020,
    488       "relevance": "GPT-3 paper establishing in-context learning capabilities that underpin RAG-based approaches to knowledge augmentation."
    489     }
    490   ],
    491   "engagement_factors": {
    492     "practical_relevance": {
    493       "score": 2,
    494       "justification": "RAG chunking optimization is directly relevant to practitioners building retrieval systems, with released code, though limited to medical domain validation."
    495     },
    496     "surprise_contrarian": {
    497       "score": 1,
    498       "justification": "Dynamic chunking is a reasonable extension of existing RAG practices rather than a surprising finding; the MoE-inspired routing is a natural architectural choice."
    499     },
    500     "fear_safety": {
    501       "score": 0,
    502       "justification": "No AI safety or security concerns raised as findings; the router security mention in limitations is speculative, not a demonstrated risk."
    503     },
    504     "drama_conflict": {
    505       "score": 0,
    506       "justification": "No controversy or conflict with established methods or labs; the paper presents incremental improvements over existing baselines."
    507     },
    508     "demo_ability": {
    509       "score": 2,
    510       "justification": "Code is released on GitHub (https://github.com/ZGChung/Mix-of-Granularity), enabling practitioners to try the method."
    511     },
    512     "brand_recognition": {
    513       "score": 1,
    514       "justification": "Shanghai AI Laboratory has moderate recognition in the Chinese AI research community but is not a top-tier globally recognized lab."
    515     }
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs