scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30570B)
      1 {
      2   "paper": {
      3     "title": "Mobility-Aware Cache Framework for Scalable LLM-Based Human Mobility Simulation",
      4     "authors": [
      5       "Hua Yan",
      6       "Heng Tan",
      7       "Yingxue Zhang",
      8       "Yu Yang"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.16727"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "MobCache introduces a reconstructible latent-space cache framework that stores LLM reasoning steps as embeddings and recombines them via tree-structured search, achieving 42-52% reduction in inference time and 79-93% cost reduction compared to direct LLM calls while maintaining comparable mobility simulation quality on Beijing and NYC datasets. A lightweight decoder trained with mobility law-constrained distillation replaces the full LLM for decoding, providing the main efficiency gain. Cross-city transfer from Beijing to NYC cache shows acceptable but degraded performance, with ~6% of users requiring fresh LLM calls.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The evaluation uses two public datasets: the Beijing mobility dataset from [32] and the NYC POI check-in dataset from [41], with user profiles simulated from U.S. Census data [3]. However, the generated synthetic training data and cache are not released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 4.2.2 specifies Python 3.10, PyTorch 2.1.0, LLaMA 3.2-3B and LLaMA 3.2-1B models, NVIDIA A100 and A6000 GPUs, Adam optimizer with learning rate 5e-5. This provides enough detail to broadly recreate the environment, though no requirements.txt is provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The implementation section describes components but not how to run them end-to-end."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Tables 1, 2, and 3 all report single point estimates with no confidence intervals, error bars, or ± notation."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims improvements like '42.20% reduction in inference time' and 'comparable performance' based solely on comparing numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used for any comparison."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports percentage improvements with baseline context visible in Tables 1-2 (e.g., 'at least a 42.20% reduction in inference time' from 2.3410s to 1.3530s, '79.71% increase in tokens per second'). The raw numbers and relative improvements provide sufficient magnitude context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper uses 4,000 people for cache construction, 13,000 synthetic trajectories, and 10,000 test trajectories, but provides no justification for these specific numbers and no power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviation, variance, or spread measures are reported anywhere. All results in Tables 1-3 appear to be single-run point estimates."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 4.2.3 compares against four baselines: CoPB [32], Urban-Mobility-LLM [1], Geo-LLaMA [20], and LLMob [14], covering both methods requiring and not requiring real-world training data."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All baselines are from 2024-2025: CoPB (2024), Urban-Mobility-LLM (2024), Geo-LLaMA (2024), LLMob (2024 NeurIPS). These are recent and represent the state of the art in LLM-based mobility simulation."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 4.8 presents ablation removing three components: latent-space evaluator (w/o LE), mobility law distillation (w/o MD), and lightweight decoder (w/o LD). Results in Table 3 show contribution of each component."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 4.2.4 defines 4 efficiency metrics (inference time, tokens/s, throughput, cost) and 5 quality metrics (radius of gyration, stay duration, jump length, location frequency, OD similarity), all using JSD."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Section 4.6 uses GPT-4o as an LLM evaluator to judge whether decoded mobility sequences are plausible (91% pass rate). This is LLM-as-judge, not human evaluation. No human judges evaluated the generated trajectories."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 4.2.1 states: 'we further sample 10,000 real trajectories as the test set, ensuring no user overlap with the cache.' Clear separation between cache construction data and evaluation data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down across two datasets (Beijing Table 1, NYC Table 2), across efficiency vs quality dimensions, per-metric breakdowns for all 5 quality metrics, and ablation variants (Table 3). Cross-city results are also shown separately."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "Section 2.2 discusses logic inconsistency problems of language-space recombination, but this is for a strawman approach, not their system. The cross-city experiment notes ~6% of users fail to match, but no systematic analysis of when or why MobCache's own outputs fail is provided."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 4.9 shows that exploration rate 0.3 gives 11.10% lower quality than 0.5, demonstrating a tradeoff. Section 4.5 shows cross-city transfer degrades performance. Table 3 ablation (w/o LD) shows the lightweight decoder sacrifices quality for efficiency."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of '42.20% reduction in inference time, 79.71% increase in tokens per second, 28.56% improvement in throughput, 42.46% reduction in cost' are supported by Tables 1 and 2. The case study claims of '66.93% reduction in inference speed and 93.18% reduction in cost' are supported by Section 4.4 and Figure 5."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper claims components 'improve' performance. The ablation study (Section 4.8, Table 3) uses controlled single-variable removal of the latent-space evaluator, mobility law distillation, and lightweight decoder, which is adequate for these causal claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'Scalable' simulation and Section 3.1 targets 'tens or hundreds of thousands of individuals,' but experiments use 10,000 trajectories with cost projections extrapolated to 100K. The cross-city experiment (Beijing→NYC) shows degradation but the paper frames it positively as 'acceptable.' Generalization beyond two urban datasets is not bounded."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for why MobCache performs comparably to baselines. For instance, the latent-space representation may simply be memorizing patterns rather than enabling meaningful reasoning recombination. No robustness checks or alternative interpretations are considered."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures JSD of 5 mobility statistics (radius of gyration, stay duration, jump length, location frequency, OD similarity) and frames these as demonstrating 'simulation quality' and 'fidelity.' Whether these aggregate distributional metrics capture the full meaning of realistic mobility simulation (individual trajectory plausibility, temporal coherence, activity diversity) is not discussed."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "LLaMA 3.2-3B and LLaMA 3.2-1B are specified with sizes. However, Section 3.3.1 says 'we use GPT as the generator' for data initialization without specifying which GPT model or version. Section 4.6 uses 'GPT-4o' for evaluation without a snapshot date or API version. Per schema rules, marketing names without snapshot dates do not count."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Appendix A.1 provides a prompt template with placeholders ({profile}, {today_date}, {home_poi}, {work_poi}). The template structure and requirements are detailed, but actual fill values are not provided. Per schema rules, templates with placeholders do not count unless fill values are also provided."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.2.2 reports: Adam optimizer, learning rate 5e-5, exploration rate tested in [0.3, 0.5, 0.7] (selected 0.5), λ tested in [0.01, 0.03, 0.05, 0.07] (selected 0.05), search rounds randomly sampled between 1 and 3, batch size 8 for throughput evaluation."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "MobCache is a caching and decoding framework, not an agentic scaffolding system. There is no tool use, retry logic, or memory/context management in the agentic sense."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.2.1 documents the pipeline: select 4,000 people from real data → generate 13,000 synthetic trajectories via LLM → fine-tune LLaMA for latent-space reasoning → build cache → sample 10,000 test trajectories with no user overlap. Section 3.3.1 describes how the initial training data is formatted."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 'Limitations and ethical considerations' includes a dedicated 'Limitation' subsection discussing the framework's constraints."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 states a specific limitation: 'it only applies to models that expose interpretable reasoning steps. In particular, the simulation model must provide accessible step-by-step reasoning, either as text or structured latent representations.' This is specific to this system's design."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Beyond the single limitation about reasoning step requirement, the paper does not state what the results do not show — e.g., no mention that results are limited to two urban datasets, that the cost projections assume specific pricing, or that the approach has not been tested at true large scale (100K+)."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The evaluation datasets (Beijing [32], NYC [41]) are public, but the paper's own synthetic training data, latent-space embeddings, cache contents, and trained model weights are not released. Independent verification of the generated results is not possible."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The Beijing dataset is described as 'collected via a social networking platform' covering Oct 1-Dec 31 2019 with 'mobility trajectories and user profile information.' NYC is a 'POI check-in dataset' with profiles simulated from U.S. Census demographics. Synthetic data generation via GPT is described in Section 3.3.1."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants were recruited for this study. The paper uses pre-existing public datasets (Beijing mobility [32], NYC check-ins [41]) and census data for profile simulation."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 4.2.1 documents the full pipeline: 4,000 people selected → 13,000 synthetic trajectories generated by LLM → fine-tuning for latent-space reasoning → cache construction → 10,000 test trajectories sampled with no user overlap. The stages are clear."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding sources, grants, or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: Lehigh University (Yan, Tan, Yang) and SUNY Binghamton (Zhang). No product conflict exists since they do not evaluate a product from their own institution."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence of the funder cannot be verified. The absence of a funding disclosure is itself a transparency gap."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial disclosure is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper uses LLaMA 3.2-3B (fine-tuned) and GPT (for data generation) without stating the training data cutoff for either model. The Beijing evaluation data is from 2019, which predates LLaMA's training data."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper ensures no user overlap between cache and test set (Section 4.2.1), but does not discuss whether LLaMA's pre-training data could include information from the evaluation datasets or their source publications."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The Beijing mobility dataset [32] was published in 2024 and its data is from 2019. LLaMA 3.2 may have been trained on data that includes information about these mobility patterns. No contamination analysis is provided."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The paper evaluates a computational framework using existing datasets."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. Ethics discussion in Section 5 addresses privacy of existing data, not human subjects research."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study. The simulated user profiles (age, gender, occupation) are inputs to the simulation, not research participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants recruited for this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 4.2.4 defines monetary cost per trajectory for 100K trajectories. Tables 1 and 2 report cost in $/trajectory (×10⁻³) for all methods. API costs use GPT-4o pricing; local GPU costs use $0.5/hour A6000 rate."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is listed (A100, A6000) but total training time, GPU hours for fine-tuning, cache construction time, and total computational budget are not reported."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single runs with no assessment of variability across seeds."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs producing the reported results is never stated. It is unclear whether results are from single runs or averaged over multiple."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 4.2.2 lists search spaces (exploration rate [0.3, 0.5, 0.7], λ [0.01, 0.03, 0.05, 0.07]) but does not report the total compute spent on hyperparameter search or the search method used."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Section 4.2.2 says 'select 0.5 based on the efficiency and quality trade-off' and 'select 0.05 based on the best performance,' but does not state whether selection was done on a validation set separate from the test set, or what the exact selection criterion was."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement and evaluate their own system against baselines. No discussion of author-evaluation bias or whether baseline implementations might be suboptimal."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Tables 1-2 report both efficiency and quality, but comparisons are not at matched compute budgets. MobCache uses a local fine-tuned LLaMA while baselines use GPT API calls — fundamentally different compute profiles that are not equalized."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses JSD on 5 mobility statistics following existing work [12, 32, 33] but does not discuss whether these metrics actually measure simulation fidelity. Whether matching distributional statistics implies realistic individual trajectories is not questioned."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "MobCache is a complete framework being evaluated as a bundled system, not a model comparison within a scaffold. The framework IS the thing being tested."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The Beijing data is from October-December 2019, published in a 2024 paper. LLaMA 3.2 was trained on data that likely includes information post-2019. No discussion of whether the model could have seen related mobility data or patterns during pre-training."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information. For instance, the profile similarity matching during inference uses features (profile, date) that could carry information about the expected trajectory distribution."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 4.2.1 explicitly states 'ensuring no user overlap with the cache' between the training/cache data (4,000 people) and test data (10,000 trajectories). This addresses independence between their training and test splits."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "The user-level split prevents overlap within their pipeline but no formal leakage detection method (canary strings, membership inference, decontamination) is applied to check whether the pre-trained LLaMA model has seen related data."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "MobCache achieves at least 42.20% reduction in inference time compared to the fastest baseline",
    369       "evidence": "Table 1 shows MobCache at 1.3530s vs Geo-LLaMA at 2.3410s (Beijing). Table 2 shows 1.2981s vs Geo-LLaMA at 2.6788s (NYC, 51.54% reduction).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "MobCache achieves 79.71% or greater increase in tokens per second",
    374       "evidence": "Table 1: 121.7636 vs Urban-Mobility-LLM's 67.3117 (80.89%). Table 2: 125.8562 vs Urban-Mobility-LLM's 70.0324 (79.71%). Supported by both datasets.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "MobCache maintains comparable quality to state-of-the-art LLM-based methods",
    379       "evidence": "Tables 1-2 show MobCache quality metrics are within range of baselines on most metrics, though not consistently best. On Beijing, MobCache wins on Duration, LocFreq, OdSim but not Radius or Jump Length vs some baselines. No statistical tests confirm 'comparable.'",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Applying MobCache to Urban-Mobility-LLM achieves 66.93% reduction in inference speed and 93.18% reduction in cost without sacrificing quality",
    384       "evidence": "Section 4.4 and Figure 5 show inference time drops from 8.080s to 2.672s and cost from $5.44×10⁻³ to $3.71×10⁻⁴. Figure 6 shows quality metrics remain comparable.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "91% of decoded sequences are considered appropriate by LLM evaluator vs 82% without the latent-space evaluator",
    389       "evidence": "Section 4.6 reports GPT-4o binary evaluation. The 91% vs 82% comparison is reported as a point estimate with no confidence interval or number of evaluated sequences.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "Cache built from Beijing data can effectively accelerate simulations in New York with acceptable quality",
    394       "evidence": "Section 4.5 and Figures 7-8 show cross-city transfer works but with lower efficiency (6% of users need fresh LLM calls) and lower quality than NYC-specific cache. 'Acceptable' is not defined quantitatively.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Limited cache diversity reduces simulation realism, motivating reconstructible caches",
    399       "evidence": "Section 2.1 compares group-based method with 1,000 cached trajectories against 10,000 real trajectories. Figures 2-3 show distributional differences in stay duration and location coverage.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No error bars or uncertainty quantification",
    406       "detail": "All results in Tables 1-3 are point estimates with no standard deviation, confidence intervals, or any indication of result stability across runs. It is impossible to tell whether the reported differences are statistically meaningful."
    407     },
    408     {
    409       "flag": "Unspecified GPT model for critical data generation step",
    410       "detail": "Section 3.3.1 states 'we use GPT as the generator' for the initial training data without specifying which GPT model or version. This data is the foundation for the entire cache — the choice of model significantly affects data quality and downstream results."
    411     },
    412     {
    413       "flag": "Unfair cost comparison across fundamentally different compute regimes",
    414       "detail": "Table 1-2 cost comparisons mix API-based methods (GPT pricing at $10/1M output tokens) with local fine-tuned models (GPU rental at $0.5/hr). The API costs represent 'strict lower bounds' (ignoring input tokens) while GPU costs use retail cloud rates. These are not comparable — the training cost of MobCache's fine-tuned models is excluded."
    415     },
    416     {
    417       "flag": "LLM-as-judge used without validation",
    418       "detail": "Section 4.6 uses GPT-4o to judge whether decoded mobility sequences are 'plausible.' No validation of this LLM judge against human judgment is provided, and no inter-rater reliability with human evaluators is established."
    419     },
    420     {
    421       "flag": "No code release for a complex multi-component system",
    422       "detail": "MobCache involves fine-tuned LLMs, latent-space evaluators, lightweight decoders, and cache search algorithms. Without released code, the implementation details cannot be verified or reproduced."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Chain-of-planned-behaviour workflow elicits few-shot mobility generation in llms",
    428       "authors": ["Chenyang Shao", "Fengli Xu", "Bingbing Fan", "Jingtao Ding", "Yuan Yuan", "Meng Wang", "Yong Li"],
    429       "year": 2024,
    430       "arxiv_id": "2402.09836",
    431       "relevance": "Core baseline — demonstrates LLM-based mobility simulation via structured reasoning chains, the paradigm MobCache builds upon."
    432     },
    433     {
    434       "title": "Large language models as urban residents: An llm agent framework for personal mobility generation",
    435       "authors": ["Wang Jiawei", "Renhe Jiang", "Chuang Yang", "Zengqing Wu"],
    436       "year": 2024,
    437       "relevance": "NeurIPS 2024 baseline (LLMob) — LLM agent framework combining self-consistency and retrieval for mobility generation."
    438     },
    439     {
    440       "title": "Urban mobility assessment using llms",
    441       "authors": ["Prabin Bhandari", "Antonios Anastasopoulos", "Dieter Pfoser"],
    442       "year": 2024,
    443       "relevance": "Baseline (Urban-Mobility-LLM) — demonstrates LLM-based travel survey synthesis, used in the case study for MobCache integration."
    444     },
    445     {
    446       "title": "Geo-llama: Leveraging llms for human mobility trajectory generation with spatiotemporal constraints",
    447       "authors": ["Siyu Li", "Toan Tran", "Haowen Lin", "John Krumm"],
    448       "year": 2024,
    449       "arxiv_id": "2408.13918",
    450       "relevance": "Baseline requiring real-world training data — fine-tunes LLMs for trajectory generation under spatiotemporal constraints."
    451     },
    452     {
    453       "title": "Training large language models to reason in a continuous latent space",
    454       "authors": ["Shibo Hao", "Sainbayar Sukhbaatar", "DiJia Su", "Xian Li"],
    455       "year": 2024,
    456       "arxiv_id": "2412.06769",
    457       "relevance": "Core methodological inspiration — the latent-space reasoning approach that MobCache adapts for mobility simulation."
    458     },
    459     {
    460       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    461       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    462       "year": 2023,
    463       "arxiv_id": "2305.05176",
    464       "relevance": "Addresses LLM cost reduction, the central motivation for MobCache's caching approach."
    465     },
    466     {
    467       "title": "OpenCity: A scalable platform to simulate urban activities with massive llm agents",
    468       "authors": ["Yuwei Yan", "Qingbin Zeng", "Zhiheng Zheng"],
    469       "year": 2024,
    470       "arxiv_id": "2410.21286",
    471       "relevance": "Addresses scalability of LLM-based urban simulation through I/O multiplexing and TCP connection pooling."
    472     },
    473     {
    474       "title": "AgentSociety: Large-scale simulation of llm-driven generative agents advances understanding of human behaviors and society",
    475       "authors": ["Jinghua Piao", "Yuwei Yan", "Jun Zhang"],
    476       "year": 2025,
    477       "arxiv_id": "2502.08691",
    478       "relevance": "Large-scale LLM agent simulation using distributed execution and message reuse for social simulation."
    479     },
    480     {
    481       "title": "On the limits of agency in agent-based models",
    482       "authors": ["Ayush Chopra", "Shashank Kumar", "Nurullah Giray-Kuru", "Ramesh Raskar", "Arnau Quera-Bofarull"],
    483       "year": 2024,
    484       "arxiv_id": "2409.10568",
    485       "relevance": "Proposes group-based LLM agent approach (LLM-archetypes) that MobCache argues reduces behavioral diversity."
    486     },
    487     {
    488       "title": "From individual to society: A survey on social simulation driven by large language model-based agents",
    489       "authors": ["Xinyi Mou", "Xuanwen Ding", "Qi He"],
    490       "year": 2024,
    491       "arxiv_id": "2412.03563",
    492       "relevance": "Survey of LLM-driven social simulation, covering the broader research area MobCache contributes to."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs