scan.json (31491B)
1 { 2 "paper": { 3 "title": "GeoAnalystBench: A GeoAI benchmark for assessing large language models for spatial analysis workflow and code generation", 4 "authors": [ 5 "Qianheng Zhang", 6 "Song Gao", 7 "Chen Wei", 8 "Yibo Zhao", 9 "Ying Nie", 10 "Ziru Chen", 11 "Shijie Chen", 12 "Yu Su", 13 "Huan Sun" 14 ], 15 "year": 2025, 16 "venue": "Trans. GIS", 17 "arxiv_id": "2509.05881", 18 "doi": "10.1111/tgis.70135" 19 }, 20 "scan_version": 3, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval"], 23 "key_findings": "GeoAnalystBench evaluates 6 LLMs on 50 Python-based GIS tasks, finding a clear proprietary-open source gap: ChatGPT-4o-mini achieves 95% validity and 0.390 CodeBLEU while DeepSeek-R1-7B manages only 48.5% validity and 0.272 CodeBLEU. Tasks requiring spatial reasoning (site selection, spatial relationship detection) are hardest across all models. Domain knowledge and dataset descriptions improve workflow accuracy for proprietary models. Llama-3.1-8B is the only open-source model approaching proprietary performance.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper provides a GitHub repository: https://github.com/GeoDS/GeoAnalystBench (Section 1, paragraph 3)." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The 50-task benchmark dataset is released via the GitHub repository. Table 1 describes the dataset structure including instructions, domain knowledge, dataset descriptions, and human-designed workflows." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper describes hardware (NVIDIA RTX 2080 Super, Intel Core i7, Tesla P40) and mentions vLLM for local inference, but provides no requirements.txt, Dockerfile, or detailed library version specifications sufficient to recreate the environment." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided in the paper. The evaluation framework is described at a high level but lacks specific commands or scripts to reproduce the results." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "Table 2 reports standard deviations for MAD values across tasks, but the primary metrics (valid rate, text similarity, and CodeBLEU in Table 3) are reported as point estimates without confidence intervals or error bars." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests are used despite claims like 'ChatGPT-4o-mini leading with the best average score' and 'proprietary models consistently outperform open-source counterparts.' Comparisons are made solely by comparing raw numbers." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper reports raw scores and absolute values (e.g., 95% validity, 0.390 CodeBLEU) but does not compute formal effect sizes such as Cohen's d or percentage improvements with baseline context between models." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The benchmark contains 50 tasks with no justification for why 50 tasks are sufficient. No power analysis or discussion of whether 50 tasks provide adequate statistical power for the claims made." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Three outputs per prompt are collected, but variance across these runs is not reported for any metric. The Std columns in Table 2 appear to be standard deviation of MAD across the 50 tasks, not across experimental runs. CodeBLEU scores in Table 3 have no variance measures." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper compares 3 proprietary models (ChatGPT-4o-mini, Claude 3.5 Sonnet, Gemini 1.5 Flash) and 3 open-source models (DeepSeek-R1-7B, Llama-3.1-8B, CodeLlama-7B). Human-designed workflows serve as the reference baseline." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "The models are contemporary at time of writing: ChatGPT-4o-mini (2024-07-18), Claude 3.5 Sonnet (2024-06-20), Gemini 1.5 Flash (2024-05-14), DeepSeek-R1-7B (2025-01-20), Llama-3.1-8B (2024-07-23)." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper systematically varies input conditions across 4 settings: task alone (WK), with domain knowledge (DK), with dataset descriptions (DD), and with both. Table 2 shows MAD values for each condition, demonstrating the effect of each component." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Multiple metrics are used: valid rate, mean absolute deviation (MAD), text similarity (cosine similarity via sentence transformers), and CodeBLEU with 4 sub-components (n-gram, weighted n-gram, syntax AST, semantic data-flow)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.1 states 'validity is assessed through a combination of automated parsing and manual checking to ensure that the generated workflow constitutes a coherent and complete sequence of geoprocessing steps.' GIS expert annotators assess workflow quality." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The 50 benchmark tasks are evaluated directly without any model fine-tuning or selection. All 4 prompt variants are reported rather than selecting the best, making the entire benchmark effectively a clean test set." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Figure 5 shows MAD performance breakdown across 6 spatial analysis categories (Understanding where, Measuring size/shape/distribution, Determining relations, Finding locations/paths, Detecting patterns, Spatial interpolation). Section 5.3 discusses per-category performance." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "The case studies in Section 6 discuss specific failures: KDE cell size being twice the optimal value (Section 6.1.2), point-based vs. road-based hotspot analysis divergence (Section 6.2.3), and LLM parameter selection limitations." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "DeepSeek-R1-7B (48.5% validity) and CodeLlama-7B (32.7% validity) show poor performance. Section 5.1 discusses why distilled models fail. Section 6 shows LLM-generated code producing incorrect spatial analysis results despite valid workflows." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims are supported: ChatGPT-4o-mini at 95% validity and 0.39 CodeBLEU matches Table 2 and Table 3. DeepSeek-R1-7B at 48.5% validity and 0.272 CodeBLEU matches. Spatial reasoning tasks being most challenging is supported by Figure 5." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The claim that domain knowledge and dataset descriptions improve accuracy is supported by controlled comparisons (same models tested with/without DK/DD in Table 2), constituting adequate single-variable manipulation. The paper avoids strong causal claims about model architecture effects." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper tests 6 models on 50 tasks but makes broad claims about 'current LLMs in GIS automation' (Section 8). The conclusion states 'The most advanced LLMs have similar levels of performance for spatial analysis tasks' which overgeneralizes from 6 models to all 'current LLMs.'" 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper proposes explanations for performance differences (training data quality, instruction tuning, distillation) but does not consider alternatives such as task design bias favoring proprietary models, evaluation metric limitations, or API vs. local inference confounds." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper measures CodeBLEU, text similarity, and validity as proxies for GIS workflow capability, but does not systematically discuss the gap between these metrics and actual GIS task completion quality. The case studies show workflows can produce different results despite similar structure, but this disconnect is not formally addressed." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5 provides version dates for all models: ChatGPT-4o-mini (2024-07-18), Claude 3.5 Sonnet (2024-06-20), Gemini 1.5 Flash (2024-05-14), DeepSeek-R1-7B (2025-01-20), Llama-3.1-8B (2024-07-23), CodeLlama-7B (2023-07-18)." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Prompt templates for both workflow and code generation are provided in Section 4.1 and 4.2. The Appendix provides complete prompts with actual content for 2 case studies. The full benchmark dataset with all fill values is released on GitHub, allowing reconstruction of all prompts." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4 states 'a temperature setting of 0.7 to balance consistency and response diversity' for all models. Section 5 reiterates this. However, only temperature is reported — no top-p, max tokens, or other sampling parameters are mentioned." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The paper sends direct prompts to LLMs and collects single-turn outputs." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3.1 describes the annotation process in detail with Figure 2: annotators executed tasks in GIS software, converted tutorials to workflow format, translated to Python scripts, and documented domain knowledge and data descriptions." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 8 (Conclusion) contains a substantive limitations discussion spanning a full paragraph, covering task scope, linear workflow structure, data compatibility issues, and potential training data contamination." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Specific threats discussed include: tasks with predefined parameters may not generalize to more complex problems, linear workflow structure limits applicability to parallel workflows, LLMs face data compatibility issues with 'incorrect parameter passing and incompatible data types,' and tasks from public tutorials may appear in training data." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 8 states: 'the study focused on evaluating LLM-generated workflows for certain GIS tasks with predefined parameters, and thus the performance may vary when applied to new or more complicated real-world problems in different scientific domains' and 'the GeoAnalystBench is still limited by the coverage of real-world scenarios.'" 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "The benchmark dataset including tasks, instructions, domain knowledge, dataset descriptions, and human-designed workflows is available at https://github.com/GeoDS/GeoAnalystBench." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3.1 describes curation from 'publicly available GIS tutorials (e.g., Esri Learn platform, university lab instructions) prior to year 2025.' The multi-step annotation process (Figure 2) is documented: execute in GIS software → convert to workflow → translate to Python → document metadata." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": false, 206 "justification": "The paper states 'Three annotators read through the GIS online materials' but does not describe how these annotators were selected, their specific qualifications beyond 'GIS experts,' or whether selection could introduce bias." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The pipeline from tutorial collection → GIS software execution → workflow documentation → Python code generation → metadata annotation is documented in Section 3.1 and Figure 2. The evaluation pipeline (prompt construction → LLM generation → automated/manual scoring) is described in Section 4." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funding is disclosed: 'NSF, Award Number: 2112606' in the header and 'National Science Foundation funded AI institute [Grant No. 2112606] for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE)' in Acknowledgements." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All authors' affiliations are listed: University of Wisconsin-Madison (Geospatial Data Science Lab, Geography) and Ohio State University (Computer Science and Engineering). The authors evaluate third-party models, not their own products." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": true, 228 "justification": "NSF is an independent US government funding agency with no financial interest in the relative performance of any evaluated LLM." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interest declaration is present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "The paper provides model release dates (e.g., ChatGPT-4o-mini 2024-07-18) but does not state training data cutoff dates for any of the 6 models evaluated." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Section 8 acknowledges: 'Some tasks were adapted from publicly available tutorials, so we can't completely rule out the possibility that similar examples exist in LLM training data.' They argue performance differences suggest tasks aren't memorized, but this is a weak defense." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": false, 250 "justification": "Tasks are derived from ESRI Learn platform and other public GIS tutorials that predate all models tested. The paper acknowledges contamination risk but uses only the weak argument that 'performance differences across models and spatial analysis categories suggest that the tasks remain sufficiently challenging.' No decontamination methods, canary strings, or temporal analysis are applied." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the evaluation. The paper evaluates LLM outputs against expert-designed benchmarks." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The study evaluates LLMs on a programmatic benchmark." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in the study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No API costs, tokens consumed, or wall-clock inference times are reported despite using both paid API services and local GPU inference across 6 models and 600 prompt configurations." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Hardware is described (RTX 2080 Super GPU, Intel Core i7, Tesla P40) but no total compute budget (GPU hours, total API spend, or runtime) is provided." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "Three outputs per prompt are collected at temperature 0.7, but no analysis of variance across these runs is reported. Results appear to aggregate the 3 outputs without reporting sensitivity." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 4 states: 'Three outputs per prompt are collected for evaluation.'" 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "Temperature 0.7 was selected 'to balance consistency and response diversity' but no alternative settings were tried or reported. No search budget is stated." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "All 4 prompt configurations (WK, DK, DD, both) are reported in Table 2 rather than cherry-picking the best. The single temperature setting is applied uniformly across all models." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors created the benchmark and evaluate models on it. They do not discuss whether benchmark design choices might favor certain models, or how annotator subjectivity in the validity assessment could bias results." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Proprietary models (unknown size, cloud-served) are compared against 7-8B parameter open-source models without any discussion of the massive compute disparity or whether performance differences are simply a function of model scale." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper uses CodeBLEU and text similarity as proxies for GIS code generation quality but does not discuss whether these metrics actually measure spatial analysis capability. The case studies show that similar workflows can produce very different geospatial outputs, undermining the metrics' construct validity." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding is used. Direct single-turn prompting is applied uniformly across all models." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Tasks are derived from ESRI Learn tutorials and academic publications accessed in 2024, all predating the evaluated models. The paper acknowledges contamination risk in the conclusion but does not perform temporal analysis of when source materials became publicly available relative to model training cutoffs." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "Not discussed. The prompts include task instructions, domain knowledge, and dataset descriptions, but there is no analysis of whether these inputs provide excessive information that would not be available in realistic use." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "Several tasks share the same domain (e.g., tasks 12, 21, 22, 32, 45 all involve mountain lion corridors; tasks 6 and 43 are the same elk home range task in different frameworks). Non-independence between these related tasks is not discussed." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No concrete leakage detection or prevention methods are applied. No canary strings, membership inference tests, or n-gram overlap analysis between benchmark content and model training data." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Proprietary LLMs (ChatGPT-4o-mini, Claude 3.5 Sonnet, Gemini 1.5 Flash) consistently outperform open-source models in GIS workflow generation with valid rates over 93% and text similarity scores over 0.54.", 375 "evidence": "Table 2 shows proprietary models at 93.5-96% valid rates and 0.54-0.56 text similarity, while open-source models range from 32.7-95.3% validity and 0.08-0.39 similarity (Section 5.1).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "ChatGPT-4o-mini achieves the highest CodeBLEU score (0.390) among all evaluated models.", 380 "evidence": "Table 3 shows ChatGPT-4o-mini at 0.390 CodeBLEU, compared to Claude (0.370), Gemini (0.358), Llama (0.340), CodeLlama (0.319), DeepSeek (0.272) (Section 5.2).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Domain knowledge and dataset descriptions improve workflow generation accuracy across all proprietary LLMs.", 385 "evidence": "Table 2 shows MAD decreasing from WK to 'With Both' conditions for all proprietary models (e.g., Gemini: 2.30→1.33, ChatGPT: 1.81→1.49) (Section 5.1).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Tasks requiring spatial reasoning—finding best locations/paths (F) and determining how places are related (DR)—are the most challenging across all models.", 390 "evidence": "Figure 5 shows highest MAD values for categories F and DR across both proprietary and open-source models (Section 5.3).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "DeepSeek-R1-7B's poor performance (48.5% validity) is attributable to knowledge distillation compressing reasoning capability.", 395 "evidence": "Section 5.1 argues distillation 'transfers knowledge from a larger teacher model to a smaller student model' at the cost of 'decreased reasoning depth and generalization capacity,' but no controlled experiment isolates distillation as the cause versus model size or training data.", 396 "supported": "weak" 397 }, 398 { 399 "claim": "ArcPy-based code evaluation shows better CodeBLEU scores than open-source package-based evaluation.", 400 "evidence": "Figure 6 shows ArcPy vs. open-source package CodeBLEU scores, with ArcPy generally higher. Section 5.4 attributes this to ArcPy's standardized syntax (Section 5.4).", 401 "supported": "moderate" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "High contamination risk from public tutorials", 407 "detail": "Benchmark tasks are derived from ESRI Learn tutorials and university lab instructions that are freely available online and almost certainly present in LLM training data. The paper's only defense is that performance varies across categories, which does not rule out partial memorization." 408 }, 409 { 410 "flag": "No statistical significance testing", 411 "detail": "All comparative claims between models (e.g., proprietary vs. open-source) are based solely on comparing raw numbers without any statistical tests, making it impossible to determine whether differences are meaningful or within noise." 412 }, 413 { 414 "flag": "Unfair compute comparison", 415 "detail": "Large proprietary models of unknown size (ChatGPT-4o-mini, Claude 3.5 Sonnet, Gemini 1.5 Flash) are compared against 7-8B parameter open-source models. The conclusion that 'proprietary outperforms open-source' conflates model scale with accessibility." 416 }, 417 { 418 "flag": "Non-independent tasks inflate sample size", 419 "detail": "Multiple tasks share the same domain or are near-duplicates (e.g., 5 mountain lion corridor tasks numbered 12, 21, 22, 32, 45; two elk home range tasks 6 and 43). This non-independence is not acknowledged and inflates the effective sample from 50 to fewer distinct evaluation points." 420 }, 421 { 422 "flag": "Construct validity gap between metrics and actual GIS output quality", 423 "detail": "The case studies demonstrate that LLM code producing similar CodeBLEU scores to reference code can yield substantially different and incorrect geospatial outputs (e.g., wrong KDE cell size, point-based vs. road-based analysis). This undermines the paper's primary evaluation metrics." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "ScienceAgentBench: Toward rigorous assessment of language agents for data-driven scientific discovery", 429 "authors": ["Ziru Chen", "Shijie Chen"], 430 "year": 2024, 431 "arxiv_id": "2410.05080", 432 "relevance": "Benchmark for evaluating language agents on scientific tasks including GIS; directly compared to GeoAnalystBench." 433 }, 434 { 435 "title": "Autonomous GIS: the next-generation AI-powered GIS", 436 "authors": ["Zhenlong Li", "Huan Ning"], 437 "year": 2023, 438 "relevance": "Pioneering work on LLM-powered autonomous GIS (LLM-Geo) that decomposes spatial analysis into structured workflows." 439 }, 440 { 441 "title": "GeoGPT: An assistant for understanding and processing geospatial tasks", 442 "authors": ["Yifan Zhang", "Chen Wei"], 443 "year": 2024, 444 "relevance": "Framework integrating LLMs with GIS tools for end-to-end geospatial workflow generation and execution." 445 }, 446 { 447 "title": "GeoBenchX: Benchmarking LLMs for Multistep geospatial Tasks", 448 "authors": ["V. Krechetova", "D. Kochedykov"], 449 "year": 2025, 450 "arxiv_id": "2503.18129", 451 "relevance": "Multi-step geospatial benchmark incorporating tool invocation and hallucination challenges, complementary to GeoAnalystBench." 452 }, 453 { 454 "title": "CodeBLEU: a method for automatic evaluation of code synthesis", 455 "authors": ["Shuo Ren"], 456 "year": 2020, 457 "relevance": "The code evaluation metric used as a primary measure in GeoAnalystBench for comparing LLM-generated code quality." 458 }, 459 { 460 "title": "Chain-of-thought prompting elicits reasoning in large language models", 461 "authors": ["Jason Wei"], 462 "year": 2022, 463 "relevance": "Foundation for the reasoning approaches used in GeoAnalystBench prompts and evaluation of LLM spatial reasoning." 464 }, 465 { 466 "title": "The Llama 3 herd of models", 467 "authors": ["Aaron Grattafiori"], 468 "year": 2024, 469 "arxiv_id": "2407.21783", 470 "relevance": "Describes Llama 3.1, one of the open-source models evaluated in the benchmark." 471 }, 472 { 473 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via Reinforcement Learning", 474 "authors": ["DeepSeek-AI"], 475 "year": 2025, 476 "arxiv_id": "2501.12948", 477 "relevance": "Describes DeepSeek-R1, the reasoning-focused model evaluated in the benchmark whose distilled 7B version performed poorly." 478 }, 479 { 480 "title": "Evaluation of code LLMs on geospatial code generation", 481 "authors": ["Piotr Gramacki", "Bruno Martins", "Piotr Szymański"], 482 "year": 2024, 483 "relevance": "Prior benchmark evaluating LLMs on geospatial Python code generation with 20 tasks and 77 examples, a direct predecessor to GeoAnalystBench." 484 }, 485 { 486 "title": "GIS Copilot: Towards an autonomous GIS agent for spatial analysis", 487 "authors": ["Temitope Akinboyewa", "Zhenlong Li"], 488 "year": 2025, 489 "relevance": "Autonomous GIS agent integrated with QGIS that generates and executes spatial analysis workflows." 490 }, 491 { 492 "title": "GeoTool-GPT: A trainable method for facilitating large language models to master GIS tools", 493 "authors": ["Chen Wei", "Yifan Zhang"], 494 "year": 2024, 495 "relevance": "Fine-tuned LLM for operating GIS tools, demonstrating domain adaptation for geospatial applications." 496 }, 497 { 498 "title": "Evaluating large language models on geospatial tasks: a multiple geospatial task benchmarking study", 499 "authors": ["Lei Xu"], 500 "year": 2025, 501 "relevance": "Multi-task geospatial benchmark covering twelve spatial reasoning tasks, revealing variability in LLM performance across domains." 502 } 503 ], 504 "engagement_factors": { 505 "practical_relevance": { 506 "score": 2, 507 "justification": "GIS practitioners could use GeoAnalystBench to evaluate LLMs for their geospatial workflows, though the benchmark is domain-specific." 508 }, 509 "surprise_contrarian": { 510 "score": 1, 511 "justification": "Results largely confirm expectations that proprietary models outperform smaller open-source ones; the spatial reasoning difficulty finding is moderately novel." 512 }, 513 "fear_safety": { 514 "score": 0, 515 "justification": "No AI safety or security concerns are raised." 516 }, 517 "drama_conflict": { 518 "score": 0, 519 "justification": "No controversy or provocative framing." 520 }, 521 "demo_ability": { 522 "score": 2, 523 "justification": "GitHub repository with benchmark tasks is available for researchers to test their own models." 524 }, 525 "brand_recognition": { 526 "score": 1, 527 "justification": "Evaluates well-known models (ChatGPT, Claude, Gemini) but from an academic GIS research group, not a major AI lab." 528 } 529 } 530 }