scan-v5.json (24985B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Evaluation of Code LLMs on Geospatial Code Generation", 6 "authors": [ 7 "Piotr Gramacki", 8 "Bruno Martins", 9 "Piotr Szymański" 10 ], 11 "year": 2024, 12 "venue": "GeoAI@SIGSPATIAL", 13 "arxiv_id": "2410.04617", 14 "doi": "10.1145/3687123.3698286" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All abstract claims (benchmark construction, task categorization, model evaluation, public release) are backed by the paper's content in Sections 3–4.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": false, 26 "answer": false, 27 "justification": "The paper makes observational comparisons across task types and models but does not make causal claims requiring special study design.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "Claims like 'models have a weak understanding of the geospatial aspect' and 'An AI coding assistant which is unable to use popular tools is not very useful' go beyond what 77 samples and 7B/8B-only models can support.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "No alternative explanations are considered for observed performance differences, such as 4-bit quantization effects, prompt sensitivity, or library version mismatches.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper measures functional correctness via test-case pass rates (accuracy, pass@1, pass_any@1) and explicitly claims these evaluate code generation capability, which is a direct rather than proxy measure.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "A dedicated 'Limitations' paragraph appears in Section 5, distinct from the conclusion prose.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "The limitations mention only computational constraints restricting model size and the need to expand task coverage; no specific threats such as quantization effects on validity, test-case adequacy, or coverage gaps are discussed.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "The authors explicitly bound scope to 7B/8B models and state 'our work is just the first steps towards the construction of a comprehensive geospatial code generation benchmark.'", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding section or acknowledgment of funding sources appears anywhere in the paper.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations are clearly stated on the title page: Wrocław University of Science and Technology / Kraina.AI and INESC-ID / Instituto Superior Técnico.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding is disclosed, so independence cannot be assessed.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial disclosure statement is present in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "The four benchmark dimensions (task complexity, input type, tools usage, task framing) are explicitly defined with enumerated values in Section 3.1.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper clearly states its contributions: a new geospatial code generation benchmark dataset and a comparative evaluation of seven code LLMs on it.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 explicitly positions the benchmark relative to HumanEval, DS-1000, APPS, and prior geospatial LLM work, explaining how this benchmark addresses gaps they identified.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "A public GitHub repository (https://github.com/kraina-ai/geospatial-code-llms-dataset) is linked in the abstract footnote with both dataset and evaluation code.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "The 77-sample benchmark dataset is released on the same public GitHub repository.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper mentions Python, transformers, and bitsandbytes but provides no requirements.txt, Dockerfile, or pinned dependency list; library versions used in evaluation are not specified.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": true, 140 "justification": "Section 4.1 describes the evaluation pipeline in sufficient detail: code trimming procedure, virtual environment creation, library discovery and import, and hardware configuration.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "No confidence intervals or error bars are reported for any result tables; only point estimates are given.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "Comparative claims (e.g., StarCoder2 outperforms Gemma, single-step easier than multi-step) are made without any statistical significance tests.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Percentage pass@1 scores with HumanEval as reference context are reported, providing interpretable effect magnitudes (e.g., StarCoder2 32.47% vs. Gemma 9.09%).", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The dataset size of 77 samples (20 unique tasks) is explained procedurally via augmentation but not justified statistically or by power analysis.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "Greedy decoding produces a single deterministic output per sample; no multiple runs are performed and no variance across runs is reported.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Seven models are compared against each other, and HumanEval scores from public leaderboards are included as reference baselines.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "All tested models are 2023–2024 releases (StarCoder2, CodeLlama, Llama-3, Mistral-7B, Gemma, CodeGemma), representing the contemporary 7B/8B tier.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": false, 191 "answer": false, 192 "justification": "The paper evaluates existing pretrained models without proposing a new system; ablation is not applicable.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Three metrics are used: accuracy (partial test-case pass rate), pass@1 (all tests pass), and pass_any@1 (at least one test passes).", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "Evaluation is entirely automated via functional test cases; human evaluation of model outputs is not used and not relevant given the code-correctness focus.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "The entire 77-sample dataset serves as a held-out test set for pre-trained models that were not fine-tuned on it.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results are broken down across all four benchmark dimensions in separate tables: complexity (Table 3), task framing (Table 4), input format (Table 5), tools (Table 6), and geometry format (Table 7).", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Specific failure modes are discussed: Gemma models generate repetitive hallucinated code, and some models generate placeholder stubs (Listing 4) for unfamiliar libraries like MovingPandas.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "OSMNX and MovingPandas yield 0% pass@1 for nearly all models, which is explicitly reported and discussed in Table 6.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": true, 236 "justification": "Exact HuggingFace model IDs are provided for all seven models (e.g., bigcode/starcoder2-7b, meta-llama/Meta-Llama-3-8B), which are specific version identifiers.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "The prompt format is shown in Figure 1 and Listings 1–3, including function signatures, type hints, and docstrings as actually used in evaluation.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": true, 248 "justification": "Greedy decoding, max_length=200, and 4-bit quantization via bitsandbytes are all specified in Section 4.1.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": false, 253 "answer": false, 254 "justification": "No agentic scaffolding is used; models receive prompts directly and generate single completions.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "The evaluation pipeline documents code trimming (searching for second 'def' occurrence), virtual environment creation, and automatic library discovery and import before test execution.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "The benchmark dataset including all prompts and test cases is publicly available on the GitHub repository linked in the abstract.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 3.3 describes the manual task creation process: starting from 20 unique tasks and augmenting via dimension variations to 77 samples, with examples in Listing 1.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants or external sample recruitment; all tasks were manually created by the paper's authors.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The full pipeline from manual task design through augmentation to test-case creation and automated evaluation is documented across Sections 3.2–3.4.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No training data cutoffs are reported for any of the seven evaluated models, despite this being relevant for assessing whether benchmark content could have been in training data.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "The authors claim prompts are 'human-written to ensure they were not present in any training data' but provide no formal verification or overlap analysis.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "Geospatial library documentation and examples that form the basis of the tasks are publicly available and could have been in training corpora; this is not discussed.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants in this study.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants in this study.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants in this study.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants in this study.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants in this study.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants in this study.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants in this study.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "Hardware used (GTX 1080 8GB and A100 80GB) is described but no inference latency, time-per-sample, or monetary cost figures are reported.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "Hardware is described but no total compute budget (GPU-hours, wall-clock time, or cost) is stated for the experiments.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Code generation LLMs perform significantly worse on geospatial tasks than on generic programming tasks (HumanEval).", 373 "evidence": "Table 2: CodeLlama-Python scores 40.48% on HumanEval but only 24.68% pass@1 on geospatial tasks; CodeGemma scores 40.13% on HumanEval but only 12.99% geospatial pass@1.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "Multi-step geospatial tasks are substantially harder for all tested models than single-step tasks.", 378 "evidence": "Table 3: StarCoder2 drops from 45.45% (simple) to 15.15% (complex) pass@1; the gap is consistent across all seven models.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Models fail almost completely on OSMNX and MovingPandas but handle Shapely reasonably well.", 383 "evidence": "Table 6: Six of seven models score 0% on OSMNX; all score 0% on MovingPandas; all score 57–86% on Shapely.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "HumanEval performance rankings do not translate directly to geospatial task performance rankings.", 388 "evidence": "StarCoder2 ranks 4th on HumanEval but 1st on geospatial; Gemma/CodeGemma rank high on HumanEval but near last on geospatial tasks.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Operation-framed tasks are generally easier for models than semantically framed tasks.", 393 "evidence": "Table 4 shows most models score higher on operation framing, but two models (Mistral, CodeLlama) score higher on semantic framing, making the pattern inconsistent.", 394 "supported": "weak" 395 } 396 ], 397 "methodology_tags": [ 398 "benchmark-eval", 399 "observational" 400 ], 401 "key_findings": "Seven 7B/8B code LLMs all perform poorly on a 77-sample geospatial benchmark (best model: StarCoder2 at 32.47% pass@1), substantially below their HumanEval scores. Tool knowledge is highly uneven: Shapely and H3 are handled moderately well, while OSMNX and MovingPandas yield near-zero success across all models. Multi-step tasks are consistently harder than single-step tasks. HumanEval rankings are a poor predictor of geospatial code generation performance, suggesting the domain requires specialized evaluation.", 402 "red_flags": [ 403 { 404 "flag": "Tiny benchmark", 405 "detail": "Only 77 samples from 20 unique tasks; conclusions about model capabilities are drawn from very small per-category sample sizes (e.g., 3 OSMNX samples, 4 H3 samples)." 406 }, 407 { 408 "flag": "No significance testing", 409 "detail": "All comparative claims across models and task categories are made without statistical tests; differences of a few percentage points are treated as meaningful." 410 }, 411 { 412 "flag": "Single greedy run", 413 "detail": "Greedy decoding with no repeated runs means no variance estimation; results could differ substantially with sampling-based generation." 414 }, 415 { 416 "flag": "7B/8B models only", 417 "detail": "Computational constraints restricted evaluation to quantized 7B/8B models; conclusions about 'code LLMs' cannot extend to larger frontier models (GPT-4, Claude, etc.)." 418 }, 419 { 420 "flag": "Contamination not formally addressed", 421 "detail": "Authors claim prompts are human-written but provide no overlap analysis with training corpora; library documentation could appear in training data." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 427 "relevance": "Primary reference benchmark for code generation evaluation; used as comparison baseline throughout." 428 }, 429 { 430 "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation", 431 "relevance": "Most closely related prior benchmark for domain-specific code generation; directly motivates the geospatial benchmark." 432 }, 433 { 434 "title": "Large Language Models Meet NL2Code: A Survey", 435 "relevance": "Survey of code generation LLMs that frames the broader context for this evaluation." 436 }, 437 { 438 "title": "StarCoder 2 and The Stack v2: The Next Generation", 439 "relevance": "One of the evaluated models; best performer on the geospatial benchmark." 440 }, 441 { 442 "title": "Code Llama: Open Foundation Models for Code", 443 "relevance": "Two variants evaluated; represents dedicated code models vs generic LLMs." 444 }, 445 { 446 "title": "GPT4GEO: How a Language Model Sees the World's Geography", 447 "relevance": "Related work evaluating LLMs on geospatial knowledge tasks, situating this benchmark in the GeoAI evaluation space." 448 }, 449 { 450 "title": "GeoGPT: An assistant for understanding and processing geospatial tasks", 451 "relevance": "Related work on LLM-based geospatial tool use, directly relevant to the tools-usage dimension of the benchmark." 452 }, 453 { 454 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 455 "relevance": "Directly cited for its approach of using larger LLMs to extend test cases, mentioned as future work direction." 456 } 457 ], 458 "engagement_factors": { 459 "practical_relevance": { 460 "score": 2, 461 "justification": "Directly useful to geospatial data scientists evaluating which 7B/8B models to use as coding assistants, and the public dataset enables future benchmarking." 462 }, 463 "surprise_contrarian": { 464 "score": 1, 465 "justification": "The near-complete failure on OSMNX and MovingPandas despite moderate Shapely performance is a notable finding, but the general 'models are worse on specialized domains' result is expected." 466 }, 467 "fear_safety": { 468 "score": 0, 469 "justification": "No AI safety or risk concerns raised." 470 }, 471 "drama_conflict": { 472 "score": 0, 473 "justification": "No controversy or conflicting claims with established work." 474 }, 475 "demo_ability": { 476 "score": 2, 477 "justification": "Public GitHub repo with dataset and evaluation code allows practitioners to test their own models immediately." 478 }, 479 "brand_recognition": { 480 "score": 0, 481 "justification": "Academic paper from Polish and Portuguese universities; no famous lab or product association." 482 } 483 }, 484 "hn_data": { 485 "threads": [ 486 { 487 "hn_id": "24767717", 488 "title": "DiffTune: Optimizing CPU Simulator Parameters with Differentiable Surrogates", 489 "points": 5, 490 "comments": 0, 491 "url": "https://news.ycombinator.com/item?id=24767717", 492 "created_at": "2020-10-13T17:29:40Z" 493 }, 494 { 495 "hn_id": "45533732", 496 "title": "Agentic Context Engineering", 497 "points": 4, 498 "comments": 0, 499 "url": "https://news.ycombinator.com/item?id=45533732", 500 "created_at": "2025-10-09T22:30:41Z" 501 }, 502 { 503 "hn_id": "45522649", 504 "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving LMs", 505 "points": 4, 506 "comments": 0, 507 "url": "https://news.ycombinator.com/item?id=45522649", 508 "created_at": "2025-10-09T01:56:20Z" 509 }, 510 { 511 "hn_id": "42367885", 512 "title": "Semantic Retrieval at Walmart", 513 "points": 2, 514 "comments": 1, 515 "url": "https://news.ycombinator.com/item?id=42367885", 516 "created_at": "2024-12-09T16:54:59Z" 517 }, 518 { 519 "hn_id": "45578786", 520 "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving LLMs", 521 "points": 2, 522 "comments": 0, 523 "url": "https://news.ycombinator.com/item?id=45578786", 524 "created_at": "2025-10-14T11:35:40Z" 525 }, 526 { 527 "hn_id": "45554565", 528 "title": "Agentic Context Engineering: Evolving Contexts for SelfImproving Language Models", 529 "points": 2, 530 "comments": 0, 531 "url": "https://news.ycombinator.com/item?id=45554565", 532 "created_at": "2025-10-12T02:15:40Z" 533 }, 534 { 535 "hn_id": "45516763", 536 "title": "Agentic Context Engineering: Evolving Contexts for SelfImproving Language Models", 537 "points": 2, 538 "comments": 0, 539 "url": "https://news.ycombinator.com/item?id=45516763", 540 "created_at": "2025-10-08T14:44:57Z" 541 }, 542 { 543 "hn_id": "34409379", 544 "title": "Red-Teaming the Stable Diffusion Safety Filter", 545 "points": 1, 546 "comments": 0, 547 "url": "https://news.ycombinator.com/item?id=34409379", 548 "created_at": "2023-01-17T05:12:51Z" 549 } 550 ], 551 "top_points": 5, 552 "total_points": 22, 553 "total_comments": 1 554 } 555 }