scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24985B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluation of Code LLMs on Geospatial Code Generation",
      6     "authors": [
      7       "Piotr Gramacki",
      8       "Bruno Martins",
      9       "Piotr Szymański"
     10     ],
     11     "year": 2024,
     12     "venue": "GeoAI@SIGSPATIAL",
     13     "arxiv_id": "2410.04617",
     14     "doi": "10.1145/3687123.3698286"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims (benchmark construction, task categorization, model evaluation, public release) are backed by the paper's content in Sections 3–4.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "The paper makes observational comparisons across task types and models but does not make causal claims requiring special study design.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Claims like 'models have a weak understanding of the geospatial aspect' and 'An AI coding assistant which is unable to use popular tools is not very useful' go beyond what 77 samples and 7B/8B-only models can support.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No alternative explanations are considered for observed performance differences, such as 4-bit quantization effects, prompt sensitivity, or library version mismatches.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures functional correctness via test-case pass rates (accuracy, pass@1, pass_any@1) and explicitly claims these evaluate code generation capability, which is a direct rather than proxy measure.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated 'Limitations' paragraph appears in Section 5, distinct from the conclusion prose.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The limitations mention only computational constraints restricting model size and the need to expand task coverage; no specific threats such as quantization effects on validity, test-case adequacy, or coverage gaps are discussed.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The authors explicitly bound scope to 7B/8B models and state 'our work is just the first steps towards the construction of a comprehensive geospatial code generation benchmark.'",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding section or acknowledgment of funding sources appears anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated on the title page: Wrocław University of Science and Technology / Kraina.AI and INESC-ID / Instituto Superior Técnico.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The four benchmark dimensions (task complexity, input type, tools usage, task framing) are explicitly defined with enumerated values in Section 3.1.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states its contributions: a new geospatial code generation benchmark dataset and a comparative evaluation of seven code LLMs on it.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 explicitly positions the benchmark relative to HumanEval, DS-1000, APPS, and prior geospatial LLM work, explaining how this benchmark addresses gaps they identified.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "A public GitHub repository (https://github.com/kraina-ai/geospatial-code-llms-dataset) is linked in the abstract footnote with both dataset and evaluation code.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The 77-sample benchmark dataset is released on the same public GitHub repository.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions Python, transformers, and bitsandbytes but provides no requirements.txt, Dockerfile, or pinned dependency list; library versions used in evaluation are not specified.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Section 4.1 describes the evaluation pipeline in sufficient detail: code trimming procedure, virtual environment creation, library discovery and import, and hardware configuration.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No confidence intervals or error bars are reported for any result tables; only point estimates are given.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Comparative claims (e.g., StarCoder2 outperforms Gemma, single-step easier than multi-step) are made without any statistical significance tests.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Percentage pass@1 scores with HumanEval as reference context are reported, providing interpretable effect magnitudes (e.g., StarCoder2 32.47% vs. Gemma 9.09%).",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The dataset size of 77 samples (20 unique tasks) is explained procedurally via augmentation but not justified statistically or by power analysis.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Greedy decoding produces a single deterministic output per sample; no multiple runs are performed and no variance across runs is reported.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Seven models are compared against each other, and HumanEval scores from public leaderboards are included as reference baselines.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "All tested models are 2023–2024 releases (StarCoder2, CodeLlama, Llama-3, Mistral-7B, Gemma, CodeGemma), representing the contemporary 7B/8B tier.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": false,
    191           "answer": false,
    192           "justification": "The paper evaluates existing pretrained models without proposing a new system; ablation is not applicable.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Three metrics are used: accuracy (partial test-case pass rate), pass@1 (all tests pass), and pass_any@1 (at least one test passes).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Evaluation is entirely automated via functional test cases; human evaluation of model outputs is not used and not relevant given the code-correctness focus.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The entire 77-sample dataset serves as a held-out test set for pre-trained models that were not fine-tuned on it.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down across all four benchmark dimensions in separate tables: complexity (Table 3), task framing (Table 4), input format (Table 5), tools (Table 6), and geometry format (Table 7).",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Specific failure modes are discussed: Gemma models generate repetitive hallucinated code, and some models generate placeholder stubs (Listing 4) for unfamiliar libraries like MovingPandas.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "OSMNX and MovingPandas yield 0% pass@1 for nearly all models, which is explicitly reported and discussed in Table 6.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact HuggingFace model IDs are provided for all seven models (e.g., bigcode/starcoder2-7b, meta-llama/Meta-Llama-3-8B), which are specific version identifiers.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "The prompt format is shown in Figure 1 and Listings 1–3, including function signatures, type hints, and docstrings as actually used in evaluation.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Greedy decoding, max_length=200, and 4-bit quantization via bitsandbytes are all specified in Section 4.1.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used; models receive prompts directly and generate single completions.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The evaluation pipeline documents code trimming (searching for second 'def' occurrence), virtual environment creation, and automatic library discovery and import before test execution.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The benchmark dataset including all prompts and test cases is publicly available on the GitHub repository linked in the abstract.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3.3 describes the manual task creation process: starting from 20 unique tasks and augmenting via dimension variations to 77 samples, with examples in Listing 1.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants or external sample recruitment; all tasks were manually created by the paper's authors.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from manual task design through augmentation to test-case creation and automated evaluation is documented across Sections 3.2–3.4.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training data cutoffs are reported for any of the seven evaluated models, despite this being relevant for assessing whether benchmark content could have been in training data.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The authors claim prompts are 'human-written to ensure they were not present in any training data' but provide no formal verification or overlap analysis.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Geospatial library documentation and examples that form the basis of the tasks are publicly available and could have been in training corpora; this is not discussed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "Hardware used (GTX 1080 8GB and A100 80GB) is described but no inference latency, time-per-sample, or monetary cost figures are reported.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Hardware is described but no total compute budget (GPU-hours, wall-clock time, or cost) is stated for the experiments.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Code generation LLMs perform significantly worse on geospatial tasks than on generic programming tasks (HumanEval).",
    373       "evidence": "Table 2: CodeLlama-Python scores 40.48% on HumanEval but only 24.68% pass@1 on geospatial tasks; CodeGemma scores 40.13% on HumanEval but only 12.99% geospatial pass@1.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Multi-step geospatial tasks are substantially harder for all tested models than single-step tasks.",
    378       "evidence": "Table 3: StarCoder2 drops from 45.45% (simple) to 15.15% (complex) pass@1; the gap is consistent across all seven models.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Models fail almost completely on OSMNX and MovingPandas but handle Shapely reasonably well.",
    383       "evidence": "Table 6: Six of seven models score 0% on OSMNX; all score 0% on MovingPandas; all score 57–86% on Shapely.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "HumanEval performance rankings do not translate directly to geospatial task performance rankings.",
    388       "evidence": "StarCoder2 ranks 4th on HumanEval but 1st on geospatial; Gemma/CodeGemma rank high on HumanEval but near last on geospatial tasks.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Operation-framed tasks are generally easier for models than semantically framed tasks.",
    393       "evidence": "Table 4 shows most models score higher on operation framing, but two models (Mistral, CodeLlama) score higher on semantic framing, making the pattern inconsistent.",
    394       "supported": "weak"
    395     }
    396   ],
    397   "methodology_tags": [
    398     "benchmark-eval",
    399     "observational"
    400   ],
    401   "key_findings": "Seven 7B/8B code LLMs all perform poorly on a 77-sample geospatial benchmark (best model: StarCoder2 at 32.47% pass@1), substantially below their HumanEval scores. Tool knowledge is highly uneven: Shapely and H3 are handled moderately well, while OSMNX and MovingPandas yield near-zero success across all models. Multi-step tasks are consistently harder than single-step tasks. HumanEval rankings are a poor predictor of geospatial code generation performance, suggesting the domain requires specialized evaluation.",
    402   "red_flags": [
    403     {
    404       "flag": "Tiny benchmark",
    405       "detail": "Only 77 samples from 20 unique tasks; conclusions about model capabilities are drawn from very small per-category sample sizes (e.g., 3 OSMNX samples, 4 H3 samples)."
    406     },
    407     {
    408       "flag": "No significance testing",
    409       "detail": "All comparative claims across models and task categories are made without statistical tests; differences of a few percentage points are treated as meaningful."
    410     },
    411     {
    412       "flag": "Single greedy run",
    413       "detail": "Greedy decoding with no repeated runs means no variance estimation; results could differ substantially with sampling-based generation."
    414     },
    415     {
    416       "flag": "7B/8B models only",
    417       "detail": "Computational constraints restricted evaluation to quantized 7B/8B models; conclusions about 'code LLMs' cannot extend to larger frontier models (GPT-4, Claude, etc.)."
    418     },
    419     {
    420       "flag": "Contamination not formally addressed",
    421       "detail": "Authors claim prompts are human-written but provide no overlap analysis with training corpora; library documentation could appear in training data."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    427       "relevance": "Primary reference benchmark for code generation evaluation; used as comparison baseline throughout."
    428     },
    429     {
    430       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    431       "relevance": "Most closely related prior benchmark for domain-specific code generation; directly motivates the geospatial benchmark."
    432     },
    433     {
    434       "title": "Large Language Models Meet NL2Code: A Survey",
    435       "relevance": "Survey of code generation LLMs that frames the broader context for this evaluation."
    436     },
    437     {
    438       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    439       "relevance": "One of the evaluated models; best performer on the geospatial benchmark."
    440     },
    441     {
    442       "title": "Code Llama: Open Foundation Models for Code",
    443       "relevance": "Two variants evaluated; represents dedicated code models vs generic LLMs."
    444     },
    445     {
    446       "title": "GPT4GEO: How a Language Model Sees the World's Geography",
    447       "relevance": "Related work evaluating LLMs on geospatial knowledge tasks, situating this benchmark in the GeoAI evaluation space."
    448     },
    449     {
    450       "title": "GeoGPT: An assistant for understanding and processing geospatial tasks",
    451       "relevance": "Related work on LLM-based geospatial tool use, directly relevant to the tools-usage dimension of the benchmark."
    452     },
    453     {
    454       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    455       "relevance": "Directly cited for its approach of using larger LLMs to extend test cases, mentioned as future work direction."
    456     }
    457   ],
    458   "engagement_factors": {
    459     "practical_relevance": {
    460       "score": 2,
    461       "justification": "Directly useful to geospatial data scientists evaluating which 7B/8B models to use as coding assistants, and the public dataset enables future benchmarking."
    462     },
    463     "surprise_contrarian": {
    464       "score": 1,
    465       "justification": "The near-complete failure on OSMNX and MovingPandas despite moderate Shapely performance is a notable finding, but the general 'models are worse on specialized domains' result is expected."
    466     },
    467     "fear_safety": {
    468       "score": 0,
    469       "justification": "No AI safety or risk concerns raised."
    470     },
    471     "drama_conflict": {
    472       "score": 0,
    473       "justification": "No controversy or conflicting claims with established work."
    474     },
    475     "demo_ability": {
    476       "score": 2,
    477       "justification": "Public GitHub repo with dataset and evaluation code allows practitioners to test their own models immediately."
    478     },
    479     "brand_recognition": {
    480       "score": 0,
    481       "justification": "Academic paper from Polish and Portuguese universities; no famous lab or product association."
    482     }
    483   },
    484   "hn_data": {
    485     "threads": [
    486       {
    487         "hn_id": "24767717",
    488         "title": "DiffTune: Optimizing CPU Simulator Parameters with Differentiable Surrogates",
    489         "points": 5,
    490         "comments": 0,
    491         "url": "https://news.ycombinator.com/item?id=24767717",
    492         "created_at": "2020-10-13T17:29:40Z"
    493       },
    494       {
    495         "hn_id": "45533732",
    496         "title": "Agentic Context Engineering",
    497         "points": 4,
    498         "comments": 0,
    499         "url": "https://news.ycombinator.com/item?id=45533732",
    500         "created_at": "2025-10-09T22:30:41Z"
    501       },
    502       {
    503         "hn_id": "45522649",
    504         "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving LMs",
    505         "points": 4,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=45522649",
    508         "created_at": "2025-10-09T01:56:20Z"
    509       },
    510       {
    511         "hn_id": "42367885",
    512         "title": "Semantic Retrieval at Walmart",
    513         "points": 2,
    514         "comments": 1,
    515         "url": "https://news.ycombinator.com/item?id=42367885",
    516         "created_at": "2024-12-09T16:54:59Z"
    517       },
    518       {
    519         "hn_id": "45578786",
    520         "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving LLMs",
    521         "points": 2,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=45578786",
    524         "created_at": "2025-10-14T11:35:40Z"
    525       },
    526       {
    527         "hn_id": "45554565",
    528         "title": "Agentic Context Engineering: Evolving Contexts for SelfImproving Language Models",
    529         "points": 2,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=45554565",
    532         "created_at": "2025-10-12T02:15:40Z"
    533       },
    534       {
    535         "hn_id": "45516763",
    536         "title": "Agentic Context Engineering: Evolving Contexts for SelfImproving Language Models",
    537         "points": 2,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=45516763",
    540         "created_at": "2025-10-08T14:44:57Z"
    541       },
    542       {
    543         "hn_id": "34409379",
    544         "title": "Red-Teaming the Stable Diffusion Safety Filter",
    545         "points": 1,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=34409379",
    548         "created_at": "2023-01-17T05:12:51Z"
    549       }
    550     ],
    551     "top_points": 5,
    552     "total_points": 22,
    553     "total_comments": 1
    554   }
    555 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs