scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32967B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluation of Code LLMs on Geospatial Code Generation",
      6     "authors": [
      7       "Piotr Gramacki",
      8       "Bruno Martins",
      9       "Piotr Szymański"
     10     ],
     11     "year": 2024,
     12     "venue": "GeoAI@SIGSPATIAL",
     13     "arxiv_id": "2410.04617",
     14     "doi": "10.1145/3687123.3698286"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims are modest and supported: they propose a benchmark (described in Section 3), test existing models (Section 4), and share code/data (GitHub link). No overclaiming is evident.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes implicit causal claims about why models fail (e.g., 'models that are worse at geospatial code generation find it especially difficult to solve multi-step tasks,' suggesting task complexity causes performance drops) without controlling for confounds like task-specific difficulty or sample size per category.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper explicitly bounds its scope: 'Due to computational constraints we limited ourselves to 7B/8B scale LLMs, which we also aim to extend in the future' (Section 5). The title and abstract reference 'a selection of existing code generation LLMs' rather than making broad claims.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No alternative explanations are discussed for the observed results. For example, the poor performance on OSMNX/MovingPandas could be due to library documentation quality, training data volume, or task difficulty rather than just model capability, but this is not explored.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures functional correctness via test cases and frames its claims in terms of code correctness and task-solving capability. The claims match the measurement granularity — they don't overclaim 'productivity' or 'developer effectiveness' from pass@1 scores.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 5 contains a dedicated 'Limitations' subsection discussing the benchmark's current scope and the computational constraints on model selection.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper identifies specific limitations: 'the current version of the benchmark should be expanded to cover more typical tasks and tools,' and 'Due to computational constraints we limited ourselves to 7B/8B scale LLMs' (Section 5). These are specific to this study.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states what is not covered: no polygon inputs, no larger models, no fine-tuning experiments, no incorrect input handling, no code infilling tasks, limited to 4 tool libraries and 5 input formats. Stated in Sections 3 and 5.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All author affiliations are clearly stated: Wrocław University of Science and Technology (Gramacki, Szymański) and INESC-ID/Instituto Superior Técnico, University of Lisbon (Martins). Kraina.AI affiliation is also listed.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding information is disclosed. Without knowing the funding source, independence cannot be verified.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is included in the paper. The Kraina.AI affiliation is listed but no conflict-of-interest declaration is made.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms including task complexity dimensions, evaluation metrics (accuracy, pass@1, pass_any@1), and task framing types (operation vs. semantic) are all explicitly defined in Sections 3 and 4.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states its contributions: a multi-dimensional geospatial code generation benchmark and a comparative evaluation of seven contemporary code LLMs on this benchmark.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 reviews code generation benchmarks (HumanEval, DS-1000, APPS) and LLMs applied to geospatial tasks, explicitly positioning the new benchmark relative to existing work and gaps it addresses.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "GitHub repository provided: https://github.com/kraina-ai/geospatial-code-llms-dataset (footnote 1). The abstract states 'We share our dataset and reproducible evaluation code on a public GitHub repository.'",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The benchmark dataset of 77 samples is shared on the same GitHub repository. The abstract confirms the dataset is publicly released.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions using the transformers library, bitsandbytes for 4-bit quantization, and specific GPU hardware, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The paper describes the evaluation pipeline methodology (Section 4.1) including code trimming, import handling, and testing procedure, but does not provide step-by-step commands or a reproduction guide. The GitHub repo is referenced but the paper itself lacks concrete instructions.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables 2-7 are reported as single point estimates (e.g., '32.47%') with no confidence intervals, error bars, or uncertainty measures.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper makes comparative claims (e.g., 'best results have been achieved by starcoder2-7b', 'models work better with geodataframes') based solely on comparing raw percentages without any statistical significance tests.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "Results are reported as raw percentages in tables. While HumanEval scores are shown alongside benchmark scores for context, no formal effect sizes (Cohen's d, relative improvement, etc.) are computed or discussed.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The benchmark has 77 samples from 20 unique tasks. No justification is given for why this sample size is sufficient, and no power analysis is discussed.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "All experiments use greedy decoding producing a single deterministic output per sample. No variance, standard deviation, or spread measures are reported across any experimental condition.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Seven models are compared against each other, and HumanEval scores from external leaderboards are included as reference baselines (Table 2).",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Models include StarCoder2, CodeLlama, Llama 3, Mistral 7B, Gemma, and CodeGemma — all released in 2023-2024 and representing contemporary open-source code LLMs at the 7B/8B scale.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": false,
    191           "answer": false,
    192           "justification": "The contribution is a benchmark dataset, not a system with components to ablate. The multi-dimensional analysis (complexity, framing, input format, tools) serves a similar diagnostic purpose but is not an ablation study.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Three metrics are used: Accuracy (percentage of passed test cases), Pass@1 (percentage of fully correct solutions), and Pass_any@1 (percentage of solutions passing at least one test case). Described in Section 4.1.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Evaluation is entirely automated via test case pass/fail. No human evaluation of code quality, readability, or correctness is performed. The paper explicitly chose automated testing over reference-solution comparison (Section 3.4).",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The entire benchmark was hand-crafted specifically for this evaluation: 'The prompts are human-written to ensure that they were not present in any training data for existing models' (Section 3.2). No model tuning was done on this data.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Extensive breakdowns are provided across five dimensions: task complexity (Table 3), task framing (Table 4), input format (Table 5), tool usage (Table 6), and point format (Table 7).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "The paper discusses gemma/codegemma models hallucinating and generating repetitive code blocks, models generating placeholder functions with NotImplementedError for unknown libraries (Listing 4), and near-complete failure on OSMNX and MovingPandas.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Multiple negative results reported: all models fail on OSMNX and MovingPandas (Table 6), gemma models perform very poorly overall, models struggle with GeoJSON and Shapefile formats compared to GeoDataFrames, and HumanEval performance does not predict geospatial performance.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact HuggingFace model identifiers are provided for all 7 models (e.g., 'bigcode/starcoder2-7b', 'meta-llama/CodeLlama-7b-hf', 'meta-llama/Meta-Llama-3-8B'). These are specific, versioned model checkpoints.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "The exact prompt format is provided in Figure 1 and Listings 1-4, including function signatures, type hints, and docstrings. The format 'remains unchanged between tested models' (Section 3.2). All 77 prompts follow this template and are available in the GitHub repository.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Key hyperparameters are stated: greedy decoding (temperature=0), max_length=200, 4-bit quantization via bitsandbytes. Reported in Section 4.1 under 'Hyperparameters.'",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. Models generate code completions directly from function signatures via single-pass inference.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 4.1 describes the evaluation pipeline: trimming responses to a single function by searching for the second 'def' string, searching generated code for library imports, installing them in a virtual environment, and importing before testing.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The complete benchmark dataset (77 samples with prompts and test cases) is available on the public GitHub repository. Generated outputs could be verified against this data.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3.3 describes the sample creation process: starting with task definition on the complexity dimension, then augmenting via input format and framing changes to produce variants. The categorization scheme is detailed in Section 3.1.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. The data is a manually created benchmark dataset, and the evaluated systems are publicly available LLMs.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline from task conception to final dataset is documented: define complexity dimension → select tools/input types → write task → augment across dimensions → create test cases (Section 3). The evaluation pipeline is documented in Section 4.1.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training data cutoff dates are not stated for any of the 7 evaluated models. The paper relies on hand-crafted tasks as contamination mitigation but does not state when each model's training data ends.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": true,
    300           "justification": "The paper states: 'The prompts are human-written to ensure that they were not present in any training data for existing models' (Section 3.2). This directly addresses train/test overlap through benchmark design.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "Contamination is addressed by design: the benchmark was manually created after the models' training periods, with human-written prompts to ensure novelty. This is stated in Section 3.2.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study. The evaluation is entirely automated benchmark testing of LLMs.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. The study evaluates LLMs on a code generation benchmark.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or tokens consumed are reported. The paper mentions hardware used but not per-sample or total inference costs.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Hardware is mentioned (GTX 1080 8GB, A100 80GB) but no total GPU hours, wall-clock time, or compute budget is quantified. Section 4.1 describes the machines but not how long experiments took.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Greedy decoding produces deterministic outputs, so there is no seed sensitivity analysis. The paper does not explore whether results change with non-greedy sampling strategies.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "The paper states they use greedy decoding to produce 'a single output' per sample (Section 4.1 under 'Hyperparameters'), making the number of runs (1) explicit.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search is reported. The max_length=200 was 'verified to be enough' but no search budget or alternative configurations are discussed.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": true,
    390           "justification": "A uniform configuration is used across all models (greedy decoding, max_length=200, 4-bit quantization), eliminating configuration cherry-picking. The max_length was verified to be sufficient for all models and tasks.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": false,
    395           "answer": false,
    396           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors created the benchmark and evaluate models on it. No discussion of whether benchmark design choices might favor or disadvantage particular models, or acknowledgment of self-comparison bias.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": false,
    407           "answer": false,
    408           "justification": "All models are 7B/8B parameters with the same 4-bit quantization, making compute differences negligible across the comparison.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper does not discuss whether 77 samples across 20 tasks adequately measure geospatial code generation capability, nor whether the four-dimensional categorization captures the full range of geospatial coding challenges.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved. Models perform single-pass code completion from function signatures.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": true,
    428           "justification": "The benchmark tasks were manually created after the models' training periods. 'The prompts are human-written to ensure that they were not present in any training data for existing models' (Section 3.2). This is a temporal leakage prevention strategy.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the prompt format, function signatures, or docstring style could leak information about expected solutions. The evaluation pipeline also adds all needed imports (Section 4.1), which could make the task easier than real-world usage.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "The 77 samples are augmented from 20 unique tasks (Section 3.3), meaning many samples share the same underlying logic. This non-independence is not discussed as a potential source of inflated evaluation.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No formal leakage detection method (canary strings, membership inference, n-gram overlap analysis) is employed. The paper relies solely on manual creation as a prevention measure.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "LLM performance on geospatial coding tasks does not correlate with HumanEval performance—gemma/codegemma score high on HumanEval but low on geospatial tasks, while StarCoder2 outperforms despite a lower HumanEval ranking.",
    455       "evidence": "Table 2: CodeGemma-7b has 40.13% HumanEval but only 12.99% geospatial pass@1; StarCoder2-7b has 34.09% HumanEval but best geospatial pass@1 at 32.47%.",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Multi-step geospatial tasks are significantly more challenging than single-step tasks for all evaluated models.",
    460       "evidence": "Table 3: pass@1 drops dramatically for every model from single to complex tasks—StarCoder2: 45.45% → 15.15%; CodeGemma: 20.45% → 3.03%.",
    461       "supported": "moderate"
    462     },
    463     {
    464       "claim": "Current code LLMs largely fail on specialized geospatial libraries (OSMNX, MovingPandas) while handling basic geometry (Shapely) reasonably well.",
    465       "evidence": "Table 6: OSMNX achieves at most 16.67% and MovingPandas 0% across all models, while Shapely achieves 57-86% pass@1.",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Task framing (operation vs. semantic description) has inconsistent effects on model performance with no clear uniform advantage.",
    470       "evidence": "Table 4: StarCoder2 scores 36.36% operation vs. 24.24% semantic, but CodeLlama-7b-hf reverses (18.18% vs. 24.24%); gemma shows no difference.",
    471       "supported": "weak"
    472     },
    473     {
    474       "claim": "Models perform better with Shapely point objects than with raw lat/lon coordinate pairs.",
    475       "evidence": "Table 7: StarCoder2 scores 33.33% (lat/lon) vs. 66.67% (shapely); CodeLlama-7b-hf scores 4.76% vs. 33.33%.",
    476       "supported": "moderate"
    477     }
    478   ],
    479   "methodology_tags": [
    480     "benchmark-eval"
    481   ],
    482   "key_findings": "StarCoder2-7b achieves the best geospatial code generation performance (32.47% pass@1) despite not topping generic HumanEval rankings, while gemma-class models underperform sharply relative to their HumanEval scores—demonstrating that domain-specific benchmarks are necessary and generic coding benchmarks are poor predictors of geospatial capability. All models fail almost completely on specialized libraries (OSMNX ≤17%, MovingPandas 0%) while achieving reasonable performance on basic Shapely geometry (57-86%). Multi-step tasks show dramatically lower performance than single-step tasks across all models, and most models handle GeoPandas GeoDataFrame inputs better than Shapefile or GeoJSON formats.",
    483   "red_flags": [
    484     {
    485       "flag": "Tiny benchmark",
    486       "detail": "Only 77 samples from 20 unique tasks—far too small for reliable statistical conclusions about model capabilities across multiple sub-dimensions simultaneously, especially when per-cell counts drop to 2-4 samples."
    487     },
    488     {
    489       "flag": "No statistical testing",
    490       "detail": "All comparative claims (including the word 'significant') are made without significance tests or confidence intervals despite comparing percentage differences on very small sample sizes."
    491     },
    492     {
    493       "flag": "Training cutoffs unstated",
    494       "detail": "No training data cutoffs are stated for any of the seven models, making it impossible to verify the claim that human-written prompts were not present in training data."
    495     },
    496     {
    497       "flag": "No environment specification",
    498       "detail": "Library versions are not pinned—geospatial libraries (geopandas, shapely, movingpandas) evolve rapidly and results may not reproduce with current versions."
    499     },
    500     {
    501       "flag": "No funding or COI disclosure",
    502       "detail": "No funding sources or competing interests are declared despite authors being affiliated with Kraina.AI, a commercial AI company."
    503     }
    504   ],
    505   "cited_papers": [
    506     {
    507       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    508       "relevance": "Primary reference benchmark; geospatial results are compared against HumanEval pass@1 scores throughout the paper."
    509     },
    510     {
    511       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    512       "relevance": "Most directly related prior benchmark covering general data science Python tasks; inspired the geospatial benchmark's design principles."
    513     },
    514     {
    515       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    516       "relevance": "Cited for using larger LLMs to generate additional test cases—relevant to future benchmark construction methodology."
    517     },
    518     {
    519       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    520       "relevance": "Best-performing evaluated model; the paper's main finding centers on StarCoder2 outperforming higher-HumanEval models."
    521     },
    522     {
    523       "title": "Code Llama: Open Foundation Models for Code",
    524       "relevance": "Two CodeLlama variants are evaluated; results show Python-specific fine-tuning provides mixed geospatial benefit."
    525     },
    526     {
    527       "title": "Autonomous GIS: the next-generation AI-powered GIS",
    528       "relevance": "Key motivation paper: LLMs generating code to solve geospatial tasks is the primary use case the benchmark targets."
    529     },
    530     {
    531       "title": "GPT4GEO: How a Language Model Sees the World's Geography",
    532       "relevance": "Directly related evaluation of LLM spatial knowledge; establishes that LLMs have geospatial knowledge gaps the new benchmark can measure."
    533     },
    534     {
    535       "title": "Large Language Models Meet NL2Code: A Survey",
    536       "relevance": "Survey of code generation with LLMs; provides broader context and methodology background for the evaluation."
    537     }
    538   ],
    539   "engagement_factors": {
    540     "practical_relevance": {
    541       "score": 2,
    542       "justification": "Practitioners building geospatial coding assistants can directly use benchmark results to select appropriate base models and identify which library integrations need improvement."
    543     },
    544     "surprise_contrarian": {
    545       "score": 2,
    546       "justification": "The reversal of HumanEval rankings in the geospatial domain—gemma underperforms while starcoder2 leads—challenges the assumption that generic coding benchmarks predict domain-specific performance."
    547     },
    548     "fear_safety": {
    549       "score": 0,
    550       "justification": "No AI safety or risk concerns raised."
    551     },
    552     "drama_conflict": {
    553       "score": 0,
    554       "justification": "No controversy or conflict angle."
    555     },
    556     "demo_ability": {
    557       "score": 2,
    558       "justification": "The benchmark is publicly available on GitHub and experiments can be reproduced on consumer-grade hardware (GTX 1080 sufficient per the paper)."
    559     },
    560     "brand_recognition": {
    561       "score": 1,
    562       "justification": "Evaluates well-known models (Llama, Mistral, Gemma) but the authoring group is a small academic/startup team without strong brand recognition."
    563     }
    564   },
    565   "hn_data": {
    566     "threads": [
    567       {
    568         "hn_id": "24767717",
    569         "title": "DiffTune: Optimizing CPU Simulator Parameters with Differentiable Surrogates",
    570         "points": 5,
    571         "comments": 0,
    572         "url": "https://news.ycombinator.com/item?id=24767717",
    573         "created_at": "2020-10-13T17:29:40Z"
    574       },
    575       {
    576         "hn_id": "45533732",
    577         "title": "Agentic Context Engineering",
    578         "points": 4,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=45533732",
    581         "created_at": "2025-10-09T22:30:41Z"
    582       },
    583       {
    584         "hn_id": "45522649",
    585         "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving LMs",
    586         "points": 4,
    587         "comments": 0,
    588         "url": "https://news.ycombinator.com/item?id=45522649",
    589         "created_at": "2025-10-09T01:56:20Z"
    590       },
    591       {
    592         "hn_id": "42367885",
    593         "title": "Semantic Retrieval at Walmart",
    594         "points": 2,
    595         "comments": 1,
    596         "url": "https://news.ycombinator.com/item?id=42367885",
    597         "created_at": "2024-12-09T16:54:59Z"
    598       },
    599       {
    600         "hn_id": "45578786",
    601         "title": "Agentic Context Engineering: Evolving Contexts for Self-Improving LLMs",
    602         "points": 2,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=45578786",
    605         "created_at": "2025-10-14T11:35:40Z"
    606       },
    607       {
    608         "hn_id": "45554565",
    609         "title": "Agentic Context Engineering: Evolving Contexts for SelfImproving Language Models",
    610         "points": 2,
    611         "comments": 0,
    612         "url": "https://news.ycombinator.com/item?id=45554565",
    613         "created_at": "2025-10-12T02:15:40Z"
    614       },
    615       {
    616         "hn_id": "45516763",
    617         "title": "Agentic Context Engineering: Evolving Contexts for SelfImproving Language Models",
    618         "points": 2,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=45516763",
    621         "created_at": "2025-10-08T14:44:57Z"
    622       },
    623       {
    624         "hn_id": "34409379",
    625         "title": "Red-Teaming the Stable Diffusion Safety Filter",
    626         "points": 1,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=34409379",
    629         "created_at": "2023-01-17T05:12:51Z"
    630       }
    631     ],
    632     "top_points": 5,
    633     "total_points": 22,
    634     "total_comments": 1
    635   }
    636 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs