scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24879B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GeoCode-GPT: A Large Language Model for Geospatial Code Generation Tasks",
      6     "authors": [
      7       "Shuyang Hou",
      8       "Zhangxiao Shen",
      9       "Anqi Zhao",
     10       "Jianyuan Liang",
     11       "Zhipeng Gui",
     12       "Xuefeng Guan",
     13       "Rui Li",
     14       "Huayi Wu"
     15     ],
     16     "year": 2024,
     17     "venue": "International Journal of Applied Earth Observation and Geoinformation",
     18     "arxiv_id": "2410.17031",
     19     "doi": "10.1016/j.jag.2025.104456"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The abstract claims GeoCode-GPT 'outperforms other models' in code generation by 1.2%–25.1%, but Table 7 shows GPT-4 (0.710) and GPT-3.5 (0.644) both outperform GeoCode-GPT-7B (0.636) in overall code generation; the body text also omits GPT-3.5's superior performance while noting only the GPT-4 gap.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper claims QLoRA pretraining plus LoRA fine-tuning jointly improve performance, but no ablation separates the contributions of each stage; the comparison is only base model vs. fully fine-tuned model, preventing attribution of gains.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Evaluation is entirely on GeoCode-Eval, a benchmark built by the same authors from overlapping source material, yet the paper asserts that GeoCode-GPT 'advances the application and development of LLMs in geospatial code generation' without bounding claims to this specific evaluation.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper does not consider alternative explanations for observed gains, such as train-test overlap (GeoCode-Eval was derived from similar sources as GeoCode-PT/SFT) or the possibility that any domain fine-tuning would produce similar gains.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Multiple-choice accuracy and LLM-judged summarization scores are used interchangeably with claims about geospatial code generation 'capability' and 'productivity' without acknowledging the gap between these proxy measures and real-world utility.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 6.1 is explicitly titled 'Limitations' and discusses the gap with GPT-4, instruction data scale, and executability room for improvement.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Section 6.1 reads as a future-work list rather than a validity analysis; it does not address specific threats such as train-test contamination, evaluator bias (GPT-4 used both as baseline and judge), or expert evaluator subjectivity.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No explicit scope boundaries are stated about what the results do NOT show; the paper does not clarify that findings are limited to the specific platforms, benchmark format, or evaluation design used.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding acknowledgement or grant information appears anywhere in the paper.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All authors list affiliations with Wuhan University on the title page.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial disclosure statement is included in the paper.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Geospatial code, 'refusal to code,' and 'coding hallucination' are defined with examples in the introduction; the NL2Code task and corpus types are also explained.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Four numbered contributions are explicitly listed: the model, open-sourced corpora, the QLoRA+LoRA training strategy, and the evaluation framework.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 2 covers domain LLM specialization, instruction data generation, fine-tuning strategies, and code evaluation approaches, explicitly positioning GeoCode-GPT against Code Llama, WizardCoder, Self-Instruct, and ICE-Score.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "A GitHub URL (https://github.com/whuhsy/GeoCode-GPT) is provided and the paper states the corpora and model are open-sourced.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "GeoCode-PT, GeoCode-SFT, and GeoCode-Eval are stated to be released via the same GitHub repository.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Hardware (2× NVIDIA A100 40GB) and quantization precision (int4) are mentioned, but no requirements.txt, Dockerfile, or software dependency list is provided.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "Hyperparameter tables are provided, but no step-by-step instructions for reproducing training or evaluation runs are included in the paper.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "All tables (5, 6, 7) report only point estimates with no confidence intervals or error bars on any metric.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical significance tests are applied to any comparative claims across the paper.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Tables include delta columns showing absolute score differences between GeoCode-GPT-7B and each baseline, providing effect size context.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The evaluation uses 3,000 MC questions, 500 summarization, and 500 generation tasks; no power analysis or justification for these numbers is provided.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No standard deviation, variance, or run-to-run variability is reported for any metric in any table.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Nine baselines are included: GPT-4, GPT-3.5, ERNIE 4.0, LLaMA 2-7B, LLaMA 3-8B, CodeGemma-7B, StarCoder 2-7B, CodeGeeX 2-6B, and Code Llama-7B (and 13B for reference).",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include 2024-era models (LLaMA 3-8B, CodeGemma-7B, StarCoder 2-7B) and current commercial frontends (GPT-4, GPT-3.5), which are competitive at the time of writing.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "No ablation is conducted to separate the contributions of QLoRA pretraining vs. LoRA fine-tuning, or the individual data components (GeoCode-PT, GeoCode-SFT, Alpaca).",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Multiple-choice accuracy, completeness, accuracy, readability, and executability metrics are used across the three evaluation task types.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Experts run generated code to measure executability and perform blind ranking of readability across models in Section 5.2.2.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "GeoCode-Eval is formally separate from GeoCode-PT and GeoCode-SFT, though drawn from overlapping source pools; it was not used during training.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Multiple-choice results are broken down into six dimensions (OK, DK, PTK, PTR, PLR, ER); summarization and code generation are broken down into three metrics each.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": false,
    227           "justification": "Figure 1 illustrates failures from general LLMs to motivate the work, but systematic failure cases or error analysis for GeoCode-GPT-7B's own outputs are not presented.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "The paper explicitly reports that GeoCode-GPT-7B scores below GPT-4 in Platform or Toolkits Knowledge (0.752 vs 0.784), Entity Recognition (0.746 vs 0.852), and overall code generation (0.636 vs 0.710).",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "GPT-4 and GPT-3.5 are referenced without snapshot dates or API version identifiers; only Code Llama-7B has a clear version specification.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "Figures 6 and 7 show schematic prompt templates for the GPT-4 judge, but the actual prompt text with scoring criteria is not provided verbatim.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Sections 4.2 and 4.3 report learning rates, batch sizes, gradient accumulation steps, LoRA rank, dropout, quantization precision, and sequence length for both training stages.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": false,
    258           "answer": false,
    259           "justification": "This is a fine-tuning paper with no agentic scaffolding; GPT-4 is used as a judge but without multi-step agentic orchestration.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Data sources (GitHub, Stack Overflow, Hugging Face, official documentation), screening for syntax accuracy, comment preservation, and attribute tables are described in Section 3.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The paper states all corpora (GeoCode-PT, GeoCode-SFT, GeoCode-Eval) are open-sourced via the provided GitHub repository.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 3 details data sources by platform, quantity, format, and attribute schema for all four data categories; Tables 1 and 2 summarize these inventories.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": true,
    284           "answer": false,
    285           "justification": "Expert evaluators are used for executability testing and readability ranking, but the number of experts, their qualifications, and recruitment process are not described.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "Figure 2 shows the overall pipeline from corpus construction through pretraining and fine-tuning to evaluation; Sections 3 and 4 elaborate each step.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The training data cutoff for Code Llama-7B (the base model) is not stated, nor is there any discussion of when the commercial comparison models (GPT-4, GPT-3.5) were trained relative to the evaluation benchmark's source data.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "GeoCode-Eval code summarization and generation tasks were 'constructed similarly to GeoCode-SFT, using different valid code snippets' from the same source pool; potential overlap is not quantified or discussed.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "Commercial models (GPT-4, GPT-3.5) may have ingested geospatial code from GEE, ArcPy, and other sources used in GeoCode-Eval during pretraining; this possibility is not discussed.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No formal human subjects study; expert evaluation is an informal peer assessment, not a pre-registerable study.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human subjects participation warranting IRB review.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "Expert evaluators are not described in terms of number, background, or demographics.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participant study requiring inclusion/exclusion criteria.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No randomization relevant to human subjects.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": true,
    348           "answer": true,
    349           "justification": "Section 5.2.2 explicitly states that 'experts rank the generated code from different models through a blind selection process.'",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No longitudinal human study with attrition.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "GPT-4 was used for large-scale automated scoring of 1,500 subjective items, but API costs are not reported; GeoCode-GPT-7B inference latency is also not reported.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "Hardware (2× A100 40GB) and number of training epochs are mentioned, but total GPU-hours or training wall-clock time is not reported.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "GeoCode-GPT-7B outperforms all compared models in multiple-choice accuracy, achieving 0.848 average vs. 0.757 for GPT-4 (the next best).",
    378       "evidence": "Table 5 shows GeoCode-GPT-7B at 0.848 vs. GPT-4 at 0.757; however GeoCode scores lower than GPT-4 in Platform/Toolkit Knowledge (0.752 vs. 0.784) and Entity Recognition (0.746 vs. 0.852).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "GeoCode-GPT-7B outperforms other models in code summarization by 1.7%–25.4%, achieving 0.914 overall.",
    383       "evidence": "Table 6 supports the range: +1.7pp vs. GPT-4 (0.897), +25.4pp vs. CodeGemma-7B (0.660); GPT-4 outperforms on Completeness and Accuracy individually.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "GeoCode-GPT-7B outperforms other models in code generation by 1.2%–25.1%.",
    388       "evidence": "Table 7 shows GeoCode-GPT-7B at 0.636, but GPT-4 scores 0.710 and GPT-3.5 scores 0.644, both higher; the claimed range excludes these failures.",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "QLoRA pretraining combined with LoRA fine-tuning achieves optimal balance between resource efficiency and model performance.",
    393       "evidence": "The paper reports memory and convergence benefits of each method in Section 4 but provides no comparison to alternative PEFT methods or full fine-tuning on the same evaluation.",
    394       "supported": "unsupported"
    395     },
    396     {
    397       "claim": "Domain-specific fine-tuning reduces geospatial coding hallucinations, evidenced by improved executability.",
    398       "evidence": "GeoCode-GPT achieves 0.504 executability vs. Code Llama-7B's 0.302, but absolute executability remains low (<50%) and the improvement is not causally isolated.",
    399       "supported": "weak"
    400     },
    401     {
    402       "claim": "GeoCode-GPT-7B approaches commercial model performance despite significantly fewer parameters.",
    403       "evidence": "GeoCode-GPT-7B matches or exceeds ERNIE 4.0 across all metrics but falls behind GPT-4 in code generation (0.636 vs. 0.710) and several summarization sub-metrics.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval"
    409   ],
    410   "key_findings": "GeoCode-GPT-7B, fine-tuned from Code Llama-7B using QLoRA and LoRA on a purpose-built geospatial code corpus, substantially outperforms same-scale open-source code models on the authors' custom GeoCode-Eval benchmark across multiple-choice knowledge, code summarization, and code generation tasks. It exceeds GPT-3.5 and ERNIE 4.0 in most metrics but fails to match GPT-4 in code generation (0.636 vs. 0.710) and select knowledge dimensions. Key methodological weaknesses are that GeoCode-Eval was constructed from the same source pools as the training data (potential contamination undisclosed), GPT-4 serves simultaneously as a strong baseline and as the automated judge, and no ablation isolates what drives the gains.",
    411   "red_flags": [
    412     {
    413       "flag": "Train-test contamination unaddressed",
    414       "detail": "GeoCode-Eval code generation/summarization tasks were built from the same geospatial code sources as GeoCode-SFT; the paper acknowledges using 'different snippets' but does not quantify or formally exclude overlap."
    415     },
    416     {
    417       "flag": "GPT-4 as both baseline and judge",
    418       "detail": "GPT-4 is used to score all 1,500 subjective evaluation items via prompt engineering, while simultaneously being the strongest baseline comparison — creating circular bias in favor of outputs that look like GPT-4 generates."
    419     },
    420     {
    421       "flag": "Abstract overclaims code generation superiority",
    422       "detail": "The abstract states GeoCode-GPT 'outperforms other models' in code generation, but Table 7 shows both GPT-4 (0.710) and GPT-3.5 (0.644) outperform GeoCode-GPT-7B (0.636); the body text omits GPT-3.5's advantage."
    423     },
    424     {
    425       "flag": "No ablation of training stages",
    426       "detail": "The two-stage training (QLoRA pretraining + LoRA fine-tuning) is presented as a contribution, but no ablation tests QLoRA-only or LoRA-only, making it impossible to attribute gains to the proposed strategy."
    427     },
    428     {
    429       "flag": "No statistical tests or confidence intervals",
    430       "detail": "All comparative claims across 10 models rest on point estimates with no significance testing, CIs, or variance across evaluation runs."
    431     },
    432     {
    433       "flag": "Expert evaluator not described",
    434       "detail": "Executability and readability rankings rely on unspecified experts with no reported count, domain qualifications, inter-rater agreement, or recruitment procedure."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Code Llama: Open Foundation Models for Code",
    440       "relevance": "Base model used for GeoCode-GPT fine-tuning; key baseline in evaluation"
    441     },
    442     {
    443       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    444       "relevance": "Contemporary domain-specific code LLM; contextualizes GeoCode-GPT's positioning"
    445     },
    446     {
    447       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    448       "relevance": "Core PEFT method used in GeoCode-GPT fine-tuning stage"
    449     },
    450     {
    451       "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
    452       "relevance": "Framework used to construct GeoCode-SFT instruction tuning data"
    453     },
    454     {
    455       "title": "Evaluating Large Language Models Trained on Code (HumanEval / Codex)",
    456       "relevance": "Establishes pass@k evaluation methodology for code generation; foundational benchmark"
    457     },
    458     {
    459       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    460       "relevance": "LLM-as-judge approach for code evaluation; informs GeoCode's evaluation framework"
    461     },
    462     {
    463       "title": "A Survey on Large Language Models for Code Generation",
    464       "relevance": "Frames the broader NL2Code landscape in which GeoCode-GPT is positioned"
    465     },
    466     {
    467       "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct",
    468       "relevance": "Comparable code-generation LLM; related fine-tuning strategy using synthetic instruction data"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 2,
    474       "justification": "Geospatial code generation addresses a real productivity bottleneck in GIS workflows; model is open-sourced on GitHub for practitioners to use."
    475     },
    476     "surprise_contrarian": {
    477       "score": 1,
    478       "justification": "Finding that domain-specific fine-tuning helps is expected; no surprising or counterintuitive result is presented."
    479     },
    480     "fear_safety": {
    481       "score": 0,
    482       "justification": "No safety, alignment, or misuse concerns are raised."
    483     },
    484     "drama_conflict": {
    485       "score": 0,
    486       "justification": "No controversy; straightforward model fine-tuning paper."
    487     },
    488     "demo_ability": {
    489       "score": 2,
    490       "justification": "Model and corpora are open-sourced on GitHub; practitioners can download and run GeoCode-GPT-7B for geospatial code tasks."
    491     },
    492     "brand_recognition": {
    493       "score": 0,
    494       "justification": "Authors are from Wuhan University; no famous lab, company, or widely-known research group is involved."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [],
    499     "top_points": 0,
    500     "total_points": 0,
    501     "total_comments": 0
    502   }
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs