ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29322B)


      1 {
      2   "paper": {
      3     "title": "GeoCode-GPT: A Large Language Model for Geospatial Code Generation Tasks",
      4     "authors": [
      5       "Shuyang Hou",
      6       "Zhangxiao Shen",
      7       "Anqi Zhao",
      8       "Jianyuan Liang",
      9       "Zhipeng Gui",
     10       "Xuefeng Guan",
     11       "Rui Li",
     12       "Huayi Wu"
     13     ],
     14     "year": 2024,
     15     "venue": "International Journal of Applied Earth Observation and Geoinformation",
     16     "arxiv_id": "2410.17031",
     17     "doi": "10.1016/j.jag.2025.104456"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "GeoCode-GPT-7B, fine-tuned from Code Llama-7B on domain-specific geospatial corpora using QLoRA and LoRA, outperforms the base model by 27.6% average on multiple-choice questions, 21.4% on code summarization, and 25.1% on code generation across the GeoCode-Eval benchmark. The model approaches GPT-4 performance on most metrics despite having far fewer parameters, though it lags behind GPT-4 on entity recognition, platform knowledge, and code readability. The open-sourced GeoCode-PT (275K code snippets), GeoCode-SFT (502K instructions), and GeoCode-Eval (4,000 items) represent the first dedicated geospatial code generation corpus and benchmark.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "GitHub repository URL is provided: https://github.com/whuhsy/GeoCode-GPT. The paper states the corpora and model are open-sourced at this link."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper states GeoCode-PT, GeoCode-SFT, and GeoCode-Eval are open-sourced at the same GitHub repository. The contributions section explicitly states 'We developed and open-sourced the GeoCode-PT and GeoCode-SFT corpora, as well as the GeoCode-Eval evaluation set.'"
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions 'two NVIDIA A100 40GB GPUs' and specific hyperparameters, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions. No software dependency versions are listed."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided in the paper. The training process is described at a high level (Sections 4.2-4.3) with hyperparameters, but there are no specific commands or scripts to reproduce the experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 5, 6, and 7 report only point estimates (e.g., 0.848 average accuracy). No confidence intervals, error bars, or uncertainty measures are reported for any result."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims GeoCode-GPT 'outperforms' other models across all metrics based solely on comparing raw numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used for any comparison."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports improvements with baseline context throughout — e.g., 'outperforms other models in multiple-choice accuracy by 9.1% to 32.1%' and tables show both absolute scores and differences from GeoCode-GPT for each model, allowing readers to assess magnitude."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The evaluation set sizes (3,000 MC questions, 500 code generation, 500 summarization) are stated but no justification is given for why these sizes are sufficient. No power analysis or sample size rationale is provided."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or any spread measure is reported. All results appear to be from single runs with no indication of stability across runs or seeds."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Comprehensive baselines are included (Section 5): GPT-4, GPT-3.5, ERNIE 4.0, LLaMA 2-7B, LLaMA 3-8B, CodeGemma-7B, StarCoder 2-7B, CodeGeeX 2-6B, Code Llama-13B, and the base model Code Llama-7B."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include GPT-4, LLaMA 3-8B, CodeGemma-7B, and StarCoder 2-7B, which are contemporary models for 2024. The selection spans commercial and open-source models of various sizes."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The system has multiple components (QLoRA pretraining on GeoCode-PT, LoRA fine-tuning on GeoCode-SFT, different corpus components) but no ablation study isolates the contribution of each component. There is no experiment showing pretraining-only vs fine-tuning-only, or different corpus subsets."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Multiple metrics are used: 6 accuracy dimensions for MC questions, 3 metrics for code summarization (Completeness, Accuracy, Readability), and 3 metrics for code generation (Accuracy, Readability, Executability)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Human evaluation is included: experts run generated code on different platforms to assess executability, and experts perform blind ranking of code readability across models (Section 5.2.2)."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "GeoCode-Eval is a separate evaluation set constructed to prevent knowledge leakage: 'all multiple-choice questions in GeoCode-Eval were sourced from platforms such as Baidu Wenku, Niuke, and Daoke Wenku' and code tasks used 'different valid code snippets' from GeoCode-SFT."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Detailed per-category breakdowns are provided: Table 5 shows 6 MC dimensions individually, Table 6 breaks down 3 summarization metrics, and Table 7 breaks down 3 code generation metrics."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "The paper notes where GeoCode-GPT underperforms GPT-4 (e.g., entity recognition, platform knowledge) but does not show specific failure examples, error analysis, or qualitative examination of where the model produces incorrect code."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "No negative results are reported. The paper does not describe any approaches tried and abandoned, configurations that failed, or design decisions that hurt performance. All reported experiments show improvement over the baseline."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims of '9.1% to 32.1%' MC improvement, '1.7% to 25.4%' summarization improvement, and '1.2% to 25.1%' code generation improvement are supported by Tables 5, 6, and 7 respectively."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims the QLoRA+LoRA training strategy 'achieves significant improvements' and attributes gains to their corpus and fine-tuning approach. While the before/after comparison with Code Llama-7B supports that some change occurred, the lack of ablation means the causal contribution of each component (QLoRA pretraining, LoRA fine-tuning, specific corpus elements) is not isolated."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Claims are consistently bounded to 'geospatial code generation tasks' throughout the paper. The title, abstract, and conclusion frame results specifically within the geospatial domain, and they don't claim general code generation superiority."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as evaluation data overlap with training sources, GPT-4 judge bias, or whether memorization of platform-specific syntax (rather than genuine understanding) drives improvements."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper measures MC accuracy, GPT-4-scored summarization, and expert-evaluated code generation, framing these as demonstrating capability in 'geospatial code generation.' However, it does not discuss whether these proxies (especially MC questions and GPT-4 scores) actually capture real-world geospatial coding ability or productivity."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Models are referenced by general names: 'GPT-4', 'GPT-3.5', 'Code Llama-7B', 'LLaMA 2-7B', etc. No specific version identifiers, snapshot dates, or API versions are provided for any model."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Prompt templates for GPT-4 evaluation scoring are provided in Fig. 6 (summarization) and Fig. 7 (code generation). The GeoCode-SFT instruction format (Instruct-Input-Output) is described in Section 3.2."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Detailed hyperparameters are reported in Sections 4.2 and 4.3: batch size 64, int4 quantization, learning rate 0.0002, cosine decay, weight decay 0.1, gradient accumulation 4, sequence length 4096, LoRA rank 64, NF4, scaling factor 128, dropout 0.05."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The models are prompted directly for evaluation tasks without any agent workflow, tool use, or iterative scaffolding."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3 provides detailed documentation of data construction: code collection from specific platforms (GitHub, Stack Overflow, Hugging Face), syntax screening, structured extraction algorithms (Fig. 3), Self-Instruct generation process (Fig. 4), and quality checking. Tables 1-4 enumerate all data components."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6.1 'Limitations' is a dedicated subsection discussing the gap with GPT-4 and room for improvement in executability."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Section 6.1 discusses only future improvements ('expanding the scale of instruction data', 'leveraging more powerful hardware') rather than specific threats to the validity of current results. No discussion of evaluation bias, benchmark specificity, or GPT-4-as-judge reliability."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of which geospatial platforms are not covered, what types of geospatial tasks are excluded, or limitations of evaluating only on GeoCode-Eval."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The GeoCode-PT, GeoCode-SFT, and GeoCode-Eval datasets are stated to be open-sourced at the GitHub repository, making the underlying data available for independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3 provides detailed descriptions of data collection: code from GitHub/Stack Overflow/Hugging Face, operator knowledge from official documentation, dataset knowledge from platform pages, encyclopedic documents from Wikipedia. Table 1 gives complete inventory with quantities and sources."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "Expert evaluators are used for executability testing and readability ranking but no details are provided about who these experts are, how many were involved, how they were selected, or their qualifications."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The full pipeline from raw data collection through structured extraction (Fig. 3), Self-Instruct generation (Fig. 4), to final corpus assembly is documented with quantities at each stage (Tables 1-4)."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: State Key Laboratory of Information Engineering in Surveying, Mapping and Remote Sensing, Wuhan University; and School of Remote Sensing and Information Engineering, Wuhan University."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding source is disclosed, so independence of the funder cannot be assessed. The work appears to be academic but funding is not mentioned."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No training data cutoff dates are stated for any model evaluated. Code Llama's training cutoff is not mentioned, nor are the cutoffs for GPT-4, GPT-3.5, or other baselines."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The paper mentions sourcing MC questions from external platforms 'to ensure fairness in evaluation and prevent knowledge leakage,' but does not analyze whether the baseline models (GPT-4, GPT-3.5, etc.) may have seen GeoCode-Eval source data during their training. The overlap prevention is only for GeoCode-GPT's own training."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "GeoCode-Eval sources questions from public platforms (Baidu Wenku, Niuke, Daoke Wenku) that could be in commercial models' training data. The paper does not discuss this contamination risk for baseline models."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "The paper has no human subjects study. Expert evaluators serve as judges, not experimental participants."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human subjects study — expert judges are not experimental participants requiring IRB approval."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects study. Expert judges are used but this is not a human participants study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human subjects study. The paper does mention 'blind selection process' for readability ranking, but this is a judge evaluation method, not an experimental design element for human participants."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human subjects study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference cost, latency, or per-example cost is reported for any model. The token costs of using GPT-4 as evaluator are mentioned in passing ('Given the token cost of LLMs') but not quantified."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Hardware is mentioned ('two NVIDIA A100 40GB GPUs') and training hyperparameters are detailed, but total training time, GPU hours, or total compute budget is not stated."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of random seeds or seed sensitivity analysis. All results appear to be single-run."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is not stated. Results are presented as point estimates with no indication of how many runs produced them."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Hyperparameters are reported in detail but no search budget, search method, or number of configurations tried is described. The choices appear to be presented without justification of the search process."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The base model selection (Code Llama-7B) is justified as balancing 'performance, flexibility, and training costs' (Section 4.1), but the specific hyperparameter configuration is presented without justification for why these values were selected over alternatives."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Many comparisons are made across 11 models and multiple dimensions without any correction for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors evaluate their own model on their own benchmark (GeoCode-Eval) constructed by the same team. This self-evaluation bias is not acknowledged or discussed."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper compares a 7B model against GPT-4 and notes the parameter gap ('despite having significantly fewer parameters than commercial models') but does not report performance as a function of compute budget or compare at matched compute levels."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "GeoCode-Eval is framed using Bloom's taxonomy (Fig. 5) as a conceptual basis, but there is no empirical validation that the benchmark actually measures real-world geospatial code generation ability. No comparison with alternative benchmarks or external validity analysis."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved in the model comparisons. Models are prompted directly."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of temporal leakage. GeoCode-Eval MC questions from public platforms (Baidu Wenku, Niuke) could exist in commercial models' training data from before their training cutoffs, but this is not addressed."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of feature leakage. The MC question format could give models hints through option structure, and the code tasks include metadata that could leak information, but this is not analyzed."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "The paper states GeoCode-Eval uses 'different valid code snippets' from GeoCode-SFT, but does not formally verify independence or measure similarity between training and evaluation data."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No concrete leakage detection method is applied. The paper mentions sourcing evaluation questions from external platforms to 'prevent knowledge leakage' but does not use canary strings, membership inference, n-gram overlap analysis, or other detection techniques."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GeoCode-GPT-7B outperforms Code Llama-7B by an average of 27.6% on multiple-choice accuracy across six geospatial knowledge dimensions.",
    374       "evidence": "Table 5 shows GeoCode-GPT-7B achieving 0.848 average accuracy vs Code Llama-7B at 0.572, with improvements ranging from 0.08 (PLR) to 0.508 (DK). Section 5.1.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "GeoCode-GPT-7B improves code summarization by 21.4% overall compared to Code Llama-7B, approaching GPT-4 performance.",
    379       "evidence": "Table 6 shows GeoCode-GPT-7B at 0.914 overall vs Code Llama-7B at 0.700. GPT-4 scores 0.897, which GeoCode-GPT surpasses slightly. Based on GPT-4-as-judge scoring. Section 5.2.1.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "GeoCode-GPT-7B improves code generation by 25.1% overall compared to Code Llama-7B.",
    384       "evidence": "Table 7 shows GeoCode-GPT-7B at 0.636 overall vs Code Llama-7B at 0.385. Includes expert-assessed executability (0.504 vs 0.302). Section 5.2.2.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "GeoCode-GPT-7B is the first LLM specifically designed for geospatial code generation tasks.",
    389       "evidence": "Stated in the abstract and Section 1 contributions. The related work section (Section 2) confirms no prior domain-specific LLM for geospatial code exists.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The QLoRA+LoRA training strategy effectively balances computational resources and training efficiency for domain adaptation.",
    394       "evidence": "The paper describes the strategy (Sections 4.2-4.3) and shows improved results over the base model, but provides no ablation isolating QLoRA pretraining vs LoRA fine-tuning contributions, and no comparison with alternative training strategies.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Self-evaluation on own benchmark",
    401       "detail": "GeoCode-GPT is trained on GeoCode-PT/SFT and evaluated on GeoCode-Eval, all constructed by the same team using similar sources and methods. The proximity between training and evaluation data raises contamination risk. The paper does not acknowledge this self-evaluation bias."
    402     },
    403     {
    404       "flag": "No statistical tests or variance reporting",
    405       "detail": "All claims of outperformance are based on comparing raw point estimates without significance tests, confidence intervals, or variance across runs. Results appear to be single-run. Differences as small as 0.008 (GPT-3.5 code generation accuracy) are presented as improvements."
    406     },
    407     {
    408       "flag": "GPT-4 as judge while GPT-4 is a baseline",
    409       "detail": "GPT-4 is used as the automatic evaluator for code summarization scoring while simultaneously being one of the baseline models. This introduces potential bias — GPT-4 may systematically favor certain output styles, including its own."
    410     },
    411     {
    412       "flag": "No ablation study for multi-component system",
    413       "detail": "The system involves QLoRA pretraining, LoRA fine-tuning, and multiple corpus components (code docs, operator knowledge, dataset knowledge, encyclopedic docs, Alpaca data) but no ablation isolates individual contributions."
    414     },
    415     {
    416       "flag": "Evaluation data sourced from public platforms",
    417       "detail": "MC questions are sourced from Baidu Wenku, Niuke, and Daoke Wenku — public educational platforms whose content may be in commercial models' training data. This could systematically disadvantage or advantage certain models."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Code llama: Open foundation models for code",
    423       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    424       "year": 2023,
    425       "arxiv_id": "2308.12950",
    426       "relevance": "Foundation model for code generation; base model for GeoCode-GPT fine-tuning."
    427     },
    428     {
    429       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming--The Rise of Code Intelligence",
    430       "authors": ["D. Guo", "Q. Zhu", "D. Yang"],
    431       "year": 2024,
    432       "arxiv_id": "2401.14196",
    433       "relevance": "Open-source code generation LLM used as baseline context for code generation capabilities."
    434     },
    435     {
    436       "title": "Wizardcoder: Empowering code large language models with evol-instruct",
    437       "authors": ["Z. Luo", "C. Xu", "P. Zhao"],
    438       "year": 2023,
    439       "arxiv_id": "2306.08568",
    440       "relevance": "Code generation LLM using instruction evolution for training, relevant to code LLM fine-tuning approaches."
    441     },
    442     {
    443       "title": "Lora: Low-rank adaptation of large language models",
    444       "authors": ["E.J. Hu", "Y. Shen", "P. Wallis"],
    445       "year": 2021,
    446       "arxiv_id": "2106.09685",
    447       "relevance": "Foundational PEFT method widely used for LLM fine-tuning."
    448     },
    449     {
    450       "title": "Self-instruct: Aligning language models with self-generated instructions",
    451       "authors": ["Y. Wang", "Y. Kordi", "S. Mishra"],
    452       "year": 2022,
    453       "arxiv_id": "2212.10560",
    454       "relevance": "Framework for synthetic instruction data generation used in GeoCode-SFT construction."
    455     },
    456     {
    457       "title": "Evaluating large language models trained on code",
    458       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    459       "year": 2021,
    460       "arxiv_id": "2107.03374",
    461       "relevance": "Introduced HumanEval and pass@k metric for code generation evaluation."
    462     },
    463     {
    464       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    465       "authors": ["T.Y. Zhuo"],
    466       "year": 2023,
    467       "arxiv_id": "2304.14317",
    468       "relevance": "LLM-as-judge approach for code evaluation; relevant to automated code quality assessment."
    469     },
    470     {
    471       "title": "A survey on evaluating large language models in code generation tasks",
    472       "authors": ["L. Chen", "Q. Guo", "H. Jia"],
    473       "year": 2024,
    474       "arxiv_id": "2408.16498",
    475       "relevance": "Survey of evaluation methods for LLM code generation, directly relevant to code evaluation methodology."
    476     },
    477     {
    478       "title": "LIMA: Less Is More for Alignment",
    479       "authors": ["C. Zhou", "P. Liu", "P. Xu"],
    480       "year": 2023,
    481       "arxiv_id": "2305.11206",
    482       "relevance": "Demonstrates effectiveness of small high-quality instruction datasets for LLM alignment."
    483     },
    484     {
    485       "title": "Phi-3 technical report: A highly capable language model locally on your phone",
    486       "authors": ["M. Abdin", "S.A. Jacobs", "A.A. Awan"],
    487       "year": 2024,
    488       "arxiv_id": "2404.14219",
    489       "relevance": "Demonstrates effectiveness of synthetic training data for small LLMs."
    490     },
    491     {
    492       "title": "Systematic Evaluation of LLM-as-a-Judge in LLM Alignment Tasks: Explainable Metrics and Diverse Prompt Templates",
    493       "authors": ["H. Wei", "S. He", "T. Xia"],
    494       "year": 2024,
    495       "arxiv_id": "2408.13006",
    496       "relevance": "Evaluates LLM-as-a-Judge methodology, relevant to understanding reliability of GPT-4-based code evaluation."
    497     },
    498     {
    499       "title": "A Survey on Large Language Models for Code Generation",
    500       "authors": ["J. Jiang", "F. Wang", "J. Shen"],
    501       "year": 2024,
    502       "arxiv_id": "2406.00515",
    503       "relevance": "Comprehensive survey of LLMs for code generation."
    504     }
    505   ]
    506 }

Impressum · Datenschutz