scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27236B)
      1 {
      2   "paper": {
      3     "title": "Evaluation of Code LLMs on Geospatial Code Generation",
      4     "authors": [
      5       "Piotr Gramacki",
      6       "Bruno Martins",
      7       "Piotr Szymański"
      8     ],
      9     "year": 2024,
     10     "venue": "GeoAI@SIGSPATIAL",
     11     "arxiv_id": "2410.04617",
     12     "doi": "10.1145/3687123.3698286"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "StarCoder2-7b achieves the best geospatial code generation performance (32.47% pass@1) among seven 7B/8B-parameter LLMs, despite ranking 4th on HumanEval, showing that generic code benchmarks do not predict domain-specific performance. Multi-step geospatial tasks are substantially harder than single-step tasks across all models. Models fail almost completely on OSMNX and MovingPandas libraries while performing reasonably on Shapely and H3, revealing significant gaps in domain-specific tool knowledge. Task framing (operation vs. semantic) has a small mixed effect, suggesting weak geospatial reasoning in current models.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository provided: https://github.com/kraina-ai/geospatial-code-llms-dataset (footnote 1). The abstract states 'We share our dataset and reproducible evaluation code on a public GitHub repository.'"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The benchmark dataset of 77 samples is shared on the same GitHub repository. The abstract confirms the dataset is publicly released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using the transformers library, bitsandbytes for 4-bit quantization, and specific GPU hardware, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper describes the evaluation pipeline methodology (Section 4.1) including code trimming, import handling, and testing procedure, but does not provide step-by-step commands or a reproduction guide. The GitHub repo is referenced but the paper itself lacks concrete instructions."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 2-7 are reported as single point estimates (e.g., '32.47%') with no confidence intervals, error bars, or uncertainty measures."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes comparative claims (e.g., 'best results have been achieved by starcoder2-7b', 'models work better with geodataframes') based solely on comparing raw percentages without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Results are reported as raw percentages in tables. While HumanEval scores are shown alongside benchmark scores for context, no formal effect sizes (Cohen's d, relative improvement, etc.) are computed or discussed."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The benchmark has 77 samples from 20 unique tasks. No justification is given for why this sample size is sufficient, and no power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "All experiments use greedy decoding producing a single deterministic output per sample. No variance, standard deviation, or spread measures are reported across any experimental condition."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Seven models are compared against each other, and HumanEval scores from external leaderboards are included as reference baselines (Table 2)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Models include StarCoder2, CodeLlama, Llama 3, Mistral 7B, Gemma, and CodeGemma — all released in 2023-2024 and representing contemporary open-source code LLMs at the 7B/8B scale."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "The contribution is a benchmark dataset, not a system with components to ablate. The multi-dimensional analysis (complexity, framing, input format, tools) serves a similar diagnostic purpose but is not an ablation study."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three metrics are used: Accuracy (percentage of passed test cases), Pass@1 (percentage of fully correct solutions), and Pass_any@1 (percentage of solutions passing at least one test case). Described in Section 4.1."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Evaluation is entirely automated via test case pass/fail. No human evaluation of code quality, readability, or correctness is performed. The paper explicitly chose automated testing over reference-solution comparison (Section 3.4)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The entire benchmark was hand-crafted specifically for this evaluation: 'The prompts are human-written to ensure that they were not present in any training data for existing models' (Section 3.2). No model tuning was done on this data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive breakdowns are provided across five dimensions: task complexity (Table 3), task framing (Table 4), input format (Table 5), tool usage (Table 6), and point format (Table 7)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses gemma/codegemma models hallucinating and generating repetitive code blocks, models generating placeholder functions with NotImplementedError for unknown libraries (Listing 4), and near-complete failure on OSMNX and MovingPandas."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Multiple negative results reported: all models fail on OSMNX and MovingPandas (Table 6), gemma models perform very poorly overall, models struggle with GeoJSON and Shapefile formats compared to GeoDataFrames, and HumanEval performance does not predict geospatial performance."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims are modest and supported: they propose a benchmark (described in Section 3), test existing models (Section 4), and share code/data (GitHub link). No overclaiming is evident."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes implicit causal claims about why models fail (e.g., 'models that are worse at geospatial code generation find it especially difficult to solve multi-step tasks,' suggesting task complexity causes performance drops) without controlling for confounds like task-specific difficulty or sample size per category."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper explicitly bounds its scope: 'Due to computational constraints we limited ourselves to 7B/8B scale LLMs, which we also aim to extend in the future' (Section 5). The title and abstract reference 'a selection of existing code generation LLMs' rather than making broad claims."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are discussed for the observed results. For example, the poor performance on OSMNX/MovingPandas could be due to library documentation quality, training data volume, or task difficulty rather than just model capability, but this is not explored."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures functional correctness via test cases and frames its claims in terms of code correctness and task-solving capability. The claims match the measurement granularity — they don't overclaim 'productivity' or 'developer effectiveness' from pass@1 scores."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Exact HuggingFace model identifiers are provided for all 7 models (e.g., 'bigcode/starcoder2-7b', 'meta-llama/CodeLlama-7b-hf', 'meta-llama/Meta-Llama-3-8B'). These are specific, versioned model checkpoints."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The exact prompt format is provided in Figure 1 and Listings 1-4, including function signatures, type hints, and docstrings. The format 'remains unchanged between tested models' (Section 3.2). All 77 prompts follow this template and are available in the GitHub repository."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Key hyperparameters are stated: greedy decoding (temperature=0), max_length=200, 4-bit quantization via bitsandbytes. Reported in Section 4.1 under 'Hyperparameters.'"
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. Models generate code completions directly from function signatures via single-pass inference."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.1 describes the evaluation pipeline: trimming responses to a single function by searching for the second 'def' string, searching generated code for library imports, installing them in a virtual environment, and importing before testing."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 contains a dedicated 'Limitations' subsection discussing the benchmark's current scope and the computational constraints on model selection."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper identifies specific limitations: 'the current version of the benchmark should be expanded to cover more typical tasks and tools,' and 'Due to computational constraints we limited ourselves to 7B/8B scale LLMs' (Section 5). These are specific to this study."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper explicitly states what is not covered: no polygon inputs, no larger models, no fine-tuning experiments, no incorrect input handling, no code infilling tasks, limited to 4 tool libraries and 5 input formats. Stated in Sections 3 and 5."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The complete benchmark dataset (77 samples with prompts and test cases) is available on the public GitHub repository. Generated outputs could be verified against this data."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.3 describes the sample creation process: starting with task definition on the complexity dimension, then augmenting via input format and framing changes to produce variants. The categorization scheme is detailed in Section 3.1."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. The data is a manually created benchmark dataset, and the evaluated systems are publicly available LLMs."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline from task conception to final dataset is documented: define complexity dimension → select tools/input types → write task → augment across dimensions → create test cases (Section 3). The evaluation pipeline is documented in Section 4.1."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All author affiliations are clearly stated: Wrocław University of Science and Technology (Gramacki, Szymański) and INESC-ID/Instituto Superior Técnico, University of Lisbon (Martins). Kraina.AI affiliation is also listed."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding information is disclosed. Without knowing the funding source, independence cannot be verified."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper. The Kraina.AI affiliation is listed but no conflict-of-interest declaration is made."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "Training data cutoff dates are not stated for any of the 7 evaluated models. The paper relies on hand-crafted tasks as contamination mitigation but does not state when each model's training data ends."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "The paper states: 'The prompts are human-written to ensure that they were not present in any training data for existing models' (Section 3.2). This directly addresses train/test overlap through benchmark design."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Contamination is addressed by design: the benchmark was manually created after the models' training periods, with human-written prompts to ensure novelty. This is stated in Section 3.2."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The evaluation is entirely automated benchmark testing of LLMs."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study evaluates LLMs on a code generation benchmark."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or tokens consumed are reported. The paper mentions hardware used but not per-sample or total inference costs."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is mentioned (GTX 1080 8GB, A100 80GB) but no total GPU hours, wall-clock time, or compute budget is quantified. Section 4.1 describes the machines but not how long experiments took."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Greedy decoding produces deterministic outputs, so there is no seed sensitivity analysis. The paper does not explore whether results change with non-greedy sampling strategies."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "The paper states they use greedy decoding to produce 'a single output' per sample (Section 4.1 under 'Hyperparameters'), making the number of runs (1) explicit."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search is reported. The max_length=200 was 'verified to be enough' but no search budget or alternative configurations are discussed."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "A uniform configuration is used across all models (greedy decoding, max_length=200, 4-bit quantization), eliminating configuration cherry-picking. The max_length was verified to be sufficient for all models and tasks."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors created the benchmark and evaluate models on it. No discussion of whether benchmark design choices might favor or disadvantage particular models, or acknowledgment of self-comparison bias."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "All models are 7B/8B parameters with the same 4-bit quantization, making compute differences negligible across the comparison."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether 77 samples across 20 tasks adequately measure geospatial code generation capability, nor whether the four-dimensional categorization captures the full range of geospatial coding challenges."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. Models perform single-pass code completion from function signatures."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The benchmark tasks were manually created after the models' training periods. 'The prompts are human-written to ensure that they were not present in any training data for existing models' (Section 3.2). This is a temporal leakage prevention strategy."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the prompt format, function signatures, or docstring style could leak information about expected solutions. The evaluation pipeline also adds all needed imports (Section 4.1), which could make the task easier than real-world usage."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The 77 samples are augmented from 20 unique tasks (Section 3.3), meaning many samples share the same underlying logic. This non-independence is not discussed as a potential source of inflated evaluation."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No formal leakage detection method (canary strings, membership inference, n-gram overlap analysis) is employed. The paper relies solely on manual creation as a prevention measure."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Generic HumanEval performance does not predict geospatial code generation performance — StarCoder2-7b ranks 4th on HumanEval but 1st on the geospatial benchmark.",
    369       "evidence": "Table 2 shows StarCoder2-7b at 34.09% HumanEval but 32.47% pass@1 on geospatial tasks (best), while CodeLlama-Python has 40.48% HumanEval but only 24.68% geospatial pass@1.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Multi-step geospatial tasks are significantly harder than single-step tasks for all code generation models.",
    374       "evidence": "Table 3 shows consistent drops across all models: StarCoder2 drops from 45.45% to 15.15%, CodeLlama from 29.55% to 12.12%, Gemma from 13.64% to 3.03%.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Current code LLMs fail almost completely on OSMNX and MovingPandas libraries.",
    379       "evidence": "Table 6: all models score 0% on MovingPandas, and 6 of 7 models score 0% on OSMNX (only StarCoder2 at 16.67%).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Models perform better with GeoDataFrame inputs than Shapefile or GeoJSON formats.",
    384       "evidence": "Table 5 shows higher pass@1 for GeoDataFrame across most models (e.g., CodeLlama-hf: 27.27% gdf vs 18.75% shp vs 4.55% geojson).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Task framing (operation vs. semantic) has a small and inconsistent effect on model performance.",
    389       "evidence": "Table 4: most models perform slightly better with operation framing, but two models (Mistral, CodeLlama-hf) perform better with semantic framing. Differences are small (3-9 percentage points).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Models perform better with Shapely Point objects than raw latitude/longitude coordinates.",
    394       "evidence": "Table 7 shows consistent advantage for shapely format: StarCoder2 33.33% vs 66.67%, CodeLlama 4.76% vs 33.33%.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No statistical tests or error bars",
    401       "detail": "All comparative claims (e.g., 'best model', 'models work better with') are based solely on comparing raw percentages without any statistical significance testing, confidence intervals, or error bars. With only 77 samples (many categories having 3-7 samples), observed differences could easily be due to chance."
    402     },
    403     {
    404       "flag": "Very small benchmark size",
    405       "detail": "The benchmark has only 77 samples from 20 unique tasks. Per-category breakdowns sometimes have as few as 3-4 samples (e.g., H3 with 3 tasks, OSMNX with 4 tasks), making percentage differences unreliable. The 77 samples are augmented from 20 unique tasks, so many share identical logic."
    406     },
    407     {
    408       "flag": "Single-run deterministic evaluation only",
    409       "detail": "Only greedy decoding (temperature=0) is used, producing a single output per sample. No exploration of sampling strategies (temperature, top-p) or pass@k with k>1, which are standard in code generation evaluation."
    410     },
    411     {
    412       "flag": "Non-independence of augmented samples",
    413       "detail": "The 77 samples are augmented from 20 unique tasks by varying input format and framing. Samples sharing the same underlying task are not independent, potentially inflating apparent benchmark coverage."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating Large Language Models Trained on Code",
    419       "authors": ["Mark Chen", "Jerry Tworek"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduced HumanEval benchmark for code generation evaluation, the primary reference baseline used in this paper."
    423     },
    424     {
    425       "title": "DS-1000: a natural and reliable benchmark for data science code generation",
    426       "authors": ["Yuhang Lai", "Chengxi Li"],
    427       "year": 2023,
    428       "relevance": "Domain-specific code generation benchmark for data science — directly comparable approach to the geospatial domain specialization in this paper."
    429     },
    430     {
    431       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    432       "authors": ["Anton Lozhkov", "Raymond Li"],
    433       "year": 2024,
    434       "arxiv_id": "2402.19173",
    435       "relevance": "StarCoder2-7b was the best-performing model on the geospatial benchmark despite not being best on HumanEval."
    436     },
    437     {
    438       "title": "Code Llama: Open Foundation Models for Code",
    439       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    440       "year": 2024,
    441       "arxiv_id": "2308.12950",
    442       "relevance": "CodeLlama models evaluated in this study; foundational open-source code generation models."
    443     },
    444     {
    445       "title": "Large Language Models Meet NL2Code: A Survey",
    446       "authors": ["Daoguang Zan", "Bei Chen"],
    447       "year": 2023,
    448       "doi": "10.18653/v1/2023.acl-long.411",
    449       "relevance": "Survey of LLM code generation approaches, providing context for the evaluation methodology used."
    450     },
    451     {
    452       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    453       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    454       "year": 2023,
    455       "relevance": "Rigorous code generation evaluation methodology; the paper considers using LLMs for test case generation as future work based on this approach."
    456     },
    457     {
    458       "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence",
    459       "authors": ["Mayank Mishra", "Matt Stallone"],
    460       "year": 2024,
    461       "arxiv_id": "2405.04324",
    462       "relevance": "Family of open code generation models relevant to the broader code LLM landscape evaluated."
    463     },
    464     {
    465       "title": "Program Synthesis with Large Language Models",
    466       "authors": ["Jacob Austin", "Augustus Odena"],
    467       "year": 2021,
    468       "arxiv_id": "2108.07732",
    469       "relevance": "Early work on LLM-based program synthesis with evaluation dataset design relevant to benchmark construction."
    470     },
    471     {
    472       "title": "Measuring Coding Challenge Competence With APPS",
    473       "authors": ["Dan Hendrycks", "Steven Basart"],
    474       "year": 2021,
    475       "relevance": "Code generation benchmark providing principles for evaluation dataset design referenced by this work."
    476     }
    477   ]
    478 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs