ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27085B)


      1 {
      2   "paper": {
      3     "title": "Syntactic Robustness for LLM-based Code Generation",
      4     "authors": [
      5       "Laboni Sarker",
      6       "Mara Downing",
      7       "Achintya Desai",
      8       "Tevfik Bultan"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2404.01535",
     13     "doi": "10.48550/arXiv.2404.01535"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "GPT-3.5 and GPT-4 are not syntactically robust when mathematical formulas in code generation prompts are mutated to semantically equivalent forms. Syntactic robustness degree decreases as syntactic distance (number of mutations) increases, from ~85% at distance 1 to ~36% at distance 5 for GPT-4. GPT-4 is 1.55x more robust than GPT-3.5 on average. A pre-processing step that reduces mutated formulas to simplified forms achieves 100% syntactic robustness on all tested equations.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Section VII mentions 'The working pipeline is available at Link' but the actual URL is a placeholder — no working repository link is provided in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The 627 equation variations and 3135 generated code responses are not released. The equations are described formally but the actual dataset is not available."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Section VIII-A mentions hardware (i9-13900K, 192 GB RAM, Ubuntu 22.04.4) and Section VII mentions Python with sympy, but no requirements.txt, dependency versions, or environment specification sufficient to recreate the setup."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The pipeline is described conceptually in Section VII and Figure 7, but no step-by-step reproduction instructions (commands to run, configuration steps) are provided. The repository link is a placeholder."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results are reported as point estimates (e.g., '85.05%, 63%, 54%, 51.37%, and 36%' for distances 1-5). No confidence intervals or error bars are provided despite querying each prompt 5 times."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims GPT-4 is more robust than GPT-3.5 and that robustness decreases with distance, but uses no statistical tests. Comparisons are based solely on comparing raw percentages."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Results are reported with baseline context: '85.05%, 63%, 54%, 51.37%, and 36% respectively' for GPT-4 across distances, and 'GPT-4 has performed 1.55 times better than GPT-3.5' (Section VIII-B, RQ1-RQ2). The magnitude of effects is clear from the percentage breakdowns."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "20 variations per distance, 627 total variations, 5 queries per variation (3135 total), and 1000 differential testing inputs are all used without justification for these specific numbers."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Despite querying each prompt 5 times to address non-determinism (Section VII), no standard deviation, variance, or spread across the 5 runs is reported. Only aggregate robustness degrees are shown."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "GPT-3.5 and GPT-4 are compared against each other, and the pre-processing reduction approach is compared against the no-pre-processing baseline (Figures 8-12)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "GPT-3.5-Turbo and GPT-4 were the leading commercial LLMs at the time of writing (early 2024). These are appropriate and contemporary models."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper systematically varies mutation types (5 types, Figure 9), equation types (7 types, Figures 10-12), and syntactic distances (1-5, Figure 8), effectively ablating along multiple dimensions."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The only metric used is syntactic robustness degree (Definition 7). While broken down by equation type, distance, and mutation type, it is fundamentally a single metric throughout."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Evaluation is entirely automated via differential testing with 1000 random inputs per generated code (Section VII). No human evaluation of generated code quality or correctness is included."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Section VII states they 'tested different setups with temperature, top p, and seed values and chose the one that provides the most deterministic results' — this hyperparameter selection was done on some data, but there is no description of whether this was separated from the final evaluation data."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by equation type (Figures 10-12), mutation type (Figure 9), and syntactic distance (Figure 8), providing detailed per-category analysis."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The motivating example (Figures 1-2) shows a specific failure. RQ2 discusses a specific quadratic equation failure where 'GPT-4 calculates the discriminant as b²−4×a×(−c) instead of b²−4×a×c.' Rounding issues in logarithmic equations are also discussed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The main finding is negative — both GPTs fail at syntactic robustness. They also report the surprising case where higher syntactic distance sometimes yields better results for quadratic equations in GPT-4 (Section VIII-B, RQ2)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims GPT-3.5 and GPT-4 are not syntactically robust (supported by Figures 8-12) and that pre-processing improves robustness (supported by Figure 8 showing 100% after reduction). All abstract claims are backed by results."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims ('reduction improves robustness', 'more mutations create more confusion') are supported by controlled single-variable manipulation — same prompts with/without reduction, same equations at different distances. The experimental design supports causal inference for these specific claims."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Syntactic Robustness for LLM-based Code Generation' is much broader than the study scope: only GPT-3.5 and GPT-4 tested, only mathematical equation prompts in C code, only 7 equation types. The threats section acknowledges limited equations but not the gap between title/framing and scope."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The threats section discusses methodological concerns (API learning, differential testing limitations) but does not discuss alternative explanations for why models fail at syntactic robustness (e.g., tokenization effects, training data distribution of formula representations, attention mechanisms)."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper clearly defines what it measures (syntactic robustness degree via differential testing with 1000 inputs) and acknowledges the proxy limitation: 'we calculate the syntactic robustness degree with respect to a subset of the input domain' (Section VII)."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper uses 'GPT-3.5-turbo' and 'GPT-4' without specifying version snapshots (e.g., gpt-3.5-turbo-0613, gpt-4-0613). Model behavior changes across versions and no API version or snapshot date is given."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV provides the full text of all 5 prompt templates. The equation slot is filled by well-defined mutation rules (Section V, Figure 4) and the base equations are listed explicitly. A reader can reconstruct every prompt."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section VII states 'we used temperature and seed values of 0' and mentions testing different setups with temperature, top_p, and seed values."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The system makes direct API queries to GPT models."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section VII documents the full pipeline: mutation generation via sympy, GPT response post-processing (extracting code, removing text), GCC compilation, output post-processing for format normalization, and differential testing with epsilon-based comparison."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section VIII-C 'Threats to Validity' contains substantive discussion with both internal and external threat subsections."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The threats section discusses specific issues: API learning from prior requests with their mitigation (switching APIs), differential testing's inability to prove equivalence, epsilon value limitations for float comparison, and the limited set of equations being non-representative."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "External threats section states: 'We analyze a small set of equations with a limited number of possible mutations, which we recognize may not be representative of all possible equations and mutations given to LLMs for code generation.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The 627 equation variations, 3135 generated code responses, and differential testing results are not released. The repository link is a placeholder."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section VII describes data collection in detail: 20 variations per distance for distances 1-5, 627 total variations for 7 equation types, 5 queries per variation via GPT API, 1000 random test inputs per generated code for differential testing."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Test data is deterministically generated from formal grammar rules and mutation operations, not sampled from an external source."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section VII and Figure 7 document the full pipeline: prompt generation → GPT API query → response post-processing → GCC compilation → differential testing → equivalence analysis. Each step is described with implementation details."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Footnote on page 1 discloses NSF funding: Award #2124039, Award #2008660, Award #1901098."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors are from University of California, Santa Barbara. They are not affiliated with OpenAI or any company whose products they evaluate."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "NSF (National Science Foundation) is a government funding agency with no financial interest in the performance of GPT-3.5 or GPT-4."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for GPT-3.5-Turbo or GPT-4 despite evaluating these models' code generation capabilities."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "While the prompts are generated programmatically (reducing contamination risk for the mutations), the base equations (e.g., a*x+b=0) and their solutions are certainly in GPT training data. This is not discussed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether the base mathematical equations and their solutions appeared in GPT training data. The models likely saw canonical forms of these equations many times during training, which could affect the baseline robustness measurement."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "3135 API queries to GPT-3.5 and GPT-4 were made. No API costs, token counts, or per-query latency are reported."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Hardware is specified (Section VIII-A) but total compute budget — API spend, total runtime, or GPU/CPU hours — is not stated."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "A fixed seed of 0 was used for determinism. The paper mentions testing 'different setups with temperature, top p, and seed values' but does not report sensitivity to seed choice or variation across seeds."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section VII explicitly states: 'we have asked GPT to provide code five times for the same prompt' yielding 3135 queries (627 × 5)."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Section VII mentions testing 'different setups with temperature, top p, and seed values and chose the one that provides the most deterministic results' but does not report how many configurations were tried or the search method."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The selection criterion is vaguely stated as 'the one that provides the most deterministic results' without specifying what 'most deterministic' means, how it was measured, or on what data the selection was made."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper evaluates its own reduction pre-processing method and compares it against the no-reduction baseline. No discussion of author-evaluation bias or independent evaluation."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "GPT-3.5 and GPT-4 differ substantially in compute cost, but the comparison is made without discussing compute differences. The pre-processing step's computational cost relative to its benefit is also not reported."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether syntactic robustness on mathematical equations generalizes to other forms of code generation robustness. The benchmark is limited to 7 equation types with algebraic mutations, but no discussion of whether this measures robustness more broadly."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used — direct API calls to GPT models."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The base mathematical equations (e.g., a*x+b=0, quadratic formula) and their solutions are ubiquitous in training data. The paper does not discuss whether the models' familiarity with canonical forms of these equations affects baseline robustness or the observed robustness decay for mutated forms."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. The prompts explicitly state the equation type and ask for a solver, which provides substantial hints about the expected solution approach."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "All 627 variations are derived from only 7 base equations via mutations. These are structurally non-independent (mutations of the same equation share most of their structure). This is not discussed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is used."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "GPT-3.5 and GPT-4 are not syntactically robust for code generation with mathematical formulas.",
    370       "evidence": "Figures 8, 10, 11 show syntactic robustness degree below 100% for all syntactic distances. GPT-4 ranges from 85% (distance 1) to 36% (distance 5); GPT-3.5 from 70% to 16% (Section VIII-B, RQ1).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Syntactic robustness degree decreases as the number of mutations (syntactic distance) increases.",
    375       "evidence": "Figure 8 shows consistent negative correlation for both models. GPT-4: 85%, 63%, 54%, 51%, 36% for distances 1-5. GPT-3.5: 70%, 51%, 33%, 16%, 16% (Section VIII-B, RQ2). One exception noted for quadratic equations in GPT-4.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Pre-processing with formula reduction achieves 100% syntactic robustness.",
    380       "evidence": "Figure 8 shows reduced form achieves 100% syntactic robustness for all equation types and both GPT models (Section VIII-B, RQ4).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "GPT-4 has 1.55x higher syntactic robustness than GPT-3.5.",
    385       "evidence": "Averaged across all distances and equations: GPT-4 outperforms GPT-3.5 by 1.55x (Section VIII-B, RQ1). Individual breakdowns shown in Figures 8-12.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Subtraction mutations cause the most confusion, swap mutations the least.",
    390       "evidence": "Figure 9 shows swap has highest robustness (34%/58% for GPT-3.5/4) and subtraction has lowest (22%/41%) (Section VIII-B, RQ3).",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Very narrow evaluation domain",
    397       "detail": "Only 7 equation types with 5 mutation rules in C code prompts, yet the paper title and framing suggest broad conclusions about 'LLM-based Code Generation.' The mathematical equation domain is a small slice of code generation use cases."
    398     },
    399     {
    400       "flag": "No variance reported despite multiple runs",
    401       "detail": "Each prompt was queried 5 times to address non-determinism, but no standard deviation, variance, or spread across runs is reported. Only aggregate robustness degrees are shown, hiding any run-to-run variation."
    402     },
    403     {
    404       "flag": "100% claim is circular by design",
    405       "detail": "The reduction rules are the inverse of the mutation rules (explicitly noted in Section VI). Reducing a mutated formula recovers the original, so achieving 100% robustness is expected by construction and does not demonstrate generalizability to arbitrary formula complexity."
    406     },
    407     {
    408       "flag": "Broken repository link",
    409       "detail": "Section VII states 'The working pipeline is available at Link' — this is a placeholder URL. The code and data cannot be independently verified."
    410     },
    411     {
    412       "flag": "No model version pinning",
    413       "detail": "GPT-3.5-Turbo and GPT-4 are used without specifying API version or snapshot date. Results may not be reproducible as OpenAI updates these models."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduced HumanEval benchmark for LLM code generation evaluation, which is the standard benchmark in this domain."
    423     },
    424     {
    425       "title": "An empirical study of the code generation of safety-critical software using LLMs",
    426       "authors": ["M. Liu", "J. Wang", "T. Lin"],
    427       "year": 2024,
    428       "relevance": "Studies LLM code generation for safety-critical software, arguing for the need for robustness evaluation."
    429     },
    430     {
    431       "title": "Large language models for software engineering: Survey and open problems",
    432       "authors": ["A. Fan", "B. Gokkaya", "M. Harman"],
    433       "year": 2023,
    434       "arxiv_id": "2310.03533",
    435       "relevance": "Comprehensive survey of LLMs for software engineering that defined robustness for code generation, which this paper extends."
    436     },
    437     {
    438       "title": "LLM is like a box of chocolates: the non-determinism of ChatGPT in code generation",
    439       "authors": ["S. Ouyang", "J. Zhang", "M. Harman"],
    440       "year": 2023,
    441       "relevance": "Empirical study of non-determinism in LLM code generation, directly relevant to this paper's handling of non-deterministic outputs."
    442     },
    443     {
    444       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    445       "authors": ["J. Liu", "C. Xia", "Y. Wang"],
    446       "year": 2023,
    447       "relevance": "Framework for rigorous evaluation of LLM code generation correctness, relevant methodology for code generation assessment."
    448     },
    449     {
    450       "title": "Coco: Testing code generation systems via concretized instructions",
    451       "authors": ["M. Yan", "J. Chen", "J. M. Zhang"],
    452       "year": 2023,
    453       "arxiv_id": "2308.13319",
    454       "relevance": "Tests code generation robustness via concretized instructions — a different robustness definition that this paper contrasts against."
    455     },
    456     {
    457       "title": "On the robustness of code generation techniques: An empirical study on GitHub Copilot",
    458       "authors": ["A. Mastropaolo", "L. Pascarella", "E. Guglielmi"],
    459       "year": 2023,
    460       "relevance": "Empirical study of code generation robustness using semantically equivalent natural language descriptors — complementary approach to this paper's formula mutations."
    461     },
    462     {
    463       "title": "GPT-4 technical report",
    464       "authors": ["OpenAI", "J. Achiam", "S. Adler"],
    465       "year": 2023,
    466       "arxiv_id": "2303.08774",
    467       "relevance": "Technical report for GPT-4, one of the two models evaluated in this study."
    468     },
    469     {
    470       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    471       "authors": ["D. Huang", "Q. Bu", "J. M. Zhang"],
    472       "year": 2023,
    473       "arxiv_id": "2312.13010",
    474       "relevance": "Multi-agent code generation approach achieving 96% on HumanEval, showing the state-of-the-art in LLM code generation."
    475     },
    476     {
    477       "title": "A systematic evaluation of large language models of code",
    478       "authors": ["F. F. Xu", "U. Alon", "G. Neubig"],
    479       "year": 2022,
    480       "relevance": "Systematic evaluation of code LLMs providing broader context for evaluating code generation capabilities."
    481     },
    482     {
    483       "title": "Competition-level code generation with AlphaCode",
    484       "authors": ["Y. Li", "D. Choi", "J. Chung"],
    485       "year": 2022,
    486       "relevance": "Demonstrates competition-level code generation capability, relevant to understanding the state of LLM code generation."
    487     }
    488   ]
    489 }

Impressum · Datenschutz