ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28490B)


      1 {
      2   "paper": {
      3     "title": "Interactive Code Generation via Test-Driven User-Intent Formalization",
      4     "authors": [
      5       "Shuvendu K. Lahiri",
      6       "Sarah Fakhoury",
      7       "Aaditya Naik",
      8       "Georgios Sakkas",
      9       "Saikat Chakraborty",
     10       "Madanlal Musuvathi",
     11       "Jeevana Priya Inala",
     12       "Piali Choudhury",
     13       "Curtis von Veh",
     14       "Chenglong Wang",
     15       "Jianfeng Gao"
     16     ],
     17     "year": 2022,
     18     "venue": "arXiv",
     19     "arxiv_id": "2208.05950",
     20     "doi": "10.48550/arXiv.2208.05950"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval"],
     25   "key_findings": "TiCoder demonstrates that interactive test-driven code generation using LLM-generated tests can substantially improve pass@1 accuracy — by 22.49% on MBPP (48.24% → 70.73%) and 24.79% on HumanEval (30.49% → 55.28%) with a single simulated user query. Discriminative test ranking and dynamic test mutation are the most impactful components in ablation. The approach generates a user-accepted test within an average of 1.5–1.7 queries for 87–96% of benchmark examples.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No code repository URL is provided in the paper. Section VI mentions 'We aim to mitigate this by releasing model generated output in the near future' — a promise of future release, not actual release."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "MBPP (sanitized) and HumanEval are public benchmarks, but the authors modified HumanEval by removing non-hidden input-output examples from docstrings (Section IV-B). The modified dataset and the cached Codex outputs are not released (only promised for future release in Section VI)."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment specifications, dependency lists, or software versions are provided beyond naming the Codex model."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided. The algorithmic description is detailed but there are no runnable scripts or reproduction guides."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results are reported as point estimates (e.g., 70.73%, 55.28%) with no confidence intervals or error bars on any figure or table."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper claims TiCoder 'outperforms' baselines based solely on comparing raw percentages (e.g., 70.73% vs 63.70% for CodeT on MBPP) without any statistical significance tests."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper reports absolute percentage improvements with baseline context throughout. For example, Section V-A reports TiCoder pass@1@1 of 70.73% vs Codex baseline 48.24%, and the abstract quantifies improvements as '22.49% to 37.71%' absolute percentage points."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "MBPP has 427 examples and HumanEval has 164. These are standard benchmarks adopted without any discussion of whether these sample sizes are adequate for the claims made."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Section IV-C explicitly states: 'we only query Codex once to generate the initial code and test suggestions into a cache of Codex responses and refer to the same cache for all experiments.' Single-run results with no variance, standard deviation, or spread measures reported."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Section IV-C and V-A compare against: Codex (t=0.8), Codex (t=0), a Baseline TiCoder variant (no mutation/ranking), CodeT, IdealRanking, and IdealTests upper bounds."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "CodeT (Chen et al., 2022) and AlphaCode (Li et al., 2022) are contemporary works. Codex code-davinci-002 was state-of-the-art at the time of evaluation."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section V-C presents detailed ablation studies (Tables II and III) removing each component individually: code prompt, single assert, dynamic mutation, test ranking, and code ranking."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper uses pass@k@m for various k∈{1,2,5,10} and m∈{0,1,2,3,4,5}, plus accept@m (Section II-C, Table I). These measure different aspects: code correctness and test quality respectively."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "All evaluation is automated using oracle simulation (reference implementations as proxy for user responses, Definition IV.1). Section VI acknowledges the lack of user study: 'We plan to conduct a user study to evaluate such metrics in practice.'"
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are evaluated against hidden test sets from MBPP and HumanEval benchmarks (the ground truth tests Tp), which are separate from the LLM-generated tests used in the approach."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Results are shown per benchmark (MBPP vs HumanEval) but there is no breakdown by problem difficulty, problem type, or category within each benchmark."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "No failure case analysis is presented. The paper does not discuss specific examples where TiCoder fails or where the approach breaks down, beyond the running example used for illustration."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Tables II-III) shows cases where removing components improves performance. For example, on HumanEval, removing the code prompt improves pass@1@1 from 55.27% to 59.62%. Section III-B notes that single-assert 'may also adversely impact the performance.'"
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims of 22.49%–37.71% improvement on MBPP and 24.79%–53.98% on HumanEval are directly supported by Figures 5 and 6 in Section V."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Causal claims are primarily in the ablation study (Section V-C), which uses controlled single-variable manipulation — removing one component at a time from the default configuration. This is adequate for the causal claims made."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title 'Interactive Code Generation via Test-Driven User-Intent Formalization' is broad, while results are limited to Python on two benchmarks using a single LLM (Codex). Section VI acknowledges limitations to 'different programs across different languages and problem domains,' but the title and framing extend well beyond the tested setting."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The threats section (Section VI) discusses three threats but does not consider alternative explanations for the results — e.g., whether improvements come primarily from test-based filtering rather than the interactive component, or whether the oracle simulation inflates the benefit."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures pass@k on hidden test suites and claims improvement in 'code generation accuracy.' This is a reasonably tight measurement-to-claim mapping. Section VI explicitly acknowledges the proxy gap of simulated vs. real user interaction: 'our automated evaluation assumes the user is able to answer the generated tests, and cannot account for the cognitive effort.'"
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Section IV-C specifies 'OpenAI's Codex code-davinci-002 model' — a specific model version identifier."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Figure 3 shows the full code and test prompt structures with concrete examples. Section III-A describes the prompt construction options in detail (pass vs choose(G) for test prompts). The prompts are template-based with clearly specified fill values."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section IV-C reports: 'temperature of 0.8 and a top p of 0.95,' maximum generation length of 300 tokens, 100 code suggestions, and 50 test suggestions."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. TiCoder is a pipeline-based algorithm (generate → mutate → rank → query) without autonomous agent behavior."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section IV-B describes using 'the sanitized version of the MBPP dataset' and explicitly describes the HumanEval modification: 'We modify the original HumanEval dataset to remove any (non-hidden) input-output examples that are included in the docstring.'"
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VI 'Threats' is a dedicated section with substantive discussion of three specific threats to validity."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section VI discusses three specific threats: (1) benchmarks may not represent real-world development, (2) Codex model stochasticity and endpoint discontinuation threatening replicability, (3) oracle simulation cannot account for user cognitive effort."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section VI explicitly states: 'Our findings may not generalize to a different set of programs across different languages and problem domains.' It also notes the limitation to simulated rather than real user interaction."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The cached Codex outputs that all experiments build on are not released. Section VI states 'We aim to mitigate this by releasing model generated output in the near future' — future promise, not current availability."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section IV-C describes data collection: querying Codex with specific parameters (temperature 0.8, top-p 0.95, 100 code suggestions, 50 test suggestions per example), caching results, and using standard benchmark datasets."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. The study uses standard code generation benchmarks (MBPP and HumanEval) with simulated oracle interaction."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Algorithm 1 and Section III detail the full pipeline: LLM query for codes (line 1) → LLM query for tests (line 2) → static mutation (line 3) → dynamic mutation (line 4) → interactive ranking loop (lines 6-18). The oracle simulation is defined precisely in Definition IV.1."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding sources or acknowledgments section is present in the paper."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly listed: Microsoft Research (majority), University of Pennsylvania, and University of California San Diego. Internship relationships are noted."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The majority of authors are from Microsoft Research, which has a major investment in OpenAI (whose Codex model is evaluated). Microsoft has a financial interest in positive results for LLM-based code generation. This conflict is not acknowledged."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No mention of Codex code-davinci-002's training data cutoff date. The paper does not state when the model's training data was collected."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of whether MBPP or HumanEval problems appeared in Codex's training data. MBPP was publicly available before Codex training."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "MBPP was published in 2021 and HumanEval was introduced alongside Codex. No discussion of whether benchmark solutions were in the training data, despite both benchmarks being publicly available."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. All user interaction is simulated via oracle (reference implementation)."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. Evaluation is fully automated using oracle simulation."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in the study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The approach queries Codex for 100 code suggestions and 50 test suggestions per example, plus user interaction rounds. No API costs, latency, or per-example cost is reported."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No total computational budget, API spend, or hardware details are provided despite substantial LLM API usage across 427 + 164 benchmark examples."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Section IV-C states: 'we only query Codex once to generate the initial code and test suggestions into a cache.' Single-run results with no seed sensitivity analysis."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Section IV-C explicitly states they query Codex once and cache the results: 'we only query Codex once to generate the initial code and test suggestions into a cache of Codex responses and refer to the same cache for all experiments.'"
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Section IV-C states 'We chose this configuration as default empirically as it performs the best on the pass@1@1 metric on the MBPP dataset,' but does not report how many configurations were searched or the computational cost of this search."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Section IV-C justifies the default configuration: 'We chose this configuration as default empirically as it performs the best on the pass@1@1 metric on the MBPP dataset.' The ablation study (Tables II-III) shows all alternative configurations tested."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Many pairwise comparisons are made across configurations, metrics, and benchmarks with no statistical significance tests and therefore no multiple comparison correction."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors compare TiCoder against their own implementation of CodeT and other baselines without acknowledging the potential bias of evaluating their own system against self-implemented competitors."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "TiCoder uses substantially more LLM queries than baseline Codex (additional test generation, mutation, and potentially multiple interaction rounds), but performance is never compared at matched compute budgets."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether MBPP and HumanEval actually measure real-world code generation capability. The paper uses these benchmarks without questioning construct validity."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No scaffolding is involved. TiCoder is an algorithmic pipeline, not an agentic scaffold. Comparisons are between algorithmic variants, not scaffold configurations."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "MBPP (2021) and HumanEval (2021) were both published before Codex's training data was collected. No discussion of temporal leakage or whether solutions to these benchmarks were in the training data."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the evaluation setup leaks information. The oracle simulation provides perfect answers, which may inflate the perceived benefit compared to real user interaction."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether MBPP or HumanEval problems share structural similarities with Codex's training data or with each other."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "TiCoder improves pass@1 from 48.24% to 70.73% on MBPP and from 30.49% to 55.28% on HumanEval with a single simulated user query.",
    377       "evidence": "Section V-A, Figures 5a and 5b show pass@k@1 results across all baselines. The numbers are clearly presented in both figures and text.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "With 5 user queries, TiCoder achieves pass@1 of 85.95% on MBPP and 84.47% on HumanEval.",
    382       "evidence": "Section V-B, Figures 6a and 6b show pass@1@m for m=0 through 5 with clear progression.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "TiCoder generates a user-accepted test within an average of 1.7 queries for 87.12% of MBPP and 1.5 queries for 95.73% of HumanEval examples.",
    387       "evidence": "Section V-B, Figure 7 shows cumulative accept@m curves for both benchmarks.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Each component (test ranking, dynamic mutation, code ranking, static mutation, prompt design) contributes to TiCoder's effectiveness.",
    392       "evidence": "Section V-C, Tables II and III show ablation results. Test ranking and dynamic mutation are most impactful. However, the default configuration is not uniformly best across all metrics and both benchmarks.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "TiCoder outperforms CodeT, which uses no user interaction, on MBPP with a single query and on HumanEval with 2 queries.",
    397       "evidence": "Section V-A shows TiCoder pass@1@1=70.73% vs CodeT 63.70% on MBPP. On HumanEval, TiCoder pass@1@1=55.28% is below CodeT's 58.54%, but with 2 queries TiCoder reaches 68.94% (Figure 6b).",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Test mutation techniques improve the pool of tests over purely LLM-generated tests, shown by 10.72% (MBPP) and 13.74% (HumanEval) accuracy gain after 5 interactions.",
    402       "evidence": "Section V-B, Figures 6a and 6b compare IdealRanking (with mutations) vs BaselineIdeal (LLM-only tests with ideal ranking).",
    403       "supported": "strong"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "Simulated user evaluation only",
    409       "detail": "All experiments use an oracle (reference implementation) instead of real users. The cognitive cost and accuracy of real user responses is unknown. The paper acknowledges this but provides no user study, making the claimed 'interactive' benefit unvalidated with actual users."
    410     },
    411     {
    412       "flag": "No error bars or variance on any result",
    413       "detail": "All results come from a single cached Codex query. No repeated runs, no confidence intervals, no statistical tests. The stochasticity of Codex (acknowledged in Section VI) means results could vary substantially across runs."
    414     },
    415     {
    416       "flag": "No contamination analysis",
    417       "detail": "Both MBPP and HumanEval were publicly available before Codex training. The high pass@100 values (89.70% MBPP, 90.85% HumanEval) could partly reflect memorization. No contamination check is performed."
    418     },
    419     {
    420       "flag": "Microsoft-OpenAI financial relationship",
    421       "detail": "The majority of authors are Microsoft Research employees evaluating OpenAI's Codex model. Microsoft is a major investor in OpenAI. This conflict of interest is not disclosed or acknowledged."
    422     },
    423     {
    424       "flag": "Discontinued model",
    425       "detail": "Codex code-davinci-002 was discontinued by OpenAI in March 2023. Results cannot be verified or replicated with the same model, and the cached outputs are not released."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Evaluating large language models trained on code",
    431       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    432       "year": 2021,
    433       "arxiv_id": "2107.03374",
    434       "relevance": "Introduced Codex and HumanEval benchmark, the primary model and one of two benchmarks used in this study."
    435     },
    436     {
    437       "title": "Program synthesis with large language models",
    438       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    439       "year": 2021,
    440       "arxiv_id": "2108.07732",
    441       "relevance": "Introduced the MBPP benchmark used in this study for evaluating code generation."
    442     },
    443     {
    444       "title": "CodeT: Code generation with generated tests",
    445       "authors": ["Bei Chen", "Fengji Zhang", "Anh Nguyen"],
    446       "year": 2022,
    447       "relevance": "Primary baseline that uses LLM-generated tests to improve code ranking without user interaction."
    448     },
    449     {
    450       "title": "Competition-level code generation with AlphaCode",
    451       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    452       "year": 2022,
    453       "relevance": "Uses test generation and code clustering for competitive programming — related approach to test-based code filtering."
    454     },
    455     {
    456       "title": "Productivity assessment of neural code completion",
    457       "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X. Alice Li"],
    458       "year": 2022,
    459       "relevance": "Evaluates Copilot productivity in real-world developer scenarios, directly relevant to AI-assisted coding evaluation."
    460     },
    461     {
    462       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    463       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"],
    464       "year": 2022,
    465       "relevance": "User study showing developers struggle to understand LLM code suggestions, motivating the interactive approach."
    466     },
    467     {
    468       "title": "Do users write more insecure code with AI assistants?",
    469       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    470       "year": 2022,
    471       "arxiv_id": "2211.03622",
    472       "relevance": "Studies security implications of AI-assisted coding, relevant to LLM code generation safety."
    473     },
    474     {
    475       "title": "CodaMosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    476       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
    477       "year": 2023,
    478       "relevance": "Uses LLMs to improve automated test generation, directly relevant to LLM-based testing capabilities."
    479     },
    480     {
    481       "title": "TOGA: A neural method for test oracle generation",
    482       "authors": ["Elizabeth Dinella", "Gabriel Ryan", "Todd Mytkowicz", "Shuvendu Lahiri"],
    483       "year": 2022,
    484       "relevance": "Neural approach to generating test oracles, relevant to LLM-based test generation for code verification."
    485     },
    486     {
    487       "title": "Adaptive test generation using a large language model",
    488       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    489       "year": 2023,
    490       "arxiv_id": "2302.06527",
    491       "relevance": "LLM-based adaptive test generation, directly relevant to using LLMs for software testing."
    492     },
    493     {
    494       "title": "PaLM: Scaling language modeling with pathways",
    495       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    496       "year": 2022,
    497       "relevance": "Major LLM with code generation capabilities, relevant to the landscape of code-generation models."
    498     },
    499     {
    500       "title": "A systematic evaluation of large language models of code",
    501       "authors": ["Frank F. Xu", "Uri Alon", "Graham Neubig", "Vincent J. Hellendoorn"],
    502       "year": 2022,
    503       "relevance": "Systematic evaluation of code LLMs including PolyCoder, relevant to benchmarking code generation models."
    504     },
    505     {
    506       "title": "Jigsaw: Large language models meet program synthesis",
    507       "authors": ["Naman Jain", "Skanda Vaidyanath", "Arun Iyer", "Nagarajan Natarajan"],
    508       "year": 2022,
    509       "relevance": "Uses user-provided tests to improve LLM code generation quality, closely related approach."
    510     }
    511   ]
    512 }

Impressum · Datenschutz