ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28327B)


      1 {
      2   "paper": {
      3     "title": "Can Vibe Coding Beat Graduate CS Students? An LLM vs. Human Coding Tournament on Market-driven Strategic Planning",
      4     "authors": [
      5       "Panayiotis Danassis",
      6       "Naman Goel"
      7     ],
      8     "year": 2025,
      9     "venue": "arXiv",
     10     "arxiv_id": "2511.20613"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides a code URL: 'Code available at: https://panayiotisd.github.io/apdp_bench/' (page 1, Section 1). The benchmark is described as open-sourced."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The benchmark platform and all agent code (including human-coded agents) are released at the same URL. The tournament configurations (network topologies of Switzerland, France, Great Britain, Netherlands) and task generation parameters are described. The benchmark itself is the data."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is mentioned. The paper states 'All code is implemented in Java' (Section 4.1) but provides no version details, dependency specifications, or environment configuration."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. While the code is released at a URL, there is no description of how to run the tournaments, set up the environment, or replicate the specific experimental configurations used."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper reports standard deviations for wins/losses per tournament in Table 1, but does not report confidence intervals or error bars for the main comparison claims (e.g., that human agents dominate). The SD values are descriptive variance across tournaments, not inferential uncertainty measures."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims 'a clear superiority of human-coded agents' and that the LLM 'makes the solution significantly worse' (Section 5.3), but no statistical significance tests are reported. Comparisons are made by rank ordering win rates without any formal test."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper provides win rates, total wins/losses, and average wins per tournament with baseline context. For example, Student 1 has 96.58% win rate vs. LLM(O,IR,1) at 85.19%. The Section 5.3 improvement experiment reports a drop from 1st to 10th place (89.667 average wins vs. 108.167). These provide magnitude of differences with clear baselines."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for why 40 LLM agents and 17 human agents were chosen. The 12 student agents were selected from a single year's course (2020), with no discussion of whether this sample size is sufficient for the claims being made."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Table 1 reports standard deviation of wins and losses per tournament for all 57 agents across 12 tournaments. For example, 'SD #Wins / Tour' and 'SD #Losses / Tour' columns are provided."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Five baseline agents are included: Naive, ExpCostFixedBid, Honest, ModelOpponent, and RiskSeeking (Section 4.1). These range from very simple to moderately sophisticated strategies."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper uses state-of-the-art LLMs as of 2025 (GPT-5 Thinking, Gemini 2.5 Pro, Claude Opus 4.1, DeepThink R1) and compares against pre-LLM-era human agents. The baselines are appropriate because the research question is whether LLMs can match human performance on this task, and the human agents serve as the benchmark."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "While not a traditional ablation, the paper systematically varies prompting strategies (A1, A2, IR, CR, GEN) across all four LLMs, effectively ablating the prompt component. Additionally, preliminary experiments (Section 5.1) test simpler variants (Reactive, Deliberative, Centralized) to isolate where LLMs fail."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper reports multiple metrics in Table 1: average wins per tournament, standard deviation, total wins, total losses, and win rate. The preliminary evaluation also uses profit comparison (Section 5.1, Centralized variant: '19% higher to 194% lower')."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "There is no human evaluation of the quality of the generated code beyond the automated tournament results. The paper does not have humans inspect or rate the LLM-generated agents' code quality, strategic sophistication, or design decisions. The tournament is an automated competitive evaluation."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Tasks are drawn randomly at tournament time, but there is no distinction between development and test configurations. The same tournament setup was used for debugging and final evaluation. The paper notes LLMs were 'tested/debugged in self-play and tournament settings' (Section 5.2), meaning the evaluation environment was also used for development."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Table 1 provides per-agent breakdowns for all 57 agents. The paper also breaks down by LLM model and prompting strategy (using the naming scheme explained in Table 1). Results across 4 network topologies are aggregated but per-topology results are not shown separately."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.2 (Debugging) discusses specific failure cases: time-out violations, not delivering won tasks, capacity constraint violations, and LLMs failing to fix persistent bugs. Section 5.1 discusses inadmissible heuristics and suboptimal design decisions. Section 5.3 discusses the degradation of the winning solution."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The entire paper is effectively a negative result for LLMs. Additionally, the iterative refinement and critic strategies often did not improve performance, and providing the best human solution to GPT-5 made it worse (Section 5.3). The paper is transparent about LLM failures."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims (i) top 5 spots held by human agents, (ii) 33/40 LLM agents beaten by simple baselines, and (iii) LLM degrades winning solution are all directly supported by Table 1 and Section 5.3 results."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper implies causal claims about LLMs' 'limited reasoning and planning capabilities' (Section 6) causing poor performance, but the study design cannot distinguish whether poor results are due to reasoning limitations, prompt quality, or the Java-specific implementation requirements. The paper does not systematically control for these confounds."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 6 explicitly bounds the conclusions: 'Our conclusions are bounded by a single, albeit rich, domain (logistics PDP). We also do not claim that LLM performance observed in this paper is the optimal performance the best LLM can ever achieve in this setting with any prompt.' The paper acknowledges dedicated coding models and better prompts could yield different results."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 6 discusses multiple alternative explanations: the results may not generalize beyond the logistics domain, dedicated fine-tuned coding models might perform better, optimal prompting strategies were not explored, and professional engineers might outperform students. The paper also acknowledges that student agents were not debugged while LLM agents were, which could bias results in favor of LLMs."
    129       }
    130     },
    131     "setup_transparency": {
    132       "model_versions_specified": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses marketing names only: 'GPT-5 Thinking', 'Gemini 2.5 Pro', 'Claude Opus 4.1', and 'DeepThink R1' (Section 4.2). No API version numbers, snapshot dates, or specific model IDs are provided."
    136       },
    137       "prompts_provided": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper states 'The supplement contains the complete prompts' (Section 4.2) but only describes the prompts in natural language within the paper itself. The supplement is referenced but not included in the paper text. The description of Author Prompt #1 is a summary, not the actual prompt text."
    141       },
    142       "hyperparameters_reported": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "No hyperparameters are reported: no temperature, top-p, max tokens, or other API settings are mentioned for any of the LLM calls. The paper does not state what sampling parameters were used."
    146       },
    147       "scaffolding_described": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "The paper uses LLMs for one-shot code generation with iterative bug-fixing prompts, not agentic scaffolding with tools, memory, or feedback loops. The approach is direct prompting, not scaffolded agent architecture."
    151       },
    152       "data_preprocessing_documented": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The paper documents how student agents were selected: 'top 8 agents from the 2020 tournament' plus '4 additional student agents that performed well against baseline agents' (Section 4.1). The LLM agent generation process is described: 4 LLMs x 5 prompting strategies x 2 runs = 40 agents (Section 4.2). Tournament parameters (4 topologies, 3 per topology, 50 tasks per match) are specified."
    156       }
    157     },
    158     "limitations_and_scope": {
    159       "limitations_section_present": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 6 is titled 'IMPLICATIONS, LIMITATIONS, AND FUTURE WORK' and contains substantive discussion of limitations spanning multiple paragraphs."
    163       },
    164       "threats_to_validity_specific": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6 discusses specific threats: 'Our conclusions are bounded by a single, albeit rich, domain', 'We also do not claim that LLM performance observed in this paper is the optimal performance', acknowledges 'dedicated models, specifically fine-tuned for coding' might differ, and notes that 'professional software engineers could presumably be better than the students in this study'."
    168       },
    169       "scope_boundaries_stated": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6 explicitly states: 'Our work is not concerned with finding the optimal prompting strategy, the best LLM, or to create a leader-board of LLMs.' It also states the scope is bounded to 'a single, albeit rich, domain (logistics PDP)' and explains why model names are not highlighted in the ranking."
    173       }
    174     },
    175     "data_integrity": {
    176       "raw_data_available": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "While the benchmark code is released, the raw match results (individual match outcomes for all ~40k matches), LLM conversation logs, generated code, and debugging iterations are not described as being available for verification."
    180       },
    181       "data_collection_described": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper describes how data was collected: student agents from the 2020 EPFL Intelligent Agents course, baseline agents from the AI Lab, LLM agents generated via 5 prompting strategies with 4 LLMs, each run twice. Tournament setup (12 double all-play-all tournaments, 4 topologies, ~40k matches) is fully described."
    185       },
    186       "recruitment_methods_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Student agent recruitment is described: agents from the 2020 postgraduate Intelligent Agents course at EPFL, selected as top 8 from the single-elimination tournament plus 4 additional based on wins against baselines (Section 4.1). The paper acknowledges selection may not capture the true top human agents due to single-elimination format."
    190       },
    191       "data_pipeline_documented": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The pipeline from agent generation to tournament results is documented: LLM code generation → debugging → self-play testing → tournament evaluation. The paper specifies how many agents were generated vs. evaluated and notes significant manual effort was required to get bug-free agents."
    195       }
    196     },
    197     "conflicts_of_interest": {
    198       "funding_disclosed": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No funding source is mentioned. The Acknowledgments section thanks Professor Faltings and the AI Lab at EPFL, but does not disclose funding. The paper notes the work was partially conducted while P.D. was at Telenor Research, but no explicit funding disclosure is provided."
    202       },
    203       "affiliations_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Author affiliations are clearly stated: Panayiotis Danassis (University of Southampton), Naman Goel (University of Oxford and Alan Turing Institute). The paper also notes P.D. was previously at Telenor Research. Neither author is affiliated with the LLM companies being evaluated."
    207       },
    208       "funder_independent_of_outcome": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source is disclosed, so it is impossible to assess funder independence. The absence of a funding disclosure does not establish that the work was unfunded."
    212       },
    213       "financial_interests_declared": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No competing interests or financial interests statement is present in the paper."
    217       }
    218     },
    219     "contamination": {
    220       "training_cutoff_stated": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "The paper does not state the training data cutoff dates for any of the four LLMs used (GPT-5 Thinking, Gemini 2.5 Pro, Claude Opus 4.1, DeepThink R1). This is relevant because the student code was developed in 2020 and is being released publicly."
    224       },
    225       "train_test_overlap_discussed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Section 5.3 explicitly discusses contamination: 'Many benchmarks are often accompanied with data contamination concerns (i.e., the correct answers were already in the training data). On the other hand, this experiment shows that, in our benchmark, even when we expose a good solution in-context, the LLM is still unable to utilize it.' The paper argues contamination is less of a concern because LLMs fail even with the solution provided."
    229       },
    230       "benchmark_contamination_addressed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The paper addresses contamination by design: the APDP benchmark is novel and was not previously published as a standard benchmark. Additionally, the in-context experiment (Section 5.3) demonstrates that even with direct access to a winning solution, the LLM cannot improve it, arguing contamination is not a confound."
    234       }
    235     },
    236     "human_studies": {
    237       "pre_registered": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper uses pre-existing student code from a 2020 course assignment, not a new human subjects study. No new human participants were recruited for this research."
    241       },
    242       "irb_or_ethics_approval": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The student agents were course submissions from 2020, used as benchmark data. No new human participants were involved in the study, so IRB approval is not applicable."
    246       },
    247       "demographics_reported": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No new human participants were recruited. The student agents are from a postgraduate course, but demographic reporting is not applicable since this is not a human subjects study."
    251       },
    252       "inclusion_exclusion_criteria": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants were recruited for the study. Student agent selection criteria are described under data collection, not human subjects methodology."
    256       },
    257       "randomization_described": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects experiment was conducted. The tournament uses random task generation and company swapping for fairness, but this is experimental design, not human subjects randomization."
    261       },
    262       "blinding_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects experiment was conducted requiring blinding."
    266       },
    267       "attrition_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants were involved in the study. Agent crash rates are reported (Section 5.2) but this is system reliability, not participant attrition."
    271       }
    272     },
    273     "cost_and_practicality": {
    274       "inference_cost_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No API costs, token counts, or wall-clock times are reported for the LLM code generation process. The paper does not mention how much it cost to generate and debug the 40 LLM agents."
    278       },
    279       "compute_budget_stated": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No computational budget is stated. The paper does not report the compute resources used for running ~40k tournament matches or the cost of LLM API calls during agent generation and debugging."
    283       }
    284     }
    285   },
    286   "claims": [
    287     {
    288       "claim": "The top 5 spots in the tournament are consistently won by human (student)-coded agents.",
    289       "evidence": "Table 1 shows Students 1-5 occupying ranks 1-5 across all 12 double all-play-all tournaments, with win rates from 96.58% to 86.16%.",
    290       "supported": "strong"
    291     },
    292     {
    293       "claim": "The majority of LLM-coded agents (33 out of 40) are beaten by very simple baseline agents.",
    294       "evidence": "Table 1 shows ExpCostFixedBid (a simple baseline) at rank 18 with 68.9% win rate. Only 7 LLM agents rank above it; 33 rank below. All are GPT-5 variants except one DeepSeek and one Gemini.",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "Given the best human solution as input and prompted to improve upon it, the best performing LLM makes the solution significantly worse.",
    299       "evidence": "Section 5.3 describes providing Student 1's code to GPT-5 Pro and asking for improvements. The resulting agent dropped from 1st to 10th place, with average wins falling from 108.167 to 89.667.",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "LLMs can generate syntactically correct code but struggle with reasoning-driven tasks requiring planning, optimization, and strategic interaction.",
    304       "evidence": "Section 5.1 preliminary results show LLMs produced running code but with semantic bugs (inadmissible heuristics, ignoring constraints). Section 4.2 notes 'significant manual effort' to achieve bug-free code. Tournament results (Table 1) show poor competitive performance.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "This is the first comparison of LLMs vs. humans in the domain of real-world optimization code generation.",
    309       "evidence": "Section 1 states 'This is the first work, to the best of our knowledge, that answers two crucial questions.' Related work (Section 2) surveys existing benchmarks and shows none involve direct LLM vs. human competition on multi-agent optimization problems.",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "Even with in-context access to a good solution, LLMs cannot utilize it effectively, mitigating data contamination concerns.",
    314       "evidence": "Section 5.3 describes the experiment where GPT-5 was given the winning student solution and asked to improve it, but the result was significantly worse. The paper argues this shows contamination is not a confound for their benchmark.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "benchmark-eval"
    320   ],
    321   "key_findings": "This paper introduces the Auction, Pickup, and Delivery Problem (APDP) benchmark, a multi-agent competitive logistics optimization problem, and evaluates 40 LLM-coded agents against 17 human-coded agents across ~40k tournament matches. Human (graduate student)-coded agents consistently dominated the top 5 positions, while 33 of 40 LLM agents lost to simple baseline strategies. Even when the best-performing LLM (GPT-5) was given the winning human solution and asked to improve it, the result dropped from 1st to 10th place, highlighting a significant gap between LLMs' syntactic code generation ability and their capacity for reasoning-driven optimization in competitive settings.",
    322   "red_flags": [
    323     {
    324       "flag": "No statistical significance tests",
    325       "detail": "Despite claiming 'clear superiority' and that LLMs make solutions 'significantly worse,' no formal statistical tests are conducted. With 12 tournaments and known variance, significance tests would strengthen the claims considerably."
    326     },
    327     {
    328       "flag": "No hyperparameters or model versions reported",
    329       "detail": "The paper uses only marketing names for LLMs (GPT-5 Thinking, Gemini 2.5 Pro, etc.) without API versions, snapshot dates, or temperature/sampling settings. These can substantially affect code generation quality."
    330     },
    331     {
    332       "flag": "Asymmetric debugging treatment",
    333       "detail": "LLM agents were thoroughly debugged while student agents were not debugged at all (crashes counted as LLM wins). The paper acknowledges this favors LLMs, but the asymmetry complicates fair comparison. Student agents with crashes (e.g., Student 9 with 184 crashes) might rank much higher if debugged."
    334     },
    335     {
    336       "flag": "Student agent selection may not be representative",
    337       "detail": "Student agents were selected from a single year (2020) of a single course at EPFL. The top 8 were chosen via single-elimination (which the paper acknowledges may not select the best agents), and 4 more were added by a different criterion. This selection process is not guaranteed to be representative of graduate student capability."
    338     },
    339     {
    340       "flag": "Prompts described but not shown",
    341       "detail": "The actual prompts are in a supplement, but the paper provides only a natural language description of Prompt A1. Without seeing the exact prompts, it's impossible to assess whether prompt quality explains the results."
    342     },
    343     {
    344       "flag": "No cost reporting",
    345       "detail": "Generating 40+ LLM agents with iterative debugging across 4 frontier models likely involved significant API costs. This is not reported, limiting reproducibility and practical assessment."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    351       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    352       "year": 2025,
    353       "arxiv_id": "2507.09089",
    354       "relevance": "RCT finding developers were 19% slower with AI tools, directly relevant to AI coding productivity claims."
    355     },
    356     {
    357       "title": "Evaluating large language models trained on code",
    358       "authors": ["Mark Chen", "Jerry Tworek"],
    359       "year": 2021,
    360       "arxiv_id": "2107.03374",
    361       "relevance": "Introduced HumanEval benchmark and pass@k metric, foundational to LLM code generation evaluation."
    362     },
    363     {
    364       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    365       "authors": ["Carlos E Jimenez", "John Yang"],
    366       "year": 2024,
    367       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks."
    368     },
    369     {
    370       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    371       "authors": ["Naman Jain", "King Han"],
    372       "year": 2025,
    373       "relevance": "Contamination-free code evaluation benchmark, relevant to evaluation methodology."
    374     },
    375     {
    376       "title": "Large Language Models for Code Generation: A Comprehensive Survey of Challenges, Techniques, Evaluation, and Applications",
    377       "authors": ["Nam Huynh", "Beiyu Lin"],
    378       "year": 2025,
    379       "arxiv_id": "2503.01245",
    380       "relevance": "Survey of LLM code generation covering challenges, techniques, and evaluation methods."
    381     },
    382     {
    383       "title": "Can large language models reason and plan?",
    384       "authors": ["Subbarao Kambhampati"],
    385       "year": 2024,
    386       "relevance": "Directly relevant to the reasoning and planning limitations of LLMs discussed in this paper."
    387     },
    388     {
    389       "title": "Competition-level code generation with alphacode",
    390       "authors": ["Yujia Li", "David Choi"],
    391       "year": 2022,
    392       "relevance": "Demonstrated LLM competitive coding performance, claims this paper challenges."
    393     },
    394     {
    395       "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers",
    396       "authors": ["Zheyuan Kevin Cui"],
    397       "year": 2025,
    398       "relevance": "Field experiments on AI impact on developer productivity, relevant to AI-assisted coding evaluation."
    399     },
    400     {
    401       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    402       "authors": ["Terry Yue Zhuo"],
    403       "year": 2025,
    404       "relevance": "Code generation benchmark with complex instructions, relevant to evaluation methodology."
    405     },
    406     {
    407       "title": "SWE-bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?",
    408       "authors": ["Xiang Deng", "Jeff Da"],
    409       "year": 2025,
    410       "arxiv_id": "2509.16941",
    411       "relevance": "Evaluates AI agents on long-horizon software engineering tasks, directly relevant to agentic coding capabilities."
    412     },
    413     {
    414       "title": "To the Cutoff... and Beyond? A Longitudinal Perspective on LLM Data Contamination",
    415       "authors": ["Manley Roberts", "Himanshu Thakur"],
    416       "year": 2024,
    417       "relevance": "Addresses data contamination in LLM benchmarking, a key concern this paper discusses."
    418     },
    419     {
    420       "title": "AlphaEvolve: A Gemini-powered coding agent for designing advanced algorithms",
    421       "authors": ["Alexander Novikov"],
    422       "year": 2025,
    423       "relevance": "LLM-powered system for algorithm design, relevant to LLM code generation capabilities beyond autocomplete."
    424     }
    425   ]
    426 }

Impressum · Datenschutz