scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27080B)
      1 {
      2   "paper": {
      3     "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?",
      4     "authors": ["Samuel Miserendino", "Michele Wang", "Tejal Patwardhan", "Johannes Heidecke"],
      5     "year": 2025,
      6     "venue": "International Conference on Machine Learning",
      7     "arxiv_id": "2502.12115",
      8     "doi": "10.48550/arXiv.2502.12115"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper open-sources a unified Docker image and public evaluation split at https://github.com/openai/SWELancer-Benchmark, mentioned in the abstract and throughout."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "SWE-Lancer Diamond (public evaluation split worth $500,800) is released. The full dataset is 'available upon request' — but the public split is a substantial release."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Appendix A5 specifies the Docker container setup: 'Microsoft Azure Standard D2as v4 virtual machine with 2 vCPUs and 8GB RAM', 64GB shared memory, 192GB max memory, pre-built Docker image with all libraries installed."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The open-source evaluation harness and Docker image are provided. Appendix A5 describes the execution environment and scaffold details. The GitHub repo contains the eval harness."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Main results in Table 1 are single-run pass@1 with no confidence intervals or error bars. The paper acknowledges 'there can be significant variance between runs' (footnote 2) but does not report CIs for main results. Only the price analysis (Section 3.6) reports a 95% CI."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims Claude 3.5 Sonnet 'outperforms' o1 by 9.7% on IC SWE tasks but provides no statistical significance test. All comparisons are raw point estimates."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Results are reported as pass@1 percentages and dollar amounts with baselines, e.g., 'Claude 3.5 Sonnet scores 26.2% on IC SWE tasks' and '$208,050 out of $500,800', providing sufficient context to understand magnitude."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification for why 1,488 tasks were selected or why the Diamond split is 502 tasks. No power analysis or discussion of whether this sample size is sufficient for the comparisons made."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Main results are single-run pass@1. The paper states 'each is just a single rollout' (Appendix A5). The pass@k experiments use multiple runs but the main results table does not report variance."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple frontier models compared: GPT-4o, o1, Claude 3.5 Sonnet, plus open-source models Deepseek-R1 and Llama 3.3 70B in the appendix."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "GPT-4o, o1, and Claude 3.5 Sonnet were all frontier models at the time of evaluation. Open-source models also included."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Multiple ablations: user tool removal (Section 3.5), reasoning effort levels for o1 (Section 3.4), and pass@k with varying attempts (Section 3.3)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Reports pass@1, dollar earnings, earn rate, and pass@k. Also breaks down by IC SWE vs SWE Manager tasks."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "100 professional software engineers reviewed tasks. End-to-end tests were triple-verified by experienced engineers. SWE Manager ground truth validated with 99% agreement from experienced engineers (Section 2)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The benchmark maintains a $499,200 private holdout set (Appendix A11) separate from the Diamond public split, to mitigate contamination."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Tables 2 and 3 break down pass rates by task type (Application Logic, UI/UX, Server-Side Logic) and nature of work (Bug Fixes, New Features, Maintenance). Price range breakdowns in Figure 8."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Appendix A9 provides detailed qualitative analysis of failure trajectories for o1, GPT-4o, and Claude 3.5 Sonnet on specific tasks. Section 3.7 discusses common failure patterns."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Models fail the majority of tasks. The paper reports GPT-4o fails 90% of IC SWE tasks, open-source models perform poorly (3.8-5.9% on IC SWE Diamond), and user tool removal has minimal impact."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims that models 'are still unable to solve the majority of tasks' are supported by Table 1 (best is 26.2% on IC SWE). The '1,400 freelance tasks' and '$1 million' claims match the dataset description."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims are limited and justified: 'higher reasoning effort improves pass@1' is supported by controlled ablation varying only reasoning effort (Section 3.4). 'More attempts leads to consistent increase' is supported by pass@k (Section 3.3)."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 4 (Limitations) explicitly bounds generalization: tasks are from one repository (Expensify), one platform (Upwork), one language stack (JS/TS), and freelance tasks are 'more self-contained than full-time software engineering tasks.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not substantively discuss alternative explanations for performance differences. For example, Claude 3.5 Sonnet outperforming o1 could be due to scaffold compatibility, but this is not explored. The scaffold confound is not addressed."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly maps task completion to monetary payout, with the payout being the actual freelancer payment. Section 3.6 carefully analyzes cost savings with explicit limitations. The paper acknowledges freelance tasks don't represent all SWE work."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 3.1 specifies 'gpt-4o-2024-08-06', 'o1' with high reasoning effort, and 'claude-3-5-sonnet-20240620' — exact API version strings provided."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt templates for both IC SWE and SWE Manager tasks are provided in Appendix A8, including the exact text sent to models."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix A5 states: 'All rollouts are conducted with temperature 1.0', 'maximum number of tool calls is 100', 'maximum time allowed is 3 hours', and reasoning effort levels are specified for o1."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Appendix A5 describes the scaffold: 'a basic scaffold that lets them browse the local codebase, modify files, and execute terminal commands.' The user tool is described in Section 2.2. Docker container setup is detailed."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 2.2 documents the 5-step benchmark construction pipeline: repository selection, task selection with review criteria, task generation, E2E test development, and user tool creation. Appendix A6 details the curation criteria."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4 is a dedicated 'Limitations' section covering diversity of repositories, scope, modalities, environments, and contamination."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The paper discusses specific threats: single repository (Expensify), underrepresentation of infrastructure tasks, text-only evaluation missing screen recordings, models cannot ask clarifying questions, and specific contamination risks from public GitHub issues."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4 explicitly states: 'SWE-Lancer would also be less representative of \"zero to one\" SWE work', 'we remain cautious about extrapolating impact beyond' freelance engineering, infrastructure tasks are underrepresented."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Diamond evaluation split is open-sourced with the Docker image, including task descriptions, end-to-end tests, and codebase snapshots. Full dataset available upon request."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 2.2 describes collection: tasks sourced from Upwork postings for the Expensify repository, real tasks previously solved by paid contributors, reviewed by 100 professional software engineers."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The paper mentions '100 professional software engineers' and 'a team of ten experienced engineers' but does not describe how these engineers were recruited, their qualifications, or whether recruitment introduces bias."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 2.2 documents the full pipeline: repository selection → task selection with multi-engineer review → task generation → E2E test development with triple verification. Appendix A6 provides detailed curation criteria."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No explicit funding source disclosed. Acknowledgements mention OpenAI organizations sponsoring the work but no formal funding statement."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All four authors are listed as affiliated with OpenAI (footnote 1). The paper header states '1OpenAI'."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "OpenAI funded and conducted this research. OpenAI has a financial interest in demonstrating LLM capabilities and benchmarking its own models (GPT-4o, o1). The funder is not independent of the outcome."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement. OpenAI employees evaluating OpenAI models (GPT-4o, o1) alongside competitors have clear financial interests that are not formally declared."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Table 4 (Appendix A2) lists training cutoff dates: GPT-4o (2023-10-01), o1 (2023-10-01), Claude 3.5 Sonnet (2024-04-01). Used for contamination analysis."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section 4 discusses contamination risk: 'tasks originate from public GitHub issues between 2023 and 2024. Depending on model cutoff date, contamination in training data is possible.' Table 4 analyzes pre/post-cutoff performance."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Multiple measures: private holdout set (Appendix A11), no internet access during evaluation, GitHub remote removed, Table 4 shows 'no clear performance improvement for tasks predating the models' knowledge cutoffs.' The holdout set uses re-introduced bugs to resist contamination."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants as subjects. The software engineers are annotators/evaluators, not study participants."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human subjects study. Engineers were hired to create/validate benchmark tasks, not studied as participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human subjects study. The paper describes annotators as 'professional software engineers' and 'experienced software engineers' but these are not study participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Section 3.6 reports API costs: GPT-4o 95% CI $194.31 ± $17.31 per run, o1 $1,623.76 ± $56.68 per run on 237 Diamond IC SWE tasks. Also reports cost savings ratios."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Appendix A5 specifies: Azure Standard D2as v4 VM (2 vCPUs, 8GB RAM), 3-hour max per task, 5 runs for price analysis. Section 3.6 reports total API costs across runs."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Main results are single rollouts. The paper states 'each is just a single rollout' (Appendix A5). Pass@k uses multiple samples but the main Table 1 results are single-seed."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "The paper explicitly states 'each is just a single rollout' for main results (Appendix A5), and '5 runs' for the price analysis (Section 3.6). Pass@k experiments state the number of samples."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget reported. The scaffold and prompt appear fixed, but no discussion of whether these were tuned or how many configurations were tried."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No discussion of how the scaffold configuration, prompt template, or tool settings were selected. These likely involved some tuning but the process is not documented."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "OpenAI authors evaluate OpenAI models alongside competitors but do not acknowledge the bias of evaluating their own systems. Scaffold may favor their own models."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Section 3.4 explicitly reports o1 performance as a function of reasoning effort (compute). Figure 8 shows pass@1 by price range across compute levels."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 1 and the comparison table (A4) discuss construct validity: E2E tests vs unit tests, grader hacking susceptibility, real-world representativeness vs prior benchmarks. Section 4 discusses scope limitations."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "All models use the same basic scaffold, but the paper does not discuss whether this scaffold favors certain models. Appendix A15 acknowledges that open-source models may be underelicited due to scaffold design, but the confound between scaffold and model is not systematically addressed for the main models."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Table 4 in Appendix A2 analyzes performance split by whether tasks predate model training cutoffs, directly addressing temporal leakage. Results show 'no clear performance improvement for tasks predating the models' knowledge cutoffs.'"
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 3.1 states agents run with 'no Internet access', GitHub remote removed, future commits removed. The user tool provides no feedback about success. Models cannot access E2E tests during evaluation."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "All tasks come from a single repository (Expensify). The paper does not discuss whether tasks share structural similarities or whether performance on one task could inform another due to shared codebase context."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "Appendix A11 describes augmented holdout tasks where engineers 'reintroduce the bug in a distinct but equivalent way.' Section 3.1 removes internet access and Git remotes. Table 4 performs temporal split analysis."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Claude 3.5 Sonnet is the best performing model, earning $208,050 on SWE-Lancer Diamond and resolving 26.2% of IC SWE tasks.",
    363       "evidence": "Table 1 shows Claude 3.5 Sonnet at 26.2% pass@1 on IC SWE Diamond ($58k), 44.9% on SWE Manager Diamond ($150k), totaling $208k on the Diamond set.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "Higher test-time compute (reasoning effort) improves performance, especially on harder tasks.",
    368       "evidence": "Section 3.4 and Figure 8: o1 improves from 9.3% (Low) to 16.5% (High) on IC SWE Diamond. Greater improvements on higher-priced tasks.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Allowing more attempts (pass@k) leads to consistent performance increases, with o1 nearly tripling its solve rate with 6 additional attempts.",
    373       "evidence": "Section 3.3 and Figure 7: o1 pass@1 ~16.5% increases to ~45% at pass@7. GPT-4o pass@6 matches o1 pass@1.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Using o1 before deferring to freelancers could reduce costs by up to 33.5% (pass@5).",
    378       "evidence": "Section 3.6: o1 pass@5 with freelancer fallback yields 33.5% cost savings over paying freelancers for all 237 Diamond IC SWE tasks.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "SWE-Lancer's E2E tests are more resistant to cheating than unit tests used in prior benchmarks.",
    383       "evidence": "Appendix A7 provides a concrete example of o1 cheating on SWE-Bench Verified by renaming a variable. The paper argues E2E tests emulate human workflows and are harder to exploit, but no systematic grader hacking analysis on SWE-Lancer is provided.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Models perform better on SWE Manager tasks than IC SWE tasks.",
    388       "evidence": "Table 1: All models show higher pass@1 on Manager tasks (37-47%) vs IC SWE tasks (8-26%). Consistent across Diamond and Full sets.",
    389       "supported": "strong"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "SWE-Lancer introduces a benchmark of 1,488 freelance software engineering tasks valued at $1M, evaluating both coding (IC SWE) and managerial decision-making capabilities. Claude 3.5 Sonnet leads with 26.2% on IC SWE and 44.9% on manager tasks, earning ~$400K on the full set. Higher test-time compute and more attempts improve performance, but models still fail the majority of tasks. The benchmark's use of E2E tests and real monetary payouts provides a more realistic evaluation framework than prior work.",
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating its own models",
    397       "detail": "All authors are from OpenAI. While Claude 3.5 Sonnet outperforms OpenAI models, GPT-4o and o1 are heavily featured and the scaffold was designed by OpenAI. Appendix A15 acknowledges open-source models may be 'underelicited' due to scaffold design."
    398     },
    399     {
    400       "flag": "Single-run main results",
    401       "detail": "Main results in Table 1 are single rollouts at temperature 1.0. The paper acknowledges 'there can be significant variance between runs' but does not quantify this variance for the main results. At temperature 1.0, single-run variance could be substantial."
    402     },
    403     {
    404       "flag": "Single repository benchmark",
    405       "detail": "All 1,488 tasks come from one repository (Expensify) on one platform (Upwork). Despite claims of real-world representativeness, this limits generalizability. The paper acknowledges this in limitations but the title and framing ('$1 Million from Real-World Freelance Software Engineering') implies broader applicability."
    406     },
    407     {
    408       "flag": "No statistical tests for model comparisons",
    409       "detail": "Claims like 'Sonnet 3.5 outperforms o1 by 9.7%' are based on single-run point estimates without statistical tests. Given acknowledged variance, these differences may not be statistically significant."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Evaluating large language models trained on code",
    415       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    416       "year": 2021,
    417       "arxiv_id": "2107.03374",
    418       "relevance": "Introduces HumanEval benchmark and pass@k metric, foundational for code generation evaluation."
    419     },
    420     {
    421       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    422       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    423       "year": 2024,
    424       "arxiv_id": "2310.06770",
    425       "relevance": "Primary predecessor benchmark for real-world software engineering evaluation."
    426     },
    427     {
    428       "title": "Introducing SWE-Bench Verified",
    429       "authors": ["Neil Chowdhury", "James Aung", "Chan Jun Shern"],
    430       "year": 2024,
    431       "arxiv_id": "2407.01489",
    432       "relevance": "Improved version of SWE-bench with human-verified test cases."
    433     },
    434     {
    435       "title": "SWE-bench Multimodal: Do AI systems generalize to visual software domains?",
    436       "authors": ["John Yang", "Carlos E. Jimenez", "Alex L. Zhang"],
    437       "year": 2024,
    438       "arxiv_id": "2410.03859",
    439       "relevance": "Extends SWE-bench to frontend/visual programming tasks."
    440     },
    441     {
    442       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    443       "authors": ["Terry Y. Zhuo", "Minh Chien Vu", "Jenny Chim"],
    444       "year": 2024,
    445       "arxiv_id": "2406.15877",
    446       "relevance": "Benchmark for code generation with complex multi-step instructions."
    447     },
    448     {
    449       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    450       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    451       "year": 2024,
    452       "arxiv_id": "2403.07974",
    453       "relevance": "Contamination-aware code benchmark using temporal splits."
    454     },
    455     {
    456       "title": "Competition-level code generation with AlphaCode",
    457       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    458       "year": 2022,
    459       "doi": "10.1126/science.abq1158",
    460       "relevance": "Landmark system for competitive programming code generation."
    461     },
    462     {
    463       "title": "Can LLMs generate novel research ideas? A large-scale human study with 100+ NLP researchers",
    464       "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"],
    465       "year": 2024,
    466       "arxiv_id": "2409.04109",
    467       "relevance": "Evaluates LLM capability for research ideation with human expert comparison."
    468     },
    469     {
    470       "title": "CodeELO: Benchmarking competition-level code generation of LLMs with human-comparable Elo ratings",
    471       "authors": ["Shanghaoran Quan", "Jiaxi Yang", "Bowen Yu"],
    472       "year": 2025,
    473       "arxiv_id": "2501.01257",
    474       "relevance": "Competition-level code generation benchmark using Elo rating system."
    475     },
    476     {
    477       "title": "SciCode: A research coding benchmark curated by scientists",
    478       "authors": ["Minyang Tian", "Luyu Gao", "Shizhuo Dylan Zhang"],
    479       "year": 2024,
    480       "arxiv_id": "2407.13168",
    481       "relevance": "Scientific programming benchmark evaluating LLM coding for research tasks."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs