scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28078B)
      1 {
      2   "paper": {
      3     "title": "NL2Repo-Bench: Towards Long-Horizon Repository Generation Evaluation of Coding Agents",
      4     "authors": ["Jingzhe Ding", "Shengda Long", "Changxin Pu", "Ge Zhang", "Huan Zhou"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.12730"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "NL2Repo-Bench evaluates coding agents on full repository generation from natural language, finding that even the best models (Claude-Sonnet-4.5) achieve below 40% test pass rates. The paper identifies systematic failure modes including premature termination, loss of global coherence, and fragile cross-file dependencies. Context window size correlates with performance, but is not sufficient alone. Task planning tool usage shows the strongest correlation with model performance (0.711).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "Project page with GitHub link provided: https://github.com/multimodal-art-projection/NL2RepoBench. The paper states they 'release the NL2Repo benchmark, including the dataset, docker environments, and evaluation toolkit.'"
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The benchmark dataset of 104 tasks is released via the GitHub project page, including task specifications and Docker evaluation environments."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 3.1.3 describes Docker-based execution environments for each task with dependency version pinning. The evaluation toolkit includes containerized environments."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the paper releases code and environments, the paper itself does not include step-by-step reproduction instructions for replicating the experimental results (e.g., exact commands to run each model evaluation)."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables 3, 4, and 8 report only point estimates (pass rates). No confidence intervals or error bars are provided for any results."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims performance differences between models (e.g., 'Claude outperforms other models') but provides no statistical significance tests. Comparisons are based solely on raw score differences."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Performance differences are reported with baseline context, e.g., 'pass rate increases from 40.2% to 59.4%' (Table 8), and percentage improvements across difficulty levels. The magnitude of differences is clear from the tables."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The benchmark contains 104 tasks but no justification is provided for why 104 tasks is sufficient for the claims being made. No power analysis or sample size rationale is given."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Table 6 reports standard deviation of interaction turns across tasks (e.g., Claude-Sonnet-4.5: 181.6 ± 64.1). However, variance in pass rates across tasks or runs is not reported."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper evaluates 12 model/framework combinations including open-source (DeepSeek, Qwen, GLM) and closed-source (Claude, GPT-5, Gemini) models as baselines against each other."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "All evaluated models are contemporary SOTA models from 2024-2025: Claude-Sonnet-4.5, GPT-5, DeepSeek-V3.2, Gemini-3-pro, Qwen3, Kimi-k2, GLM-4.6."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 4.4 presents two ablation studies: impact of interaction round limits (Fig. 9) and impact of revealing test cases (Table 8), isolating specific evaluation conditions."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports overall pass rate, Pass@1 count, per-difficulty breakdown, per-category breakdown, interaction turns, tool usage statistics, and turns-per-score efficiency ratio."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The benchmark evaluates code generation via automated pytest suites. Human evaluation of generated code quality is not relevant — the correctness criterion is binary (tests pass or fail)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Test suites are strictly hidden from agents during development (Section 4.1: 'no test cases are revealed during development'). The ablation in Section 4.4.2 explicitly tests the impact of revealing tests."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 4 provides per-category breakdowns across 9 task categories. Table 3 provides per-difficulty breakdowns (Easy/Medium/Hard)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Sections 4.3.3 (early termination), 4.3.6 (workflow patterns including 'Navigation Trap' and 'Blind Editing'), and 4.3.7 (failure taxonomy including ImportError, dependency issues, test suite alignment) discuss failure modes in detail."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper's main finding is essentially negative: all models fail to reliably generate repositories (<40% pass rate). GPT-5's underperformance relative to expectations is specifically discussed. The 'thinking model paradox' (Qwen3-Thinking's 49% early termination) is another negative finding."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims agents achieve 'below 40% average test pass rates' — supported by Table 3 (best: 40.2%). Claims about failure modes (premature termination, loss of coherence) are supported by Sections 4.3.3 and 4.3.7."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims like 'context window capacity emerges as a crucial factor' (Section 4.3.5) and attributes Claude's advantage to its 1M token context, but this is confounded by other model differences (training data, architecture, RLHF). The ablation on test case visibility is a proper controlled experiment, but most causal claims are observational."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims to evaluate 'long-horizon repository generation' broadly but tests only Python libraries (104 tasks). The title and abstract frame results as general statements about 'coding agents' without bounding to Python. Section 3.1.1 restricts to Python but claims in Section 5 generalize beyond this."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "When attributing Claude's performance to context window size, the paper does not adequately discuss alternative explanations (model quality, training data, RLHF alignment). The 'thinking model paradox' hypothesis is speculative without controlled evidence."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper is explicit about what it measures (pytest pass rates) and what it claims this indicates (repository generation capability). The distinction between functional correctness (measured) and broader software quality (not measured) is maintained throughout."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are specified by marketing names only: 'Claude-Sonnet-4.5', 'GPT-5', 'DeepSeek-V3.2', 'Gemini-3-pro'. No API version strings, snapshot dates, or model IDs are provided. Footnote 1 mentions Gemini-3-pro was used with Cursor-CLI but no version specifics."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Section 4.1 mentions 'a single initialization instruction from the user' but does not provide the actual prompt text used. The exact instruction given to agents is not disclosed."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or other sampling settings for any of the models evaluated."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The OpenHands CodeAct framework tools are described in Appendix B (8 tools listed with descriptions). Cursor-CLI and Claude Code are acknowledged as commercial agents. The paper describes available tools and their functions."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.1 documents the full benchmark construction pipeline: repository selection criteria (Section 3.1.1), document writing process (Section 3.1.2), environment building (Section 3.1.3), and verification/refinement (Section 3.1.4). Appendix C provides detailed annotator tutorials."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion briefly mentions 'limitations' in passing but does not substantively discuss them."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. Issues like the Python-only restriction, potential annotator bias in specification writing, or the representativeness of the 104 selected repositories are not addressed."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to Python, to the specific frameworks tested, or acknowledge that repository generation from specifications differs from real-world development."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The benchmark release includes task documents, Docker environments, and evaluation toolkit. Generated workspaces could potentially be shared, though it's unclear if raw agent outputs/logs are released."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.1.1 describes repository selection criteria (complexity, maturity, completeness, recency). Section 3.1.2 describes the reverse-engineering process for document creation. Appendix C provides annotator guidelines."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants in the study. The benchmark evaluates AI models, not humans. Data sources are public GitHub repositories."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline from repository selection → document writing → environment building → verification is documented in Section 3.1, with filtering criteria at each stage. Figure 2 visualizes the construction pipeline."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: ByteDance Seed, M-A-P, 2077AI, Humanlaya Data, Nanjing University, Peking University, Beijing University of Posts and Telecommunications, Beihang University."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Lead authors are from ByteDance Seed, which develops AI models. ByteDance has a vested interest in benchmarking outcomes. The paper evaluates ByteDance-affiliated models (Seed-Thinking is referenced) alongside competitors. No discussion of this potential conflict."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the evaluated models. This is relevant since the benchmark uses existing open-source repositories that could appear in training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The benchmark tasks are derived from existing GitHub repositories. No discussion of whether these repositories appeared in the training data of the evaluated models. The 'recency' criterion (3 years) is mentioned but not connected to model training cutoffs."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The source repositories are public GitHub projects that likely appeared in LLM training data. No contamination analysis is performed. While the task requires generating from NL specifications (not recalling code), the models may have memorized repository structures and implementations."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, token consumption, or wall-clock time are reported for running the benchmark despite evaluating 12 model/framework combinations across 104 tasks each."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget is stated. The paper does not report GPU hours, total API spend, or hardware used for the experiments."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "All results appear to be single-run (Pass@1 is explicitly reported). No seed sensitivity analysis or multiple-run results are provided for pass rates."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not explicitly state whether results are from single runs or averaged. Pass@1 counts suggest single runs, but this is not clearly stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "The paper evaluates existing models via APIs without hyperparameter tuning, so no search budget applies."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection was performed — models were evaluated with their default settings."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is moot."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "ByteDance authors evaluate models from multiple companies but do not acknowledge potential bias in benchmark design that could favor certain model capabilities. The Seed-Thinking model (from ByteDance) is referenced in the paper, and the benchmark construction process could embed implicit preferences."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 7 plots total tool invocations vs. task pass rate, showing efficiency trade-offs. Table 6 reports turns/score ratios. Section 4.3.5 discusses context window size vs. performance."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 4.4.2 ablates the benchmark by revealing test cases, testing whether the bottleneck is requirement inference vs. implementation. The paper discusses what the benchmark measures (long-horizon generation) vs. what existing benchmarks measure. Section 4.3.5 analyzes whether context size or model capability drives results."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Table 3 explicitly evaluates Claude-Sonnet-4.5 across three frameworks (OpenHands, Claude Code, Cursor) showing <1% variation, directly addressing the scaffold confound. The paper concludes the benchmark is 'model-centric' rather than scaffold-dependent."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The benchmark uses existing GitHub repositories. The 'recency' criterion requires repos updated within 3 years, but no analysis connects this to model training cutoffs. Models may have seen source code of benchmark repositories during training."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The paper explicitly separates development (NL specification only) from evaluation (hidden test suite). Section 4.4.2 ablates this by revealing tests, showing the design prevents feature leakage from test suites to agents."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the 104 source repositories share structural similarities, use similar libraries, or whether training data of evaluated models includes code from these repositories."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection method is applied. No canary strings, membership inference, or decontamination analysis is performed despite the benchmark being derived from public GitHub repositories."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Even the strongest coding agents achieve below 40% average test pass rates on NL2Repo-Bench",
    364       "evidence": "Table 3: Claude-Sonnet-4.5 (Claude Code) achieves 40.2%, all other models below 40%. Best Pass@1 is 5 repositories out of 104.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Context window size is a crucial factor in NL2Repo-Bench performance, with 1M-context models substantially outperforming others",
    369       "evidence": "Section 4.3.5 and Figure 8: 1M context models (Claude) average 38.5% vs 256K models averaging 21.7%. However, Kimi-k2 with long context achieves only 22.7%.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "The benchmark is model-centric rather than framework-dependent, as the same model shows <1% variation across frameworks",
    374       "evidence": "Table 3: Claude-Sonnet-4.5 scores 40.2% (Claude Code), 39.9% (OpenHands), 39.2% (Cursor).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Task planning (task_tracker tool) has the strongest correlation with model performance",
    379       "evidence": "Table 5: task_tracker correlation = 0.711, with only 1 model not using it. The 'think' tool has higher correlation (0.816) but 4 models don't use it.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "GPT-5 is aligned more towards human-in-the-loop collaboration than autonomous execution, with 84.5% non-finish rate",
    384       "evidence": "Figure 5 and Section 4.3.3: GPT-5 has 84.5% non-finish rate and only 78.4 average turns (lowest among all models). Case analysis shows it halts for user confirmation.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Even with all test cases revealed ('cheating' scenario), the best model achieves only 59.4% pass rate",
    389       "evidence": "Table 8: Claude-Sonnet-4.5 (Claude Code) improves from 40.2% to 59.4% when test cases are provided, with Pass@1 jumping from 3 to 18.",
    390       "supported": "strong"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "ByteDance conflict of interest",
    396       "detail": "Lead authors are from ByteDance Seed. ByteDance develops AI models including Seed-Thinking (referenced in the paper). While the benchmark evaluates mainly competitor models, the benchmark design could embed implicit biases. No conflict of interest statement is provided."
    397     },
    398     {
    399       "flag": "No contamination analysis despite using public repositories",
    400       "detail": "The 104 benchmark tasks are derived from public GitHub repositories that likely appeared in the training data of evaluated models. The models may have memorized implementations, inflating or deflating scores unpredictably. No decontamination analysis is performed."
    401     },
    402     {
    403       "flag": "Single-run evaluation",
    404       "detail": "Results appear to be from single runs with no variance or confidence intervals reported for pass rates. LLM outputs are stochastic, and single-run results may not be representative."
    405     },
    406     {
    407       "flag": "No limitations section",
    408       "detail": "The paper lacks a dedicated limitations section despite making broad claims about 'autonomous software engineering capability.' Python-only restriction, potential annotation bias, and contamination risks are not acknowledged."
    409     },
    410     {
    411       "flag": "Causal claims from observational comparisons",
    412       "detail": "Claims about context window size being a 'crucial factor' are drawn from comparing different models that differ in many ways beyond context size. The observed correlation does not establish causation."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "SWE-bench: Can language models resolve real-world github issues?",
    418       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    419       "year": 2024,
    420       "relevance": "Foundational repair-focused benchmark for coding agents that NL2Repo-Bench positions against."
    421     },
    422     {
    423       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    424       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    425       "year": 2024,
    426       "relevance": "Primary agent framework used for evaluation in NL2Repo-Bench experiments."
    427     },
    428     {
    429       "title": "Evaluating large language models trained on code",
    430       "authors": ["Mark Chen", "Jerry Tworek"],
    431       "year": 2021,
    432       "arxiv_id": "2107.03374",
    433       "relevance": "HumanEval benchmark — foundational function-level code generation evaluation that NL2Repo-Bench argues is insufficient."
    434     },
    435     {
    436       "title": "PaperBench: Evaluating AI's ability to replicate AI research",
    437       "authors": ["Giulio Starace", "Oliver Jaffe"],
    438       "year": 2025,
    439       "relevance": "Repository-level benchmark using LLM-based evaluation that NL2Repo-Bench contrasts with its test-based evaluation."
    440     },
    441     {
    442       "title": "Commit0: Library generation from scratch",
    443       "authors": ["Wenting Zhao", "Nan Jiang"],
    444       "year": 2024,
    445       "relevance": "Most directly comparable benchmark — also targets from-scratch library generation but provides scaffolding/signatures as priors."
    446     },
    447     {
    448       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    449       "authors": ["John Yang", "Carlos E Jimenez"],
    450       "year": 2024,
    451       "relevance": "Introduces Agent-Computer Interface abstraction for repository-level coding agent tasks."
    452     },
    453     {
    454       "title": "RepoBench: Benchmarking repository-level code auto-completion systems",
    455       "authors": ["Tianyang Liu", "Canwen Xu"],
    456       "year": 2023,
    457       "arxiv_id": "2306.03091",
    458       "relevance": "Repository-level code completion benchmark that NL2Repo-Bench positions as addressing only partial completion."
    459     },
    460     {
    461       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    462       "authors": ["Xueying Du", "Mingwei Liu"],
    463       "year": 2023,
    464       "arxiv_id": "2308.01861",
    465       "relevance": "Class-level code generation benchmark bridging function-level and repository-level evaluation."
    466     },
    467     {
    468       "title": "Code Llama: Open foundation models for code",
    469       "authors": ["Baptiste Roziere"],
    470       "year": 2023,
    471       "arxiv_id": "2308.12950",
    472       "relevance": "Key open-source code LLM with long-context support relevant to repository-level generation."
    473     },
    474     {
    475       "title": "DeepSeek-V3.2: Pushing the frontier of open large language models",
    476       "authors": ["Aixin Liu"],
    477       "year": 2025,
    478       "arxiv_id": "2512.02556",
    479       "relevance": "One of the evaluated open-source models in NL2Repo-Bench experiments."
    480     },
    481     {
    482       "title": "AutoCodeBench: Large language models are automatic code benchmark generators",
    483       "authors": ["Jason Chou", "Ao Liu"],
    484       "year": 2025,
    485       "arxiv_id": "2508.09101",
    486       "relevance": "Automated benchmark generation for code LLMs, contrasted with NL2Repo-Bench's human-curated approach."
    487     },
    488     {
    489       "title": "SWE-bench Pro: Can AI agents solve long-horizon software engineering tasks?",
    490       "authors": ["Xiang Deng", "Jeff Da"],
    491       "year": 2025,
    492       "arxiv_id": "2509.16941",
    493       "relevance": "Extended SWE-bench targeting long-horizon tasks, directly comparable in scope to NL2Repo-Bench."
    494     }
    495   ]
    496 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs