scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26115B)
      1 {
      2   "paper": {
      3     "title": "SWE-MERA: A Dynamic Benchmark for Agentically Evaluating Large Language Models on Software Engineering Tasks",
      4     "authors": [
      5       "Pavel Adamenko",
      6       "Mikhail Ivanov",
      7       "Aidar Valeev",
      8       "Rodion Levichev",
      9       "Pavel Zadorozhny",
     10       "Ivan Lopatin",
     11       "Dmitry Babayev",
     12       "Alena Fenogenova",
     13       "Valentin Malykh"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv",
     17     "arxiv_id": "2507.11059"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "SWE-MERA introduces a dynamic, continuously updated benchmark for evaluating LLMs on software engineering tasks, collecting ~10,000 potential tasks with 300 currently available via a 7-stage automated pipeline from GitHub issues. DeepSeek-R1-0528 achieves the highest pass@6 of 40.2%, with strong discriminative power across model sizes. The paper finds DeepSeek-R1 shows a notable year-over-year performance drop (50% to 40.2% pass@6) compared to other models, suggesting potential contamination effects on older tasks.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Pipeline released as a Python package on PyPI (https://pypi.org/project/repositorytest) with source code linked. Section 3.2 states 'The entire pipeline is implemented as a Python package and can be executed for any GitHub repository.'"
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Dataset available on HuggingFace (MERA-evaluation/SWE-MERA) as stated in Section 7 submission workflow."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Docker-based execution environment specified with a standardized base image (python:3.11 on Docker Hub). Appendix A shows pip install commands. Conda alternative also mentioned."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Section 7 provides a submission workflow with steps for dataset acquisition, agent execution, and submission. Appendix A documents the build validation procedure with specific commands."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Figure 1 shows error bars computed using the binomial distribution (5% two-sided quantile), as stated in the figure caption."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims DeepSeek-R1 shows a year-over-year performance difference and ranks models, but no formal significance tests are reported. Confidence intervals are shown visually but no p-values or tests are applied."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results tables report absolute pass rates with baseline context (e.g., DeepSeek-R1 27.8% pass@1, 40.2% pass@6), and year-over-year changes are quantified (e.g., 'decreases from 50% to 40.2%'). This provides enough context to assess magnitude."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The benchmark uses 300 tasks but no justification is given for why 300 is sufficient for discriminating between models. No power analysis or sample size discussion."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Results are single-run evaluations per model. While confidence intervals based on binomial distribution are shown, no variance across multiple independent runs is reported. The '3±1 hour' is about execution time, not result variance."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Ten models evaluated as baselines, including models of varying sizes and types (Table 2), with SWE-bench discussed as a baseline benchmark."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Models evaluated include DeepSeek-R1-0528 (May 2025), Devstral-Small-2505 (May 2025), Qwen3-32B (April 2025) — all contemporary at time of writing."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No ablation study of the pipeline components (e.g., effect of LLM-based filtering, effect of different quality thresholds). The pipeline has 7 stages but their individual contributions are not evaluated."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Reports pass@1, pass@6, localize files, generate patch, regression tests, and token limit hit (Table 2)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of task quality or model outputs. Task quality filtering uses LLM-based evaluation (Qwen3-32B) rather than human judgment."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The dynamic nature ensures tasks are temporally separated from model training data. Tasks are collected monthly from recent GitHub issues (post-September 2024)."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results broken down by year (2024 vs 2025 tasks in Tables 2 and 4), by model size (Figures 3, 4), and by difficulty decile (Figure 5)."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Discussion section addresses malicious software found in repositories, and the Limitations section discusses failure modes of automated collection (trivial tasks, overly complex tasks, incomplete specifications)."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports DeepSeek-R1's unexpected performance degradation on 2025 vs 2024 tasks (Section 5), and discusses cases where automated collection yields unusable tasks (Section 6)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims about contamination in SWE-bench (32.67% solution leakage, 31.08% inadequate tests) are attributed to prior work. Claims about 10,000 potential tasks and 300 available are supported by Table 1. 'Strong discriminative power' is supported by the spread of results in Table 2."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper implies DeepSeek-R1's year-over-year performance drop is due to contamination ('performs better on 2024 tasks'), but does not provide causal evidence — the difference could be due to task difficulty differences between years, task distribution shifts, or other confounds."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims evaluation of 'Software Engineering Tasks' but the benchmark is Python-only. The paper acknowledges future expansion to other languages in the conclusion but does not bound current claims to Python."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The DeepSeek-R1 year-over-year performance difference is presented without exploring alternative explanations (task difficulty differences, distribution shifts). The Limitations section discusses general issues but not alternative interpretations of specific results."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper uses pass@1/pass@6 on GitHub issues as a proxy for 'software engineering capability' but does not discuss the gap between resolving curated GitHub issues and actual software engineering ability. The Limitations section acknowledges missing aspects (readability, maintainability, security) but does not frame this as a proxy-outcome distinction."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 3 lists specific model versions with release dates (e.g., 'DeepSeek-R1-0528', 'Codestral-22B-v0.1', 'Qwen2.5-Coder-32B-Instruct'). Specific git commits for Aider are provided (4f4b10fd, 6e98cd6)."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix D provides the full prompt used for LLM-based task evaluation. The Aider framework handles model prompting, and specific Aider versions/commits are cited."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No temperature, top-p, or sampling parameters reported for any of the evaluated models. The paper mentions 6 attempts with 4 reflections per attempt (Aider config) but not model-level generation parameters."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4 describes the Aider framework: 6 independent attempts, up to 4 reflections per attempt for lint/test output, 32k token context limit. Specific Aider commits are referenced."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Table 1 documents the full pipeline funnel with counts at each stage (e.g., 255M repos → 300 final tasks), and Section 3.1 describes the criteria at each filtering step."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "A dedicated 'Limitations' section with six substantive paragraphs covering task quality, grading challenges, bias, contamination residual risk, infrastructure complexity, and scope limitations."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The Limitations section raises specific concerns: 'Automatically constructed problems may inadvertently result in unnaturally phrased prompts', 'automated correctness checks may yield false negatives', 'overrepresenting certain programming paradigms.' These are specific to this study."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The Limitations section explicitly states: 'our current benchmark focuses primarily on programming correctness. Other crucial aspects of software engineering — such as code readability, maintainability, efficiency, security, and teamwork — are not evaluated.'"
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Dataset available on HuggingFace (MERA-evaluation/SWE-MERA), and the pipeline code is released as a Python package, enabling independent verification of the collection process."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3.1 describes the 7-step collection pipeline in detail, including repository selection criteria (10+ stars, 10+ forks, Python, open-source license, recent activity), PR-issue mapping rules, and filtering criteria."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data source is GitHub repositories with well-documented selection criteria."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Table 1 documents each pipeline stage with exact counts (repositories and issues) and time estimates. The funnel from 255M repos to 300 final tasks is fully traced."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding or acknowledgments section. Authors are from SberAI, ITMO University, and MWS AI, but no funding sources are disclosed."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are listed: SberAI, ITMO University, MWS AI. These are clearly stated on the first page."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding disclosed. Authors are affiliated with SberAI (Sber is a major Russian tech company with AI products). The paper does not evaluate Sber models, but the lack of funding disclosure prevents assessment of independence."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper does not state training data cutoff dates for any of the evaluated models. It relies on temporal separation (tasks from Sept 2024–June 2025) but does not verify this against model training cutoffs."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "The entire paper is motivated by contamination concerns. Section 1 discusses data leakage in SWE-bench. The dynamic collection approach is designed to minimize overlap. The DeepSeek-R1 year-over-year analysis (Section 5) is an indirect test of contamination."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "The core contribution is addressing contamination through dynamic updates. Tasks are collected monthly from recent GitHub activity. Section 7 describes the temporal slider for inspecting contamination events. The paper provides its primary raison d'être as contamination mitigation."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 4 reports: '60-140M prompt tokens and 3-20M completion tokens' per model evaluation, '14-20K prompt and 1-4K completion tokens per request', and '3±1 hour' per model evaluation."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 4 states '8 NVIDIA H100 80 GB' for most models, '16 GPUs' for DeepSeek-R1, and '4 GPUs' for 7B models. Table 1 provides time estimates for each pipeline stage."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No seed sensitivity analysis. Each model is evaluated once on the benchmark with no reporting of variance across seeds or runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 4 states Aider gives 6 attempts per task, and reports both pass@1 (first attempt) and pass@6 (any of 6 attempts). However, there is no indication of multiple independent evaluation runs."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "The paper evaluates pre-trained models via API/inference without hyperparameter tuning. Aider configuration is fixed."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No configuration selection — models are evaluated with fixed Aider settings."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Multiple models compared but no correction for multiple comparisons applied. Confidence intervals shown but no formal family-wise error rate control."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors created the benchmark and evaluate models on it without discussing self-comparison bias or having independent evaluation."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Figures 3 and 4 plot pass@1 and pass@6 vs model size, enabling compute/performance comparison. Table 3 lists model sizes alongside results."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether resolving GitHub issues actually measures 'software engineering capability' as claimed. The LLM-based quality filter (Qwen3-32B) is used without validating whether its judgments correlate with human assessments of task quality."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "All models evaluated through a single scaffold (Aider). While this controls the scaffold variable across models, the paper does not discuss how Aider-specific design choices might advantage or disadvantage particular models, nor does it evaluate with alternative scaffolds."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "The core design uses temporal separation — tasks collected from September 2024 to June 2025, after most models' training cutoffs. The year-over-year comparison (Section 5) is an implicit temporal leakage analysis."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 3.1 step 3 filters metadata, and SWE-bench+ contamination concerns (solution leakage in issue descriptions) are discussed in Section 2. The LLM-based evaluation step explicitly checks task correctness."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether tasks drawn from the same repositories share structural similarities that could inflate performance. Multiple tasks from the same repo could create non-independence."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "The temporal slider in the web interface (Section 7, Figure 2) enables users to 'inspect potential contamination events.' The year-over-year comparison serves as an empirical leakage detection method, revealing DeepSeek-R1's anomalous performance drop."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "SWE-bench has 32.67% of successful patches involving direct solution leakage and 31.08% passing due to inadequate test cases.",
    374       "evidence": "Abstract, citing SWE-bench+ (Aleithan et al., 2024) findings.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The pipeline can collect approximately 10,000 potential tasks, with 300 samples currently available.",
    379       "evidence": "Table 1 shows the full funnel from 255M repositories to 300 final tasks. The 10,000 estimate is for extending beyond the 6-month window.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "DeepSeek-R1-0528 achieves the highest performance with 27.8% pass@1 and 40.2% pass@6 on 2025 tasks.",
    384       "evidence": "Table 2 results.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "DeepSeek-R1 shows a larger year-over-year performance decrease (50% to 40.2% pass@6) than other models, suggesting potential contamination.",
    389       "evidence": "Comparison of Tables 2 and 4, discussed in Section 5 and Appendix C. Figure 5 shows the effect is more subtle on top-decile tasks.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "SWE-MERA demonstrates strong discriminative power across state-of-the-art models.",
    394       "evidence": "Table 2 shows a clear spread from 5.5% (Qwen2.5-Coder-7B) to 40.2% (DeepSeek-R1) pass@6.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "LLM-based quality filtering without human validation",
    401       "detail": "The final quality gate uses Qwen3-32B to score tasks on correctness, test correctness, and completeness. No human validation of these LLM judgments is reported, creating a circular dependency where an LLM evaluates tasks used to evaluate LLMs."
    402     },
    403     {
    404       "flag": "Single scaffold confound",
    405       "detail": "All models evaluated exclusively through Aider. Since scaffold choice can dramatically affect results (paper's own motivation cites SWE-bench scaffold variance), using only one scaffold limits the generalizability of the model rankings."
    406     },
    407     {
    408       "flag": "Causal contamination claim without adequate evidence",
    409       "detail": "The paper implies DeepSeek-R1's year-over-year performance drop indicates contamination, but does not control for task difficulty differences between 2024 and 2025 cohorts, distribution shifts, or other confounds."
    410     },
    411     {
    412       "flag": "Python-only benchmark with broad claims",
    413       "detail": "The benchmark covers only Python repositories but the title and abstract frame it as evaluating 'Software Engineering Tasks' generally."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    419       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    420       "year": 2023,
    421       "arxiv_id": "2310.06770",
    422       "relevance": "The foundational SWE-bench benchmark that SWE-MERA aims to improve upon with dynamic collection."
    423     },
    424     {
    425       "title": "SWE-bench+: Enhanced Coding Benchmark for LLMs",
    426       "authors": ["Reem Aleithan", "Haoran Xue", "Mohammad Mahdi Mohajer", "Elijah Nnorom", "Gias Uddin", "Song Wang"],
    427       "year": 2024,
    428       "arxiv_id": "2410.06992",
    429       "relevance": "Identified solution leakage and inadequate test cases in SWE-bench, motivating SWE-MERA's design."
    430     },
    431     {
    432       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    433       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    434       "year": 2024,
    435       "arxiv_id": "2403.07974",
    436       "relevance": "Pioneered dynamic, frequently updated evaluation to address contamination, but focused on algorithmic problems rather than repository-level tasks."
    437     },
    438     {
    439       "title": "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving",
    440       "authors": ["Daoguang Zan", "Zhirong Huang", "Wei Liu"],
    441       "year": 2025,
    442       "arxiv_id": "2504.02605",
    443       "relevance": "Extended SWE-bench to multiple programming languages, addressing language diversity limitations."
    444     },
    445     {
    446       "title": "Training Software Engineering Agents and Verifiers with SWE-Gym",
    447       "authors": ["Jiayi Pan", "Xingyao Wang", "Graham Neubig"],
    448       "year": 2024,
    449       "arxiv_id": "2412.21139",
    450       "relevance": "Automatic task generation for training SE agents, complementary approach to SWE-MERA's evaluation focus."
    451     },
    452     {
    453       "title": "SWE-smith: Scaling Data for Software Engineering Agents",
    454       "authors": ["John Yang", "Kilian Leret", "Carlos E Jimenez"],
    455       "year": 2025,
    456       "arxiv_id": "2504.21798",
    457       "relevance": "Scalable synthetic data creation for SE benchmarks, addressing size and diversity limitations."
    458     },
    459     {
    460       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    461       "authors": ["DeepSeek-AI"],
    462       "year": 2025,
    463       "arxiv_id": "2501.12948",
    464       "relevance": "Top-performing model on SWE-MERA; its year-over-year performance drop is a key finding."
    465     },
    466     {
    467       "title": "Qwen2.5-Coder Technical Report",
    468       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    469       "year": 2024,
    470       "arxiv_id": "2409.12186",
    471       "relevance": "Code-specialized LLM family evaluated across multiple sizes, demonstrating scaling behavior on SWE-MERA."
    472     },
    473     {
    474       "title": "Qwen3 Technical Report",
    475       "authors": ["An Yang", "Anfeng Li", "Baosong Yang"],
    476       "year": 2025,
    477       "arxiv_id": "2505.09388",
    478       "relevance": "Used both as evaluated model and as the LLM quality filter (Qwen3-32B) in the pipeline."
    479     }
    480   ]
    481 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs