scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26373B)
      1 {
      2   "paper": {
      3     "title": "SWEnergy: An Empirical Study on Energy Efficiency in Agentic Issue Resolution Frameworks with SLMs",
      4     "authors": ["Arihant Tripathy", "Ch Pavan Harshit", "Karthik Vaidhyanathan"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2512.09543",
      8     "doi": "10.48550/arXiv.2512.09543"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Current agentic issue resolution frameworks (SWE-Agent, OpenHands, AutoCodeRover, Mini SWE Agent) fail when paired with SLMs, achieving near-zero resolution rates (4% max for AutoCodeRover+Qwen, 0% for all others) on SWE-bench Verified Mini. Framework architecture is the primary driver of energy consumption, with AutoCodeRover consuming 9.4x more energy than OpenHands. Energy correlates strongly with runtime (R=0.89) and output tokens (R=0.88), with 'chatty' ReAct-style agents entering unproductive loops that waste energy without progress.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Replication package with source code, logs, and measurements provided at https://github.com/sa4s-serc/swenergy (footnote 1, Section 3)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "SWE-bench Verified Mini is publicly available. The replication package includes raw logs and computed metrics in structured JSON format (Section 4.3)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Hardware is specified (Intel Xeon w3-2435, 32GB RAM, RTX A2000 16GB) and FP16/32K context window stated, but no software dependency specifications (requirements.txt, Dockerfile, library versions) are described for the monitoring/measurement infrastructure."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "A replication package is mentioned but the paper does not include step-by-step reproduction instructions. No README with commands or 'Reproducing Results' section is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Table 1 reports Min, P50, Mean, Max, and Std for all metrics across repetitions. Figure 2 shows box plots with interquartile ranges and whiskers."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used despite claims of differences between frameworks. Comparisons rely on descriptive statistics and correlation coefficients (R values in Section 5.2) without formal hypothesis testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported with context: '9.4x more energy' (abstract), specific kJ values with baselines (e.g., 216.21 kJ vs 23.05 kJ), and correlation coefficients (R=0.89, R=0.88) in Section 5."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The choice of 50 tasks is justified by SWE-bench Verified Mini's representativeness (Section 3.1) and 3 repetitions are performed, but no power analysis or formal sample size justification is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviation is reported for all metrics in Table 1 across 3 repetitions. Box plots in Figure 2 show full distributions."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Four frameworks are compared against each other (SWE-Agent, OpenHands, AutoCodeRover, Mini SWE Agent), each serving as a baseline for the others (Section 3.1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Three of the four frameworks ranked among the top three on SWE-bench Full leaderboard as of October 2025 (Section 3.1). Models selected from Berkeley Function-Calling leaderboard."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The paper evaluates existing third-party frameworks comparatively, not a single system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Eight metrics across three dimensions: resolution status, failure mode (effectiveness); wall-clock duration, token usage, LLM call count (efficiency); total energy, peak memory, cost (resource utilization). Section 3.3."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "The study measures energy consumption and automated resolution rates of software agents. Human evaluation is irrelevant to these claims."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "SWE-bench Verified Mini is used as-is as the evaluation benchmark. No tuning or selection is performed on this dataset (Section 3.1)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides per-framework-per-model breakdowns for all metrics. Figure 5 shows per-framework failure mode breakdowns. Figure 4 shows per-framework token distributions."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive failure analysis in Section 5.2 and 6.1: failure mode taxonomy (Figure 5), qualitative log analysis identifying Step Repetition, Context Loss, Follow Task Requirements failures, and 'false positive' completions (Lesson 2)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The entire paper is essentially a negative result: SLMs achieve near-zero resolution rates in all frameworks. The paper explicitly frames this as a finding rather than hiding it."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims (near-zero resolution, 9.4x energy difference, framework architecture as primary driver, 4% for AutoCodeRover) are all supported by Table 1 and the analysis in Section 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The claim 'framework architecture is the primary driver of energy consumption' is supported by controlling for model and hardware across 4 frameworks. Section 5.2 provides correlation analysis and failure mode analysis linking architectural choices to energy patterns."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 (Threats to Validity) explicitly bounds findings: limited to 2 SLMs, single hardware configuration, 50-task subset. External validity subsection notes findings 'may not generalize to different types or scales of issues.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6 discusses alternative explanations: low energy could mask premature termination or false positives (Lesson 2), framework design vs model capability as competing explanations (Lesson 3). Section 7 considers OS background activity as a confound."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures energy (kJ), resolution rate, and tokens directly. Claims about 'energy efficiency' match the granularity of measurements. No proxy gap exists between what is measured and what is claimed."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are identified as 'Gemma-3 4B' and 'Qwen-3 1.7B Instruct' with parameter counts but no specific checkpoint IDs, version hashes, or snapshot dates. These are marketing names without precise version identification."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper uses existing frameworks' default prompts but does not reproduce them. The replication package may contain them via the framework code, but the paper itself does not provide the actual prompt text used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "32K context window and FP16 precision are stated (Section 4.1), along with 30-minute timeout. But LLM sampling parameters (temperature, top-p, etc.) are not reported."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Each framework's architecture is described in Section 3.1: SWE-Agent's ACI with ReAct prompting, OpenHands' sandboxed Docker with multi-agent support, AutoCodeRover's three-phase pipeline (fault localization → context retrieval → patch generation), Mini SWE Agent's minimal bash-only design."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4 documents the full experimental procedure: idle baseline measurement and subtraction (51.69W CPU, 2.70W GPU), monitoring script instrumentation, SWE-bench evaluation harness verification, and JSON storage format."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Threats to Validity' provides substantive discussion across internal, construct, external, and conclusion validity categories."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats: '50 tasks are a small subset' (external), 'limited to two small, open-weight models' (external), 'single hardware configuration' (external), 'Gemini 2.5 Flash may influence pattern identification' (construct), 'dataset heavily skewed toward failure' (conclusion)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 explicitly states what results do NOT show: 'findings may not generalize to different types or scales of issues', 'trade-offs could differ substantially with larger models', 'energy profiles may vary on different CPU/GPU architectures.' Conclusion validity notes inability 'to draw firm conclusions about the energy cost of a successfully resolved issue.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 4.3: 'All raw logs and computed metrics were stored in a structured JSON format to ensure transparency and facilitate downstream analysis.' Replication package provided at GitHub."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 describes the experimental procedure in detail: isolated machine, RAPL/NVML instrumentation, idle baseline subtraction, 30-min timeout, 3 repetitions, SWE-bench evaluation harness for verification."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data source is SWE-bench Verified Mini, a standard public benchmark."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from execution to analysis is documented: framework execution with monitoring → idle baseline subtraction → SWE-bench evaluation → structured JSON storage → statistical analysis (Section 4). Idle power values (51.69W CPU, 2.70W GPU) are explicitly stated."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section: 'This research was funded by the ANRF Prime Minister Early Career Research Grant (ANRF/ECRG/2024/003379/ENS).'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors affiliated with SERC, IIIT-Hyderabad. They evaluate third-party frameworks (SWE-Agent, OpenHands, AutoCodeRover), not their own products."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funder is ANRF (Indian government research grant), which has no financial stake in the performance of any evaluated framework or model."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No mention of training data cutoff dates for Gemma-3 4B or Qwen-3 1.7B Instruct. The models could have been trained on SWE-bench task repositories."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether SWE-bench Verified Mini tasks or their solutions appeared in the training data of either model."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "SWE-bench tasks derive from public GitHub issues that predate both models' training. No contamination analysis or discussion is provided."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 1 reports Cost (USD) per run for each configuration. Token usage and energy consumption (kJ) also reported. E.g., AutoCodeRover+Qwen mean cost $0.1245, OpenHands+Gemma $0.0043."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "1,200 total experimental runs on specified hardware (Section 4.2). Energy consumption per run and total duration are reported. Hardware fully specified (Intel Xeon w3-2435, 32GB RAM, RTX A2000 16GB)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Three full repetitions per configuration with standard deviation reported across runs in Table 1 (Section 4.3). This captures run-to-run variability including stochastic model behavior."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.3: 'three full repetitions for each framework and model configuration. This yielded 150 data points' per condition, '1,200 experimental runs' total."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Frameworks are used with fixed settings (32K context, FP16, 30-min timeout) but no discussion of whether these were tuned or how they were selected. No search budget reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "All configurations are reported in Table 1 with no selective presentation. The paper evaluates and reports all framework-model combinations rather than selecting a best one."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate third-party frameworks using original implementations, which mitigates this bias, but they do not explicitly discuss author-evaluation bias or the Lucic et al. concern."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "The entire paper analyzes performance as a function of compute/energy budget. Table 1 jointly reports energy, time, and resolution. The energy-vs-effectiveness trade-off is the central research question."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "SWE-bench Verified Mini is used without questioning whether it measures real-world issue resolution capability. Section 3.1 notes representativeness of the subset but does not discuss construct validity of SWE-bench itself."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The scaffold confound IS the central research question. The paper explicitly compares frameworks (scaffolds) as the primary variable while controlling for model and hardware, directly measuring the scaffold's impact on energy and performance."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether SWE-bench tasks predate the models' training data or whether solutions were available during training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides hints not available in real usage scenarios."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training data and SWE-bench tasks share structural similarities or overlap from same repositories."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Framework architecture is the primary driver of energy consumption, with AutoCodeRover consuming 9.4x more energy than OpenHands when using the same SLM (Gemma).",
    365       "evidence": "Table 1: AutoCodeRover+Gemma mean 216.21 kJ vs OpenHands+Gemma mean 23.05 kJ. Correlation analysis (Section 5.2): energy correlates with duration (R=0.89) and output tokens (R=0.88).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Task resolution rates are near-zero for all SLM-framework configurations, with only AutoCodeRover+Qwen achieving 4% (2/50 tasks).",
    370       "evidence": "Table 1 shows 0% resolution for all configurations except AutoCodeRover+Qwen (mean 2 resolved). Verified by SWE-bench evaluation harness (Section 4.3).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Low energy consumption does not indicate efficiency — it can mask premature termination or false positive completions.",
    375       "evidence": "Section 6.1, Lesson 2: Mini SWE Agent showed low energy but many runs terminated prematurely due to context overflow, and some 'successful' runs produced destructive patches (e.g., replacing 2,000-line file with a single invalid line).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Current agentic frameworks are fundamentally mismatched with SLM capabilities, acting as passive orchestrators that assume competent reasoning engines.",
    380       "evidence": "Section 6.1, Lesson 3, and qualitative log analysis (Section 4.4): frameworks lack mechanisms to guide, scaffold, or correct struggling SLMs. Supported by failure mode analysis (Figure 5) showing Step Repetition and Context Loss dominate.",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "Near-zero resolution limits conclusions",
    387       "detail": "With only AutoCodeRover+Qwen achieving any success (4%), conclusions about energy cost of successful resolution are statistically weak. The authors acknowledge this in Section 7 (Conclusion Validity)."
    388     },
    389     {
    390       "flag": "AI-assisted qualitative analysis",
    391       "detail": "Gemini 2.5 Flash was used to process execution logs for failure mode identification (Section 4.4). While authors manually verified a sample, this introduces potential bias in pattern interpretation."
    392     },
    393     {
    394       "flag": "No contamination analysis",
    395       "detail": "SWE-bench tasks come from public GitHub issues. Models may have seen these during training, potentially affecting even the near-zero results. No training cutoffs or contamination checks are discussed."
    396     },
    397     {
    398       "flag": "Very narrow model selection",
    399       "detail": "Only 2 SLMs tested (1.7B and 4B parameters). Results may not generalize to the broader SLM space (e.g., 7B-8B models commonly considered 'small')."
    400     }
    401   ],
    402   "cited_papers": [
    403     {
    404       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    405       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    406       "year": 2024,
    407       "relevance": "The benchmark used in this study; foundational evaluation framework for autonomous issue resolution agents."
    408     },
    409     {
    410       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    411       "authors": ["John Yang", "Carlos E. Jimenez"],
    412       "year": 2024,
    413       "relevance": "One of the four evaluated agentic frameworks; introduces the ACI concept for agent-driven code editing."
    414     },
    415     {
    416       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    417       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    418       "year": 2025,
    419       "relevance": "One of the four evaluated frameworks; general-purpose agentic platform for software development."
    420     },
    421     {
    422       "title": "AutoCodeRover: Autonomous Program Improvement",
    423       "authors": ["Yiheng Zhang"],
    424       "year": 2024,
    425       "doi": "10.1145/3650212.3680384",
    426       "relevance": "Only framework achieving non-zero resolution in this study; structured three-phase pipeline for issue resolution."
    427     },
    428     {
    429       "title": "Small Language Models are the Future of Agentic AI",
    430       "authors": ["Peter Belcak", "Greg Heinrich", "Shizhe Diao"],
    431       "year": 2025,
    432       "arxiv_id": "2506.02153",
    433       "relevance": "Argues for SLMs as sustainable alternatives to LLMs for agentic tasks; motivates this study's focus."
    434     },
    435     {
    436       "title": "Why Do Multi-Agent LLM Systems Fail?",
    437       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
    438       "year": 2025,
    439       "arxiv_id": "2503.13657",
    440       "relevance": "Provides the MAST failure taxonomy used to classify failure modes in this study."
    441     },
    442     {
    443       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    444       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    445       "year": 2023,
    446       "relevance": "The prompting paradigm used by SWE-Agent and OpenHands; relevant to understanding agentic reasoning loops."
    447     },
    448     {
    449       "title": "Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model",
    450       "authors": ["Alexandra Sasha Luccioni", "Sylvain Viguier", "Anne-Laure Ligozat"],
    451       "year": 2023,
    452       "relevance": "Foundational work on environmental costs of large language models; motivates the energy efficiency focus."
    453     },
    454     {
    455       "title": "Greening AI-enabled Systems with Software Engineering: A Research Agenda for Environmentally Sustainable AI Practices",
    456       "authors": ["Luís Cruz", "João Paulo Fernandes", "Maja H. Kirkeby"],
    457       "year": 2025,
    458       "doi": "10.1145/3743095.3743099",
    459       "relevance": "Research agenda for sustainable AI practices that motivates and contextualizes this study's green SE focus."
    460     },
    461     {
    462       "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision, and the Road Ahead",
    463       "authors": ["Junda He", "Christoph Treude", "David Lo"],
    464       "year": 2025,
    465       "doi": "10.1145/3712003",
    466       "relevance": "Survey of LLM-based multi-agent systems for SE; provides context on the agentic SE landscape."
    467     },
    468     {
    469       "title": "Small Models, Big Tasks: An Exploratory Empirical Study on Small Language Models for Function Calling",
    470       "authors": ["Ishan Kavathekar", "Raghav Donakanti", "Ponnurangam Kumaraguru", "Karthik Vaidhyanathan"],
    471       "year": 2025,
    472       "arxiv_id": "2504.19277",
    473       "relevance": "Related work by same group on SLM capabilities for function calling; directly relevant to SLM viability in agentic tasks."
    474     }
    475   ]
    476 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs