scan.json (28821B)
1 { 2 "paper": { 3 "title": "SweRank+: Multilingual, Multi-Turn Code Ranking for Software Issue Localization", 4 "authors": [ 5 "Revanth Gangi Reddy", 6 "Ye Liu", 7 "Wenting Zhao", 8 "JaeHyeok Doo", 9 "Tarun Suresh", 10 "Daniel Lee", 11 "Caiming Xiong", 12 "Yingbo Zhou", 13 "Semih Yavuz", 14 "Shafiq Joty" 15 ], 16 "year": 2025, 17 "venue": "arXiv", 18 "arxiv_id": "2512.20482", 19 "doi": "10.48550/arXiv.2512.20482" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval"], 24 "key_findings": "SweRankEmbedMulti, trained on a new multilingual dataset (SweLocMulti) spanning 10 programming languages, achieves state-of-the-art function localization performance on multilingual benchmarks and even improves over Python-only training on Python-specific benchmarks. SweRankAgent, a multi-turn agentic framework using iterative search and reasoning, consistently outperforms single-pass ranking by 2-6 points on Acc@10, with particular gains on low-overlap and multi-function localization tasks. A multi-query reformulation baseline without agentic reasoning actually hurts performance, suggesting that iterative reasoning—not just multiple queries—is the key ingredient.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper states 'Code and models will be released here: https://github.com/SalesforceAIResearch/SweRank' — this is a promise of future release, not an actual release." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": false, 35 "justification": "The primary data contribution SweLocMulti is not released; no download link is provided. Evaluation benchmarks (SWE-Bench, SWE-PolyBench, etc.) are publicly available standard benchmarks, but the paper's own training dataset is not released." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specification is provided in the paper." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are provided. The paper promises future code release but includes no instructions for replicating experiments." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Tables 2, 3, and 4 report only point estimates (Acc@5, Acc@10) with no confidence intervals, error bars, or ± notation." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper claims improvements ('new state-of-the-art', 'consistently improves') based solely on comparing point estimates across benchmarks. No statistical significance tests (t-tests, bootstrap, etc.) are reported." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Table 4 reports differences with baseline context (e.g., 66.67 → 73.08, +6.41 points). Tables 2 and 3 provide full baseline and proposed system numbers, allowing readers to compute effect sizes. This matches the schema pattern of improvement with from/to context." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "No justification for dataset sizes or power analysis. The number of test instances in each benchmark is not discussed in terms of statistical adequacy." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "All results are reported as single numbers with no standard deviation, variance across runs, or other spread measures. It is unclear whether results are from single or multiple runs." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Extensive baselines across all evaluations: Tables 2 and 3 include CodeRankEmbed, Qwen3-Embedding, GTE-Qwen2, Gemini-Embedding, SweRankEmbed, GPT-4.1, OpenHands, and LocAgent. Table 4 compares against single-query and reformulation baselines." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "Baselines include very recent models: GPT-4.1, GPT-5, Qwen3-Embedding (2025), Gemini-Embedding (2025), Qwen3-Instruct-8B (2025). These represent current state-of-the-art." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table 2 provides ablation-style comparisons: SweRankEmbed (old base) vs SweRankEmbedPython (new base, Python data) vs SweRankEmbedMulti (new base, multilingual data), isolating the effect of base model and training data. Table 4 ablates single-query vs multi-query reformulation vs multi-turn agent." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are reported using both Acc@5 and Acc@10 across all benchmarks." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": false, 99 "justification": "No human evaluation is included. All evaluation is automated via Accuracy@k metrics on benchmark datasets. Human evaluation of localization quality could be relevant to the claims." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "Models are trained on SweLocMulti and evaluated on separate, independently curated benchmarks: SWE-PolyBench, SWE-Bench-Multilingual, Multi-SWE-Bench, SWE-Bench-Lite, LocBench, and SWE-Bench-Verified. Clear separation of training and evaluation data." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Figure 3 provides language-wise performance breakdowns for SWE-PolyBench and SWE-Bench-Multilingual. Figure 4 breaks down by lexical and semantic overlap. Figure 5 breaks down by number of target functions." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": false, 114 "justification": "The qualitative example in Figure 2 shows only a success case. While Figures 4 and 5 show aggregate performance variation by difficulty, no specific failure cases are analyzed or shown." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Table 4 shows that the Reformulate baseline hurts performance compared to single-query (-1.46 on SWE-PolyBench, -1.71 on SWE-Bench-Multilingual), with the paper noting 'directly using reformulated queries introduces additional noise.'" 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims 'new state-of-the-art performance' — supported by Tables 2 and 3 showing SweRankMulti outperforms all baselines. The claim that SweRankAgent 'further improves localization over single-pass ranking' is supported by Table 4." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper makes causal claims (multilingual training improves performance, agentic search improves localization). These are supported by controlled ablations: SweRankEmbedPython vs SweRankEmbedMulti isolates the training data variable; Table 4 isolates single-query vs agent reasoning. The ablation design is adequate for these claims." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": true, 136 "justification": "Claims are appropriately bounded. The abstract says 'on issue localization benchmarks spanning various languages.' The paper evaluates on specific named benchmarks and languages. The title 'Multilingual' is justified by testing on 10 languages." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "No discussion of alternative explanations or confounds. For example, the paper does not discuss whether improvements come from more training data rather than multilingual data specifically, or whether the agent's gains are due to GPT-5's reasoning vs. simply seeing more candidates." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper measures function localization Acc@k and frames results in terms of function localization accuracy. The measurements match the claims at the same granularity — no proxy gap exists between what is measured and what is claimed." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "Base models are identified by family and size (Qwen3-Embedding 0.6B/8B, CodeRankLLM 7B, Qwen-2.5-32B-Instruct) but API models lack specific version identifiers: 'GPT-4.1', 'GPT-5', 'Claude-3.5' have no snapshot dates or API versions. Gemini-Embedding is listed as '(unknown)' size." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Full prompt text for both SWERANKLLMMULTI (system + user prompt with exact format) and SWERANKAGENT (system + user prompt with tool definitions, rules, and expected response format) are provided in the appendix." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": false, 163 "justification": "No training hyperparameters (learning rate, batch size, epochs, warmup) or inference parameters (temperature, top-p, max tokens) are reported. Only model sizes and retrieval top-k values (top-100, top-10) are mentioned." 164 }, 165 "scaffolding_described": { 166 "applies": true, 167 "answer": true, 168 "justification": "SweRankAgent's scaffolding is described in detail in Section 4 with Figure 2: ReAct-style loop with four steps (Search, Reasoning, Reformulation, Aggregate), memory buffer, stopping conditions, and the full agent prompt. The tool interface and information flow are well-documented." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 3.1 describes the data pipeline: repository selection criteria (40% target language, 1000+ stars, recent commits), PR extraction linked to issues with test modifications, consistency filtering (top-40 ranking), and hard-negative mining. Table 1 provides instance counts per language." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "There is no dedicated limitations or threats-to-validity section. The paper goes directly from experiments (Section 5) to conclusion (Section 6) without discussing limitations." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "No specific threats to validity are discussed anywhere in the paper." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "No explicit statements about what the results do not show. The conclusion mentions future work (end-to-end repair pipelines) but does not state specific boundaries of the current results." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "The SweLocMulti training dataset is not released. While evaluation benchmarks are public, the paper's primary data contribution is not available for independent verification." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 3.1 describes data collection in detail: GitHub repository selection criteria, PR extraction linked to issues, function extraction via Tree-sitter, consistency filtering, and hard-negative mining procedure." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. Data is collected from public GitHub repositories and evaluation uses standard benchmarks." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline from repository selection through PR extraction, function extraction, consistency filtering, and hard-negative mining is documented. Table 1 provides final counts per language (155,663 total instances from 4,060 repos)." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding or acknowledgments section is present in the paper. No grants or sponsors are mentioned." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are clearly listed: University of Illinois at Urbana-Champaign, Salesforce AI Research, and KAIST AI. The Salesforce affiliation of 7 of 10 authors is prominently displayed." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "Salesforce AI Research is the primary institutional affiliation (7 of 10 authors). Salesforce has commercial interest in code intelligence tools, making the funder non-independent of the outcome. This conflict is not acknowledged." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests statement or financial disclosure is included in the paper." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "No training data cutoff dates are stated for the base models used (Qwen3-Embedding, GPT-4.1, GPT-5, Claude-3.5). The SweLocMulti dataset collection period is also not specified." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of potential overlap between SweLocMulti training data and evaluation benchmarks, despite both being sourced from GitHub repositories. The risk that training and test PRs come from overlapping repositories is not addressed." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "No discussion of whether evaluation benchmark data (SWE-Bench, etc.) could be in the pre-training data of the base models (Qwen3-Embedding, GPT-4.1, GPT-5). SWE-Bench was published in 2024 and models trained after that date may have seen it." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "The paper claims efficiency advantages ('reducing computational cost', 'lightweight') but reports no actual cost metrics: no API costs, no tokens consumed, no wall-clock time, no latency measurements." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "No GPU hours, training time, API spend, or hardware specifications are reported for either training the models or running experiments." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "No results across multiple random seeds. All tables report single-point results with no indication of seed variation." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": false, 312 "justification": "The number of experimental runs is not stated anywhere. It is unclear whether results are from single runs or averaged over multiple runs." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "No hyperparameter search budget is reported. The paper does not describe how model hyperparameters were selected or how many configurations were tried." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": false, 322 "justification": "No explanation of how the best configuration was selected. Model sizes (small/large) are compared but the selection of other design choices (top-k values, number of agent turns, etc.) is not justified." 323 }, 324 "multiple_comparison_correction": { 325 "applies": true, 326 "answer": false, 327 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons. The paper makes many comparisons across 5 benchmarks and multiple models without any correction." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors evaluate their own SweRank+ system against their previous SweRank system and re-implementations of baselines without acknowledging potential self-comparison bias." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "No analysis of performance as a function of compute budget. The paper compares small (0.6B) vs large (8B) retrievers but does not quantify or control for compute differences between approaches." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether the benchmarks (SWE-Bench, SWE-PolyBench, etc.) actually measure the claimed capability of issue localization accuracy. The benchmarks are used without questioning their construct validity." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Table 2 includes OpenHands (Claude-3.5) and LocAgent (Claude-3.5) baselines which use entirely different scaffolds and models, but the paper does not discuss the scaffold confound. In Table 4, SweRankAgent adds GPT-5 reasoning on top of the same retriever, but the contribution of GPT-5 vs the iterative design is not separated." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of temporal leakage. The paper does not state when SweLocMulti training data was collected relative to when the evaluation benchmarks were created." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of feature leakage. The paper does not analyze whether the evaluation setup (e.g., issue descriptions containing traceback information) provides hints not available in real-world usage." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No discussion of non-independence. Both SweLocMulti (training) and the benchmarks (SWE-Bench, etc.) source data from GitHub repositories. The paper does not verify that training and test repositories are disjoint." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No leakage detection or prevention method is used. No decontamination pipeline, temporal splits, or overlap analysis is described." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "SweRankEmbedMulti achieves state-of-the-art retriever performance across multilingual and Python-specific benchmarks.", 376 "evidence": "Table 2 shows SweRankEmbedMulti-Large (8B) achieving the highest scores on all benchmarks: 53.73 Acc@10 on SWE-PolyBench, 62.39 on SWE-Bench-Multilingual, 86.86 on SWE-Bench-Lite, 65.36 on LocBench, and 76.37 on SWE-Bench-Verified, outperforming all baselines including Gemini-Embedding.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Multilingual training improves performance even on Python-specific benchmarks compared to Python-only training.", 381 "evidence": "Table 2 compares SweRankEmbedPython vs SweRankEmbedMulti at both small and large sizes. The Multi variant outperforms the Python variant on Python benchmarks: e.g., SWE-Bench-Lite Acc@10 76.28 vs 75.18 (small), 86.86 vs 83.94 (large).", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "SweRankLLMMulti reranker outperforms GPT-4.1 for code reranking.", 386 "evidence": "Table 3 shows SweRankLLMMulti-Large (32B) outperforming GPT-4.1 on 4 of 5 benchmarks: SWE-PolyBench 63.21 vs 61.76, SWE-Bench-Multilingual 71.37 vs 70.94, SWE-Bench-Lite 89.78 vs 88.69, SWE-Bench-Verified 81.18 vs 81.62 (GPT-4.1 wins here).", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "SweRankAgent's multi-turn reasoning consistently outperforms single-pass ranking across all benchmarks.", 391 "evidence": "Table 4 shows SweRankAgent improving Acc@10 by +3.09 on SWE-PolyBench, +6.41 on SWE-Bench-Multilingual, +2.55 on SWE-Bench-Lite, +1.45 on LocBench, and +2.31 on SWE-Bench-Verified over single-query SweRankMulti.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Multi-query reformulation without agentic reasoning hurts localization performance.", 396 "evidence": "Table 4 shows the Reformulate baseline degrades Acc@10 on 4 of 5 benchmarks: -1.46 on SWE-PolyBench, -1.71 on SWE-Bench-Multilingual, -0.44 on SWE-Bench-Verified, with only a minor +0.40 gain on LocBench.", 397 "supported": "strong" 398 }, 399 { 400 "claim": "SweRankAgent is particularly beneficial for harder cases with low lexical/semantic overlap and multi-function targets.", 401 "evidence": "Figures 4a and 4b show larger improvements in low-overlap buckets. Figure 5 shows the agent's advantage grows with more target functions. Analysis is presented in Section 5.3.3.", 402 "supported": "moderate" 403 } 404 ], 405 "red_flags": [ 406 { 407 "flag": "Salesforce evaluating Salesforce models", 408 "detail": "7 of 10 authors are from Salesforce AI Research, evaluating SweRank+, a Salesforce product. The conflict of interest is not acknowledged anywhere in the paper, and no independent evaluation is included." 409 }, 410 { 411 "flag": "No statistical rigor for claimed improvements", 412 "detail": "All claims of improvement are based on comparing point estimates with no significance tests, confidence intervals, or variance across runs. Differences of 1-6 percentage points could be within noise margins." 413 }, 414 { 415 "flag": "Train/test overlap risk unaddressed", 416 "detail": "Both the SweLocMulti training data and evaluation benchmarks are sourced from GitHub repositories. The paper does not verify repository disjointness or discuss any decontamination procedure." 417 }, 418 { 419 "flag": "Efficiency claimed but not quantified", 420 "detail": "The paper positions its approach as more efficient than agentic baselines ('reducing computational cost', 'lightweight'), but reports zero cost, latency, or throughput numbers to substantiate this claim." 421 }, 422 { 423 "flag": "No limitations section", 424 "detail": "The paper has no limitations, threats to validity, or scope boundary discussion despite making broad claims about multilingual issue localization." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "SWE-bench: Can language models resolve real-world github issues?", 430 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 431 "year": 2024, 432 "relevance": "Foundational benchmark for evaluating LLM-based software engineering capabilities on real GitHub issues." 433 }, 434 { 435 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 436 "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 437 "year": 2024, 438 "arxiv_id": "2405.15793", 439 "relevance": "Key agentic framework for issue resolution that SweRank+ compares against and builds upon." 440 }, 441 { 442 "title": "LocAgent: Graph-guided LLM agents for code localization", 443 "authors": ["Zhaoling Chen", "Xiangru Tang", "Gangda Deng"], 444 "year": 2025, 445 "arxiv_id": "2503.09089", 446 "relevance": "Graph-guided agentic approach to code localization; key baseline in SweRank+'s evaluation." 447 }, 448 { 449 "title": "SweRank: Software issue localization with code ranking", 450 "authors": ["Revanth Gangi Reddy", "Tarun Suresh", "JaeHyeok Doo"], 451 "year": 2025, 452 "arxiv_id": "2505.07849", 453 "relevance": "Predecessor system that introduced retrieve-and-rerank for issue localization; SweRank+ extends it to multilingual and multi-turn settings." 454 }, 455 { 456 "title": "CornStack: High-quality contrastive data for better code ranking", 457 "authors": ["Tarun Suresh", "Revanth Gangi Reddy"], 458 "year": 2024, 459 "arxiv_id": "2412.01007", 460 "relevance": "Provides the contrastive training methodology and CodeRankLLM reranker used as the basis for SweRank+." 461 }, 462 { 463 "title": "ReAct: Synergizing reasoning and acting in language models", 464 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 465 "year": 2023, 466 "relevance": "Foundational agentic reasoning framework that SweRankAgent's iterative search loop is based on." 467 }, 468 { 469 "title": "OrcaLoca: An LLM agent framework for software issue localization", 470 "authors": ["Zhongming Yu", "Hejia Zhang", "Yujie Zhao"], 471 "year": 2025, 472 "arxiv_id": "2502.00350", 473 "relevance": "LLM agent framework specialized for bug localization using search and read actions." 474 }, 475 { 476 "title": "SWE-PolyBench: A multi-language benchmark for repository level evaluation of coding agents", 477 "authors": ["Muhammad Shihab Rashid", "Christian Bock"], 478 "year": 2025, 479 "arxiv_id": "2504.08703", 480 "relevance": "Key multilingual benchmark used to evaluate SweRank+'s cross-language localization performance." 481 }, 482 { 483 "title": "Multi-SWE-Bench: A multilingual benchmark for issue resolving", 484 "authors": ["Daoguang Zan", "Zhirong Huang", "Wei Liu"], 485 "year": 2025, 486 "arxiv_id": "2504.02605", 487 "relevance": "Multilingual issue-resolving benchmark used in SweRank+ evaluation." 488 }, 489 { 490 "title": "SWE-Smith: Scaling data for software engineering agents", 491 "authors": ["John Yang", "Kilian Lieret", "Carlos E. Jimenez"], 492 "year": 2025, 493 "arxiv_id": "2504.21798", 494 "relevance": "Multilingual SWE-Bench variant used in SweRank+ evaluation." 495 }, 496 { 497 "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead", 498 "authors": ["Junda He", "Christoph Treude", "David Lo"], 499 "year": 2025, 500 "relevance": "Survey of LLM-based agentic frameworks for software engineering tasks." 501 }, 502 { 503 "title": "Qwen3 technical report", 504 "authors": ["An Yang"], 505 "year": 2025, 506 "arxiv_id": "2505.09388", 507 "relevance": "Technical report for the Qwen3 model family used as base models in SweRank+." 508 } 509 ] 510 }