scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26926B)
      1 {
      2   "paper": {
      3     "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents",
      4     "authors": ["Niels Mündler", "Mark Niklas Müller", "Jingxuan He", "Martin Vechev"],
      5     "year": 2024,
      6     "venue": "Neural Information Processing Systems",
      7     "arxiv_id": "2406.12952",
      8     "doi": "10.52202/079017-2601"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The abstract states 'We release all data and code at github.com/logic-star-ai/SWT-Bench.'"
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The benchmark dataset is released alongside the code at the provided GitHub URL. The underlying data is based on publicly available SWE-Bench data."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is described in the paper itself. Evaluation uses Docker containers but environment specifications are not detailed in the text."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper does not include step-by-step reproduction instructions. While code is released, no README-level reproduction guide is provided in the paper text."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Main results in Tables 2-6 report point estimates only with no confidence intervals or error bars. Table 8 (temperature ablation) shows 95% CIs for n=25, but this is an ablation, not the main results."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper reports statistical significance: 'This stronger performance is significant at p < 0.1%' for SWE-AGENT+ vs LIBRO (§5.2), and p-values for contamination analysis (p≈37%, Table 7) and overlap analysis (Table 6)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Results are reported as absolute percentages with baselines for context (e.g., 'almost 3x increase in success rate to 9.4%', 'more than doubles the precision of SWE-AGENT to 47.8%'). Tables provide full context for comparison."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The choice of SWT-BENCH-LITE (276 instances) over full SWT-BENCH is stated as 'due to budget constraints' but no power analysis or sample size justification is provided."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Main results (Tables 2-4) are single-run results at temperature 0. No variance across runs is reported for the primary experiments. Table 8 shows variance for the temperature ablation only."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines are compared: ZEROSHOT, ZEROSHOTPLUS, LIBRO, PASS@5, AUTOCODEROVER, AIDER, SWE-AGENT, and SWE-AGENT+ (Table 2)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include contemporary Code Agents (SWE-Agent, AutoCodeRover, Aider) and the state-of-the-art test generation method LIBRO, all from 2023-2024."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Multiple ablations are provided: number of LIBRO samples (Fig. 8a), interaction rounds for agents (Fig. 8b), temperature (Table 8), effect of providing code patches/test files (Table 5), and SWE-AGENT vs SWE-AGENT+ (test execution ablation)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are defined and reported: Success rate (S), Change Coverage (ΔC), Well-formedness (W), F→P rate, F→× rate, P→P rate (§3.3, Tables 2-3)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of generated tests is performed. All evaluation is automated via test execution against golden patches."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "SWT-BENCH-LITE is used for evaluation. The golden patches serve as held-out ground truth not provided to the methods being evaluated. No tuning is performed on the test set."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Per-repository breakdowns are provided (Fig. 9), per-issue-length analysis (Fig. 5), successful vs non-successful coverage breakdown (Table 3), and per-model comparisons (Table 4)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 6 discusses common failure modes: 'Adding passing tests that do not reproduce the given issue, getting stuck in loops, failing to execute the test environment correctly and adding tests with syntax errors.' Appendix D analyzes per-repo failures."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative findings are reported: AUTOCODEROVER underperforms simpler methods, LIBRO's heuristics only recover half the gap to PASS@5, providing code patches has smaller impact than providing test files (Table 5), and overlap between fix and test generation is low (Table 6)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims are supported: Code Agents outperform test-specific methods (Table 2), generated tests double SWE-Agent precision (§5.3), and LLMs 'perform surprisingly well' is supported by the range of success rates reported."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper's causal claims are modest and supported by controlled ablations. E.g., the custom diff format's effect is isolated by comparing ZEROSHOT vs ZEROSHOTPLUS (same model, same prompts, different format). SWE-AGENT vs SWE-AGENT+ isolates the test execution instruction."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper acknowledges Python limitation in §6 but the title 'Testing and Validating Real-World Bug-Fixes' and abstract claims like 'LLMs generally perform surprisingly well at generating relevant test cases' are broader than the 12 Python repos tested."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper discusses data contamination as an alternative explanation (§5.4, Table 7), acknowledges that popular repos may not represent common development (§6), and considers that overlap between tasks could explain correlation (Table 6)."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper clearly defines what it measures (F→P rate, change coverage) and does not overclaim these as measures of general test quality. The metrics are well-defined in §3.3 with explicit mathematical formulations."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Exact model versions are specified in §5.1: 'gpt-4-1106-preview', 'gpt-4o-mini-2024-07-18', 'Claude 3.0 Haiku', 'Claude 3.5 Sonnet', 'Mistral Large 2', 'Mixtral 7x22b'."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompts are provided in the appendix: Figures 10-16 contain complete prompt text for ZEROSHOT, ZEROSHOTPLUS, SWE-AGENT, AUTOCODEROVER, and AIDER."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "§5.1 reports: 'temperature t = 0 for all zero-shot methods and agents and at t = 0.7 for LIBRO and PASS@5', API calls limited to 20, reflection steps to 3, interaction rounds to 10. LIBRO samples 5 tests."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Each agent's scaffolding is described: SWE-AGENT's command-line tools and ACI (§4.4), AIDER's repository indexing and validation (§4.4), AUTOCODEROVER's two-stage context collection and generation (§4.4). The custom diff format is formalized in §4.2 and Appendix A."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The three-stage benchmark construction process is documented in §3.2: scraping ~90K PRs, filtering for merged PRs resolving issues with test changes, filtering for F→P tests. Further filtering of 311 problematic instances is described with specific reasons."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6 'Limitations and Future Work' provides substantive discussion of multiple limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6 discusses specific threats: Python-only limitation, popular repos not representative of common development, limited to bug reproduction (not edge case detection), and data contamination risk from historic GitHub issues."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "§6 explicitly states what is not shown: 'limited to Python', 'does not measure edge case detection or global coverage increase', 'based on popular GitHub repositories, which may not be representative.' §5.4 acknowledges contamination risk limitations."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The full dataset is released publicly at github.com/logic-star-ai/SWT-Bench, built on top of the publicly available SWE-Bench dataset."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "§3.2 describes the three-stage data collection from SWE-Bench: scraping PRs from 12 GitHub repos, filtering for merged PRs resolving issues, filtering for F→P tests. Table 1 characterizes the resulting dataset."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. The data source is public GitHub repositories, a standard benchmark construction approach."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: ~90K PRs → filtering for merged+issue-resolving+test-changing → filtering for F→P → 2,294 instances → removing 311 problematic instances → 1,983 SWT-BENCH instances → 276 SWT-BENCH-LITE subset."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding or acknowledgments section is present in the paper text provided."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly stated: ETH Zurich (Department of Computer Science) and LogicStar.ai. Mark Niklas Müller has dual affiliation."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Co-author Mark Niklas Müller is affiliated with LogicStar.ai, which appears to be a company related to code agents. No funding disclosure means independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present. One author is affiliated with LogicStar.ai but no declaration of financial interests is made."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "§5.4 states 'the Knowledge Cutoff (KC) of GPT-4 (April 2023)' and analyzes contamination risk relative to this date."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "§5.4 explicitly investigates contamination by comparing performance on issues created before vs after GPT-4's knowledge cutoff (Table 7), finding no statistically significant difference."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "§5.4 and §6 discuss that 'most issues in SWT-BENCH have been created before the knowledge cutoff of state-of-the-art models, posing a risk for data contamination.' Table 7 provides empirical analysis. The paper also notes all methods use the same LLM so contamination affects them equally."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Appendix G reports detailed costs: Tables 9 and 10 list USD costs for each model and method on SWT-BENCH-LITE (e.g., GPT-4 SWE-AGENT: $290.71, SWE-AGENT+: $478.21)."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Tables 9-10 report total API costs. Table 11 reports execution times per instance. The paper states evaluation 'can be performed on a consumer grade machine in reasonable time.'"
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Main experiments use temperature 0 (deterministic) with single runs. Table 8 shows some variance analysis for temperature ablation (n=25) but not for the primary results."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "The paper states temperature t=0 for main experiments (implying single deterministic run) and t=0.7 for LIBRO/PASS@5. Table 8 explicitly states n=25 for the temperature ablation."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Default settings are used for agents (API calls=20, reflections=3, rounds=10) with no discussion of how these defaults were chosen or whether alternatives were tried beyond the ablation in Appendix C."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Appendix C provides ablation studies justifying the choice of hyperparameters (number of LIBRO samples, interaction rounds, temperature). Default agent settings are used without cherry-picking."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple comparisons are made across 8+ methods without correction for multiple testing. The p<0.1% claim for SWE-AGENT+ vs LIBRO is not corrected for the number of pairwise comparisons possible."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors propose SWT-Bench and SWE-AGENT+ (their adaptation) which outperforms other methods. No discussion of author-evaluation bias or independent verification."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Tables 9-10 report costs for all methods. Appendix C.2 (Fig. 8b) shows performance as a function of API calls. The paper enables cost-performance comparison across methods."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "§3.1-3.3 formally define what the benchmark measures (F→P as issue reproduction), §4.1 discusses how test generation differs from code repair, and §6 acknowledges construct limitations (only bug reproduction, not edge case detection)."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Different agents use fundamentally different scaffolding (SWE-Agent ACI vs AIDER repository indexing vs AutoCodeRover two-stage), yet performance differences are attributed to the methods without isolating scaffold effects from LLM effects. Table 4 varies LLMs within SWE-Agent only."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "§5.4 directly addresses temporal leakage by comparing performance on PRs before vs after GPT-4's knowledge cutoff (Table 7). §6 proposes a rolling benchmark version as mitigation."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. For instance, BM-25 retrieval over the codebase could surface code near the fix location, providing hints not available in real usage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Multiple instances come from the same repositories (e.g., sympy, django are heavily represented in Fig. 2) but non-independence between instances from the same repo is not discussed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "Table 7 uses a temporal split (before vs after knowledge cutoff) as a concrete detection method, comparing performance on n=83 instances in each group."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Code Agents designed for code repair outperform systems designed specifically for test generation, with SWE-AGENT+ achieving 18.5% success rate vs LIBRO's 14.1%.",
    363       "evidence": "Table 2 (§5.2) shows success rates across all methods. Statistical significance at p < 0.1% is reported for SWE-AGENT+ vs LIBRO.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "The custom diff format (ZEROSHOTPLUS) yields almost 3x increase in success rate over standard unified diff (ZEROSHOT), from 3.6% to 9.4%.",
    368       "evidence": "Table 2 shows W increases from 48.6% to 89.5% and S from 3.6% to 9.4% when switching from unified diff to custom format.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Generated tests double the precision of SWE-AGENT for code repair from ~20% to 47.8%.",
    373       "evidence": "§5.3 reports filtering code fixes using self-generated F→P tests achieves 47.8% precision at 20% recall.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Test generation and code repair are distinct tasks — overlap between solved instances is small with no statistical evidence of correlation.",
    378       "evidence": "Table 6 shows low overlap (7/50 for SWE-AGENT) with p-values of 72.8% and 80.4% under independence.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "An ideal ensemble of the best four methods solves 71% more samples than the best single method (87 vs 51).",
    383       "evidence": "Figure 6 shows the Venn diagram of solved instances across the four best methods.",
    384       "supported": "strong"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "SWT-Bench is a novel benchmark for LLM-based test generation from GitHub issues, containing 1,983 instances from 12 Python repositories. Code Agents (especially SWE-AGENT+) outperform specialized test generation methods like LIBRO at issue reproduction, achieving 18.5% vs 14.1% success rate. Generated tests serve as an effective filter for code fixes, doubling SWE-AGENT's precision to 47.8%. Test generation and code repair show low per-instance correlation despite similar aggregate difficulty, indicating they are complementary tasks.",
    389   "red_flags": [
    390     {
    391       "flag": "Self-evaluation bias",
    392       "detail": "The authors propose both the benchmark (SWT-Bench) and the best-performing method variant (SWE-AGENT+). Their adaptations to existing agents are evaluated on their own benchmark without independent verification."
    393     },
    394     {
    395       "flag": "No variance on main results",
    396       "detail": "All primary results (Tables 2-6) are single-run at temperature 0. While deterministic, this means no assessment of sensitivity to non-deterministic factors (API non-determinism, evaluation environment variance). Table 8 shows some variance exists even at T=0."
    397     },
    398     {
    399       "flag": "Concentrated repository distribution",
    400       "detail": "Figure 2 shows sympy and django dominate the dataset. Figure 9 reveals five repositories where test generation fails entirely, meaning aggregate metrics are heavily influenced by a few repos."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    406       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    407       "year": 2023,
    408       "arxiv_id": "2310.06770",
    409       "relevance": "The foundational benchmark that SWT-Bench builds upon, providing the underlying dataset of GitHub issues and code patches."
    410     },
    411     {
    412       "title": "SWE-agent: Agent Computer Interfaces Enable Software Engineering Language Models",
    413       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    414       "year": 2024,
    415       "relevance": "Key Code Agent baseline adapted for test generation, introducing the ACI concept for LLM-based code interaction."
    416     },
    417     {
    418       "title": "AutoCodeRover: Autonomous Program Improvement",
    419       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    420       "year": 2024,
    421       "arxiv_id": "2404.05427",
    422       "relevance": "Two-stage Code Agent evaluated as a baseline for test generation, demonstrating context-gathering then patch-generation approach."
    423     },
    424     {
    425       "title": "Large Language Models are Few-Shot Testers: Exploring LLM-Based General Bug Reproduction",
    426       "authors": ["Sungmin Kang", "Juyeon Yoon", "Shin Yoo"],
    427       "year": 2023,
    428       "doi": "10.1109/ICSE48619.2023.00194",
    429       "relevance": "LIBRO — state-of-the-art LLM-based test generation method used as primary baseline."
    430     },
    431     {
    432       "title": "Evaluating Large Language Models Trained on Code",
    433       "authors": ["Mark Chen", "Jerry Tworek"],
    434       "year": 2021,
    435       "relevance": "HumanEval/Codex benchmark paper foundational to LLM code generation evaluation."
    436     },
    437     {
    438       "title": "CodeT: Code Generation with Generated Tests",
    439       "authors": ["Bei Chen", "Fengji Zhang", "Anh Nguyen"],
    440       "year": 2023,
    441       "relevance": "Uses generated tests to validate code generation, directly related to the test-as-filter approach in SWT-Bench."
    442     },
    443     {
    444       "title": "ChatUniTest: A Framework for LLM-Based Test Generation",
    445       "authors": ["Yinghao Chen", "Zehao Hu", "Chen Zhi"],
    446       "year": 2023,
    447       "relevance": "LLM-based test generation framework relevant to the automated testing survey scope."
    448     },
    449     {
    450       "title": "Automated Unit Test Improvement Using Large Language Models at Meta",
    451       "authors": ["Nadia Alshahwan", "Jubin Chheda"],
    452       "year": 2024,
    453       "arxiv_id": "2402.09171",
    454       "relevance": "Industry-scale LLM test generation at Meta, demonstrating practical application of automated test improvement."
    455     },
    456     {
    457       "title": "MAGIS: LLM-Based Multi-Agent Framework for GitHub Issue Resolution",
    458       "authors": ["Wei Tao", "Yucheng Zhou", "Wenqiang Zhang", "Yu Cheng"],
    459       "year": 2024,
    460       "arxiv_id": "2403.17927",
    461       "relevance": "Multi-agent framework for code repair, part of the Code Agent landscape being evaluated."
    462     },
    463     {
    464       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    465       "authors": ["Islem Bouzenia", "Premkumar T. Devanbu", "Michael Pradel"],
    466       "year": 2024,
    467       "arxiv_id": "2403.17134",
    468       "relevance": "Autonomous LLM agent for program repair, relevant to the agentic code generation survey scope."
    469     },
    470     {
    471       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    472       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    473       "year": 2024,
    474       "doi": "10.1109/TSE.2023.3334955",
    475       "relevance": "Empirical evaluation of LLMs for test generation, directly relevant baseline work."
    476     },
    477     {
    478       "title": "Pynguin: Automated Unit Test Generation for Python",
    479       "authors": ["Stephan Lukasczyk", "Gordon Fraser"],
    480       "year": 2022,
    481       "doi": "10.1145/3510454.3516829",
    482       "relevance": "Symbolic execution-based Python test generation tool, representing non-LLM approaches to test generation."
    483     }
    484   ]
    485 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs