scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27879B)
      1 {
      2   "paper": {
      3     "title": "Automatically Generating Web Applications from Requirements Via Multi-Agent Test-Driven Development",
      4     "authors": [
      5       "Yuxuan Wan",
      6       "Tingshuo Liang",
      7       "Jiakai Xu",
      8       "Jingyu Xiao",
      9       "Yintong Huo",
     10       "Michael Lyu"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv preprint (submitted to ACM conference)",
     14     "arxiv_id": "2509.25297"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper states 'The code of TDDev is available at https://github.com/yxwan123/TDDev' in both the abstract and Section 8 (Data Availability)."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper describes the Req-to-App-MM dataset (an extension of WebGen-Bench with Gemini-generated images) but does not provide a download link or release the augmented dataset. WebGen-Bench itself is referenced but the multimodal extension is not explicitly released."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions 'iMac with 10 Core Intel Core i9 processor, 32GB RAM' and that 'All LLM models are accessed through the official API services' (Section 4.4), but no requirements.txt, Dockerfile, or detailed dependency/library version listing is provided."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper provides a GitHub link but does not include step-by-step reproduction instructions in the paper itself. There is no 'Reproducing Results' section or specific commands to replicate experiments."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "All results (Tables 6-9) are reported as point estimates (e.g., '78.2% accuracy') with no confidence intervals, error bars, or uncertainty quantification."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper claims TDDev outperforms baselines (e.g., '14.4% improvement') based solely on comparing numbers in tables without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper provides percentage improvements with baseline context throughout. For example, 'TDDev achieves the highest accuracy (78.2%), a relative improvement of +30% over Cursor (60.2%) and more than triple the performance of Bolt.diy (25.6%)' (Section 5.1.1). This provides enough context for the reader to assess magnitude."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The evaluation uses only 10 websites sampled from WebGen-Bench ('we randomly select 10 websites while preserving the category distribution', Section 4.6.1). No justification for this small sample size is provided, and no power analysis is discussed."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs. Results appear to be single-run numbers with no indication of result stability."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper compares TDDev against two baselines: Bolt.diy (open-source) and Cursor (proprietary), both described as 'widely used open-sourced and proprietary industry-level code-agent frameworks' (Section 4.2)."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Bolt.diy (17.7k GitHub stars) and Cursor (360k active users by 2024) are contemporary, actively used tools. Both are described with current metrics and are state-of-the-art in this domain."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "RQ2 (Section 5.1.3) ablates the multi-step test case generation agent vs. a straightforward single-step variant. RQ3 (Section 5.2) ablates the testing agent feedback across 0, 1, 2, and 3 rounds. Both ablations isolate specific components."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses multiple metrics: accuracy (weighted Yes/Partial/No), fail-to-start rate, appearance score (1-5), visual similarity (1-5), and per-category breakdowns for functionality, data display, and design validation (Tables 6-7)."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 6.1 describes manual verification of UI agent testing results on 5 generated web applications (28 test cases). Two annotators independently labeled each case with a third resolving disagreements. RQ4 (Section 5.3) includes a user study with 3 developers."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The evaluation uses the test set of WebGen-Bench (Section 4.6.1), which is a separately defined benchmark. The 10 websites are randomly selected from this test set, not from any data used during development."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table 7 provides per-category breakdowns across both instruction categories (Content, User Interaction, Data Management) and test case categories (Functionality, Data Display, Design Validation)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 6.2 ('Failure Analysis and Future Works') discusses failures including scaling feedback rounds introducing variance, open-source LLM formatting failures, and coding ability limitations of specific models."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 9 shows that 1-2 rounds of feedback actually decrease accuracy from 58.3% to 25.3%, which is openly reported and discussed: 'accuracy drops to 25.3%, suggesting that 1-2 rounds may be insufficient' (Section 5.2). Open-source model failures are also reported in Section 6.2.2."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims '14.4% improvement on overall accuracy compared to state-of-the-art baselines.' Table 6 shows TDDev with Claude-4-Sonnet at 70.2% vs Cursor at 63.8%, which is approximately a 6.4 percentage point (or ~10% relative) improvement. With GPT-4.1 the improvement is larger (78.2% vs 60.2% = 30% relative). The 14.4% figure is somewhat ambiguous (not clearly explained as which comparison), but the general direction of improvement is supported."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims through ablation studies (RQ2, RQ3) that use controlled single-variable manipulation: removing the multi-step test generation (Table 8) or varying feedback rounds (Table 9). These ablation designs adequately support the causal claims about component contributions."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper title and abstract claim effectiveness for 'full-stack web application generation' generally, but all results come from only 10 websites from WebGen-Bench with two backbone LLMs. The paper does not explicitly bound its claims to this narrow test setting. The broad title 'Automatically Generating Web Applications' overstates the tested scope."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The threats to validity section (Section 7) only discusses two generic threats (agentic system design effectiveness and UI agent testing reliability) without considering alternative explanations for the observed improvements, such as whether the improvements come from additional compute/tokens rather than the TDD approach itself, or whether the choice of templates creates an unfair advantage."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper uses 'GPT-4.1' and 'Claude-4-Sonnet' without specifying snapshot dates or API versions. For open-source models it mentions 'Qwen-2.5-VL-72B' (which includes size) and 'DeepSeek-V3.1'. The main models lack version specificity beyond marketing names."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper provides several actual prompts: the context selection prompt (Section 3.2), the development prompt (Section 3.2), the visual similarity evaluation prompt (Section 4.6.3), and the straightforward test case generation prompt (Section 5.1.3). While some use placeholders (e.g., '[Available Files]', '[Context Buffer]'), the core system instructions and evaluation prompts are provided in full."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 4.4 states 'Temperatures are set to 0 for all models. Max tokens are set to the maximum allowable value for each model.' While minimal, this covers the key LLM API hyperparameters."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The agentic scaffolding is described in detail across Section 3: the three-agent architecture (test generation, development, testing), workflow orchestration, template fetching, context selection, file editing mechanisms, retry logic (bounded retry mechanism in Section 3.3), and feedback loops."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.6.1 describes the dataset construction: starting from WebGen-Bench test set, augmenting with Gemini-2.5-Flash-Image generated design images, selecting 10 websites while preserving category distribution. Table 5 provides category statistics."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 7 ('Threat to Validity') provides a dedicated threats-to-validity discussion with two identified threats."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The two threats discussed in Section 7 are fairly generic: (1) 'performance may be constrained by the effectiveness of individual components' and (2) 'evaluation relies on UI agent testing, which requires high levels of reliability and accuracy.' These are not specific to this study's particular design choices or observed patterns."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the scope to specific types of web applications, specific model families, or specific complexity levels. The discussion of open-source model limitations (Section 6.2.2) is the closest, but this is framed as 'future work' rather than explicit scope boundaries."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The raw test results, generated applications, BrowserUse logs, and human annotation data are not available for independent verification. Only aggregate metrics in tables are provided."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.6.1 describes the dataset construction from WebGen-Bench, the image augmentation process using Gemini-2.5-Flash-Image, the sampling strategy (10 websites preserving category distribution), and the evaluation process using BrowserUse."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "For the user study (RQ4, Section 5.3), the paper states 'we recruit three developers (two research staff who have previously developed at least two web applications, and one front-end developer from a startup company)' but does not describe how they were recruited, whether they volunteered, or whether there is selection bias in using researchers from the authors' institution."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The evaluation pipeline is documented: dataset construction (Section 4.6.1) → functionality evaluation via BrowserUse (Section 4.6.2) → visual evaluation via Claude-4-Sonnet (Section 4.6.3). The scoring formula is explicitly stated. The test case execution process is described."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: Chinese University of Hong Kong, Columbia University, and Singapore Management University. The paper evaluates third-party tools (GPT-4.1, Claude-4-Sonnet, Cursor, Bolt.diy) and the authors are not affiliated with those companies."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Since funding is not disclosed, it is impossible to assess whether the funder is independent of the outcome. The absence of a funding disclosure means this criterion cannot be satisfied."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "There is no competing interests statement or financial interest declaration in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper uses GPT-4.1 and Claude-4-Sonnet to generate web applications and evaluate them, but does not state training data cutoff dates for any model used. This matters because WebGen-Bench or similar web development benchmarks could be in the training data."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No discussion of whether WebGen-Bench data or similar web application generation tasks appeared in the training data of GPT-4.1 or Claude-4-Sonnet. Given that the models may have seen web development code and specifications, this is a relevant concern."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "WebGen-Bench was published in 2025 (arXiv:2505.03733), so depending on model training cutoffs it may or may not be contaminated. The paper does not address this. The Req-to-App-MM extension uses generated images from Gemini which adds novelty, but the text requirements from WebGen-Bench could still be contaminated."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The user study in RQ4 involves 3 human developers but no pre-registration is mentioned."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No IRB or ethics board approval is mentioned for the user study involving 3 developers."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Section 5.3 describes participants: 'two research staff who have previously developed at least two web applications, and one front-end developer from a startup company.' While minimal, this characterizes their experience level and role."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No explicit inclusion/exclusion criteria are stated for participant selection. The paper states they were 'recruited' following the methodology of Chen et al. [11] but does not specify screening criteria."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "The user study (RQ4) is not a comparative experimental study with randomized assignment. All participants use both tools sequentially (TDDev first, then Bolt.diy), making this a within-subjects design without randomization."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "No blinding is described. Participants know which tool they are using (TDDev vs. Bolt.diy), and the order is fixed (TDDev first, then Bolt.diy), which could introduce order effects."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "With only 3 participants and results reported for all 3 (Table 10 shows averages), there is implicitly no attrition. All participants completed both conditions."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Table 4 reports per-round costs: TDDev costs ~0.36 USD/round (split between ~10K tokens for development and ~10K for testing, ~4 min each). Bolt.diy costs ~0.18 USD/round. Cursor costs 20 USD/month subscription."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Section 4.4 states hardware used ('iMac with 10 Core Intel Core i9 processor, 32GB RAM'). Table 4 provides time per round (~4 min development + ~4 min testing). Section 4.5 discusses cost structures. Total wall-clock time for the user study is 18.7 minutes for TDDev."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "TDDev achieves a 14.4% improvement on overall accuracy compared to state-of-the-art baselines.",
    293       "evidence": "Table 6 shows TDDev with Claude-4-Sonnet at 70.2% accuracy vs. Bolt.diy at 44.5% and Cursor at 63.8%. With GPT-4.1, TDDev achieves 78.2% vs. Bolt.diy at 25.6% and Cursor at 60.2%. The 14.4% figure is not clearly derived from any single comparison in the paper.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "TDDev has zero fail-to-start rate, while baselines have 20-80% fail-to-start rates.",
    298       "evidence": "Table 6 shows TDDev has 0.0% fail-to-start in both GPT-4.1 and Claude-4-Sonnet configurations, while Bolt.diy has 80.0%/20.0% and Cursor has 40.0%/20.0% fail-to-start rates.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "The multi-step test case generation agent improves functionality accuracy from 33.3% to 61.1% over single-step generation.",
    303       "evidence": "Table 8 (Section 5.1.3, RQ2) shows straightforward test generation achieves 33.3% functionality accuracy vs. 61.1% for multi-step, with overall accuracy improving from 59.1% to 70.2%.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Three rounds of testing feedback improve accuracy from 58.3% (no feedback) to 70.2%.",
    308       "evidence": "Table 9 (Section 5.2, RQ3) shows accuracy progression: no feedback 58.3%, 1 round 25.3%, 2 rounds 25.2%, 3 rounds 70.2%. Notably, intermediate rounds decrease performance.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "TDDev eliminates the need for manual intervention, reducing manual effort to zero.",
    313       "evidence": "Table 10 (Section 5.3, RQ4) shows TDDev requires 0.0 minutes of manual intervention vs. 4.7 minutes for Bolt.diy, with 0 intervention frequency and 0 additional prompt words. Based on user study with 3 developers.",
    314       "supported": "weak"
    315     },
    316     {
    317       "claim": "BrowserUse achieves 82.8% alignment rate with manual testing, and 100% alignment on Yes/No cases.",
    318       "evidence": "Table 11 (Section 6.1) reports 82.8% overall alignment and 100% Yes/No alignment for BrowserUse vs. 46.4% and 66.7% for WebVoyager, based on manual verification of 28 test cases across 5 applications.",
    319       "supported": "moderate"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "benchmark-eval",
    324     "case-study"
    325   ],
    326   "key_findings": "TDDev, a multi-agent TDD framework for full-stack web application generation, outperforms Bolt.diy and Cursor on WebGen-Bench with 10 test websites, achieving 70-78% accuracy depending on backbone LLM compared to 25-64% for baselines. Ablation studies show that multi-step test case generation and 3 rounds of iterative feedback both contribute meaningfully to performance, though 1-2 rounds of feedback actually decrease performance. A small user study with 3 developers found that TDDev eliminated all manual intervention compared to Bolt.diy. The framework achieves zero fail-to-start rate while baselines fail to launch 20-80% of the time.",
    327   "red_flags": [
    328     {
    329       "flag": "Very small evaluation sample",
    330       "detail": "Only 10 websites are used for evaluation from WebGen-Bench. This is acknowledged in the paper but not justified. With N=10, individual website results dominate aggregate metrics, and category-level breakdowns (Table 7) are based on as few as 3-4 websites per category."
    331     },
    332     {
    333       "flag": "No statistical significance testing",
    334       "detail": "All comparative claims (14.4% improvement, triple performance) are based on raw percentage comparisons without any significance tests. With N=10, observed differences could easily be due to chance."
    335     },
    336     {
    337       "flag": "Tiny user study (N=3)",
    338       "detail": "RQ4 claims practical advantages based on only 3 developers. No statistical inference is possible with N=3. The order is not counterbalanced (all use TDDev first, then Bolt.diy), introducing potential order effects."
    339     },
    340     {
    341       "flag": "Non-monotonic feedback results",
    342       "detail": "Table 9 shows that 1-2 rounds of feedback dramatically decrease accuracy (from 58.3% to 25.3%), while 3 rounds recover to 70.2%. This non-monotonic pattern on N=10 could indicate high variance and cherry-picked iteration count rather than a robust finding."
    343     },
    344     {
    345       "flag": "Self-evaluation using own UI agent",
    346       "detail": "TDDev uses BrowserUse for both internal testing (as part of the framework) and external evaluation. The same tool is used to drive the system and to evaluate it, creating a potential alignment bias where the evaluation favors the system's own testing approach."
    347     },
    348     {
    349       "flag": "Cost comparison favors TDDev unfairly",
    350       "detail": "TDDev's total development time is longer (18.7 min vs. 15.2 min for Bolt.diy per Table 10), and 3 rounds of TDD adds significant token cost. The paper emphasizes 'zero manual intervention' but the higher total time and compute cost are downplayed."
    351     },
    352     {
    353       "flag": "14.4% improvement claim unclear",
    354       "detail": "The abstract's headline claim of '14.4% improvement' does not cleanly match any single comparison in the results tables. The derivation of this specific figure is not explained in the paper."
    355     }
    356   ],
    357   "cited_papers": [
    358     {
    359       "title": "WebGen-Bench: Evaluating LLMs on Generating Interactive and Functional Websites from Scratch",
    360       "authors": ["Zimu Lu", "Yunqiao Yang", "Houxing Ren", "Haotian Hou", "Han Xiao", "Ke Wang", "Weikang Shi", "Aojun Zhou", "Mingjie Zhan", "Hongsheng Li"],
    361       "year": 2025,
    362       "arxiv_id": "2505.03733",
    363       "relevance": "Primary benchmark used for evaluation; empirical study on LLM agents' ability to generate full-stack websites."
    364     },
    365     {
    366       "title": "Demystifying llm-based software engineering agents",
    367       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    368       "year": 2025,
    369       "relevance": "Study on LLM-based SE agents, relevant to understanding agentic AI capabilities in software engineering."
    370     },
    371     {
    372       "title": "AutoCodeRover: Autonomous Program Improvement",
    373       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    374       "year": 2024,
    375       "arxiv_id": "2404.05427",
    376       "relevance": "Applies test cases for fault localization in autonomous program repair, related agentic approach to code improvement."
    377     },
    378     {
    379       "title": "Test-Driven Development and LLM-based Code Generation",
    380       "authors": ["Noble Saji Mathews", "Meiyappan Nagappan"],
    381       "year": 2024,
    382       "doi": "10.1145/3691620.3695527",
    383       "relevance": "Empirically demonstrates TDD principles benefit LLM-based code generation using human-written tests."
    384     },
    385     {
    386       "title": "Evaluating large language models trained on code",
    387       "authors": ["Mark Chen", "Jerry Tworek"],
    388       "year": 2021,
    389       "arxiv_id": "2107.03374",
    390       "relevance": "Codex paper highlighting limits of single-sample code generation, motivating iterative testing approaches."
    391     },
    392     {
    393       "title": "Design2Code: How Far Are We From Automating Front-End Engineering?",
    394       "authors": ["Chenglei Si", "Yanzhe Zhang", "Zhengyuan Yang", "Ruibo Liu", "Diyi Yang"],
    395       "year": 2024,
    396       "arxiv_id": "2403.03163",
    397       "relevance": "Benchmark and evaluation of MLLM capabilities for front-end code generation from screenshots."
    398     },
    399     {
    400       "title": "A Survey on Code Generation with LLM-based Agents",
    401       "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian", "Tian Wang", "Kechi Zhang", "Zhi Jin", "Ge Li"],
    402       "year": 2025,
    403       "arxiv_id": "2508.00083",
    404       "relevance": "Survey covering LLM-based agent approaches to code generation, directly relevant to the survey scope."
    405     },
    406     {
    407       "title": "Repairagent: An autonomous, llm-based agent for program repair",
    408       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    409       "year": 2024,
    410       "arxiv_id": "2403.17134",
    411       "relevance": "Autonomous LLM-based agent for program repair, related agentic software engineering approach."
    412     },
    413     {
    414       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    415       "authors": ["Joel Becker", "Nate Rush", "Beth Barnes", "David Rein"],
    416       "year": 2025,
    417       "arxiv_id": "2507.09089",
    418       "relevance": "Measures AI impact on developer productivity, relevant to understanding practical benefits of AI coding tools."
    419     },
    420     {
    421       "title": "Towards autonomous testing agents via conversational large language models",
    422       "authors": ["Robert Feldt", "Sungmin Kang", "Juyeon Yoon", "Shin Yoo"],
    423       "year": 2023,
    424       "relevance": "Early work on LLM-based autonomous testing agents, foundational to the testing agent design in this paper."
    425     },
    426     {
    427       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    428       "authors": ["Hongliang He", "Wenlin Yao", "Kaixin Ma", "Wenhao Yu"],
    429       "year": 2024,
    430       "relevance": "Web navigation agent using multimodal LLMs, used as comparison for UI agent evaluation accuracy."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs