scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26439B)
      1 {
      2   "paper": {
      3     "title": "ATLAS: Artifact Generation Through Layered Constraints and LLM × MDE Synergy",
      4     "authors": [
      5       "Tong Ma",
      6       "Hui Lai",
      7       "Hui Wang",
      8       "Zhenhu Tian",
      9       "Jizhou Wang",
     10       "Haichao Wu",
     11       "Yongfan Gao",
     12       "Chaochao Li",
     13       "Fengjie Xu",
     14       "Ling Fang"
     15     ],
     16     "year": 2025,
     17     "arxiv_id": "2510.25890"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, GitHub link, or archive is provided in the paper. The paper describes a complex multi-component framework (UMM, ICM, CVG, AGR) but does not release source code."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The evaluation datasets (60 AUTOSAR components for RQ1, 20 AUTOSAR systems for RQ2, 60 Brussels I bis cases for RQ3) are not released. No download links or data repositories are mentioned."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions using DeepSeek-R1-Distill-Qwen-32B served through vLLM and GPT-5 via API, but provides no requirements.txt, Dockerfile, or detailed environment setup section listing library versions."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology sections describe the framework architecture in detail but do not provide instructions for replicating the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results in Tables 2-7 report only point estimates (e.g., XSD pass rates, latency values, legal correctness scores). No confidence intervals or error bars are provided."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims ATLAS outperforms baselines across multiple metrics (e.g., Legal-Correct@1: 0.467 vs 0.300 vs 0.200) but provides no statistical significance tests to support these comparative claims."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports results with baseline context throughout. For example, RQ1: 0% vs 100% XSD pass rate; RQ3: Legal Correctness 0.200 (Baseline) vs 0.300 (RAG) vs 0.467 (ATLAS); Citation Precision 0.250 vs 0.428 vs 0.778. These provide enough context to assess magnitude."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for sample sizes: 60 AUTOSAR components (RQ1), 20 systems (RQ2), or 60 legal cases (RQ3). No power analysis or explanation of why these numbers are sufficient."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "RQ1 uses three fixed random seeds (42, 1001, 20250701) but does not report variance or standard deviation across seeds. RQ2 and RQ3 use single-shot generation with no repeated runs. No spread measures are reported anywhere."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ1 includes four pipeline configurations (vLLM, vLLM+RAG, vLLM+RAG+GBNF, vLLM+RAG+JSON Schema) plus an LLM-API baseline (GPT-5). RQ3 compares Baseline (LLM-only), RAG, and ATLAS."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include contemporary approaches: vLLM with RAG, grammar-constrained decoding (GBNF), JSON Schema constrained decoding, and GPT-5 as a frontier model baseline. These represent current state-of-the-art approaches in constrained LLM generation."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The pipeline configurations in RQ1 serve as an ablation study: vLLM (no constraints), +RAG (retrieval), +GBNF (grammar constraints), +JSON Schema (schema constraints). Each adds a component incrementally. RQ3 similarly ablates LLM-only vs RAG vs full ATLAS."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics are used across all RQs. RQ1: XSD pass rate, evidence coverage, latency, token counts. RQ2: file completeness, XSD pass rate, reference resolution rate, SHACL/SMT pass rates, expert review scores. RQ3: Legal Correctness, Citation Precision, Abstention Quality, Promotion Accuracy, Schema Compliance, Rule Satisfaction."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "RQ2 includes expert review across four dimensions (requirements alignment, architectural quality, engineering quality, toolchain integration) with structured 5-point scale scoring and qualitative findings. RQ3 legal cases were drafted and validated by a subject-matter expert."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "There is no mention of train/test split or held-out data. The 60 AUTOSAR components, 20 systems, and 60 legal cases are all used directly for evaluation without any separation of development and test sets."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "RQ2 provides breakdowns by complexity tier (Simple/Middle/Complex) in Tables 5 and 6. RQ1 provides breakdowns by prompt regime (Min/Std/Full) in Tables 2 and 3."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Failure cases are extensively discussed. RQ2 describes specific failure patterns: client-server role confusion in Middle-tier systems, broken SecOC authentication chains and inverted health monitoring patterns in Complex-tier systems (Section 4.4.2, Table 6). The 0% SMT pass rate is analyzed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports multiple negative results: 0% SMT pass rate across all tiers in RQ2, low SHACL pass rates (3.5%-14.2%), architectural quality degrading to 1/5 for Complex systems. The paper frames these honestly as demonstrating the limits of current LLMs."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims ATLAS produces 'structurally valid, auditable artifacts' and 'substantially reduce manual remediation effort, validating a graduated automation paradigm.' Results support structural validity (100% XSD pass rates) and the graduated automation claim is supported by the expert review findings showing semantic failures require human intervention."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims like 'Layer-1 enforcement reliably solves the syntax problem' backed by controlled ablation: comparing the same base model with and without constraints. The ablation design (adding one component at a time to the same vLLM backend) constitutes controlled single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper is careful about generalization boundaries. RQ3's legal evaluation is explicitly described as 'a feasibility probe testing architectural transfer, not claiming exhaustive legal coverage or deployment readiness.' The Limitations section states 'does not claim unsupervised certification' and identifies specific scalability limitations."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper discusses distribution distortion as an alternative explanation for constrained decoding behavior (Section 3.4), considers whether stronger base models alone could solve the problem (Section 4.4.3), and discusses the gap between reference existence and semantic correctness as a hallucination mode rather than simple structural failure."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "RQ1 specifies 'DeepSeek-R1-Distill-Qwen-32B served through vLLM' which is a specific model name/size. However, RQ2 and RQ3 say only 'GPT-5' without a version, snapshot date, or API version identifier. Marketing names without snapshot dates do not count as specified versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Prompts are described functionally (e.g., 'buildPrompt(s, ctx, S_c)' in Algorithm 2, 'BuildRepairPrompt' in Algorithm 4) but the actual prompt text is never provided. The paper describes prompt construction procedures without revealing the actual prompts used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Temperature and sampling parameters are specified: RQ1 uses temperature 0.7, top-p 0.9, fixed seeds (42, 1001, 20250701). RQ2 uses temperature 0.7 for Phase 1, 0.3 for Phase 2, 0.2 for interface files. Algorithm 2 uses T=0.3, Algorithm 3 uses T=0.2."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The agentic scaffolding is described in extensive detail across Sections 3.1-3.6: the UMM-ICM-CVG pipeline, dual-channel constraint extraction, prefix-safe automaton execution, two-layer enforcement, and audit-guided repair with two routes (automated and human-in-the-loop). Multiple algorithms (1-4) and figures (1-10) document the workflow."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.2 documents AUTOSAR domain preparation in detail: UMM construction from XSD/XMI (4.2.1), ICM extraction yielding 1,161 constraints with 90% entity linking precision and 1,045 anchored to UMM entities (4.2.2), and KG materialization with 1,000+ entities and 3,049 edges (4.2.3)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 (Conclusion and Future Work) contains an explicit 'Limitations' subsection listing four specific limitations: incremental L2 validation, system-level semantics at scale, formal-to-programmatic scaling, and the continued need for human review."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations are specific to this study: 'In RQ2, architectural drift and dangling cross-file references appear in more complex systems'; 'ATLAS can surface these violations and drive AGR, but does not yet guarantee global semantic invariants for arbitrarily dense dependency graphs'; 'scaling [SHACL and SMT] to thousands of routine business rules introduces significant maintenance overhead.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope boundaries: RQ3 is 'a feasibility probe testing architectural transfer, not claiming exhaustive legal coverage'; 'does not claim unsupervised certification. Certified deployment still requires domain experts and regulatory sign-off'; and 'direct structured-to-structured transformations that demand perfect consistency may not benefit from LLM-based generation.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (generated ARXML files, legal case inputs/outputs, audit trails, expert review forms) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data collection is described: RQ1 uses '60 representative AUTOSAR components under three prompt regimes.' RQ2 curates '20 AUTOSAR systems across three complexity tiers' with detailed descriptions of each tier. RQ3 uses '60 expert-authored Brussels I bis jurisdiction cases spanning the full hierarchy of bases of jurisdiction.'"
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are recruited. The expert review in RQ2 is part of the evaluation methodology but the experts are not research participants. The evaluation datasets are synthetic/curated."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The data pipeline is documented across Section 4.2 (AUTOSAR domain preparation), with specific counts at each stage: 1,161 constraints extracted, 1,045 anchored to UMM entities (90% precision), 794 SHACL shapes, 1,005 SMT assertions compiled. The generation pipeline is documented in Section 4.3-4.5."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The Acknowledgments section states: 'This work was supported in part by the National Natural Science Foundation of China (NSFC) under Grant 32427801 (National Major Scientific Research Instrument Development Project).'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All author affiliations are listed: University of Science and Technology of China, Anhui University, and Hefei Institutes of Physical Science, Chinese Academy of Sciences. No authors are affiliated with commercial LLM or AUTOSAR companies."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The funder is NSFC (National Natural Science Foundation of China), a government research funding agency with no financial stake in the outcomes of this framework evaluation."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses DeepSeek-R1-Distill-Qwen-32B and GPT-5 but does not state the training data cutoff date for either model. This is relevant because AUTOSAR specifications and Brussels I bis regulations could be in the training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether AUTOSAR specifications or Brussels I bis legal content appeared in the training data of the LLMs used. The models may have been trained on these publicly available standards."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "While the evaluation tasks are somewhat custom (specific AUTOSAR components and legal cases), the underlying domain knowledge (AUTOSAR specifications, Brussels I bis regulation) is publicly available and likely in training data. This potential contamination is not addressed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study. Expert review is part of the evaluation methodology, not a human subjects study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 2 reports detailed latency (seconds) and token counts (input/output) for all pipeline configurations across prompt regimes. RQ2 reports token counts and wall-clock time broken down by Phase 1 (blueprint) and Phase 2 (component assembly) across complexity tiers."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated. While per-run latency and token counts are reported, there is no information on hardware used (GPU type, memory), total API spend, or aggregate compute hours for the full evaluation."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Layer-1 constrained decoding achieves 100% XSD structural validity for single-file AUTOSAR generation, while unconstrained LLM generation achieves 0% (vLLM) or 50% (GPT-5).",
    296       "evidence": "Table 4 (Section 4.3.2) shows 0% for vLLM, 50% for LLM-API, and 100% for vLLM+RAG, vLLM+RAG+GBNF, and vLLM+RAG+JSON Schema.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "ATLAS achieves 100% XSD structural validity across all complexity tiers in multi-file system generation (20 systems, 284 files).",
    301       "evidence": "Table 5 (Section 4.4.2) shows 100% XSD pass rate and file completeness across Simple, Middle, and Complex tiers.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Despite 100% structural validity, semantic (SHACL) and logical (SMT) validation reveals severe failure modes at system scale, with 0% SMT pass rate across all tiers.",
    306       "evidence": "Section 4.4.2 reports SHACL pass rates of 3.5% (Simple), 14.2% (Middle), 9.5% (Complex), and 0% SMT pass rate system-wide.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "ATLAS improves Legal Correctness to 0.467 vs 0.300 (RAG) and 0.200 (Baseline) on 60 Brussels I bis jurisdiction cases.",
    311       "evidence": "Table 7 (Section 4.5.2) reports these values along with other metrics showing consistent ATLAS improvement.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "The dual-channel ICM extraction process yields 1,161 constraints from AUTOSAR specifications with 90% entity linking precision.",
    316       "evidence": "Section 4.2.2 states: 'yielded a total of 1,161 normative constraints. The entity linking mechanism successfully anchored 1,045 of these to verified UMM entities (90% precision).'",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "ATLAS validates a graduated automation paradigm: automates routine 80% structural construction while surfacing 20% semantic ambiguities for expert review.",
    321       "evidence": "Section 4.4.3: 100% structural validity (automated) combined with expert review findings showing semantic failures require moderate-to-extensive human repair in Middle/Complex tiers.",
    322       "supported": "moderate"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "benchmark-eval",
    327     "case-study"
    328   ],
    329   "key_findings": "ATLAS combines LLMs with model-driven engineering through layered constraint enforcement: Layer-1 prefix-safe automaton decoding achieves 100% structural validity for generated AUTOSAR artifacts (vs. 0-50% for unconstrained generation), while Layer-2 semantic/logic validation reveals that semantic correctness degrades sharply at system scale (0% SMT pass rate for multi-file systems). Cross-domain transfer to legal reasoning (Brussels I bis) improves legal correctness from 0.200 (baseline) to 0.467 with the full pipeline. The results validate a graduated automation paradigm where LLMs reliably handle structural construction but require human-in-the-loop repair for complex semantic violations.",
    330   "red_flags": [
    331     {
    332       "flag": "No code or data release",
    333       "detail": "A complex multi-component framework with multiple algorithms, yet no source code, generated artifacts, evaluation datasets, or audit trail data is released. This makes independent verification impossible."
    334     },
    335     {
    336       "flag": "No variance or significance testing",
    337       "detail": "RQ1 mentions three random seeds but does not report variance across them. RQ2 and RQ3 use single-shot generation. No statistical significance tests support the comparative claims in RQ3 (e.g., 0.467 vs 0.300 legal correctness on only 60 cases)."
    338     },
    339     {
    340       "flag": "Expert review details lacking",
    341       "detail": "The expert review in RQ2 is described as 'structured protocol' but the number of experts, their qualifications, inter-rater reliability, and the actual review instrument are not specified. It appears to be a single expert's assessment."
    342     },
    343     {
    344       "flag": "GPT-5 used without version specification",
    345       "detail": "RQ2 and RQ3 use 'GPT-5' as the model but provide no version identifier, snapshot date, or API version. Given that model behavior changes across versions, this undermines reproducibility."
    346     },
    347     {
    348       "flag": "Potential contamination of LLMs on AUTOSAR/legal standards",
    349       "detail": "Both AUTOSAR specifications and Brussels I bis regulations are publicly available and likely in the training data of DeepSeek and GPT-5. The paper does not discuss whether the models' familiarity with these standards could inflate or deflate results."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation",
    355       "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"],
    356       "year": 2024,
    357       "arxiv_id": "2403.06988",
    358       "relevance": "DOMINO constrained decoding algorithm directly used as baseline approach for grammar-constrained LLM generation."
    359     },
    360     {
    361       "title": "Generating Structured Outputs from Language Models: Benchmark and Studies",
    362       "authors": ["Saibo Geng", "Hudson Cooper", "Michał Moskal"],
    363       "year": 2025,
    364       "arxiv_id": "2501.10868",
    365       "relevance": "SchemaBench/JSONSchemaBench benchmarks that characterize failure modes in structured output generation from LLMs."
    366     },
    367     {
    368       "title": "Grammar-Aligned Decoding",
    369       "authors": ["Kanghee Park", "Jiayu Wang", "Taylor Berg-Kirkpatrick", "Nadia Polikarpova", "Loris D'Antoni"],
    370       "year": 2025,
    371       "relevance": "Grammar-aligned decoding addresses distribution distortion in constrained generation, a key challenge discussed in ATLAS."
    372     },
    373     {
    374       "title": "Reflexion: language agents with verbal reinforcement learning",
    375       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    376       "year": 2023,
    377       "relevance": "Verbal reinforcement learning paradigm that ATLAS's audit-guided repair is compared to for iterative LLM refinement."
    378     },
    379     {
    380       "title": "Teaching Large Language Models to Self-Debug",
    381       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schaerli", "Denny Zhou"],
    382       "year": 2024,
    383       "relevance": "Self-debugging approach related to ATLAS's audit-guided repair loop for LLM-generated artifacts."
    384     },
    385     {
    386       "title": "Hallucination-Free? Assessing the Reliability of Leading AI Legal Research Tools",
    387       "authors": ["Varun Magesh", "Faiz Surani", "Matthew Dahl", "Mirac Suzgun", "Christopher D. Manning", "Daniel E. Ho"],
    388       "year": 2025,
    389       "relevance": "Directly motivates ATLAS's legal domain evaluation (RQ3) and the need for constraint enforcement to prevent hallucinations in legal AI."
    390     },
    391     {
    392       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    393       "authors": ["Hou Xinyi", "Zhao Yanjie", "Liu Yue"],
    394       "year": 2024,
    395       "relevance": "Comprehensive survey of LLMs in software engineering that provides context for ATLAS's MDE+LLM integration approach."
    396     },
    397     {
    398       "title": "SpecGen: Automated Generation of Formal Program Specifications via Large Language Models",
    399       "authors": ["Lezhi Ma", "Shangqing Liu", "Yi Li", "Xiaofei Xie", "Lei Bu"],
    400       "year": 2025,
    401       "arxiv_id": "2401.08807",
    402       "relevance": "Related work on LLM-based formal specification generation with verification, similar to ATLAS's constraint-guided generation."
    403     },
    404     {
    405       "title": "A Dual-Stage Framework for Behavior-Enhanced Automated Code Generation in Industrial-Scale Meta-Models",
    406       "authors": ["Tong Ma", "Shenlong Dai", "Yongfan Gao", "Fengjie Xu", "Ling Fang"],
    407       "year": 2025,
    408       "doi": "10.1109/ACCESS.2025.3614174",
    409       "relevance": "S2D2 predecessor framework by the same first author; ATLAS extends its dual-path fusion to dynamic constraint enforcement for LLMs."
    410     },
    411     {
    412       "title": "No Need to Lift a Finger Anymore? Assessing the Quality of Code Generation by ChatGPT",
    413       "authors": ["Zhijie Liu", "Yutian Tang", "Xiapu Luo", "Yuming Zhou", "Liang Feng Zhang"],
    414       "year": 2024,
    415       "relevance": "Evaluation of LLM code generation quality relevant to understanding the baseline capabilities ATLAS extends."
    416     },
    417     {
    418       "title": "Code Repair with LLMs Gives an Exploration-Exploitation Tradeoff",
    419       "authors": ["Hao Tang", "Keya Hu", "Jin Peng Zhou"],
    420       "year": 2024,
    421       "arxiv_id": "2405.17503",
    422       "relevance": "LLM-based code repair approach related to ATLAS's audit-guided repair mechanism."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs