scan.json (29100B)
1 { 2 "paper": { 3 "title": "Multi-Agent Code-Orchestrated Generation for Reliable Infrastructure-as-Code", 4 "authors": [ 5 "Rana Nameer Hussain Khan", 6 "Dawood Wasif", 7 "Jin-Hee Cho", 8 "Ali Butt" 9 ], 10 "year": 2025, 11 "venue": "arXiv.org", 12 "arxiv_id": "2510.03902", 13 "doi": "10.48550/arXiv.2510.03902" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "MACOG, a multi-agent orchestration with typed intermediate representation, constrained decoding, and validator-guided repair, consistently outperforms few-shot, chain-of-thought, multi-turn, and RAG baselines across 10 models on IaC-Eval. GPT-5 improves from 54.90 (RAG) to 74.02 (MACOG) and Gemini-2.5 Pro from 43.56 to 60.13. Ablations show the DevOps sandbox (-17.09), Security Prover (-12.57), and constrained decoding (-9.13) are the most critical components. Chain-of-thought provides no systematic gain over few-shot in this domain.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Section 7 states 'All source code and evaluation scripts are archived on Zenodo at https://zenodo.org/records/17117489' and they reference the public IaC-Eval dataset at HuggingFace." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "They use the public IaC-Eval dataset (Section 7): 'We use the public IaC-Eval dataset, available at https://huggingface.co/datasets/autoiac-project/iac-eval.' Standard public benchmark used without modification." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or library version specifications anywhere in the paper." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions provided in the paper. The Zenodo archive is referenced but no README or reproduction guide is described." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 1-4 are point estimates only. The paper explicitly acknowledges this: 'we refrain from claiming statistical significance in the absence of per-item distributions in the tables' (Section 5.1). They mention performing internal bootstrapping but do not report CIs." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "Section 5.1 mentions 'in our internal runs we bootstrap task-level scores (1,000 resamples) to derive 95% confidence intervals and apply paired tests (randomization tests for BLEU/CodeBERTScore and Wilcoxon signed-rank for IaC-Eval)' but explicitly states they 'refrain from claiming statistical significance' in the paper. No test results are reported." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Absolute and relative improvements reported throughout: 'GPT-5 improves from 54.90 (RAG) to 74.02 (MACOG, +19.12)', 'relative improvement of roughly +35%', and per-ablation deltas in Section 5.2.3. Cohen's d mentioned as reported 'where appropriate' but actual values not shown." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification for why IaC-Eval's size is sufficient. No power analysis. The benchmark is adopted as-is without discussing whether the number of tasks provides adequate statistical power for the claimed comparisons." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No variance, standard deviation, or spread measure reported in any table. All results are single point estimates. The paper acknowledges doing bootstrapping internally but does not report variance in the published results." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Five enhancement strategies compared: Few-shot, Chain-of-Thought, Multi-turn, RAG, and MACOG (Section 5.1). Each applied consistently across 10 models." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Models include GPT-5, GPT-4, Gemini-2.5 Pro, Gemini 2.0 Flash — contemporary frontier models. Enhancement strategies include RAG and multi-turn repair, which are current approaches." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 4 presents a comprehensive ablation removing each of the 7 major MACOG components individually (Provider Harmonizer, Engineer, Reviewer, Security Prover, Cost & Capacity Planner, DevOps, Memory Curator) and measuring impact across all 4 metrics." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Four metrics used: BLEU, CodeBERTScore, LLM-judge, and IaC-Eval (Section 5.1). Equations 18-22 define each metric formally." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "No human evaluation of generated IaC configurations. The LLM-judge metric uses a 'held-out judge' model for binary adequacy checks, but this is an automated LLM-based evaluation, not human evaluation." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "While no fine-tuning is performed, the paper does not discuss whether prompt engineering, system instructions, or orchestration logic were developed independently of the IaC-Eval test tasks. No held-out validation split is mentioned." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": false, 103 "justification": "Section 5.1 mentions grouping 'tasks by coarse functional families (Networking, Compute, Storage, Identity and Access, Managed Services) and by approximate graph difficulty' but no per-category breakdown is shown in any table or figure. Only aggregate results reported." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.2.2 includes a 'qualitative audit of Gemini-2.5 Pro's baseline errors shows frequent security-group laxity and occasional missing encryption flags.' Section 5.2.1 discusses where strategies fail (CoT underperformance, RAG's schema drift issues)." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "CoT consistently underperforming or matching Few-shot is explicitly discussed (Section 5.2.1): 'CoT does not provide systematic gains over Few-shot in this domain; for many models the two are statistically similar or CoT is slightly worse.' Ablation failures also reported." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims ('GPT-5 improves from 54.90 (RAG) to 74.02', 'Gemini-2.5 Pro from 43.56 to 60.13', ablation drops to 64.89 and 56.93) are all directly supported by Tables 1 and 4. The claim 'MACOG is the top enhancement across models' is confirmed in Table 1." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims ('removing them drops IaC-Eval to 64.89 and 56.93') are supported by single-variable ablation in Table 4. The ablation design removes one component at a time while holding all else constant, which is an adequate causal identification strategy for component contributions." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims 'Reliable Infrastructure-as-Code' broadly but evaluation is limited to Terraform on a single benchmark (IaC-Eval). No discussion of generalization to Pulumi, CloudFormation, Ansible, or other IaC tools mentioned in the introduction. The conclusion acknowledges 'broaden cross-provider coverage' as future work but the paper's claims are not bounded to Terraform." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No discussion of alternative explanations for the results. MACOG uses substantially more compute (multiple agent calls, validator executions, repair loops) than baselines. This compute confound is not discussed. No consideration of whether simpler interventions could achieve similar gains." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "IaC-Eval benchmark success is framed as 'deployable correctness' (Section 5.1) without discussing the gap between passing a benchmark harness and actual production deployment reliability. The paper does not acknowledge that IaC-Eval is a proxy for real-world infrastructure correctness." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models listed as 'GPT-5', 'GPT-4', 'Gemini-2.5 Pro', 'GPT-3.5-turbo' etc. — marketing names without specific version identifiers, snapshot dates, or API version strings. No model version pinning mentioned." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section 4.10 states 'The Architect and Engineer receive I-IR schemas and HCL grammars in system prompts and are steered by exemplars' but no actual prompt text is provided. Prompts are described functionally ('carefully engineered prompts, structured tool calling') but not reproduced." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 5.1 states 'nucleus sampling p ∈ [0.7, 0.9] per model family, temperature ∈ [0.2, 0.5], and a maximum output budget sufficient to emit a self-contained Terraform module.' Ranges given per model family rather than exact values, but hyperparameters are explicitly reported." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "Extensive scaffolding description across Sections 4.2-4.11: 8 agent roles described in detail, blackboard architecture, finite-state orchestrator (Eq. 14), I-IR typed representation, constrained decoding pipeline, counterexample-guided repair loop with Algorithm 1, and the Error-to-Edit mapping." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 5.1 describes using 'canonical task prompts provided by the benchmark' without modification. Task grouping by functional families described. The evaluation protocol is clearly documented." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "No dedicated limitations or threats-to-validity section. The conclusion mentions future work directions ('broaden cross-provider coverage', 'larger, more diverse corpora') but does not discuss limitations of the current evaluation or approach." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No threats to validity discussed anywhere in the paper. No consideration of specific limitations like single-benchmark evaluation, compute cost differences between strategies, or potential overfitting of the orchestration to IaC-Eval." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "No explicit scope boundaries stated. The conclusion mentions future extensions ('extend beyond IaC-Eval', 'broaden cross-provider coverage') which implicitly acknowledge current limitations, but there is no explicit statement of what the results do NOT show." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Only aggregate results reported in Tables 1-4. No per-task results available. The Zenodo archive contains 'source code and evaluation scripts' but the paper does not state that raw per-task outputs or scores are included." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "IaC-Eval benchmark clearly cited and described (Section 5.1): 'a benchmark comprising natural-language infrastructure intents and associated verification procedures tailored to cloud provisioning.' Tasks grouped by functional families." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Uses a standard benchmark dataset (IaC-Eval)." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The evaluation pipeline is described: benchmark inputs are canonical task prompts, outputs are complete Terraform programs, acceptance is determined by the IaC-Eval harness. Section 5.1 documents inference configuration, orchestration controls, and metric computation." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding or acknowledgments section present in the paper. No mention of any funding source." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All four authors listed with Virginia Tech affiliation. They evaluate commercial models (GPT-5, Gemini) but are not affiliated with those companies." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding disclosed, so independence cannot be assessed. Absence of funding disclosure prevents verification." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial disclosure statement present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates stated for any of the 10 models evaluated. Models like GPT-5 and Gemini-2.5 Pro are used without specifying when their training data was collected." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether IaC-Eval tasks or similar Terraform configurations appeared in the training data of any evaluated model." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "IaC-Eval was published in 2024 (NeurIPS). Models trained after 2024 (GPT-5, Gemini-2.5 Pro) may have seen its tasks. No contamination discussion provided." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. Pure benchmark evaluation." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference cost, API cost, latency, or per-task cost reported despite MACOG involving multiple LLM agent calls, validator executions, and repair loops per task." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total compute budget, GPU hours, or total API spend reported. The compute cost difference between strategies (Few-shot uses one call; MACOG uses many) is never quantified." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs producing the main results is never stated. Internal bootstrapping is mentioned but the primary run count is not specified." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No mention of how many configurations were tried for the orchestration parameters, prompt designs, or decoding settings. Temperature and top-p ranges are given but the search process is not documented." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": false, 316 "justification": "No explanation of how the final configuration was selected. The paper presents results for a single MACOG configuration without discussing how it was chosen from possible alternatives." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "50 comparisons across 10 models × 5 strategies. No multiple comparison correction mentioned or applied. The paper explicitly states it refrains from claiming statistical significance." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement MACOG and all baseline strategies. No acknowledgment of the bias of evaluating their own system against their own implementations of competing approaches." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "MACOG uses multiple agent calls, constrained decoding, validator executions, and repair loops — substantially more compute than Few-shot or CoT. The paper claims 'each retry consumes identical compute budget for equity' only vs Multi-turn, not vs simpler baselines. No performance-per-compute analysis." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "IaC-Eval is adopted without any discussion of whether its tasks adequately represent real-world IaC challenges, whether its evaluation harness captures meaningful correctness, or how it compares to production IaC complexity." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": true, 341 "justification": "The paper holds the scaffold constant when comparing models (Table 1) and varies the scaffold systematically when comparing strategies. Section 5.1 states 'the same enhancement strategies and orchestration logic are applied to isolate the effect of the strategy rather than model-specific prompt engineering.'" 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of temporal leakage. IaC-Eval was published in 2024; models trained after 2024 may have seen its tasks or solutions. Not addressed." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup (e.g., RAG retrieval of similar tasks, prompt examples) leaks information that wouldn't be available in real usage." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether IaC-Eval tasks share structural similarities that could inflate performance estimates or whether independence between tasks holds." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method applied (no canary strings, membership inference, temporal splits, or decontamination)." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "MACOG is the top enhancement across models, with GPT-5 improving from 54.90 (RAG) to 74.02 and Gemini-2.5 Pro from 43.56 to 60.13 on IaC-Eval.", 370 "evidence": "Table 1 (Section 5.2.1) shows consistent MACOG > RAG > Multi-turn > CoT ≈ Few-shot ordering across all 10 models. Average uplift of MACOG over RAG is +7.3 absolute points (~35% relative).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Ablations show constrained decoding and deploy feedback are critical: removing them drops IaC-Eval to 64.89 and 56.93 respectively.", 375 "evidence": "Table 4 (Section 5.2.3) presents single-variable ablation on GPT-5. DevOps sandbox removal causes largest drop (-17.09), followed by Security Prover (-12.57) and constrained decoding (-9.13).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Chain-of-thought does not provide systematic gains over few-shot in the IaC domain.", 380 "evidence": "Table 1 shows CoT ≈ Few-shot or slightly worse for most models. Tables 2-3 show CoT underperforming Few-shot on IaC-Eval for both GPT-5 (10.19 vs 12.53) and Gemini-2.5 Pro (10.49 vs 12.18).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "MACOG produces proof-carrying bundles for offline verification including policy proofs, cost sheets, deploy logs, and compiler provenance.", 385 "evidence": "Section 4.9 describes the evidence bundle conceptually but no empirical validation of bundle completeness or usefulness is provided.", 386 "supported": "weak" 387 }, 388 { 389 "claim": "The MACOG ranking largely mirrors the base-model ranking, implying the orchestration is complementary to raw model capability rather than a substitute.", 390 "evidence": "Table 1 shows that the model ordering (GPT-5 > Gemini-2.5 Pro > GPT-4 > ...) is preserved across strategies including MACOG (Section 5.2.1).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "MACOG improves LLM-judge from 69.72 to 94.10 for GPT-5, indicating higher alignment with intent and best practices.", 395 "evidence": "Table 2 (Section 5.2.2). The large LLM-judge jump is attributed to the reviewer's interface checks and security prover's targeted patches.", 396 "supported": "weak" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No error bars or uncertainty quantification", 402 "detail": "All results in Tables 1-4 are point estimates. The authors acknowledge performing internal bootstrapping (1,000 resamples) but explicitly 'refrain from claiming statistical significance' and do not report any uncertainty measures. This makes it impossible to assess whether observed differences are meaningful." 403 }, 404 { 405 "flag": "Compute cost confound not addressed", 406 "detail": "MACOG uses multiple LLM agent calls, constrained decoding, validator executions, and iterative repair loops per task — likely orders of magnitude more compute than Few-shot or CoT. The paper never quantifies this cost difference. Improvements may simply reflect more compute spent, not architectural innovation." 407 }, 408 { 409 "flag": "LLM-judge metric without validation", 410 "detail": "The LLM-judge metric uses an unnamed LLM to assess output adequacy. No inter-rater agreement with human judges is reported. No analysis of judge model bias. The 94.10% LLM-judge score for MACOG-GPT-5 is remarkably high and may reflect LLM-judge tendency to prefer structured, verbose outputs." 411 }, 412 { 413 "flag": "No limitations section", 414 "detail": "The paper has no limitations, threats to validity, or scope boundaries section. This is a significant omission for a systems paper making broad claims about 'reliable Infrastructure-as-Code.'" 415 }, 416 { 417 "flag": "Monotonically improving results across strategies", 418 "detail": "Table 1 shows perfectly monotonic improvement from Few-shot → MACOG for every single model (10/10). This clean ordering across such diverse model families and capability levels is unusual and warrants scrutiny." 419 }, 420 { 421 "flag": "Per-category breakdown promised but not delivered", 422 "detail": "Section 5.1 describes grouping tasks by functional families (Networking, Compute, Storage, etc.) and graph difficulty, but no per-category results are ever shown. Aggregate results could mask important variation." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "Iac-eval: A code generation benchmark for cloud infrastructure-as-code programs", 428 "authors": ["Patrick T Kon", "Jiachen Liu", "Yiming Qiu", "Weijun Fan", "Ting He"], 429 "year": 2024, 430 "relevance": "Primary benchmark used for evaluation; demonstrates that LLMs solve only 19% of IaC tasks on first try." 431 }, 432 { 433 "title": "Chatdev: Communicative agents for software development", 434 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 435 "year": 2023, 436 "arxiv_id": "2307.07924", 437 "relevance": "Foundational multi-agent code generation framework with specialized roles that MACOG builds upon." 438 }, 439 { 440 "title": "Self-collaboration code generation via chatgpt", 441 "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"], 442 "year": 2024, 443 "relevance": "Demonstrates single-model multi-role code generation through iterative self-collaboration." 444 }, 445 { 446 "title": "Competition-level code generation with alphacode", 447 "authors": ["Yujia Li", "David Choi", "Junyoung Chung"], 448 "year": 2022, 449 "relevance": "Demonstrates coupling code generation with validation/filtering for competitive programming." 450 }, 451 { 452 "title": "Repairagent: An autonomous, llm-based agent for program repair", 453 "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"], 454 "year": 2024, 455 "arxiv_id": "2403.17134", 456 "relevance": "Autonomous LLM-based program repair agent with finite-state tool controller." 457 }, 458 { 459 "title": "Magis: Llm-based multi-agent framework for github issue resolution", 460 "authors": ["Wei Tao", "Yucheng Zhou", "Yanlin Wang"], 461 "year": 2024, 462 "relevance": "Multi-agent framework for automated GitHub issue resolution with coordinated planning and QA." 463 }, 464 { 465 "title": "SynCode: LLM generation with grammar augmentation", 466 "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang"], 467 "year": 2024, 468 "relevance": "Grammar-constrained LLM decoding using precomputed DFA-based masks for syntactic validity." 469 }, 470 { 471 "title": "PICARD: Parsing incrementally for constrained auto-regressive decoding from language models", 472 "authors": ["Torsten Scholak", "Nathan Schucher", "Dzmitry Bahdanau"], 473 "year": 2021, 474 "arxiv_id": "2109.05093", 475 "relevance": "Constrained decoding via incremental parsing for structured code generation." 476 }, 477 { 478 "title": "Grammar-aligned decoding", 479 "authors": ["Kanghee Park", "Jiayu Wang", "Taylor Berg-Kirkpatrick"], 480 "year": 2024, 481 "relevance": "Addresses distributional bias from hard grammar constraints in LLM decoding." 482 }, 483 { 484 "title": "Code repair with llms gives an exploration-exploitation tradeoff", 485 "authors": ["Hao Tang", "Keya Hu", "Jin Zhou"], 486 "year": 2024, 487 "relevance": "Formalizes LLM code repair as exploration-exploitation, relevant to MACOG's repair loop." 488 }, 489 { 490 "title": "A Survey on Code Generation with LLM-based Agents", 491 "authors": ["Yihong Dong", "Xue Jiang", "Jiaru Qian"], 492 "year": 2025, 493 "arxiv_id": "2508.00083", 494 "relevance": "Recent survey on LLM-based agent code generation approaches." 495 } 496 ] 497 }