scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32200B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "Monitor-Guided Decoding of Code LMs with Static Analysis of Repository Context",
      6     "authors": [
      7       "Lakshya A Agrawal",
      8       "Aditya Kanade",
      9       "Navin Goyal",
     10       "Shuvendu K. Lahiri",
     11       "Sriram K. Rajamani"
     12     ],
     13     "year": 2023,
     14     "venue": "NeurIPS 2023",
     15     "arxiv_id": "2306.10763"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Code and data released at https://github.com/microsoft/monitors4codegen. The abstract states 'Our data and implementation are available at' this URL, and Section 1 adds 'We open source our implementation and provide an extensible Python library called multilspy.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "PRAGMATICCODE dataset is released at the same GitHub repository. Section 3 describes it as 'a publicly-released dataset of Java code repositories complete with their development environments and dependencies.'"
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Appendix B specifies Oracle JDK 17.0.6, Apache Ant 1.10.13, Apache Maven 3.8.7, Gradle 7.3.3, CodeQL CLI 2.12.5. Section 6 specifies hardware: AMD Epyc 24-core with 220GB RAM and A100 80GB, and Intel Xeon Platinum 8168 with 290GB RAM and V100 16GB."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The GitHub repository is provided with the multilspy library. The paper describes the full evaluation pipeline in Section 3 and Appendices B-C with sufficient detail (model configs, hyperparameters, dataset construction). The open-source library is designed for reproducibility."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Table 1 reports score@k values with relative improvements but no confidence intervals or error bars. Figures 2-3 show score@k curves without error bars. All results are point estimates."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes numerous comparative claims ('MGD outperforms X by Y%') based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported anywhere."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Table 1 reports relative improvements in parentheses for every comparison (e.g., '24.69%' for CG-350M-MGD over CG-350M on CR). Absolute baseline numbers are also provided, giving full context for the magnitude of improvements."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The dataset has 100 repos, 1420 methods, 10538 dereference prompts. The selection process is described but there is no justification for why these numbers are sufficient, no power analysis, and no discussion of whether the sample is large enough for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Results are computed via the score@k estimator (Appendix D) which aggregates across 6 samples, but no standard deviation, variance, or spread measure is reported. The paper presents single aggregate values with no indication of result stability."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Each model is compared with and without MGD. Multiple models serve as baselines: CodeGen-{350M, 2B, 6B}, SantaCoder, text-davinci-003. Different prompting strategies (Standard, classExprTypes, RLPG) and decoding strategies (autoregressive, FIM) also serve as comparisons."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include SantaCoder (2023), CodeGen (2023), and text-davinci-003 (2022), all contemporary at the time of writing. RLPG (Shrivastava et al., 2022) is a recent prompt augmentation technique."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Sections 4.1-4.3 systematically ablate components: MGD alone, prompt augmentation alone, FIM alone, and their combinations. Section 4.4 analyzes by identifier complexity. Section 5 ablates across languages and coding scenarios. The microbenchmark also tests individual vs joint monitors."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Four metrics are used: Compilation Rate (CR), Next Identifier Match (NIM), Identifier Sequence Match (ISM), and Prefix Match (PM), as defined in Section 3."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "All evaluation is fully automated: compilation checks, identifier matching, and prefix matching. No human evaluation of generated code quality is included. The Discussion acknowledges that 'additional steps such as testing and human inspection are needed to guarantee correctness.'"
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "PRAGMATICCODE uses repositories released after the training cutoff (March 31, 2022), creating a temporal held-out set. Since MGD requires no training, there is no risk of test-set leakage through model selection on the evaluation data."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by model size (Section 4.1), prompting strategy (Section 4.2), decoding strategy (Section 4.3), identifier complexity (Section 4.4/Appendix F), and coding scenario (Section 5/Appendix H). Table 1 provides per-configuration breakdowns."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 6 discusses limitations of static analysis (imprecise/incomplete results). The microbenchmark shows partial failures: SE2 generates an invalid dereference, VD1 generates wrong number of arguments. Section 2 notes 'If Aφ returns an empty set to start with, we abandon the current run.'"
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Appendix G reports an 83.16% mean inference slowdown with MGD. The microbenchmark reports incomplete solutions (SE2, VD1) that require joint monitoring to fix. Section 4.1 shows that smaller models with MGD still underperform larger models on method-level metrics (ISM, PM)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims: (1) MGD 'consistently improves compilation rates and agreement with ground truth' — supported by Table 1 showing 21-25% CR improvement across all models. (2) 'SantaCoder-1.1B achieves better compilation rate and next-identifier match than text-davinci-003' — supported by Figures 2a-b. Claims are appropriately hedged."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper claims 'MGD improves compilation rates' — a causal claim. The study design is adequate: same model with vs without MGD on the same data constitutes controlled single-variable manipulation. Sections 4.1-4.3 systematically isolate MGD's contribution from prompting and decoding strategies."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The main evaluation is explicitly on Java (PRAGMATICCODE). The generalizability study (Section 5) is framed as a 'microbenchmark' with only 10 examples, appropriately labeled as demonstrating feasibility rather than proving broad generalization. The title references 'Code LMs' broadly but the paper is clear that main results are Java-specific."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not consider alternative explanations for the improvements. For instance, it does not discuss whether the compilation rate improvement could be partially due to the monitor forcing shorter/simpler completions, or whether the identifier-level guidance has downstream effects that inflate other metrics."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The metrics (compilation rate, identifier match, prefix match) are well-aligned with the claimed contributions (type-consistent identifier generation). The paper does not overclaim — it measures type consistency and claims improvements in type consistency. Section 6 explicitly notes that 'additional steps such as testing and human inspection are needed to guarantee correctness.'"
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Models are specified with parameter counts and origins: CodeGen-{350M, 2B, 6B}-Multi (Nijkamp et al., 2023), SantaCoder-1.1B (Allal et al., 2023), text-davinci-003 (Ouyang et al., 2022). HuggingFace Transformers implementations are used for open models, Azure API for text-davinci-003."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Prompts are deterministically constructed from code context. Section 3 and Appendix C fully describe the construction: Standard (local file content up to dereference, left-truncated), classExprTypes (20% budget for type definitions), FIM format. Appendix I provides the exact CodeQL query for identifying target methods. The prompt IS the code — no natural language instructions are involved."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Appendix C reports: nucleus sampling with top-p=0.95, temperatures {0.2, 0.4, 0.6, 0.8}, prompt budget 1536 tokens, generation budget 512 tokens, context window 2048 tokens. Budget allocation for classExprTypes (20%) and FIM suffix (50%) also specified."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. MGD is a decoding-time intervention that reshapes logits using static analysis results — there are no agents, tool use, retry logic, or feedback loops."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Appendix B documents the full pipeline: GitHub API query (March 25, 2023) for top 1000 starred Java repos after March 31, 2022 → 731 downloaded → filtered for permissive licenses and successful CodeQL builds → 302 repos → 100 after JDT.LS initialization. Method selection uses CodeQL query (Appendix I) with complexity filters."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6 'Discussion' contains a dedicated 'Limitations' subsection discussing that static analysis of partial programs is difficult, analyses can be imprecise and incomplete, and functional correctness is beyond scope."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The limitations discuss threats specific to this work: 'analyzing partial and incomplete programs is still a difficult problem,' static analysis heuristics 'can be both imprecise (they can give incorrect suggestions) and incomplete (they can leave out correct suggestions),' and 'Satisfying functional-correctness specifications like pre/post-conditions and invariants is beyond the scope.'"
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 6 states: 'Satisfying functional-correctness specifications like pre/post-conditions and invariants is beyond the scope of this work.' Section 8 (Conclusions) states plans to 'expand the scope of MGD to more languages and deeper semantic analyses,' implicitly bounding current scope to type-consistency monitoring."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "PRAGMATICCODE dataset is released at the GitHub repo (https://github.com/microsoft/monitors4codegen), including 100 repositories with their development environments. Raw data (repos, CodeQL databases, DOTPROMPTS) is available for verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Appendix B describes the full collection procedure: GitHub API query on March 25, 2023, for top 1000 starred Java repos created after March 31, 2022. Build environment details, filtering criteria, and the CodeQL-based method identification process are all documented."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. The data source is public GitHub repositories with a well-defined selection process described in Appendix B."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Full pipeline with counts: 1000 queried → 731 downloaded → 302 after build/license filtering → 100 after JDT.LS initialization. From these: 1420 methods identified via CodeQL, up to 20 per repo, up to 10 dereference locations per method → 10538 DOTPROMPTS instances (Appendix B)."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No explicit funding statement or acknowledgments section listing funding sources. The authors are all from Microsoft Research, implying corporate funding, but this is not explicitly disclosed."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All five authors clearly list their Microsoft Research affiliations with locations (Bangalore, India and Redmond, United States) in the author block."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "All authors are from Microsoft Research. Microsoft has a direct commercial interest in code generation tools (GitHub Copilot). The research demonstrating that small models with MGD can outperform large models directly benefits Microsoft's product strategy. The funder is not independent of the outcome."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper. Microsoft has commercial interests in code generation, and some authors may have related patents or equity, but no disclosure is provided."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section 3 states 'the determined training dataset cutoff date (31 March 2022) of the models which we use to evaluate MGD.' Appendix B reiterates this cutoff."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "The paper explicitly addresses this: 'We ensure that these repositories were released publicly only after the determined training dataset cutoff date (31 March 2022) for the CodeGen, SantaCoder, and text-davinci-003 family of models' (Appendix B). This temporal filtering prevents train-test overlap."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "PRAGMATICCODE was created from repos released after the training cutoff and the benchmark itself was constructed by the authors in 2023. No prior benchmark that could appear in training data is used. The contamination issue is addressed by construction."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study. All evaluation is automated on the DOTPROMPTS benchmark."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study evaluates language models on code repositories."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Appendix G reports inference time comparison: CG-6B mean 22.57s vs CG-6B-MGD mean 41.34s, with 83.16% mean slowdown. Table 2 provides full statistics (mean, std, min, quartiles, max)."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Section 6 describes hardware configurations: (1) AMD Epyc 24-core with 220GB RAM and Nvidia A100 80GB, (2) Intel Xeon Platinum 8168 with 290GB RAM and Nvidia Tesla V100 16GB. Azure API used for text-davinci-003. The paper explicitly states 'Our experiments do not involve any training, and we only perform inferences.'"
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Results are computed from 6 samples with different temperatures (0.2, 0.4, 0.6, 0.8) but no seed sensitivity analysis is reported. No standard deviation across seeds or seed-specific results are provided."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Section 3 clearly states: 'we use nucleus sampling with a top-p value of 0.95 to generate n = 6 independent samples' with specified temperatures. Appendix C adds: '1 each with temperature 0.2 and 0.4, and 2 each with temperature 0.6 and 0.8.'"
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Hyperparameters (temperatures, top-p, prompt budget allocations) are stated but no search budget is reported. No explanation of how these specific values were selected or how many configurations were tried."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "All model configurations and their results are reported in Table 1. No cherry-picking — results are shown for every combination of model, prompting strategy, and MGD status. The paper does not select a single 'best' configuration to highlight."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No statistical significance tests are performed at all. Since no hypothesis tests are conducted, multiple comparison correction is not applicable."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors implement MGD and evaluate it against base models. While they use original model implementations and RLPG's released code (reducing re-implementation bias), they do not acknowledge or discuss self-comparison bias. The DOTPROMPTS benchmark was also designed by the authors."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Appendix G reports the 83.16% inference slowdown separately. The paper does not analyze performance as a function of compute budget, nor compare MGD vs non-MGD at matched compute budgets (e.g., could the extra time be spent generating more samples instead?)."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "DOTPROMPTS evaluates type-consistent dereferences in Java. The paper does not discuss whether dereference-location code completion is a valid proxy for real-world code generation quality, or whether focusing exclusively on identifier correctness at dereference points captures meaningful code generation ability."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No scaffolding is involved. MGD is a decoding-time logit masking approach, not an agentic scaffold."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Repos in PRAGMATICCODE were released after March 31, 2022, the training cutoff for all evaluated models. Appendix B: 'We ensure that these repositories were released publicly only after the determined training dataset cutoff date.'"
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "For classExprTypes, target methods are masked out to prevent leakage: 'identify the type of all expressions occurring in C (after masking out the target method to prevent leakage)' (Section 3). The prompt construction avoids including ground truth in the input."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Multiple methods are sampled from the same repository (up to 20 per repo), and multiple dereference points per method (up to 10). The paper does not discuss whether these create non-independent test examples that could inflate result counts."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "The temporal split (repos created after training cutoff) is a concrete leakage prevention method. Additionally, the paper caps methods per repo at 20 and masks target methods in prompt augmentation. These are proactive prevention measures."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "MGD consistently improves compilation rates across model sizes by 21.77%-24.69% relative improvement.",
    368       "evidence": "Table 1 shows CR improvements: CG-350M 52.43→65.37 (24.69%), CG-2B 57.01→70.91 (24.38%), CG-6B 58.64→72.28 (23.25%), SC 59.97→73.03 (21.77%), TD-3 62.66→74.26 (18.52%).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "SantaCoder-1.1B with MGD outperforms the much larger text-davinci-003 (175B) on compilation rate and next-identifier match.",
    373       "evidence": "Figure 2a shows SC-MGD achieving ~73% CR vs TD-3's ~63% at score@6. Figure 2b shows SC-MGD achieving ~88.4% NIM vs TD-3's ~86.2% at score@6 (Section 4.1).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "MGD is complementary to prompt augmentation — combining both yields further improvements over either alone.",
    378       "evidence": "Table 1 and Section 4.2: SC-RLPG-MGD achieves 78.14% CR vs SC-RLPG 66.39% and SC-MGD 73.03%. Similar additive improvements across all metrics.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "MGD generalizes to multiple programming languages (Java, C#, Rust), coding scenarios, and richer semantic properties (typestates, session types).",
    383       "evidence": "Section 5 and Appendix H: MGDMICROBENCH with 10 examples spanning 3 languages, 4 coding scenarios, typestate and session type analyses. SC-MGD generates correct code in all scenarios.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "MGD improves identifier prediction most dramatically for complex (multi-subtoken) identifiers, with 21-28% relative improvement.",
    388       "evidence": "Appendix F and Section 4.4: For identifier complexity [4,18), CG-350M-MGD achieves parity with TD-3 and outperforms CG-6B by 11.95%. SC-MGD outperforms TD-3 by 11.53%.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "MGD introduces an 83% mean inference slowdown.",
    393       "evidence": "Appendix G, Table 2: CG-6B mean inference 22.57s vs CG-6B-MGD 41.34s across 161 matched instances with 500 prompts.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": ["benchmark-eval"],
    398   "key_findings": "Monitor-Guided Decoding (MGD) uses static analysis through the Language Server Protocol to reshape LLM logits at decode time, enforcing type-consistent identifier generation. On the PRAGMATICCODE Java benchmark (100 repos, 10538 prompts), MGD improves compilation rates by 21-25% across all model sizes and enables the 1.1B-parameter SantaCoder to outperform the 175B-parameter text-davinci-003 on compilation rate and identifier match. MGD is complementary to prompt augmentation and fill-in-the-middle decoding, with gains stacking. The approach generalizes to Java, C#, and Rust on a small microbenchmark, and can enforce richer semantic constraints like typestate protocols and session types.",
    399   "red_flags": [
    400     {
    401       "flag": "No statistical significance testing",
    402       "detail": "All comparative claims ('X outperforms Y by Z%') are based on point estimates without any significance tests. With 10538 data points, many comparisons, and no uncertainty quantification, it is unclear which improvements are statistically meaningful."
    403     },
    404     {
    405       "flag": "No variance or error bars reported",
    406       "detail": "Despite generating 6 samples per prompt, no standard deviation, confidence intervals, or error bars are reported on any metric. The score@k estimator aggregates results but the reader cannot assess result stability."
    407     },
    408     {
    409       "flag": "Tiny generalizability study",
    410       "detail": "The generalizability to C#, Rust, and richer semantic properties is demonstrated on only 10 hand-crafted micro-benchmark examples (MGDMICROBENCH). This is insufficient to support generalization claims beyond proof-of-concept."
    411     },
    412     {
    413       "flag": "Inference slowdown buried in appendix",
    414       "detail": "An 83% mean inference slowdown (Table 2, Appendix G) is a significant practical limitation but is not discussed in the main paper. The slowdown is only reported for CG-6B; other model configurations are not benchmarked for latency."
    415     },
    416     {
    417       "flag": "Corporate conflict of interest undisclosed",
    418       "detail": "All authors are from Microsoft Research, which has direct commercial interest in code generation tools (GitHub Copilot). The paper demonstrates that small models with static analysis can rival large models — directly relevant to Microsoft's product strategy. No conflict of interest statement is provided."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Evaluating large language models trained on code",
    424       "authors": ["Mark Chen", "Jerry Tworek"],
    425       "year": 2021,
    426       "arxiv_id": "2107.03374",
    427       "relevance": "Introduces Codex and HumanEval benchmark, foundational work on LLM code generation evaluation."
    428     },
    429     {
    430       "title": "SantaCoder: don't reach for the stars!",
    431       "authors": ["Loubna Ben Allal", "Raymond Li"],
    432       "year": 2023,
    433       "arxiv_id": "2301.03988",
    434       "relevance": "1.1B parameter code model used as primary evaluation target in this paper, demonstrates small model code generation."
    435     },
    436     {
    437       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    438       "authors": ["Erik Nijkamp", "Bo Pang"],
    439       "year": 2023,
    440       "relevance": "Open code generation model family (350M-6B) used as baselines for evaluating MGD across parameter scales."
    441     },
    442     {
    443       "title": "Repository-level prompt generation for large language models of code",
    444       "authors": ["Disha Shrivastava", "Hugo Larochelle", "Daniel Tarlow"],
    445       "year": 2022,
    446       "arxiv_id": "2206.12839",
    447       "relevance": "RLPG prompt augmentation technique used as a baseline and shown to be complementary with MGD."
    448     },
    449     {
    450       "title": "Competition-level code generation with AlphaCode",
    451       "authors": ["Yujia Li", "David Choi"],
    452       "year": 2022,
    453       "doi": "10.1126/science.abq1158",
    454       "relevance": "Large-scale code generation system, represents encoder-decoder approach to code generation."
    455     },
    456     {
    457       "title": "CoCoMIC: Code Completion By Jointly Modeling In-file and Cross-file Context",
    458       "authors": ["Yangruibo Ding", "Zijian Wang"],
    459       "year": 2022,
    460       "arxiv_id": "2212.10007",
    461       "relevance": "Cross-file context for code completion using architecture modifications and additional training — contrasts with MGD's training-free approach."
    462     },
    463     {
    464       "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding from Language Models",
    465       "authors": ["Torsten Scholak", "Nathan Schucher", "Dzmitry Bahdanau"],
    466       "year": 2021,
    467       "relevance": "Constrained decoding approach for SQL that is conceptually similar to MGD, using incremental parsing for validity."
    468     },
    469     {
    470       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    471       "authors": ["Gabriel Poesia", "Alex Polozov"],
    472       "year": 2022,
    473       "relevance": "Constrained decoding for code generation on SQL and DSLs; closely related approach using semantic checks during decoding."
    474     },
    475     {
    476       "title": "StarCoder: may the source be with you!",
    477       "authors": ["Raymond Li", "Loubna Ben Allal"],
    478       "year": 2023,
    479       "arxiv_id": "2305.06161",
    480       "relevance": "Large-scale open code generation model, successor to SantaCoder, represents state of the art in open code LMs."
    481     },
    482     {
    483       "title": "Better context makes better code language models: A case study on function call argument completion",
    484       "authors": ["Hao Pei", "Jian Zhao"],
    485       "year": 2023,
    486       "relevance": "Uses static analysis context for function call argument completion; related approach that requires finetuning unlike MGD."
    487     },
    488     {
    489       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    490       "authors": ["Fengji Zhang", "Bei Chen"],
    491       "year": 2023,
    492       "arxiv_id": "2303.12570",
    493       "relevance": "Repository-level code completion using retrieval, complementary approach that modifies input rather than output."
    494     },
    495     {
    496       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    497       "authors": ["Hao Yu", "Bo Shen"],
    498       "year": 2023,
    499       "arxiv_id": "2302.00288",
    500       "relevance": "Pragmatic code generation benchmark evaluating repository-level context, motivates PRAGMATICCODE dataset creation."
    501     },
    502     {
    503       "title": "Incoder: A generative model for code infilling and synthesis",
    504       "authors": ["Daniel Fried", "Armen Aghajanyan"],
    505       "year": 2022,
    506       "arxiv_id": "2204.05999",
    507       "relevance": "Code infilling model supporting fill-in-the-middle, relevant to FIM decoding strategy evaluated with MGD."
    508     }
    509   ]
    510 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs