scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19172B)
      1 {
      2   "paper": {
      3     "title": "Constrained Decoding for Fill-in-the-Middle Code Language Models via Efficient Left and Right Quotienting of Context-Sensitive Grammars",
      4     "authors": ["Daniel Melcer", "Nathan Fulton", "Sanjay Krishna Gouda", "Haifeng Qian"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2402.17988"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "Footnote 5 states 'Link omitted for review; see supplemental material.' No working URL is provided in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses the publicly available The Stack dataset ('the-stack-smol-xl') and states 'We include the code and random seeds necessary to exactly reproduce both datasets in our supplemental material' (Section VII-A)."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions implementation is 'largely in Python, with selected subroutines written in Rust' but gives no version details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While random seeds and dataset construction details are described, there are no step-by-step reproduction instructions, README, or scripts to replicate the experiments. The code link is omitted."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Tables I and II are reported as raw counts with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims constrained generation 'performs significantly better' but provides no statistical significance tests — only raw count comparisons."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Tables I and II provide absolute counts and the full confusion matrix, allowing effect sizes to be computed (e.g., constrained succeeds on 90665/95390 vs unconstrained on 65353/95390 for STACK-BOUNDARY)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample sizes (95390 experiments from 9539 files, 10 per file) are described but not justified via power analysis or other rationale."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run experiments with greedy sampling."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against unconstrained generation and checked unconstrained generation as baselines (Tables I and II)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not compare against other constrained decoding methods for code (e.g., Synchromesh, grammar-aligned decoding [31], or Outlines [11/18]). Only unconstrained generation baselines are used."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is provided to isolate the contribution of individual components (e.g., lexer branching, indentation handling, parentheses handling)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The only metric is syntactic correctness (whether ast.parse succeeds). No functional correctness, code quality, or other metrics are reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated code quality is included. Evaluation is entirely automated via ast.parse."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The evaluation uses a subset of The Stack ('the-stack-smol-xl') which is a separate dataset from SantaCoder's training data. The datasets are constructed from this held-out subset."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by dataset (STACK-BOUNDARY vs STACK-RANDSPAN) and failure cases are categorized (29 parser issues vs 4696 model failures in Section VII-C1)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section VII-C1 provides detailed failure analysis distinguishing between parser-related failures (29 cases) and model failures to connect to right context (4696 cases)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports 4725 failure cases for STACK-BOUNDARY and discusses the soundness-completeness tradeoff (Section VI-B) and cases where the method accepts invalid programs (Figure 10)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims constrained generation 'can significantly reduce the incidence of syntax errors,' which is supported by Tables I and II showing large reductions (e.g., from ~30000 failures to ~4725 on STACK-BOUNDARY)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The causal claim is that constrained decoding reduces syntax errors. The experimental design (same model, same inputs, constrained vs unconstrained) is a controlled comparison adequate for this claim."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds claims to Python 3 and SantaCoder, describing the implementation as a 'proof-of-concept' and noting it evaluates 'the particularly difficult case of FIM completion for Python 3.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section VII-C discusses why STACK-RANDSPAN performs better (less text removed despite harder contexts), and the failure analysis considers multiple causes for failures (parser limitations vs model limitations)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'SantaCoder' [30] which is a specific, versioned open-source model with a clear reference."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The FIM prompt format is fully specified in Section I (FIM-PREFIX, FIM-SUFFIX, FIM-MIDDLE tokens with exact concatenation order)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section VII-B reports greedy sampling, 500 token limit, and top-50 candidate fallback strategy."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a single-pass constrained decoding system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section VII-A documents preprocessing: 459 files excluded for ast.parse errors, 2 files excluded for unimplemented features, leaving 9539 files. Dataset construction procedures are detailed for both STACK-BOUNDARY and STACK-RANDSPAN."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VIII (Future Work) and Section VI-B (Completeness-Soundness-Complexity Tradeoff) substantively discuss limitations. Section VII-D acknowledges the prototype nature of the implementation."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The paper discusses specific threats: the parser is not sound (Figure 10 shows concrete invalid programs accepted), implementation is a research prototype with Python performance limitations, and Python's grammar is specified as PEG so exact CFG conversion may be impossible (footnote 6)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope boundaries: 'more complete code generation systems, and evaluations for systems that include metrics of context escape, are out of scope for this paper' (Section VIII-A). It also notes this is a proof-of-concept for Python 3 only."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The base dataset (The Stack) is publicly available, and the paper states random seeds for dataset construction are included in supplemental material."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VII-A describes the data source (the-stack-smol-xl, 10000 Python files from GitHub), filtering criteria, and how both synthetic datasets were constructed."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is from a standard public benchmark (The Stack)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: 10000 files → exclude 459 parse errors → exclude 2 unimplemented features → 9539 files → 10 experiments per file → 95390 experiments. Both dataset construction methods are specified."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Northeastern University, MIT-IBM AI Lab, and AWS AI Labs. Two authors are from AWS, which produces Amazon Q Developer (mentioned in the introduction)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Two authors are from AWS AI Labs, and the paper's techniques are directly relevant to Amazon Q Developer (cited in the introduction). AWS has a financial interest in improved code completion."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state SantaCoder's training data cutoff date."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the-stack-smol-xl files overlap with SantaCoder's training data, despite both being sourced from The Stack."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "SantaCoder was trained on The Stack, and the evaluation uses a subset of The Stack. This potential contamination is not addressed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section VII-D reports per-token overhead timing (with regression equations) and one-time overhead for constrained generation, comparing against checked unconstrained generation."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total compute budget, GPU hours, or hardware specifications are stated for running the experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Constrained generation significantly reduces syntax errors compared to unconstrained generation for FIM tasks.",
    286       "evidence": "Table I: constrained succeeds on 90665/95390 (95.0%) vs unconstrained on 65353/95390 (68.5%) for STACK-BOUNDARY. Table II: 92085/95390 (96.5%) vs 68590/95390 (71.9%) for STACK-RANDSPAN.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The constrained generation method has near-constant per-token overhead independent of context size.",
    291       "evidence": "Figure 11 (top) shows R² = 4.22×10⁻³ for constrained generation overhead vs context size, compared to R² = 0.544 for checked unconstrained generation.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The method handles context-sensitive features of real programming languages including whitespace sensitivity and leftmost-longest lexing.",
    296       "evidence": "Sections V and VI detail the algorithms. The evaluation on Python 3 (which has whitespace sensitivity) demonstrates practical handling, though Section VI-B acknowledges soundness tradeoffs.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Checked unconstrained generation fixes some cases but constrained generation still outperforms it.",
    301       "evidence": "Table I: checked unconstrained recovers 5110 additional cases beyond unconstrained, but constrained succeeds on 490 cases where checked unconstrained fails.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "The paper extends the Earley parsing algorithm to support left and right quotienting for context-sensitive grammars, enabling constrained fill-in-the-middle code generation. On two synthetic Python datasets derived from The Stack, constrained generation with SantaCoder achieves 95-96.5% syntactic correctness compared to 68.5-71.9% for unconstrained generation. The method introduces near-constant per-token overhead independent of context size, unlike checked unconstrained generation which scales linearly. Most failures (4696/4725 on STACK-BOUNDARY) are due to the model failing to connect to the right context rather than parser limitations.",
    307   "red_flags": [
    308     {
    309       "flag": "Potential train-test contamination",
    310       "detail": "SantaCoder was trained on The Stack, and the evaluation uses a subset of The Stack (the-stack-smol-xl). The paper does not discuss whether evaluation files appeared in training data."
    311     },
    312     {
    313       "flag": "No comparison with other constrained decoding methods",
    314       "detail": "The paper only compares against unconstrained baselines. Other constrained decoding approaches for code (grammar-aligned decoding, Outlines/Synchromesh) are not compared against, making it unclear if the improvement comes from the quotienting approach specifically or constrained decoding generally."
    315     },
    316     {
    317       "flag": "AWS conflict of interest undisclosed",
    318       "detail": "Two authors are from AWS AI Labs, and the paper's techniques are directly applicable to Amazon Q Developer (cited in the introduction). No conflict of interest statement is provided."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "StarCoder: May the source be with you!",
    324       "authors": ["R. Li", "L. B. Allal"],
    325       "year": 2023,
    326       "relevance": "Major open-source code LLM used widely in code generation benchmarks."
    327     },
    328     {
    329       "title": "Evaluating Large Language Models Trained on Code",
    330       "authors": ["M. Chen", "J. Tworek"],
    331       "year": 2021,
    332       "relevance": "Introduced Codex and HumanEval benchmark, foundational to LLM code generation evaluation."
    333     },
    334     {
    335       "title": "Code Llama: Open Foundation Models for Code",
    336       "authors": ["B. Rozière", "J. Gehring"],
    337       "year": 2024,
    338       "arxiv_id": "2308.12950",
    339       "relevance": "Major open-source code LLM family with FIM capabilities."
    340     },
    341     {
    342       "title": "Efficient Guided Generation for Large Language Models",
    343       "authors": ["B. T. Willard", "R. Louf"],
    344       "year": 2023,
    345       "relevance": "Key prior work on constrained decoding for LLMs using grammar-based sampling."
    346     },
    347     {
    348       "title": "Syntax-Aware On-the-Fly Code Completion",
    349       "authors": ["W. Takerngsaksiri", "C. Tantithamthavorn", "Y.-F. Li"],
    350       "year": 2023,
    351       "relevance": "Prior work on syntax-aware constrained code completion."
    352     },
    353     {
    354       "title": "SantaCoder: Don't reach for the stars!",
    355       "authors": ["L. B. Allal", "R. Li"],
    356       "year": 2023,
    357       "relevance": "The code LLM used in this paper's experiments for FIM evaluation."
    358     },
    359     {
    360       "title": "Grammar-aligned decoding",
    361       "authors": ["K. Park", "J. Wang", "T. Berg-Kirkpatrick", "N. Polikarpova", "L. D'Antoni"],
    362       "year": 2024,
    363       "relevance": "Complementary constrained decoding technique that could be combined with this work."
    364     },
    365     {
    366       "title": "Efficient Training of Language Models to Fill in the Middle",
    367       "authors": ["M. Bavarian", "H. Jun"],
    368       "year": 2022,
    369       "relevance": "Foundational work on FIM training for code LLMs, defining the FIM task format used in this paper."
    370     },
    371     {
    372       "title": "Deepseek-coder-v2: Breaking the barrier of closed-source models in code intelligence",
    373       "year": 2024,
    374       "arxiv_id": "2406.11931",
    375       "relevance": "Major code LLM relevant to code generation capability evaluation."
    376     }
    377   ]
    378 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs