scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17298B)
      1 {
      2   "paper": {
      3     "title": "Compilable Neural Code Generation with Compiler Feedback",
      4     "authors": ["Xin Wang", "Yasheng Wang", "Yao Wan", "Fei Mi", "Yitong Li", "Pingyi Zhou", "Jin Liu", "Hao Wu", "Xin Jiang", "Qun Liu"],
      5     "year": 2022,
      6     "venue": "ACL 2022 (Findings)",
      7     "arxiv_id": "2203.05132"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets: CodeSearchNet-Python and AdvTest from CodeXGLUE. The filtering criteria to extract subsets are described."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions Python 3 and NVIDIA Tesla V100 GPUs but does not provide a requirements.txt, Dockerfile, or detailed dependency list."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as single point estimates without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims COMPCODER outperforms baselines but provides no statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Improvements are reported with baseline context, e.g., 'improving the success rate of compilation from 44.18 to 89.18' and absolute point differences are stated."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the choice of 50k/45k/5k split for code completion or 41k/40k/1k for text-to-code."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or multiple-run results are reported. All numbers appear to be single-run."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are compared: BiLSTM, Transformer, GPT-2, CodeGPT, PLBART, CodeT5 (Tables 1 and 2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "CodeGPT, PLBART, and CodeT5 were all recent state-of-the-art models at the time of writing (2021-2022)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 3 presents an ablation study examining RL, Dtrain, and Dtest components individually and in combination."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two metrics are used: Edit Similarity (ES) and Compilation Rate (CR)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is conducted. All evaluation is automated via compiler checks and edit similarity."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Separate test sets are used: 5k for code completion, 1k for text-to-code generation (Section 4.1)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Code completion results are broken down by token count (25, 30, 35, 40, 45 tokens) in Table 1 and Figure 4."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 3 shows failure cases where candidates fail to compile, and Figure 5 shows case studies comparing CodeGPT failures with COMPCODER successes."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results or failed approaches are reported. Every experiment shows improvement."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of improving compilation rate from 44.18 to 89.18 (code completion) and 70.3 to 96.2 (text-to-code) are supported by Tables 1 and 2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by the ablation study (Table 3), which uses controlled single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Neural Code Generation' broadly but experiments are only on Python. No explicit statement bounding generalization to Python."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the results. The weak constraint issue (whitespace strings compiling) is noted but no broader alternatives are considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The specific CodeGPT checkpoint is identified via HuggingFace URL: 'microsoft/CodeGPT-small-py-adaptedGPT2' (Section 4.4)."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper fine-tunes models rather than using prompting. No prompts are used."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.4 reports learning rate (1.5e-5), batch size (32/16), max epochs (20), sequence lengths, beam size (5), and RL data sampling (5%)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a training pipeline, not an agent system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 describes filtering: extracting compilable Python 3 methods with token lengths 64-96 from CodeSearchNet, and filtering AdvTest for code lengths 128-170 and text lengths >5."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section. The conclusion briefly mentions compilability doesn't guarantee correctness."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statement of what the results do NOT show. The paper does not bound its claims to Python or the specific datasets tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The underlying datasets (CodeSearchNet, AdvTest/CodeXGLUE) are publicly available, though the specific filtered subsets are not released."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes how data was collected: extracting from CodeSearchNet with length filters, and from AdvTest with length and semantic filters."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data comes from standard benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 documents the pipeline: source dataset → language/version filter → length filter → compilability filter → train/test split, with counts at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgements section lists NSFC grants No. 61972290, 62102157, and 61962061."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed, including Huawei Noah's Ark Lab and Huawei Technologies Co., Ltd."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is from NSFC (Chinese national science foundation), which has no stake in the specific outcomes of this code generation research."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present. Several authors are from Huawei, which has commercial interest in code generation, but this is not explicitly acknowledged as a conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The CodeGPT model's training data cutoff is not stated. The paper uses pre-trained CodeGPT on benchmarks without discussing when its training data was collected."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether CodeSearchNet or AdvTest data appeared in CodeGPT's pre-training corpus."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "CodeSearchNet was published in 2019 and CodeGPT was pre-trained on code corpora that likely included it. This contamination risk is not addressed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost or latency is reported despite the multi-stage pipeline involving beam search and discriminator evaluation."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (2 NVIDIA Tesla V100 32GB) but total training time and compute budget are not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "COMPCODER improves average compilation rate from 44.18 to 89.18 in code completion compared to CodeGPT",
    286       "evidence": "Table 1 and Figure 4 show compilation rates across different token completion lengths (25-45 tokens). Average improvement is ~45 points.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "COMPCODER improves compilation rate from 70.3 to 96.2 in text-to-code generation compared to CodeGPT",
    291       "evidence": "Table 2 shows CR of 96.2 for COMPCODER vs 70.3 for CodeGPT on the AdvTest dataset.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "COMPCODER does not sacrifice code fluency (Edit Similarity) while improving compilation rate",
    296       "evidence": "Tables 1 and 2 show ES scores are comparable or slightly better (64.53 vs 64.47 in completion, 62.74 vs 61.82 in text-to-code).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "All three components (RL, Dtrain, Dtest) contribute to compilation rate improvement",
    301       "evidence": "Table 3 ablation study shows incremental improvements: CodeGPT 46.84 → +Dtrain 64.88 → +RL 76.48 → +RL+Dtrain 83.14 → full model 94.48.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "COMPCODER, a three-stage pipeline using compiler feedback (fine-tuning, RL-based compilability reinforcement, and compilability discrimination), dramatically improves compilation rates of generated code. On Python code completion, it raises compilation rate from 44.18% to 89.18% vs CodeGPT, and on text-to-code generation from 70.3% to 96.2%, without degrading edit similarity. An ablation study confirms all three components contribute, with the discriminator at inference time providing the largest single boost.",
    307   "red_flags": [
    308     {
    309       "flag": "No variance or multiple runs",
    310       "detail": "All results appear to be single-run with no error bars, standard deviations, or confidence intervals, making it impossible to assess result stability."
    311     },
    312     {
    313       "flag": "No limitations section",
    314       "detail": "The paper has no dedicated limitations or threats-to-validity section. Only a brief mention in the conclusion that compilability does not equal correctness."
    315     },
    316     {
    317       "flag": "Potential contamination risk",
    318       "detail": "CodeGPT was pre-trained on code corpora that likely include CodeSearchNet data. The paper does not discuss whether training/test overlap could inflate results."
    319     },
    320     {
    321       "flag": "Python-only evaluation with broad claims",
    322       "detail": "The title and framing suggest general 'neural code generation' but all experiments are on Python only. No discussion of generalizability to other languages."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    328       "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"],
    329       "year": 2021,
    330       "relevance": "Major code generation benchmark used in this paper's evaluation, widely used in LLM code generation research."
    331     },
    332     {
    333       "title": "Unified Pre-training for Program Understanding and Generation",
    334       "authors": ["Wasi Uddin Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"],
    335       "year": 2021,
    336       "relevance": "PLBART, a pre-trained code model used as a baseline, relevant to understanding code generation model evolution."
    337     },
    338     {
    339       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    340       "authors": ["Yue Wang"],
    341       "year": 2021,
    342       "relevance": "CodeT5 baseline model, influential in pre-trained code generation models."
    343     },
    344     {
    345       "title": "Sequencer: Sequence-to-Sequence Learning for End-to-End Program Repair",
    346       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano"],
    347       "year": 2021,
    348       "relevance": "Program repair using deep learning, relevant to automated code generation and repair quality."
    349     },
    350     {
    351       "title": "Energy-based Models for Code Generation under Compilability Constraints",
    352       "authors": ["Tomasz Korbak", "Hady ElSahar", "Marc Dymetman", "German Kruszewski"],
    353       "year": 2021,
    354       "arxiv_id": "2106.04985",
    355       "relevance": "Directly related work on compilability-constrained code generation using energy-based models."
    356     },
    357     {
    358       "title": "SPoC: Search-based Pseudocode to Code",
    359       "authors": ["Sumith Kulal", "Panupong Pasupat", "Kartik Chandra"],
    360       "year": 2019,
    361       "relevance": "Introduced compilation rate as an evaluation metric for code generation."
    362     },
    363     {
    364       "title": "Retrieval Augmented Code Generation and Summarization",
    365       "authors": ["Md. Rizwan Parvez", "Wasi Uddin Ahmad"],
    366       "year": 2021,
    367       "relevance": "Retrieval-augmented approach to code generation, relevant to AI-assisted programming."
    368     },
    369     {
    370       "title": "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search",
    371       "authors": ["Hamel Husain", "Ho-Hsiang Wu"],
    372       "year": 2019,
    373       "relevance": "Source dataset used in this paper's code completion experiments, widely used code benchmark."
    374     }
    375   ]
    376 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs