scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18703B)
      1 {
      2   "paper": {
      3     "title": "ComBack: A Versatile Dataset for Enhancing Compiler Backend Development Efficiency",
      4     "authors": ["Ming Zhong", "Fang Lyu", "Lulin Wang", "Hongna Geng", "Lei Qiu", "Huimin Cui", "Xiaobing Feng"],
      5     "year": 2024,
      6     "venue": "NeurIPS 2024 Track on Datasets and Benchmarks"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": true,
     13         "justification": "Fine-tuned models and code are available at https://huggingface.co/docz1105/ComBack_Models (Sec. 4.1)."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "Dataset is publicly available at https://huggingface.co/datasets/docz1105/ComBack (abstract)."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Hardware is described (64-core Intel Xeon Gold CPU, 8 NVIDIA Tesla V100 GPUs) but no requirements.txt, Dockerfile, or detailed software environment specification is provided."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are described in the paper."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "All results are reported as point estimates (e.g., EM percentages, ED scores) with no confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims CodeT5+ 'outperformed' and 'surpassed' baselines based solely on comparing numbers without any statistical significance tests."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Absolute differences are reported with baseline context, e.g., 'CodeT5+ surpasses 37.10%-40.82% for EM compared with ChatGPT' and improvements of '41.64-77.21 of ED across three tasks' (Sec. 4.2, 4.3)."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification for why 6 models were chosen as the sample, or why 3 targets were selected for new-target experiments. No power analysis."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No standard deviations, variance across runs, or any spread measures are reported. Results appear to be single-run numbers."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Baselines include Fork-Flow (conventional method), ChatGPT-3.5-Turbo, and Code-LLaMA-34B-Instruct (Sec. 4.1, Table 3)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "ChatGPT-3.5-Turbo and Code-LLaMA-34B-Instruct were contemporary large language models at the time of submission."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Sec. 4.3.2 ablates by removing GPU/MPU data (training on CPU only) and Sec. 4.4 ablates by adding/removing RISC-V data, showing component contributions."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple metrics used: Exact Match (EM), Edit Distance Similarity (ED), and BLEU-4 across different tasks (Sec. 4.1)."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No human evaluation of generated code quality. All evaluation is automated via EM, ED, and BLEU-4 metrics. For a code generation dataset, human evaluation of code correctness/usefulness would be relevant."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Data is split into train/validation/test sets (80%:10%:10% in Sec. 4.2, 85%:15% train/val with separate test targets in Sec. 4.3)."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down per target (RISC-V, ARC, NVPTX) in Tables 3-5, per compiler (GCC vs LLVM), and per task."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Sec. 4.3.2 and Appendix H discuss accuracy decreases for new target types and analyze why the model struggles with certain functions."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 4 shows accuracy decreases when training without GPU/MPU data, and the paper acknowledges limitations in handling customized functions (Sec. 6 Discussion)."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Abstract claims about fine-tuned CodeT5+ outperforming Fork-Flow, ChatGPT, and Code-LLaMA are supported by Tables 2-3 and Figure 6."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Causal claims ('ComBack effectively improves...') are supported by before/after fine-tuning comparisons (Table 2) and controlled ablations (Tables 4-5), which constitute adequate single-variable manipulation."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Claims are bounded to GCC and LLVM compiler backends. The paper specifies exact targets tested and acknowledges limitations for new target types (Sec. 4.3.2)."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No discussion of alternative explanations for the results. For example, whether the improvements stem from domain-specific vocabulary rather than structural understanding, or whether the comparison with ChatGPT/Code-LLaMA is unfair due to different input formats."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "ChatGPT-3.5-Turbo is mentioned without a snapshot date. Code-LLaMA-34B-Instruct is specified by size. CodeT5+-220M is specified. No API version dates for ChatGPT."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Appendix F provides the prompts used for ChatGPT and Code-LLaMA experiments (referenced in Sec. 4.3.1)."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Table 8 in Appendix C reports hyperparameters: training batch size 32, evaluation batch size 16, beam size 4, learning rate 5e-5, max optimization steps 3, and sequence length settings."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used. Models are fine-tuned and used for direct inference."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Sec. 3.2 documents four preprocessing steps in detail: code collection, function description collection, code extraction with tree-sitter, and target-specific value extraction with intermediate representations."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Sec. 6 (Discussion) contains a 'Limitation' subsection noting the absence of function descriptions for highly-customized functions."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The limitation mentioned is specific but very narrow (missing function descriptions). No discussion of threats like data leakage between train/test splits sharing the same compiler infrastructure, or whether EM/BLEU adequately measure practical usefulness."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No explicit statements about what the results do NOT show. The paper does not clarify, for example, that matching code tokens does not mean the generated code compiles or is semantically correct."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The dataset is publicly available on HuggingFace (https://huggingface.co/datasets/docz1105/ComBack), enabling independent verification."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Sec. 3.2 describes crawling GitHub with specific keywords, collecting from official GCC/LLVM websites, version ranges (GCC 3.0-13.0, LLVM 2.0.1-17.0.1), and filtering steps."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. Data source is open-source compiler code from GitHub and official websites."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Sec. 3.2 documents the full pipeline: collection → duplicate removal → function extraction via tree-sitter → target-specific value extraction. Sample counts are given for each task (161,124; 216,315; 45,296)."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Acknowledgement section lists National Key R&D Program of China, Strategic Priority Research Program of CAS, National Natural Science Foundation of China, and ICT Innovation Funding with grant numbers."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: SKLP, Institute of Computing Technology, CAS, and UCAS, Beijing, China."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Funders are Chinese government research agencies (NSFC, CAS) with no apparent financial stake in compiler backend tools."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper compares against ChatGPT-3.5-Turbo and Code-LLaMA but does not state their training data cutoff dates. It's unclear whether these models may have seen compiler backend code in their training data."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether ChatGPT or Code-LLaMA may have been trained on the same GCC/LLVM source code used in the test sets."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "GCC and LLVM source code is widely available online. The paper does not discuss whether baseline LLMs may have seen this code during pre-training, which could affect the comparison."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No inference cost, latency, or tokens consumed are reported for any of the models despite comparing a 220M parameter model against much larger ones."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "Hardware is mentioned (8 NVIDIA Tesla V100 GPUs) but no training time, GPU hours, or total compute budget is stated."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "ComBack effectively improves backend development capabilities of six language models across three tasks, with ED improvements of 41.64-77.21 after fine-tuning.",
    285       "evidence": "Table 2 shows before/after fine-tuning accuracy for all six models across three tasks (Sec. 4.2).",
    286       "supported": "strong"
    287     },
    288     {
    289       "claim": "Fine-tuned CodeT5+ (220M) outperforms ChatGPT-3.5-Turbo and Code-LLaMA-34B-Instruct on compiler backend tasks.",
    290       "evidence": "Table 3 shows CodeT5+ surpasses ChatGPT by 37-41% EM and Code-LLaMA by 50-55% EM on statement-level completion (Sec. 4.3.1).",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "Fine-tuned CodeT5+ achieves higher accuracy than Fork-Flow conventional development methods for code generation.",
    295       "evidence": "Figure 6 compares CodeT5+ against Fork-Flow average and maximum edit distance/BLEU-4 scores (Sec. 4.3.1).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "ComBack supports iterative expansion: adding RISC-V data improves accuracy for RI5CY by +7.90% EM on statement completion and +25.05 BLEU-4 on code generation.",
    300       "evidence": "Table 5 shows accuracy improvements across all three tasks after adding RISC-V to training data (Sec. 4.4).",
    301       "supported": "strong"
    302     }
    303   ],
    304   "methodology_tags": ["benchmark-eval"],
    305   "key_findings": "ComBack is the first public dataset for compiler backend development, covering 178 backends across GCC and LLVM with three tasks (statement completion, next-statement suggestion, code generation). Fine-tuning a small 220M-parameter CodeT5+ model on ComBack significantly outperforms ChatGPT-3.5-Turbo and Code-LLaMA-34B-Instruct on all three tasks. The dataset demonstrates cross-target generalization: models trained on CPU backends can generate code for MPU and GPU backends, and iterative data expansion improves accuracy for customized targets.",
    306   "red_flags": [
    307     {
    308       "flag": "No variance or multiple-run reporting",
    309       "detail": "All results are single-run point estimates with no standard deviations, confidence intervals, or indication of result stability across random seeds."
    310     },
    311     {
    312       "flag": "No functional correctness evaluation",
    313       "detail": "Evaluation uses token-level metrics (EM, ED, BLEU-4) only. No assessment of whether generated code compiles, passes tests, or is semantically correct. High BLEU-4 does not guarantee functional code."
    314     },
    315     {
    316       "flag": "Potentially unfair baseline comparison",
    317       "detail": "ChatGPT and Code-LLaMA are used zero-shot on a highly specialized domain (compiler backends), while CodeT5+ is fine-tuned on domain-specific data. The comparison demonstrates the value of fine-tuning but may overstate CodeT5+'s relative capability."
    318     },
    319     {
    320       "flag": "Contamination risk unaddressed",
    321       "detail": "GCC and LLVM source code is widely available on GitHub. The baseline LLMs (ChatGPT, Code-LLaMA) may have seen this code during pre-training, making the comparison results harder to interpret."
    322     }
    323   ],
    324   "cited_papers": [
    325     {
    326       "title": "Code Llama: Open Foundation Models for Code",
    327       "authors": ["Baptiste Rozière"],
    328       "year": 2024,
    329       "relevance": "Major open-source code LLM used as a baseline, directly relevant to LLM code generation capability evaluation."
    330     },
    331     {
    332       "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation",
    333       "authors": ["Yue Wang"],
    334       "year": 2023,
    335       "relevance": "Best-performing model in the study, representative of encoder-decoder code LLMs."
    336     },
    337     {
    338       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    339       "authors": ["Zhangyin Feng"],
    340       "year": 2020,
    341       "relevance": "Foundational pre-trained code model used as one of six fine-tuned models in the evaluation."
    342     },
    343     {
    344       "title": "GraphCodeBERT: Pre-training code representations with data flow",
    345       "authors": ["Daya Guo"],
    346       "year": 2021,
    347       "relevance": "Code representation model incorporating data flow, evaluated as a baseline in the study."
    348     },
    349     {
    350       "title": "UniXcoder: Unified cross-modal pre-training for code representation",
    351       "authors": ["Daya Guo"],
    352       "year": 2022,
    353       "relevance": "Cross-modal code model evaluated in the study, relevant to code understanding and generation benchmarks."
    354     },
    355     {
    356       "title": "CodeXGlue: A machine learning benchmark dataset for code understanding and generation",
    357       "authors": ["Shuai Lu"],
    358       "year": 2021,
    359       "relevance": "Influential code benchmark dataset whose methodology ComBack follows for data extraction."
    360     },
    361     {
    362       "title": "NatGen: Generative pre-training by 'naturalizing' source code",
    363       "authors": ["Saikat Chakraborty"],
    364       "year": 2022,
    365       "relevance": "Code generation pre-training approach evaluated as one of six models in the study."
    366     },
    367     {
    368       "title": "Large Language Models for Compiler Optimization",
    369       "authors": ["Chris Cummins"],
    370       "year": 2023,
    371       "arxiv_id": "2309.07062",
    372       "relevance": "Directly relevant work applying LLMs to compiler tasks, showing AI potential for compilation."
    373     }
    374   ]
    375 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs