scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24537B)
      1 {
      2   "paper": {
      3     "title": "CODEPROMPTZIP: Code-specific Prompt Compression for Retrieval-Augmented Generation in Coding Tasks with LMs",
      4     "authors": ["Pengfei He", "Shaowei Wang", "Tse-Hsun (Peter) Chen"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2502.14925"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides an anonymous repository link (https://anonymous.4open.science/r/CodePromptZip-6B2B) in Appendix A ('Data Availability') and the Ethical Considerations section states 'Both the source code and data are released as free and open-source software.' However, this is an anonymized link for review — whether it is a fully working URL is uncertain, but it constitutes a provided link."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states in the Ethical Considerations section that 'Both the source code and data are released as free and open-source software and are made available in the public domain.' The datasets used (Assertion Generation, Bugs2Fix/CodexGLUE, Code Suggestion) are also based on publicly available benchmarks."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper specifies the base model (CodeT5, 775M parameters) and mentions using AdamW optimizer with specific hyperparameters, but does not provide a requirements.txt, Dockerfile, or detailed environment setup section listing library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper provides an anonymous repository link but does not include step-by-step reproduction instructions in the paper itself. No README with commands or 'Reproducing Results' section is present in the paper text."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 2, 3, 5, and 6 are reported as point estimates without confidence intervals, error bars, or any uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims improvements of 23.4%, 28.7%, and 8.7% over baselines but provides no statistical significance tests (no p-values, t-tests, or other tests). Differences are compared by raw number comparison only."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements over baselines with baseline context. For example, '23.4%, 28.7%, and 8.7% over the best baseline' and Table 2 shows the actual metric values (e.g., CODEPROMPTZIP: 42.1% Exact Match vs. LongLLMLingua: 34.1%), providing enough context to assess magnitude."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper randomly samples 2,000 instances from validation and test sets 'due to computational resource constraints' (Sec. 3.2) but does not justify why 2,000 is sufficient or discuss whether this sample is representative."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs (temperature set to 0 for BLMs, but no mention of multiple training runs for the compressor model)."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against four baselines: LLMLingua, LongLLMLingua, LLMLingua-2, and RECOMP, as well as w/o retrieval and Oracle conditions (Table 2, Sec. 5.2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines are contemporary: LLMLingua (2023), LongLLMLingua (2023), LLMLingua-2 (2024), and RECOMP (2024). These represent the state of the art in prompt compression at the time of writing."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper includes an ablation study comparing CODEPROMPTZIP with and without the copy mechanism (Table 2: 'CODEPROMPTZIP w/o Copy' vs. 'CODEPROMPTZIP'), demonstrating the contribution of the copy mechanism across all three tasks."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Different metrics are used for different tasks (Exact Match for Assertion Generation, CodeBleu for Bugs2Fix and Code Suggestion), and token count/compression ratio are reported alongside. However, each individual task uses only one quality metric."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. All evaluation is automated using Exact Match and CodeBleu metrics. For code compression quality, human judgment of compressed code readability or usefulness would be informative but is absent."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses original train/validation/test splits from each dataset (Sec. 3.2: 'we follow the original split of the dataset into Train, Validation, and Test partitions'). The validation set is used for ablation analysis and the test set for evaluation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per task (Assertion Generation, Bugs2Fix, Code Suggestion) in all tables. Per-model breakdowns are provided in RQ4 (Figure 6). Varying compression ratios are shown in Table 6."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No qualitative failure cases or error analysis is provided. The paper does not show examples of where compression produced poor results or where the compressor failed to meet the desired ratio."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the w/o Copy variant struggles with compression ratio control (Sec. 6.3, Table 6), and that cross-task compressors show degraded performance (Appendix D, Table 5). It also notes performance degradation vs. uncompressed prompts."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims improvements of 23.4%, 28.7%, and 8.7% over the best baseline for the three tasks. These numbers are confirmed in Table 2 (comparing CODEPROMPTZIP to the best baseline in each task: 42.1 vs 34.1, 61.9 vs 48.1, 23.7 vs ~21.8)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims through ablation (removing the copy mechanism degrades performance). The ablation design is adequate: controlled single-variable manipulation comparing w/ and w/o copy on the same datasets and settings."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper title and abstract claim compression 'for Retrieval-Augmented Generation in Coding Tasks' broadly, but experiments are only on Java, method-level tasks, and 3 specific benchmarks. The Limitations section acknowledges this (Java only, method-level), but the title and abstract do not qualify the scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the results. For example, it does not consider whether the improvements are due to the type-aware priority ranking vs. the copy mechanism architecture vs. the training data construction, beyond the basic ablation."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT-3.5-turbo' and 'Gemini-1.0-pro' without specifying snapshot dates or API versions. It also uses 'CodeLlama-13B' which is more specific but still lacks version details. The compressor is 'CodeT5 (Wang et al., 2021), 775M' which is reasonably specified."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 7 (Appendix) shows the prompt templates for all three coding tasks with the actual structure including placeholders (e.g., {header}, {body}, {method under test}). The fill values are the actual code examples from the datasets, which are publicly available."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Training hyperparameters are reported: 'AdamW optimizer with a batch size of 16, a learning rate of 5e-5, and 1,000 warmup steps for 10 epochs' (Sec. 4.2). BLM temperature is set to 0 (Sec. 3.2). Compression ratios are explicitly stated."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The system is a straightforward pipeline: retrieve examples, compress with trained model, pass to BLM for generation."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Data preprocessing is documented: code examples are categorized into parsable/unparsable using JavaParser (Table 1 shows counts), 2,000 instances randomly sampled from validation/test sets (Sec. 3.2), token types categorized via AST construction, and the compression dataset construction is detailed in Algorithm 1 and Appendix C."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 8 is titled 'Limitations' and contains substantive discussion of two specific limitations: the need for extra training for new coding tasks and the limited generalizability to Java/method-level tasks only."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section discusses specific threats: the approach requires retraining for different coding tasks because removal priorities are task-dependent, and experiments are limited to Java and method-level tasks. These are specific to this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The Limitations section explicitly states: 'This study focuses exclusively on Java and method-level tasks' and notes that applying to repository-level tasks would require additional training. It encourages 'further studies to explore additional base LMs and a broader range of programming languages.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The replication package is stated to be available (Appendix A) and the underlying datasets (CodexGLUE/Bugs2Fix, Assertion Generation, Code Suggestion) are public benchmarks. The paper states 'Both the source code and data are released.'"
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data sources are clearly described: Assertion Generation from Nashid et al. 2023, Bugs2Fix from CodexGLUE (Lu et al. 2021), Code Suggestion from Chen et al. 2024. Table 1 provides dataset statistics with knowledge base, test, and validation sizes. The compression dataset construction is described in detail."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data sources are standard public benchmarks, making this criterion not applicable."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full data pipeline is documented: code examples parsed with JavaParser, tokens categorized by type, ablation analysis to determine removal priorities, Algorithm 1 for constructing compressed training examples, and the training/validation/test split (8:1:1 ratio, Appendix C Table 4)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section disclosing grants or sponsors."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: University of Manitoba (He, Wang) and Concordia University (Chen). These are academic institutions, not companies whose products are being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Funding is not disclosed, so independence of funder cannot be assessed. The absence of funding disclosure makes this a NO."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "There is no competing interests or financial interests statement in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses GPT-3.5-turbo, Gemini-1.0-pro, and CodeLlama-13B as base LMs for evaluation but does not state their training data cutoff dates. This is relevant because the benchmarks used could have been in the training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of potential overlap between the LLMs' training data and the benchmark test sets. CodexGLUE/Bugs2Fix was published in 2021, well before GPT-3.5-turbo's training, raising contamination concerns that are not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The benchmarks (CodexGLUE, etc.) were published before the models' training cutoffs, making contamination a real concern, but the paper does not discuss this risk at all."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper motivates the work partly by referencing GPT-4 API costs ($2.50 per million tokens) in the introduction, and reports token counts and compression ratios, but does not report the actual inference cost or latency of running the compressor itself or the end-to-end pipeline."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No information is provided about GPU hours, total API spend, or hardware used for training the compressor or running the BLM experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CODEPROMPTZIP improves by 23.4%, 28.7%, and 8.7% over the best baseline for Assertion Generation, Bugs2Fix, and Code Suggestion respectively.",
    286       "evidence": "Table 2 shows CODEPROMPTZIP achieves 42.1% Exact Match vs 34.1% (LongLLMLingua) for Assertion Generation, 61.9% CodeBleu vs 48.1% (LLMLingua-2) for Bugs2Fix, and 23.7% CodeBleu vs ~21.8% (LLMLingua) for Code Suggestion.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The copy mechanism consistently contributes to performance enhancement across all tasks.",
    291       "evidence": "Table 2 ablation shows w/o Copy vs w/ Copy: 40.9% vs 42.1% Exact Match (Assertion Generation), 56.7% vs 61.9% CodeBleu (Bugs2Fix), 20.5% vs 23.7% CodeBleu (Code Suggestion). Consistent improvement across all tasks.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Token type removal priorities are task-specific yet model-agnostic, applicable across different LMs.",
    296       "evidence": "Figure 1 shows priority rankings are consistent across GPT-3.5-turbo and Gemini-1.0-pro for all three tasks, but the rankings differ between tasks (e.g., Invocation highest priority for Assertion Generation, Signature highest for Code Suggestion).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Fewer, less-compressed examples generally outperform more, highly-compressed examples within a fixed token budget.",
    301       "evidence": "Figure 4 (RQ2) shows that 1-shot with lower compression ratio generally achieves better performance than 3 or 5 shots with higher compression, across all three tasks.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CODEPROMPTZIP generalizes across different base LMs (CodeLlama-13B, Gemini-1.0-pro).",
    306       "evidence": "Figure 6 (RQ4) shows CODEPROMPTZIP consistently outperforms baselines across CodeLlama-13B and Gemini-1.0-pro on all three tasks.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "CODEPROMPTZIP handles unparsable code effectively with minimal performance degradation.",
    311       "evidence": "Table 3 (RQ5) shows only slight decreases when code is artificially made unparsable: 42.1% to 42.0% (1% removed) and 42.1% to 41.7% (3% removed) for Assertion Generation. Oracle is N/A for unparsable code.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "CODEPROMPTZIP introduces a code-specific prompt compression framework for RAG-based coding tasks that uses type-aware token priority ranking and a copy-enhanced CodeT5 compressor. The framework outperforms existing NL-focused compression methods (LLMLingua, LLMLingua-2, RECOMP) by 8.7-28.7% across three Java coding tasks. The copy mechanism is critical for both compression quality and ratio control. The approach generalizes across different base LMs and handles unparsable code, though it is limited to Java method-level tasks and requires task-specific retraining.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical significance testing",
    320       "detail": "All claimed improvements are based on raw number comparisons with no significance tests, confidence intervals, or variance reporting. The improvements could be within noise, especially for Code Suggestion (8.7% over baseline)."
    321     },
    322     {
    323       "flag": "Benchmark contamination risk unaddressed",
    324       "detail": "CodexGLUE (Bugs2Fix) was published in 2021 and could be in the training data of GPT-3.5-turbo and Gemini-1.0-pro. This is never discussed, yet the absolute performance numbers (which determine whether compression 'maintains' quality) depend on this."
    325     },
    326     {
    327       "flag": "No inference cost analysis despite cost motivation",
    328       "detail": "The paper motivates the work by citing GPT-4 costs ($2.50/M tokens) but never reports the actual cost of running the compressor, the end-to-end latency, or the GPU resources needed to train/deploy the 775M parameter compressor. The cost savings from compression could be offset by compressor costs."
    329     },
    330     {
    331       "flag": "Subsampled test sets without justification",
    332       "detail": "Only 2,000 instances are randomly sampled from test sets (originally 6,545-18,027) 'due to computational resource constraints' with no justification that this sample is representative or large enough for reliable conclusions."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models",
    338       "authors": ["Huiqiang Jiang", "Qianhui Wu", "Chin-Yew Lin", "Yuqing Yang", "Lili Qiu"],
    339       "year": 2023,
    340       "relevance": "Key baseline for prompt compression; entropy-based approach for reducing prompt length while preserving LM performance."
    341     },
    342     {
    343       "title": "LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression",
    344       "authors": ["Zhuoshi Pan", "Qianhui Wu", "Huiqiang Jiang"],
    345       "year": 2024,
    346       "relevance": "Knowledge distillation baseline for prompt compression using transformer encoder for token classification."
    347     },
    348     {
    349       "title": "RECOMP: Improving Retrieval-Augmented LMs with Context Compression and Selective Augmentation",
    350       "authors": ["Fangyuan Xu", "Weijia Shi", "Eunsol Choi"],
    351       "year": 2024,
    352       "relevance": "Distillation-based baseline that trains T5 on GPT-3.5-turbo summaries for retrieval context compression."
    353     },
    354     {
    355       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    356       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven CH Hoi"],
    357       "year": 2021,
    358       "relevance": "Base model architecture used for the code compressor; foundational pre-trained model for code tasks."
    359     },
    360     {
    361       "title": "Retrieval-based Prompt Selection for Code-related Few-shot Learning",
    362       "authors": ["Noor Nashid", "Mifta Sintaha", "Ali Mesbah"],
    363       "year": 2023,
    364       "relevance": "Provides the Assertion Generation task and RAG prompt template used in evaluation."
    365     },
    366     {
    367       "title": "CodexGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    368       "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"],
    369       "year": 2021,
    370       "relevance": "Provides the Bugs2Fix benchmark dataset used in evaluation; foundational code benchmark."
    371     },
    372     {
    373       "title": "Code Search is All You Need? Improving Code Suggestions with Code Search",
    374       "authors": ["Junkai Chen", "Xing Hu", "Zhenhao Li", "Cuiyun Gao", "Xin Xia", "David Lo"],
    375       "year": 2024,
    376       "relevance": "Provides the Code Suggestion task and dataset used in evaluation."
    377     },
    378     {
    379       "title": "Code Llama: Open Foundation Models for Code",
    380       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    381       "year": 2023,
    382       "arxiv_id": "2308.12950",
    383       "relevance": "Open-source code LLM used as one of the base LMs for evaluating transferability of compressed prompts."
    384     },
    385     {
    386       "title": "Less is More: Docstring Compression in Code Generation",
    387       "authors": ["Guang Yang", "Yu Zhou", "Wei Cheng"],
    388       "year": 2024,
    389       "arxiv_id": "2410.22793",
    390       "relevance": "Related work on compressing natural language parts (docstrings) in coding task prompts, complementary to code compression."
    391     },
    392     {
    393       "title": "Natural is the Best: Model-Agnostic Code Simplification for Pre-trained Large Language Models",
    394       "authors": ["Yan Wang", "Xiaoning Li", "Tien N Nguyen", "Shaohua Wang"],
    395       "year": 2024,
    396       "relevance": "Provides the token type taxonomy (Symbol, Signature, Invocation, Identifier, Structure) used to categorize code tokens."
    397     }
    398   ]
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs