scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21583B)
      1 {
      2   "paper": {
      3     "title": "CoTran: An LLM-based Code Translator using Reinforcement Learning with Feedback from Compiler and Symbolic Execution",
      4     "authors": ["Prithwish Jana", "Piyush Jha", "Haoyang Ju", "Gautham Kishore", "Aryan Mahajan", "Vijay Ganesh"],
      5     "year": 2023,
      6     "venue": "ECAI-2024 (27th European Conference on Artificial Intelligence)",
      7     "arxiv_id": "2306.06755"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository URL provided: https://github.com/PrithwishJana/CoTran. Appendix A.1 states code and dataset are available there."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The AVATAR-TC dataset is released at the same GitHub repository. The paper states 'The AVATAR-TC dataset and all our code can be accessed at https://github.com/PrithwishJana/CoTran.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Appendix A.1 states the repository includes 'library dependencies' and Appendix A.2 specifies hardware (four NVIDIA V100 GPUs, 32GB memory, six CPU cores per GPU) and mentions PyTorch. The README reportedly outlines library dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Appendix A.1 states 'We have also made it user-friendly with a README file in the root folder that outlines the folder structure, library dependencies, and instructions for running the code.'"
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Main results in Tables 2 and 3 report only point estimates. One instance of mean ± stdev is given for EpC in Finding 5 (e.g., '10.98±9.67'), but no confidence intervals or error bars for the main metrics (FEqAcc, CompAcc)."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CoTran outperforms baselines based solely on comparing numbers in Tables 2/3. No statistical significance tests (p-values, t-tests, etc.) are reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Percentage improvements with baseline context are consistently reported, e.g., '+14.89% FEqAcc and +8.14% CompAcc for Python-to-Java' with absolute values in tables showing from/to context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The test set size of 1,746 programs is stated but not justified. No discussion of whether this is sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results appear to be from single runs. No standard deviation across seeds or multiple runs reported for the main metrics. The EpC mean±stdev in Finding 5 is for error counts, not across experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Extensive comparison against 14 tools including 3 human-written transpilers, 3 unsupervised LLM tools, ChatGPT, and 7 supervised LLM tools (Table 2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include PPOCoder (2023), TransCoder-ST (2022), ChatGPT (gpt-3.5-turbo-0301, 2023), CodeT5 (2021), PLBART (2021). These are contemporary and competitive for the task."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Thorough ablation: CoTran baseline vs. +CF vs. +CF+SF, RL-only vs. RL+SFT interleaved, non-RL alternatives (CoTran+, CoTran×), and kw-Tok contribution. Tables 2 and 3 and Appendix A.3."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Six metrics used: FEqAcc, CompAcc, errPos1st, CodeBLEU, BLEU, EM. Plus proposed EpC and f/c-rate metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of translation quality. All evaluation is automated (compilation checks, test-case-based equivalence, BLEU scores). Human judgment of code quality, readability, or correctness of translations is absent."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Explicit train/validation/test split described in Section 4. 'No problem overlaps across splits' ensuring out-of-distribution testing. Results reported on the 1,746 test programs."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Figure 4 provides per-sub-dataset breakdown across 7 coding platforms (Aizu, AtCoder, Codeforces, G-CodeJam, LeetCode, GFG, ProjEuler)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Finding 5 discusses compilation error counts. Finding 7 discusses why P2J is harder. Finding 8 discusses failure of function-level tools on whole-program tasks. Appendix A.4 discusses transpiler failures."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Finding 1 and Appendix A.3 report that non-RL schemes (CoTran+, CoTran×) were tried and underperformed. ChatGPT outperforms CoTran on J2P FEqAcc, which is acknowledged."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 48.68% FEqAcc and 76.98% CompAcc for P2J, and improvements over PLBART-base and CodeT5, are supported by Tables 2/3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about CF and SF improving performance are supported by controlled ablation studies (single-variable manipulation: baseline → +CF → +CF+SF, RL-only vs. RL+SFT). The ablation design is adequate."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Code Translator' generally, but results are only for Java-Python pair. While Section 1 mentions 'It can be effortlessly adapted for other language pairs,' no evidence supports this claim. The paper does not bound its generalization to the tested Java-Python setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the results. For example, improvements could partly be due to kw-Tok rather than the feedback mechanism, but this interaction is not explored. No threats-to-validity section exists."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "CodeT5-base from HuggingFace is specified. ChatGPT version is 'gpt-3.5-turbo-0301' with knowledge cutoff date. Other baselines reference specific model architectures."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The ChatGPT prompt is provided verbatim: 'Translate [S] to [T]:[scode]\\n Do not return anything other than the translated code.' For CoTran itself, prompting is not applicable as it is fine-tuned end-to-end."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.2 reports: LoRA rank r=16, scaling factor α=32, Adam lr=10^-4, PPO lr=1.41×10^-5, max sequence length 512, ChatGPT temperature=0, top_p=0."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. CoTran is a fine-tuned model, not an agent with tools or feedback loops at inference time."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4 describes data cleaning: parsing with javalang and tokenize modules, manual correction of minor faults, discarding pairs with major issues, output matching criteria (case insensitivity, whitespace removal, etc.). Statistics in Table 1."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section in the paper. The conclusion mentions future work (extending to legacy code) but does not discuss limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the Java-Python pair or the competitive programming domain of AVATAR-TC."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The AVATAR-TC dataset and all translations generated by tools are available at the GitHub repository, enabling independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4 describes web-crawling 7 competitive coding platforms, collecting human-written test cases, and the relationship to the AVATAR dataset."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is from public competitive programming platforms (standard benchmark source)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 documents the pipeline: scraping from 7 platforms, parsing with javalang/tokenize, manual correction, discarding faulty pairs, resulting in 57,368 pairs with train/validation/test splits shown in Table 1."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Georgia Institute of Technology, University of Toronto, UC San Diego, Columbia University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement found. The paper uses Symflower, a commercial tool, but no disclosure about the relationship with Symflower is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "For ChatGPT, the paper states 'gpt-3.5-turbo-0301' with 'knowledge cutoff of March 1, 2023.' For CodeT5-base, it is a pre-trained model fine-tuned on AVATAR-TC with known pre-training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 4 states 'no problem overlaps across splits.' For ChatGPT, Section 5.1 notes the knowledge cutoff 'predates the public release of AVATAR-TC on GitHub' to minimize contamination risk."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section 5.1 explicitly addresses this: ChatGPT's knowledge cutoff predates AVATAR-TC's public release, 'minimizing the risk of AVATAR-TC (Test) pairs being included in ChatGPT's training data.'"
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or per-example cost reported for CoTran or the baselines."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is described (four V100 GPUs) and Appendix A.2 mentions '~75% reduction in training time' from LoRA, but no total training time, GPU hours, or API spend for ChatGPT experiments are stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoTran + CF + SF (RL+SFT interleaved) achieves 53.89% FEqAcc for J2P and 48.68% for P2J, outperforming all similar-size baselines.",
    286       "evidence": "Table 2 shows these numbers. Nearest competitor PPOCoder gets 44.27% J2P FEqAcc; PLBART-base gets 38.26% P2J FEqAcc.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Fine-grained compiler feedback is more effective than Boolean compiler feedback for RL-based fine-tuning.",
    291       "evidence": "Finding 2 and Figure 2 show that Boolean feedback (CompCoder) provides constant reward for many translations, while CoTran's CF varies continuously. CoTran + CF outperforms PPOCoder by +11.57% FEqAcc in P2J (Table 2).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Interleaving RL and SFT improves performance compared to RL-only fine-tuning.",
    296       "evidence": "Finding 3: RL+SFT interleaved gives +12.94% J2P FEqAcc vs. +9.50% for RL-only, relative to CodeT5-base (Table 2).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The keyword tokenizer (kw-Tok) improves code translation performance.",
    301       "evidence": "Finding 4: kw-Tok accounts for +3.57% J2P and +6.62% P2J FEqAcc improvement (CoTran baseline vs. CodeT5-base in Table 2).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CoTran outperforms ChatGPT on all metrics in P2J and all but FEqAcc in J2P.",
    306       "evidence": "Table 2 confirms this. However, ChatGPT is gpt-3.5-turbo, not GPT-4, so the comparison is with an older model.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "CoTran can be 'effortlessly adapted' for other language pairs.",
    311       "evidence": "Section 1 claims this but provides no evidence beyond noting the requirements (dataset, compilers, test generation tool). No experiments on other language pairs.",
    312       "supported": "unsupported"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "CoTran fine-tunes CodeT5-base using RL with fine-grained compiler feedback (CF) and symbolic execution feedback (SF), achieving state-of-the-art results on Java-Python code translation. On the AVATAR-TC benchmark of 57,000+ code pairs, CoTran + CF + SF with interleaved RL+SFT training achieves 53.89% functional equivalence accuracy for J2P and 48.68% for P2J, outperforming 14 competing tools including ChatGPT (gpt-3.5-turbo) on P2J. The ablation study demonstrates that each component (kw-Tok, CF, SF, interleaved training) contributes incrementally, and that non-RL approaches to incorporating feedback are ineffective.",
    317   "red_flags": [
    318     {
    319       "flag": "No limitations section",
    320       "detail": "The paper has no limitations, threats-to-validity, or scope-bounding discussion. For a paper tested on a single language pair from competitive programming, this is a significant omission."
    321     },
    322     {
    323       "flag": "No variance or significance testing",
    324       "detail": "All results appear to be single-run numbers with no error bars, standard deviations across seeds, or significance tests. Given the small margins between some methods (e.g., CoTran baseline 44.52% vs. PPOCoder 44.27% on J2P FEqAcc), it is impossible to know if differences are meaningful."
    325     },
    326     {
    327       "flag": "Unbounded generalization claims",
    328       "detail": "The paper claims the approach can be 'effortlessly adapted for other language pairs' with no supporting evidence. Results are limited to Java-Python competitive programming problems, but the title and framing suggest general code translation capability."
    329     },
    330     {
    331       "flag": "Commercial tool dependency undisclosed",
    332       "detail": "CoTran relies on Symflower, a commercial symbolic execution engine. No disclosure of any relationship with Symflower is provided, and no competing interests statement exists."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Unified Pre-training for Program Understanding and Generation",
    338       "authors": ["W. Ahmad", "S. Chakraborty", "B. Ray", "K.-W. Chang"],
    339       "year": 2021,
    340       "relevance": "PLBART model used as a key baseline for LLM-based code translation."
    341     },
    342     {
    343       "title": "AVATAR: A Parallel Corpus for Java-Python Program Translation",
    344       "authors": ["W. U. Ahmad", "M. G. R. Tushar", "S. Chakraborty", "K.-W. Chang"],
    345       "year": 2023,
    346       "relevance": "Source dataset that AVATAR-TC builds upon for code translation benchmarking."
    347     },
    348     {
    349       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    350       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    351       "year": 2020,
    352       "relevance": "Pre-trained code model used as baseline for code translation evaluation."
    353     },
    354     {
    355       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    356       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. Hoi"],
    357       "year": 2021,
    358       "relevance": "Base model architecture used by CoTran; key baseline for code generation and translation."
    359     },
    360     {
    361       "title": "Execution-based Code Generation using Deep Reinforcement Learning",
    362       "authors": ["P. Shojaee", "A. Jain", "S. Tipirneni", "C. K. Reddy"],
    363       "year": 2023,
    364       "relevance": "PPOCoder: RL-based code translation framework, primary competing method."
    365     },
    366     {
    367       "title": "Unsupervised Translation of Programming Languages",
    368       "authors": ["B. Roziere", "M.-A. Lachaux", "L. Chanussot", "G. Lample"],
    369       "year": 2020,
    370       "relevance": "TransCoder: foundational unsupervised code translation approach used as baseline."
    371     },
    372     {
    373       "title": "TransCoder-ST: Leveraging Automated Unit Tests for Unsupervised Code Translation",
    374       "authors": ["B. Roziere", "J. Zhang", "F. Charton"],
    375       "year": 2022,
    376       "relevance": "Uses unit tests for code translation self-training; key related work on test-guided translation."
    377     },
    378     {
    379       "title": "CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning",
    380       "authors": ["H. Le", "Y. Wang", "A. D. Gotmare"],
    381       "year": 2022,
    382       "relevance": "RL-based code generation method using unit test feedback, directly related to CoTran's approach."
    383     },
    384     {
    385       "title": "RLTF: Reinforcement Learning from Unit Test Feedback",
    386       "authors": ["J. Liu", "Y. Zhu", "K. Xiao"],
    387       "year": 2023,
    388       "relevance": "RL framework using test feedback for code generation, closely related methodology."
    389     },
    390     {
    391       "title": "Compilable Neural Code Generation with Compiler Feedback",
    392       "authors": ["X. Wang", "Y. Wang", "Y. Wan"],
    393       "year": 2022,
    394       "relevance": "CompCoder: uses Boolean compiler feedback for RL-based code generation, direct predecessor to CoTran's fine-grained CF."
    395     },
    396     {
    397       "title": "CodeTransOcean: A Comprehensive Multilingual Benchmark for Code Translation",
    398       "authors": ["W. Yan", "Y. Tian", "Y. Li"],
    399       "year": 2023,
    400       "relevance": "Multilingual code translation benchmark; standardized evaluation protocol used for ChatGPT prompting."
    401     },
    402     {
    403       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    404       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis"],
    405       "year": 2022,
    406       "relevance": "Parameter-efficient fine-tuning technique used in CoTran's training pipeline."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs