scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26125B)
      1 {
      2   "paper": {
      3     "title": "Collaboration is all you need: LLM Assisted Safe Code Translation",
      4     "authors": ["Rabimba Karanjai", "Sam Blackshear", "Lei Xu", "Weidong Shi"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2503.11237",
      8     "doi": "10.1145/3696630.3728521"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The framework is described architecturally but no implementation is released."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper uses publicly available benchmarks: CodeNet (Puri et al., 2021), Avatar (Ahmad et al., 2021), and XLCoST (Zhu et al., 2022). These are standard public datasets the authors did not modify. However, the NLI training dataset created from programming language documentation is not released."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper lists model names used in the garden (Section 3) but provides no information about library versions, hardware, or software dependencies."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the architecture conceptually but does not provide enough implementation detail to reproduce the experiments."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '61.6%', '79.38'). No confidence intervals, error bars, or uncertainty measures are provided for any results."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper claims UniTranslator 'rivals and surpasses' GPT-4 and outperforms SolMover based solely on comparing raw numbers. No statistical significance tests are performed."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Only raw success rates and CodeBLEU scores are reported. No effect sizes (Cohen's d, relative improvement with context) are provided. The reader must manually compute differences from the tables."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Sample sizes are stated (200 samples per language in Table 1, 734 tasks in Table 2, 249/250 in Avatar) but no justification is provided for why these sizes are sufficient for the claims being made. No power analysis is discussed."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No standard deviations, variance, or spread measures are reported across any experimental runs. It is unclear whether results are from a single run or averaged across multiple runs."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Table 1 compares against CodeGeeX, StarCoder, GPT-4, and Llama 2. Table 2 compares against SolMover. Table 3 compares against SteloCoder and XLCoST. Multiple baselines are included across experiments."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The baselines are mixed. GPT-4 is contemporary, but the paper says just 'GPT-4' without specifying which version. CodeGeeX and StarCoder are from 2023. SteloCoder is from 2023. No comparison against more recent code translation systems from 2024-2025. The paper was submitted in March 2025."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "UniTranslator has multiple components (Director LLM, concept agent, NLI checker, compiler feedback, agent quorum) but no ablation study is provided to show which components contribute to performance. The system is evaluated only as a whole."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Tables 1 and 2 use success rate (functional correctness), while Table 3 uses CodeBLEU scores. Two distinct metrics are used across experiments."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of translation quality is included. All evaluation is automated (compilation success rate and CodeBLEU). For a code translation system making claims about semantic soundness and idiomatic translations, human evaluation of output quality would be relevant."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No discussion of train/test/validation splits. The paper does not clarify whether the benchmarks used for evaluation overlap with any data used to fine-tune the NLI model or concept agent."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Tables 1 and 3 provide per-language-pair breakdowns. Table 1 shows results for C++, Go, Java, and Python source languages. Table 3 shows results for five different translation directions to Python."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No failure cases are shown or analyzed. The paper does not discuss examples where UniTranslator failed, what kinds of code it cannot translate, or qualitative error analysis."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No negative results are reported. Table 3 shows UniTranslator underperforming SteloCoder on C# to Python, Java to Python, and PHP to Python, but these are not discussed as negative results or analyzed. Every aspect of the system is presented positively."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The abstract claims UniTranslator 'achieves a level of accuracy and efficiency that rivals larger, monolithic models.' Table 3 shows UniTranslator scoring 54.31 on C# to Python vs SteloCoder's 74.83, and 53.22 on PHP to Python vs SteloCoder's 71.11. These substantial losses are not acknowledged. The abstract also claims the approach 'mitigates common issues such as code artifacts and hallucinations' but no evidence for this is presented."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper makes causal claims: 'the success of our concept agent underscores the transformative potential of incorporating explicit programming knowledge' (Section 4), and claims the feedback mechanism drives improvement. Without an ablation study isolating these components, these causal claims are not justified."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title says 'Safe Code Translation' broadly but testing is limited to specific benchmarks and language pairs. The abstract claims the approach works for 'complex code translation tasks' generally. The paper acknowledges results are 'preliminary' but the title and abstract make broad claims. The paper also claims to address 'low-resource languages' but only tests Solidity-to-Move as an example."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether performance gains might come from specific model capabilities rather than the multi-agent framework, whether the benchmark selection favors the approach, or other confounding factors."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Section 3 lists model names: 'granite-code, codegemma, deepseek-coder-v2, starcoder2, codegeex4, codestral, deepseek-coder, codellama, codeqwen, qwen2.5-coder, gemma2, llama3.2, opencoder, llama3.3.' No version numbers, parameter counts, or snapshot dates are specified. The baselines in Table 1 are listed only as 'CodeGeeX', 'StarCoder', 'GPT-4', 'Llama 2' without versions."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No prompts are provided. The paper describes concepts like 'few-shot prompting' and 'dynamic prompt generation system' (Section 1) but does not include any actual prompt text used in experiments."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters are reported. Temperature, top-p, max tokens, number of refinement iterations, convergence criteria, and other settings are not specified anywhere in the paper."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The multi-agent scaffolding is described at a high conceptual level (Section 2: Director LLM, concept agent, NLI checker, compiler feedback loop). However, critical implementation details are missing: how agents communicate, what the convergence criteria are, how many iterations are typical, how the Director selects agents in practice, and the actual workflow logic."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Section 3 mentions 'We sourced code from a diverse set of GitHub projects, prioritizing those with substantial community recognition (measured by repository stars)' but provides no details on selection criteria, filtering steps, or how the final evaluation set was constructed."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "A 'Limitations' section is present between Section 4 (Discussion) and Section 5 (Conclusion), acknowledging that smaller LLMs may not match larger ones, potential biases and hallucinations, and varying performance across languages."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "The Limitations section contains only generic statements: 'performance of smaller LLMs may not always match that of larger models', 'Potential biases and hallucinations in LLMs can lead to errors', 'framework's performance may vary across different programming languages and domains.' None of these are specific to this study's design or results."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No specific scope boundaries are stated. The paper does not explicitly say what it did NOT test, what populations/settings are excluded, or what claims it is NOT making. The Limitations section speaks generically about future work needed."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw experimental data (individual translation outputs, per-example pass/fail results, logs) is available. Only aggregated success rates and CodeBLEU scores in tables are provided."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper states code was sourced from GitHub projects with 'substantial community recognition' but does not specify which projects, how many, what selection criteria were applied, or the time period of collection. The evaluation datasets are named but the process of using them is not described in detail."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants. The paper uses standard code translation benchmarks, not human subjects."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The pipeline from raw code to evaluation results is not documented. It is unclear how code samples were selected from the benchmarks, how translations were executed (batch vs. sequential), or how success/failure was determined for each sample."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section states: 'This research was supported by the Sui Foundation Academic Grant (SARA)' and 'compute credits through the Google Developer Expert (GDE) and Google Cloud Research Innovator program.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are listed: Rabimba Karanjai and Weidong Shi at University of Houston, Sam Blackshear at Mysten Labs, Lei Xu at Kent State University."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The research is funded by the Sui Foundation, and co-author Sam Blackshear is from Mysten Labs (the company behind the Sui blockchain and the Move language). The paper evaluates Solidity-to-Move translation (Table 2), which directly relates to the Sui ecosystem. The funder has a clear interest in demonstrating that Move is a viable translation target."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper. Given that a co-author works at Mysten Labs (the company behind the Sui blockchain) and the paper is funded by the Sui Foundation while evaluating Move language translation, this is a significant omission."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper uses multiple LLMs (granite-code, codegemma, deepseek-coder-v2, etc.) to perform code translation on public benchmarks (CodeNet, Avatar, XLCoST). No training cutoff dates are stated for any of the models used."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The benchmarks used (CodeNet 2021, Avatar 2021, XLCoST 2022) predate many of the models used. No discussion of whether these benchmark solutions appeared in the training data of the LLMs."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "CodeNet (2021), Avatar (2021), and XLCoST (2022) were all published before the training cutoffs of the 2024-era models used. The paper does not discuss whether benchmark contamination could inflate results. This is a serious concern given that the models may have memorized benchmark solutions."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study. The evaluation is entirely automated using benchmarks."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper's stated motivation includes 'enabling deployment on commonly available hardware' and 'resource-constrained environments.' Despite this, no inference costs, latency, tokens consumed, or wall-clock time are reported for the multi-agent translation pipeline."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The Acknowledgments mention Google Cloud compute credits but do not quantify the total compute used for experiments."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "UniTranslator, powered by compact open-source LLMs, achieves performance that rivals and sometimes surpasses GPT-4 in code translation.",
    287       "evidence": "Table 1 shows UniTranslator (labeled 'TransCode') outperforming GPT-4 in some cases: Java translation 80.6% vs 81.3% (close), but the column labeled 'TransCode' appears to be a different system from the reference [Puri et al., 2021]. UniTranslator's own results are not clearly separated from TransCode in Table 1.",
    288       "supported": "weak"
    289     },
    290     {
    291       "claim": "UniTranslator outperforms SolMover in Solidity-to-Move translation with 61.6% vs 47.5% success rate.",
    292       "evidence": "Table 2 shows UniTranslator achieving 61.6% success rate on 734 tasks vs SolMover's 47.5%. However, no statistical tests, confidence intervals, or details about what constitutes 'success' are provided.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "UniTranslator shows competitive CodeBLEU performance on XLCoST benchmark, surpassing existing models in several translation-to-Python tasks.",
    297       "evidence": "Table 3 shows UniTranslator outperforming on C++ to Python (79.38 vs 75.42) and JavaScript to Python (80.07 vs 73.05), but substantially underperforming on C# to Python (54.31 vs 74.83), Java to Python (64.27 vs 74.39), and PHP to Python (53.22 vs 71.11). The claim of 'competitive performance' is contradicted by 3 out of 5 cases showing significant underperformance.",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "The multi-agent collaborative approach with NLI grounding mitigates code artifacts and hallucinations.",
    302       "evidence": "No direct evidence is presented. The paper describes the NLI mechanism architecturally (Section 2.1) but provides no quantitative measurement of hallucination reduction or comparison with/without NLI grounding.",
    303       "supported": "unsupported"
    304     }
    305   ],
    306   "methodology_tags": ["benchmark-eval"],
    307   "key_findings": "UniTranslator proposes a multi-agent framework for code translation using compact LLMs orchestrated by a Director LLM, with NLI-based fact checking and compiler feedback loops. Preliminary evaluations on CodeNet, Avatar, and XLCoST benchmarks show mixed results: the system outperforms baselines in some language pairs (e.g., C++ to Python, JavaScript to Python, Solidity to Move) but substantially underperforms in others (C# to Python, PHP to Python). The paper is explicitly described as presenting 'preliminary' results.",
    308   "red_flags": [
    309     {
    310       "flag": "Undisclosed conflict of interest",
    311       "detail": "Co-author Sam Blackshear is from Mysten Labs (the company behind the Sui blockchain and Move language). The paper is funded by the Sui Foundation. A key evaluation (Table 2) tests Solidity-to-Move translation, directly relevant to the Sui ecosystem. This conflict is not acknowledged anywhere in the paper."
    312     },
    313     {
    314       "flag": "Selective presentation of results",
    315       "detail": "Table 3 shows UniTranslator scoring 54.31, 64.27, and 53.22 on three translation tasks where SteloCoder achieves 74.83, 74.39, and 71.11 respectively — losses of 20+ CodeBLEU points. These substantial underperformances are not discussed. The abstract and conclusion focus exclusively on cases where UniTranslator performs well."
    316     },
    317     {
    318       "flag": "No uncertainty quantification",
    319       "detail": "All results are single point estimates with no error bars, confidence intervals, significance tests, or information about number of runs. It is impossible to assess whether observed differences are meaningful or due to random variation."
    320     },
    321     {
    322       "flag": "Benchmark contamination risk",
    323       "detail": "Benchmarks from 2021-2022 (CodeNet, Avatar, XLCoST) are used with 2024-era LLMs. No analysis of whether the models have seen these benchmarks during training. Solutions to these problems may be in the training data."
    324     },
    325     {
    326       "flag": "Missing ablation study for multi-component system",
    327       "detail": "UniTranslator has at least 5 major components (Director LLM, concept agent, NLI checker, compiler feedback, agent quorum) but no ablation study shows which components contribute to performance. It is unknown whether the multi-agent approach adds value over a single LLM with compiler feedback."
    328     },
    329     {
    330       "flag": "Ambiguous Table 1 attribution",
    331       "detail": "Table 1 lists 'TransCode' with a citation to [Puri et al., 2021] (the CodeNet paper) as a column. It is unclear whether 'TransCode' refers to UniTranslator or to a baseline system. The paper states UniTranslator uses Gemma family models but does not clearly identify which column represents UniTranslator's results."
    332     },
    333     {
    334       "flag": "Claims significantly outrun evidence",
    335       "detail": "The abstract claims UniTranslator 'achieves a level of accuracy and efficiency that rivals larger, monolithic models' and 'mitigates common issues such as code artifacts and hallucinations.' The actual results show mixed performance and no evidence for hallucination mitigation."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Lost in translation: A study of bugs introduced by large language models while translating code",
    341       "authors": ["Rangeet Pan", "Ali Reza Ibrahimzada", "Rahul Krishna"],
    342       "year": 2024,
    343       "relevance": "Empirical study of bugs introduced by LLMs during code translation, directly relevant to understanding LLM code generation quality."
    344     },
    345     {
    346       "title": "Attention, Compilation, and Solver-based Symbolic Analysis are All You Need",
    347       "authors": ["Prithwish Jana", "Piyush Jha", "Haoyang Ju"],
    348       "year": 2023,
    349       "arxiv_id": "2306.06755",
    350       "relevance": "Combines LLMs with compilation and symbolic analysis for code tasks, relevant to LLM-assisted programming methodology."
    351     },
    352     {
    353       "title": "Teaching Machines to Code: Smart Contract Translation with LLMs",
    354       "authors": ["Rabimba Karanjai", "Lei Xu", "Weidong Shi"],
    355       "year": 2024,
    356       "arxiv_id": "2403.09740",
    357       "relevance": "Prior work on LLM-based smart contract translation (SolMover), directly relevant to agentic code generation evaluation."
    358     },
    359     {
    360       "title": "Unsupervised translation of programming languages",
    361       "authors": ["Baptiste Roziere", "Marie-Anne Lachaux", "Lowik Chanussot", "Guillaume Lample"],
    362       "year": 2020,
    363       "relevance": "Foundational work on neural code translation published at NeurIPS, relevant to code generation methodology."
    364     },
    365     {
    366       "title": "XLCoST: A Benchmark Dataset for Cross-lingual Code Intelligence",
    367       "authors": ["Ming Zhu", "Aneesh Jain", "Karthik Suresh"],
    368       "year": 2022,
    369       "arxiv_id": "2206.08474",
    370       "relevance": "Benchmark dataset for cross-lingual code evaluation, relevant to understanding evaluation methodology in code translation."
    371     },
    372     {
    373       "title": "Understanding the Performance and Estimating the Cost of LLM Fine-Tuning",
    374       "authors": ["Yuchen Xia", "Jiho Kim", "Yuhan Chen"],
    375       "year": 2024,
    376       "arxiv_id": "2408.04693",
    377       "relevance": "Study of LLM fine-tuning cost and performance, relevant to practical cost assessment of LLM-based tools."
    378     },
    379     {
    380       "title": "SteloCoder: a Decoder-Only LLM for Multi-Language to Python Code Translation",
    381       "authors": ["Jialing Pan", "Adrien Sadé", "Jin Kim"],
    382       "year": 2023,
    383       "arxiv_id": "2310.15539",
    384       "relevance": "LLM for multi-language code translation used as a baseline, relevant to code generation evaluation methodology."
    385     },
    386     {
    387       "title": "Project CodeNet: A large-scale AI for code dataset for learning a diversity of coding tasks",
    388       "authors": ["Ruchir Puri", "David S Kung", "Geert Janssen"],
    389       "year": 2021,
    390       "arxiv_id": "2105.12655",
    391       "relevance": "Large-scale code dataset used as evaluation benchmark, relevant to understanding benchmarks for code AI systems."
    392     },
    393     {
    394       "title": "TRUE: Re-evaluating factual consistency evaluation",
    395       "authors": ["Or Honovich", "Roee Aharoni", "Jonathan Herzig"],
    396       "year": 2022,
    397       "arxiv_id": "2204.04991",
    398       "relevance": "NLI-based factual consistency evaluation method adapted for code verification in this paper, relevant to LLM output verification methodology."
    399     },
    400     {
    401       "title": "CARGO: AI-guided dependency analysis for migrating monolithic applications to microservices architecture",
    402       "authors": ["Vikram Nitin", "Shubhi Asthana", "Baishakhi Ray", "Rahul Krishna"],
    403       "year": 2022,
    404       "relevance": "AI-guided code migration tool, relevant to LLM-assisted software engineering and code transformation."
    405     }
    406   ]
    407 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs