scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24867B)
      1 {
      2   "paper": {
      3     "title": "Align-Pro: A Principled Approach to Prompt Optimization for LLM Alignment",
      4     "authors": [
      5       "Prashant Trivedi",
      6       "Souradip Chakraborty",
      7       "Avinash Reddy",
      8       "Vaneet Aggarwal",
      9       "Amrit Singh Bedi",
     10       "George K. Atia"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2501.03486"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No code repository is provided. The paper links to the Hugging Face TRL library examples (https://github.com/huggingface/trl/blob/main/examples/notebooks/gpt2-sentiment.ipynb), but this is not code for Align-Pro itself — it is an external dependency. No Align-Pro-specific code is released."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper uses three publicly available datasets: UltraFeedback, HelpSteer, and Orca. These are standard public benchmarks that the authors did not modify, so they are accessible to others."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions 'Python 3.11' and 'INTEL(R) XEON(R) GOLD 6526Y processor with a Nvidia H100 GPU' (Section 7.1), and the TRL library, but does not provide a requirements.txt, Dockerfile, or detailed dependency list with versions sufficient to recreate the environment."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions are provided. The paper describes the experimental setup at a high level but does not include a README, scripts, or specific commands to replicate the experiments."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No confidence intervals or error bars are reported. Figures 2 and 3 show bar charts of mean rewards and variances, but no CI notation, ± notation, or error bars on the figures are present."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No statistical significance tests are used. The paper claims Align-Pro 'consistently outperforms' and 'significantly outperforms' the baseline based on comparing numbers in tables and figures without any formal statistical test (e.g., p-values, t-tests)."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No standardized effect sizes are reported. Win rates are given in Table 1 (e.g., 60 vs 24), but there is no Cohen's d, odds ratio, or contextual percentage improvement with baseline context. Raw reward means are shown in bar charts without numerical values in tables."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The win-rate evaluation uses 100 samples per configuration (Table 1 caption: 'win rates (for 100 samples)'), but no justification for this sample size is provided. No power analysis is mentioned."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Figure 3 shows 'reward variance' across model configurations, which measures output diversity rather than experimental variance across repeated runs. There is no mention of multiple experimental runs, seeds, or standard deviation across replications. Single-run results appear to be presented."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper includes two baselines: (1) 'No Fine-Tuning' where only the frozen model generates responses without prompt optimization, and (2) 'RLHF' where the frozen model is fine-tuned, serving as an oracle upper bound (Section 7.1)."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper explicitly states in Remark 7.1: 'we did not compare our approach with other existing prompt optimization methods in the literature.' The only baselines are a trivial no-optimization baseline and RLHF fine-tuning. No contemporary prompt optimization methods (e.g., BDPL, PRewrite, PromptAgent, APOHF, all cited in the related work) are compared against."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No ablation study is performed. The framework has components (meta-prompt, prompter model choice, KL regularization parameter lambda) that could be ablated, but none are systematically varied to show their individual contribution."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Three evaluation metrics are used: mean reward, reward variance, and win rate (assessed by GPT-4 as judge), as described in Section 7.1."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No human evaluation is included. Win rates are determined by GPT-4 as judge (Section 7.2), which is an automated LLM-based evaluation, not human evaluation. The paper makes claims about alignment with 'human values' but relies entirely on automated reward models and LLM judges."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "There is no explicit mention of a held-out test set separate from training/validation. The paper mentions evaluating on 'an unseen test dataset' in Appendix C.3, but does not clearly describe the train/test split procedure."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down per dataset (UltraFeedback, HelpSteer, Orca) and per model architecture combination (four prompter-frozen model pairs) in Table 1, Figures 2 and 3."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No failure cases are discussed. All results show Align-Pro outperforming the no-fine-tuning baseline. The paper does not analyze where the approach fails or produces worse results."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "No negative results are reported. Every configuration shows Align-Pro winning against the baseline. The one case where win rates are tied (Phi-3.5/Qwen-2.5-7B on Orca: 46-46) is not discussed as a limitation."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims that prompt optimization can 'effectively align LLMs' and the paper provides theoretical bounds (Theorem 6.1) and experimental results (Figures 2-3, Table 1) showing improvement over no-fine-tuning baseline. The claims are hedged appropriately ('proof of concept')."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims through its framework design: 'prompt optimization can effectively align LLMs.' The theoretical framework provides formal justification, and the experimental design compares prompt-optimized vs. non-optimized settings in a controlled manner where the only variable changed is whether the prompter is used. This constitutes adequate controlled manipulation."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper's title and abstract claim results about 'LLM Alignment' broadly, but experiments use only two small prompter models (Phi-3.5-Instruct, Qwen-2.5-1.5B-Instruct) and two frozen models (both Llama-3.1-8B-Instruct in the text, though Qwen-2.5-7B-Instruct appears in Table 1). Claims are not bounded to these specific models. No discussion of whether results generalize to larger models or different model families."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No alternative explanations for the results are discussed. For example, the improvement could be due to the meta-prompt adding detail/length rather than genuine alignment improvement, or the GPT-4 judge could be biased toward longer/more detailed responses. These confounds are not addressed."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper specifies model names with sufficient detail: 'Phi-3.5-Instruct', 'Qwen-2.5-1.5B-Instruct', 'Llama-3.1-8B-Instruct', 'Qwen-2.5-7B-Instruct' (Section 7.1), and 'RM-Gemma-2B' for the reward model. While not API snapshot dates, these are specific versioned open-source models with fixed weights."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The meta-prompt used for the prompter is provided in Appendix C.1: 'Rephrase the given text in detail and precise so that it is fed to another language model. The given text is [PROMPT]'. The GPT-4 judge system prompt is also provided in Appendix C.2. The actual fill values are the dataset prompts, which are from public datasets."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 7.1 reports key hyperparameters: 'learning rate = 1.41e-5', 'temperature = 1.5, top P = 0.6 and top K = 20'. These are the main generation and training hyperparameters."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The prompt optimization pipeline is described: a prompter model rewrites the input prompt using a meta-prompt, then the rewritten prompt is fed to the frozen LLM (Figure 1, Section 7.1). The prompter is fine-tuned using PPO via the TRL library. The workflow is clear."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No data preprocessing steps are documented. The paper names three datasets (UltraFeedback, HelpSteer, Orca) but does not describe how data was sampled, filtered, or prepared for the experiments. It is unclear how many examples were used from each dataset or whether any filtering was applied."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 8 is titled 'Conclusion, Limitations and Future Work' and contains a dedicated 'Limitations and future work' subsection discussing the framework's inherent limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The limitations section mentions specific threats: 'Our framework is inherently limited by the capabilities of the frozen language model' and 'sensitivity of the prompt to the final response; a slight change in the prompt can lead to profound changes in the final responses.' These are specific to this study's approach."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what the results do NOT show. For instance, it does not bound its claims to the specific models and datasets tested, nor does it state that the approach has not been validated on larger models, safety-critical alignment tasks, or with human evaluators."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No raw experimental data (individual reward scores, model outputs, GPT-4 judge responses) is made available. Only aggregate statistics (mean rewards, win rates) are reported."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper names the three datasets but does not describe how specific examples were selected from them, how many examples were used in training vs. evaluation, or the data collection procedure for the experimental results."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants are involved. The data sources are standard public benchmarks."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The full data pipeline from dataset selection to final evaluation results is not documented. There is no description of how training data was split, how many training steps were run, or how the 100 evaluation samples for win-rate were selected."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: University of Central Florida, University of Maryland College Park, and Purdue University. All are academic institutions."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not mean the work is unfunded — it means the reader cannot evaluate potential conflicts."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper evaluates frozen LLMs (Llama-3.1-8B-Instruct, Qwen-2.5-7B-Instruct) on alignment datasets but does not state the training data cutoff dates for any of the models used."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No discussion of whether the evaluation datasets (UltraFeedback, HelpSteer, Orca) may have been included in the training data of the models used. These are publicly available datasets that could be in the training corpora."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The public datasets used (UltraFeedback, HelpSteer, Orca) were published before the training cutoffs of the models (Llama-3.1, Qwen-2.5), creating contamination risk. This is not discussed."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants are involved in this study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants are involved in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants are involved in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants are involved in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are involved in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are involved in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference cost, latency, or token consumption is reported. The approach requires running two models (prompter + frozen LLM) per query, doubling inference cost, but this is not quantified."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "The paper mentions using an H100 GPU (Section 7.1) but does not state total GPU hours, training time, or total computational budget for the experiments."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Align-Pro consistently outperforms the no-fine-tuning baseline in terms of mean reward across all datasets and model configurations.",
    293       "evidence": "Figure 2 shows bar charts of mean rewards for three datasets across four model configurations. Align-Pro bars are consistently higher than no-fine-tuning bars.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "Align-Pro has the lowest reward variance compared to both RLHF (oracle) and no-fine-tuning approaches.",
    298       "evidence": "Figure 3 shows reward variance comparisons. The paper states 'the variance in reward for Align-Pro is the lowest' (Section 7.2).",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Align-Pro significantly outperforms the no-fine-tuning approach in win rate across all model architectures and datasets.",
    303       "evidence": "Table 1 shows win rates for 100 samples. Align-Pro wins in 11 of 12 configurations, with one tie (46-46 on Orca with Phi-3.5/Qwen-2.5-7B). However, no statistical significance test is performed.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "The suboptimality gap between prompt optimization and RLHF fine-tuning is bounded and depends on the frozen model quality, prompter variation, and KL divergence between optimal and baseline prompters.",
    308       "evidence": "Theorem 6.1 (Section 6) provides a formal upper bound with three interpretable terms. Proof in Appendix B.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Prompt optimization can achieve performance comparable to fine-tuning when parameter modification is not feasible.",
    313       "evidence": "The theoretical bound (Theorem 6.1) and experimental results (Figures 2-3) show Align-Pro approaching RLHF performance. However, experimental evidence shows a gap remains — Align-Pro is closer to no-fine-tuning than to RLHF in some configurations.",
    314       "supported": "moderate"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "theoretical",
    319     "benchmark-eval"
    320   ],
    321   "key_findings": "The paper proposes Align-Pro, a theoretical framework for prompt optimization as an alternative to RLHF fine-tuning for LLM alignment. It derives a closed-form optimal prompter distribution and establishes suboptimality bounds showing how prompt optimization performance depends on the frozen model quality, prompter distribution variation, and KL divergence. Experimental validation on three datasets with multiple model configurations shows Align-Pro consistently outperforms no-fine-tuning baselines in mean reward and win rate, though the comparison lacks contemporary prompt optimization baselines and statistical rigor.",
    322   "red_flags": [
    323     {
    324       "flag": "No comparison with existing prompt optimization methods",
    325       "detail": "Remark 7.1 explicitly acknowledges the paper does not compare against other prompt optimization methods cited in the related work (BDPL, PRewrite, PromptAgent, APOHF). The only baselines are trivial (no optimization) and an upper bound (full RLHF). This makes it impossible to assess whether Align-Pro offers any advantage over existing methods."
    326     },
    327     {
    328       "flag": "No statistical significance tests despite 'significantly outperforms' claims",
    329       "detail": "The paper uses language like 'significantly outperforms' (Section 7.2) to describe results, but no statistical significance tests are performed. Win rates are based on 100 samples without confidence intervals."
    330     },
    331     {
    332       "flag": "GPT-4 as sole automated judge for win rates",
    333       "detail": "Win rates rely entirely on GPT-4 as judge with no human validation. The GPT-4 judge may be biased toward longer, more detailed responses — which is exactly what prompt rewriting tends to produce (as shown in the examples in Appendix C)."
    334     },
    335     {
    336       "flag": "Identical frozen models listed as different",
    337       "detail": "Section 7.1 lists both frozen models F1 and F2 as 'Llama-3.1-8B-Instruct', though Table 1 shows Qwen-2.5-7B-Instruct as a second frozen model. This inconsistency suggests a writing error and raises questions about the experimental rigor."
    338     },
    339     {
    340       "flag": "No error bars or uncertainty quantification",
    341       "detail": "None of the experimental results include error bars, confidence intervals, or multi-run statistics. It is unclear whether results are from a single run or averaged across multiple runs."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Training language models to follow instructions with human feedback",
    347       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    348       "year": 2022,
    349       "relevance": "Foundational RLHF paper for LLM alignment, directly relevant to the survey's coverage of alignment methods."
    350     },
    351     {
    352       "title": "Direct preference optimization: Your language model is secretly a reward model",
    353       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    354       "year": 2023,
    355       "arxiv_id": "2305.18290",
    356       "relevance": "Key alternative to RLHF for LLM alignment; directly relevant to methodology comparisons in the survey."
    357     },
    358     {
    359       "title": "AutoPrompt: Eliciting Knowledge from Language Models with Automatically Generated Prompts",
    360       "authors": ["Taylor Shin", "Yasaman Razeghi", "Robert L. Logan IV"],
    361       "year": 2020,
    362       "relevance": "Early prompt optimization method using gradient-based techniques, relevant to automated prompt engineering approaches."
    363     },
    364     {
    365       "title": "PRewrite: Prompt Rewriting with Reinforcement Learning",
    366       "authors": ["Weize Kong", "Spurthi Amba Hombaiah", "Mingyang Zhang"],
    367       "year": 2024,
    368       "arxiv_id": "2401.08189",
    369       "relevance": "RL-based prompt optimization method directly comparable to Align-Pro's approach."
    370     },
    371     {
    372       "title": "PromptAgent: Strategic planning with language models enables expert-level prompt optimization",
    373       "authors": ["Xinyuan Wang", "Chenxi Li", "Zhen Wang"],
    374       "year": 2023,
    375       "arxiv_id": "2310.16427",
    376       "relevance": "Planning-based prompt optimization approach, relevant to survey coverage of automated prompting methods."
    377     },
    378     {
    379       "title": "Prompt Optimization with Human Feedback",
    380       "authors": ["Xiaoqiang Lin", "Zhongxiang Dai", "Arun Verma"],
    381       "year": 2024,
    382       "arxiv_id": "2405.17346",
    383       "relevance": "Uses dueling bandits for prompt optimization with preference feedback, directly relevant to prompt optimization methodology."
    384     },
    385     {
    386       "title": "Curiosity-driven red-teaming for large language models",
    387       "authors": ["Zhi-Wei Hong", "Idan Shenfeld", "Tsun-Hsuan Wang"],
    388       "year": 2024,
    389       "arxiv_id": "2402.19464",
    390       "relevance": "Red-teaming approach using RL to train attacker models, sharing similar optimization formulation with prompt optimization for alignment."
    391     },
    392     {
    393       "title": "LIAR: Leveraging Alignment (Best-of-N) to Jailbreak LLMs in Seconds",
    394       "authors": ["James Beetham", "Souradip Chakraborty", "Mengdi Wang"],
    395       "year": 2024,
    396       "arxiv_id": "2412.05232",
    397       "relevance": "Uses alignment techniques for adversarial jailbreaking, relevant to AI safety and red-teaming coverage in the survey."
    398     },
    399     {
    400       "title": "Large Language Models Are Human-Level Prompt Engineers",
    401       "authors": ["Yongchao Zhou", "Andrei Ioan Muresanu", "Ziwen Han"],
    402       "year": 2023,
    403       "relevance": "Demonstrates LLMs as prompt optimizers, foundational work for automated prompt engineering."
    404     },
    405     {
    406       "title": "Safe RLHF: Safe reinforcement learning from human feedback",
    407       "authors": ["Josef Dai", "Xuehai Pan", "Ruiyang Sun"],
    408       "year": 2023,
    409       "arxiv_id": "2310.12773",
    410       "relevance": "Safety-aware RLHF method, relevant to the survey's coverage of safe AI alignment approaches."
    411     }
    412   ]
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs