scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29740B)
      1 {
      2   "paper": {
      3     "title": "RESCUE: Ranking LLM Responses with Partial Ordering to Improve Response Generation",
      4     "authors": [
      5       "Yikun Wang",
      6       "Rui Zheng",
      7       "Haoming Li",
      8       "Qi Zhang",
      9       "Tao Gui",
     10       "Fei Liu"
     11     ],
     12     "year": 2023,
     13     "venue": "Annual Meeting of the Association for Computational Linguistics (Student Research Workshop)",
     14     "arxiv_id": "2311.09136",
     15     "doi": "10.18653/v1/2024.acl-srw.32"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "RESCUE trains LLMs to prioritize high-quality responses using a partial ordering ranking metric combined with supervised fine-tuning. On e-SNLI, partial ordering strategies (label prioritization and human-label hybrid) outperform both full ordering and SFT alone, especially with limited training data (0.4%–3.6% of full set). On Multi-doc QA, label-prioritized ranking helps the model better identify relevant information across document positions. Human evaluation shows a 47% win rate for partial ordering over SFT on explanation quality.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states in footnote 1: 'Our code and models are available at: https://github.com/ekonwang/RRescue.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available datasets: e-SNLI (Camburu et al., 2018) and NaturalQuestions-Open (Kwiatkowski et al., 2019). Both are standard public benchmarks."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions 4xA100 GPUs, BF16 mixed precision, and fully sharded data parallelism, but does not provide a requirements.txt, Dockerfile, or detailed software dependency list with library versions."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "While code is released and training hyperparameters are described in §6, the paper itself does not contain step-by-step reproduction instructions. A reader would need to consult the GitHub repository."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 1 and 2 report only point estimates (e.g., accuracy percentages) with no confidence intervals, error bars, or ± notation."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims partial ordering 'outperforms' SFT and full ordering methods based solely on comparing accuracy numbers. No statistical significance tests (p-values, t-tests, etc.) are reported."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Results are reported as accuracy percentages with baseline context (e.g., Table 1 shows SFT at 77.45% vs PO Human-Label Hybrid at 82.86% at 0.4% data). Human evaluation reports win/tie/lose percentages (47%/18%/35%)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for why subsets of {2k, 5k, 10k, 20k} were chosen, why 100 samples were used for human evaluation, or why 1k training examples were used for Multi-doc QA."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "All results appear to be from single experimental runs. No standard deviations, variance across seeds, or spread measures are reported in any table or figure."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper compares against multiple baselines: SFT, FO Similarity, FO GPT-3.5-Turbo, and the base Llama-2-7b model (Table 1). For Multi-doc QA, comparison is against base Llama-2-7b (Table 2)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Llama-2-7b was a contemporary model at submission time (2023). The paper also compares with Hsieh et al. (2023) step-by-step distilling results and mentions other contemporary models (Falcon, Mistral, Vicuna, MPT)."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Multiple ablations are conducted: PO vs FO ordering strategies (Table 1), effect of α balancing coefficient (Figure 4 left), number of candidate responses (Figure 4 right), response flipping (Table 1 right columns), and length scaling factor λ (§7)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper uses automatic accuracy on e-SNLI, answer accuracy on Multi-doc QA, and human evaluation with win/tie/lose ratings on explanation quality (Figure 3)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 6.2 describes human evaluation: 'An annotator evaluated responses for 100 randomly selected samples from the e-SNLI test set, using win, tie and lose to rate each response pair.' Results in Figure 3."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are reported on the standard e-SNLI test set of 9,824 examples (§6.1) and a Multi-doc QA test set of 665 examples (§6.3), separate from training subsets."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 1 breaks down results by training data proportion (0.4%–3.6%). Table 2 breaks down by position of gold document (1st, 3rd/5th, 5th/10th). Figure 6 shows per-label confusion matrix."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper discusses central tendency bias in LLM predictions (§7, Figure 6), margin violations in training (§7), and model copying behavior in Multi-doc QA (§6.3)."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Response flipping does not consistently improve results beyond 2k training examples (Table 1). Performance saturates beyond 3-4 candidate responses (Figure 4 right). Full ordering can hurt at higher data proportions (Table 1, FO Similarity drops from 86.69 to 86.38)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims partial ordering is 'more robust, less sensitive to noise' and results in 'improved response generation.' Tables 1–2 and Figure 3 support improved accuracy and response quality. The abstract appropriately hedges with 'offers a promising avenue.'"
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims like 'training an LLM to rank responses can improve response generation' are supported by controlled ablation studies comparing PO, FO, and SFT variants (Table 1, Figures 3–4), holding other factors constant."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title and abstract frame the work broadly ('Ranking LLM Responses,' 'enhancing the response generation and task accuracy of LLMs') but experiments use only Llama-2-7b on two tasks (e-SNLI and Multi-doc QA). The authors acknowledge leaving 'extension to other models such as Llama-3 for future work' in a footnote but the framing is broader than the evidence."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper mentions SFT may rely on 'data artifacts for predictions (Gururangan et al., 2018)' but does not substantively discuss alternative explanations for why partial ordering improves over full ordering or SFT, such as data augmentation effects, regularization from multiple responses, or selection bias in the human evaluation."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures accuracy on e-SNLI and answer accuracy on Multi-doc QA, and claims improvement in 'task accuracy' and 'response generation.' These measurements align with the claims without proxy gaps. Human evaluation directly assesses explanation quality."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model versions are provided: 'Llama-2-7b,' 'GPT-3.5-Turbo-0613,' and 'GPT-4-0613' (§5.1, §6). These include API snapshot dates or size specifications."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Only the response flipping prompt is provided ('Rewrite the sentence to convey the opposite meaning: {Explanation}'). The main prompts used to elicit responses from Llama-2-7b, GPT-3.5-Turbo for response generation, and GPT-3.5-Turbo for full ordering are not provided."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 reports: AdamW with lr=2e-5, cosine scheduler with 0.03 warmup, BF16 mixed precision, batch size B=64 (g=16, b=1, D=4), single epoch training, temperature 0.8 for Llama-2-7b sampling, λ=0.85 length scaling."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. RESCUE is a fine-tuning method combining SFT with a ranking loss."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 describes candidate response collection: 3 responses from Llama-2-7b at temperature 0.8, 1 from GPT-3.5-Turbo, 1 human explanation per e-SNLI prompt. For Multi-doc QA, 5 candidates per question (1 from gold passage, 4 from distractors). Response flipping process is described in §5.1."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing domain-specific criteria needs, GPU resource constraints, and candidate response diversity."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The Limitations section discusses generic future directions ('organizing candidate responses can benefit from domain-specific criteria,' 'with additional GPU resources, we can improve the variety') rather than specific threats to the validity of the current results."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of which tasks, model sizes, or settings the approach may not generalize to, beyond a footnote about leaving Llama-3 extension to future work."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The underlying datasets (e-SNLI, NaturalQuestions-Open) are publicly available. Code and models are released at https://github.com/ekonwang/RRescue, enabling regeneration of candidate responses."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5 details candidate response collection: sources (Llama-2-7b, GPT-3.5-Turbo, human annotations), sampling parameters (temperature 0.8), number of responses per instance (5), and the response flipping procedure."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The human evaluator in §6.2 is described only as 'an annotator' with no information about who they are, their qualifications, how they were selected, or potential biases."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline from public datasets through candidate response generation, ordering (PO/FO strategies), to combined SFT+ranking training is described across §3–5. Training subsets are specified ({2k, 5k, 10k, 20k})."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgements state: 'HL and FL are supported in part by National Science Foundation grant IIS-2303678.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are listed: Fudan University and Emory University. They are not evaluating their own commercial product."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "NSF is an independent government funding agency with no financial stake in the experimental outcomes."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not state training data cutoff dates for Llama-2-7b, GPT-3.5-Turbo-0613, or GPT-4-0613, despite fine-tuning and evaluating Llama-2-7b on e-SNLI."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether e-SNLI test examples (publicly available since 2018) could appear in Llama-2's pre-training data. The base Llama-2-7b already achieves 33.31% on the NLI task, suggesting some exposure."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "e-SNLI was published in 2018, well before Llama-2's training. NaturalQuestions was published in 2019. No discussion of contamination risk for either benchmark."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "The paper has no human subjects study. The single annotator in §6.2 evaluates model outputs but is not a research participant."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human subjects study requiring IRB approval. The evaluation annotator is not a research participant."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human subjects study. The single evaluator annotator is not described, but this is an output evaluation task, not a human subjects study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human subjects study with participant recruitment."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No experimental study with human participants requiring randomization."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human subjects study requiring blinding."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human subjects study with participant attrition."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, latency, or API costs are reported. The method uses GPT-3.5-Turbo and GPT-4 for candidate generation and ordering but the associated costs are not mentioned."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper mentions 4xA100 GPUs and single-epoch training but does not report total GPU hours, training wall-clock time, or API costs for GPT-3.5-Turbo/GPT-4 usage."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from single experimental runs."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they represent single runs or averages."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "While Figure 4 shows results for different α values (0, 0.01, 0.02, 0.05, 0.1, 0.2, 1), no search budget is reported (total configurations tried, compute spent on search)."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "For human evaluation, §6.2 states 'the highest performing model across all data proportions was chosen' — this is selection based on test set performance, not a held-out validation set."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many pairwise comparisons across ordering strategies and data proportions (Table 1, Figure 3) without any statistical testing, let alone multiple comparison correction."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement all baselines (SFT, FO Similarity, FO GPT-3.5-Turbo) and their own PO methods without acknowledging the bias of evaluating their own system against their own baseline implementations."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "FO methods require GPT-3.5-Turbo for ranking (additional API cost) while PO methods use heuristic orderings, but performance is not compared at matched compute budgets."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether e-SNLI accuracy is a valid measure of explanation quality or whether NaturalQuestions answer accuracy captures the claimed 'response generation' improvement."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved. RESCUE is a fine-tuning method, not an agentic system."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "e-SNLI (2018) and NaturalQuestions (2019) both predate Llama-2's training. No discussion of whether the model may have seen test examples or solutions during pre-training."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. For Multi-doc QA, training uses labels to determine correct/incorrect responses, but no analysis of whether this creates feature leakage at test time."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence between train and test splits beyond using the standard e-SNLI train/test split."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention methods (canary strings, membership inference, n-gram overlap analysis) are applied."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Partial ordering strategies outperform both full ordering and SFT for response ranking, especially with limited training data.",
    372       "evidence": "Table 1 shows PO Label Prioritization (86.34% avg) and PO Human-Label Hybrid (86.35% avg) exceed FO Similarity (85.15%), FO GPT-3.5-Turbo (85.14%), and SFT (84.57%) on e-SNLI. The gap is largest at 0.4% training data (82.86% vs 77.45% for SFT).",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "RESCUE with label prioritization achieves a 47% win rate over SFT in human evaluation of explanation quality.",
    377       "evidence": "Figure 3 (§6.2) shows 47% win, 18% tie, 35% lose based on a single annotator evaluating 100 randomly selected samples from the e-SNLI test set.",
    378       "supported": "weak"
    379     },
    380     {
    381       "claim": "Ranking responses allows the LLM to more effectively identify relevant information in long contexts, improving the U-shaped performance curve.",
    382       "evidence": "Table 2 shows PO Label Prioritization improves average accuracy from 40.96% to 46.92% (5 docs) and 38.84% to 41.42% (10 docs), with large gains in middle positions (34.19%→42.44% at 3rd position).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Optimal performance is achieved with α between 0.01 and 0.1 for balancing SFT and ranking metrics.",
    387       "evidence": "Figure 4 (left) shows accuracy across α values, with peak performance in the 0.01–0.1 range on e-SNLI. Single experimental configuration, no error bars.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Performance improvement can be achieved with 3-4 candidate responses, with saturation beyond that.",
    392       "evidence": "Figure 4 (right) shows performance by number of candidate responses, with PO performance plateauing at 3-4 candidates.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Response flipping increases answer variety but does not consistently improve performance.",
    397       "evidence": "Table 1 shows flipping helps only at 0.4% data (↑5.15 for PO Label Prioritization) but can hurt at 0.9% (↓0.26 for FO Similarity, ↓1.30 for PO Human Prioritization). §6.1 notes it 'might cause a shift in the distribution of ranked responses.'",
    398       "supported": "strong"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Single annotator for human evaluation",
    404       "detail": "Human evaluation (§6.2) relies on a single unnamed annotator evaluating 100 samples with no inter-annotator agreement reported. This makes the human evaluation results (Figure 3) unreliable — a second annotator might produce substantially different ratings."
    405     },
    406     {
    407       "flag": "No error bars or variance on any result",
    408       "detail": "All results in Tables 1–2 and Figure 4 are point estimates from apparently single experimental runs. Without variance information, it is impossible to assess whether the differences between methods are meaningful or within noise."
    409     },
    410     {
    411       "flag": "Test-set model selection for human evaluation",
    412       "detail": "Section 6.2 states 'the highest performing model across all data proportions was chosen for human evaluation,' meaning model selection was done on test set performance rather than a validation set, biasing the human evaluation comparison."
    413     },
    414     {
    415       "flag": "Benchmark contamination unaddressed",
    416       "detail": "e-SNLI (2018) and NaturalQuestions (2019) are both publicly available well before Llama-2's training period. The base Llama-2-7b achieves 33.31% on 3-class NLI, which is above random (33.3%), yet no contamination analysis is performed."
    417     },
    418     {
    419       "flag": "No statistical significance testing",
    420       "detail": "Claims of one method outperforming another (e.g., PO vs FO, PO vs SFT) are based solely on comparing point estimates. Without significance tests, it is unclear whether observed differences are statistically meaningful, especially given small margins (e.g., 86.34% vs 85.15%)."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Direct preference optimization: Your language model is secretly a reward model",
    426       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Stefano Ermon", "Christopher D. Manning", "Chelsea Finn"],
    427       "year": 2023,
    428       "relevance": "Core method for aligning LLMs with human preferences without a reward model, directly compared against in this work's positioning."
    429     },
    430     {
    431       "title": "Training language models to follow instructions with human feedback",
    432       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    433       "year": 2022,
    434       "relevance": "InstructGPT/RLHF foundational work on aligning LLMs using human preference rankings."
    435     },
    436     {
    437       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    438       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    439       "year": 2023,
    440       "relevance": "Foundational prompting technique for LLM reasoning that RESCUE builds upon for explanation generation."
    441     },
    442     {
    443       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    444       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    445       "year": 2023,
    446       "arxiv_id": "2305.10601",
    447       "relevance": "Alternative reasoning approach for LLMs using tree-structured exploration, discussed as related work."
    448     },
    449     {
    450       "title": "Reflexion: Language agents with verbal reinforcement learning",
    451       "authors": ["Noah Shinn", "Federico Cassano", "Beck Labash"],
    452       "year": 2023,
    453       "relevance": "Self-reflection approach for LLM agents that represents an alternative to ranking-based improvement."
    454     },
    455     {
    456       "title": "Llama 2: Open foundation and fine-tuned chat models",
    457       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    458       "year": 2023,
    459       "arxiv_id": "2307.09288",
    460       "relevance": "Base model used in RESCUE experiments; discussion of labeler disagreement in preference data informs partial ordering motivation."
    461     },
    462     {
    463       "title": "Fine-tuning language models from human preferences",
    464       "authors": ["Daniel M. Ziegler", "Nisan Stiennon", "Jeffrey Wu"],
    465       "year": 2020,
    466       "relevance": "Foundational work on RLHF for language model fine-tuning, establishing the reward model training approach that RESCUE aims to simplify."
    467     },
    468     {
    469       "title": "BRIO: Bringing order to abstractive summarization",
    470       "authors": ["Yixin Liu", "Pengfei Liu", "Dragomir Radev", "Graham Neubig"],
    471       "year": 2022,
    472       "relevance": "Prior work on ranking-based training for text generation that directly inspires RESCUE's approach."
    473     },
    474     {
    475       "title": "RRHF: Rank responses to align language models with human feedback without tears",
    476       "authors": ["Zheng Yuan", "Hongyi Yuan", "Chuanqi Tan"],
    477       "year": 2023,
    478       "relevance": "Concurrent work on ranking-based LLM alignment without reinforcement learning, closely related to RESCUE's ranking metric."
    479     },
    480     {
    481       "title": "Distilling step-by-step! Outperforming larger language models with less training data and smaller model sizes",
    482       "authors": ["Cheng-Yu Hsieh", "Chun-Liang Li", "Chih-Kuan Yeh"],
    483       "year": 2023,
    484       "relevance": "Comparison point for state-of-the-art NLI performance (89.51% with 540B model), showing RESCUE achieves competitive results with 7B model."
    485     },
    486     {
    487       "title": "Self-RAG: Learning to retrieve, generate, and critique through self-reflection",
    488       "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang"],
    489       "year": 2024,
    490       "relevance": "Related approach for improving LLM generation through self-critique in retrieval-augmented settings."
    491     },
    492     {
    493       "title": "Lost in the middle: How language models use long contexts",
    494       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"],
    495       "year": 2023,
    496       "relevance": "Identifies the U-shaped performance curve in long-context LLMs that RESCUE's Multi-doc QA experiments directly address."
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 2,
    502       "justification": "The partial ordering approach offers a practical method for fine-tuning LLMs with limited annotated data, applicable to domain-specific tasks."
    503     },
    504     "surprise_contrarian": {
    505       "score": 1,
    506       "justification": "Partial ordering outperforming full ordering is a moderately interesting finding but not deeply contrarian."
    507     },
    508     "fear_safety": {
    509       "score": 0,
    510       "justification": "No safety or security concerns raised; the paper focuses on improving response quality."
    511     },
    512     "drama_conflict": {
    513       "score": 0,
    514       "justification": "No controversy or conflict; a straightforward methodological contribution."
    515     },
    516     "demo_ability": {
    517       "score": 2,
    518       "justification": "Code and models are released on GitHub (https://github.com/ekonwang/RRescue), enabling reproduction."
    519     },
    520     "brand_recognition": {
    521       "score": 1,
    522       "justification": "From Fudan University and Emory University; uses well-known models (Llama-2, GPT-3.5/4) but is not from a major AI lab."
    523     }
    524   }
    525 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs