scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20677B)
      1 {
      2   "paper": {
      3     "title": "Condor: Enhance LLM Alignment with Knowledge-Driven Data Synthesis and Refinement",
      4     "authors": ["Maosong Cao", "Taolin Zhang", "Mo Li", "Chuyu Zhang", "Yunxin Liu", "Haodong Duan", "Songyang Zhang", "Kai Chen"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2501.12273"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided: https://github.com/InternLM/Condor. Dataset also released on HuggingFace: https://hf.co/datasets/internlm/Condor-SFT-20K."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Dataset released on HuggingFace at https://hf.co/datasets/internlm/Condor-SFT-20K. However, only 20K samples are released, not the full 200K used in experiments."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification found. The paper mentions xTuner and OpenCompass as tools but does not provide environment setup details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the pipeline conceptually but does not provide commands or scripts to replicate experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims Condor 'significantly outperforms' other methods based solely on comparing numbers. No statistical significance tests are reported."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context, e.g., 'Qwen2.5-7B shows an improvement more than 6% (56.9% to 63.3%)' and percentage improvements are given throughout."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 200K data points were chosen, or why 8 benchmarks were selected. No power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "All results appear to be single-run numbers. No standard deviation, variance, or multi-run results are reported."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 3 compares against Tulu v2, Evol Instruct, WildChat, Magpie, and the official Qwen2.5-7B-Instruct model."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include Magpie (2024), WildChat (2024), and official Qwen2.5-7B-Instruct, which are contemporary methods."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4.4 provides ablation studies on model type, model size, and task difficulty. Section 4.5 studies scaling of data amount, tags, and tasks."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Eight human-preference benchmarks and twelve knowledge-based benchmarks are used for evaluation."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "All evaluation is automated via LLM judges (GPT-4o, CompassJudger-1-32B). No actual human evaluation of outputs is conducted despite the paper claiming to improve 'human-preference' capabilities."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Evaluation is done on established public benchmarks (AlpacaEval, ArenaHard, WildBench, etc.) that are separate from the training data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Per-benchmark breakdowns in Tables 1-7, per-sub-capability analysis in Figure 6, and per-difficulty breakdowns in Table 6."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No failure cases or error analysis is presented. The paper does not discuss where Condor-generated data fails or produces poor results."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Some negative results visible: Condor Void underperforms the official model on several benchmarks (e.g., FoFo, FollowBench in Table 1). The 7B self-iteration shows drops on some benchmarks (Table 7: CompassArena, FoFo)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'superior performance compared to counterparts' which is supported by Tables 1 and 3. Claims about self-improvement at various scales supported by Table 7."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims like 'Condor Refine improves performance' are supported by controlled ablations (Void vs Refine, different difficulty levels, scaling experiments) with single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Enhance LLM Alignment' broadly, but experiments are limited to Qwen, InternLM, and Llama model families on specific chat benchmarks. Claims of 'alignment' improvement are tested only via chat preference benchmarks, not safety or truthfulness alignment."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations. For instance, improvements could be due to the Qwen2.5-72B teacher model's quality rather than Condor's pipeline design. No confound analysis."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions are stated: Qwen2.5-7B, Qwen2.5-14B, Qwen2.5-32B, Qwen2.5-72B-Instruct, InternLM2.5-7B, LLaMA3.1-8B, GPT-4o-0806, CompassJudger-1-32B."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompt templates conceptually (Figure 2, task descriptions in Appendix) but does not provide the actual full prompt text used for data synthesis, critique generation, or refinement. Only template structures with placeholders are shown."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Learning rate (2e-5) and epochs (3) are stated for SFT training, but inference hyperparameters for data synthesis (temperature, top-p, max tokens for the generation model) are not reported. Only 'greedy inference' is stated for evaluation."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Condor is a data synthesis pipeline, not an agent system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper describes the pipeline stages but does not document filtering or quality control steps. It's unclear how/if any generated data was filtered or cleaned before training."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 'Limitations' is present, discussing unexplored multi-round iteration, diversity improvements, and hallucination risks."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations section is generic: 'many experiments that require further exploration' and 'hallucinations produced by LLMs in synthetic data could also become a potential risk.' No specific threats to the validity of the reported results."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do NOT show. The paper does not bound claims to specific model families, languages, or domains tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A 20K subset of the Condor dataset is released on HuggingFace, allowing partial verification. The full 200K dataset used in experiments is not released."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes the World Knowledge Tree construction, tag expansion from platforms like Zhihu and Reddit, Q&A pair generation with task diversity and difficulty control, and the refinement process."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is synthetically generated by LLMs."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline stages are described (World Knowledge Tree → Q&A generation → Refinement), but intermediate filtering steps, rejection rates, and how many examples were discarded at each stage are not documented. The paper says ~200K pairs were generated but doesn't explain if any were filtered out."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Shanghai AI Laboratory and Tsinghua University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Authors are from Shanghai AI Laboratory which develops InternLM models. They evaluate InternLM models and use their own CompassJudger for evaluation, creating a potential conflict. No independence statement."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates stated for any of the models used (Qwen2.5, InternLM2.5, LLaMA3.1). Benchmarks like HumanEval (2021) could be in training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the evaluation benchmarks overlap with the base models' pre-training data or with the synthetically generated Condor data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Many benchmarks used (HumanEval 2021, GSM8K 2021, MATH 2021, BoolQ 2019) were published well before the models' likely training cutoffs. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, API cost, or wall-clock time reported for the data synthesis pipeline or training. The paper claims efficiency but provides no cost numbers."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No GPU hours, hardware specifications, or total computational budget stated for data synthesis or model training."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "A base model fine-tuned on only 20K Condor-generated samples achieves superior performance compared to counterparts on human-preference benchmarks.",
    286       "evidence": "Table 3 shows Condor 72B (200K data from Qwen2.5-72B) achieves 60.03 average vs 58.02 for official Qwen2.5-7B-Instruct. Table 1 shows per-benchmark results.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Condor Refine enables iterative self-improvement for LLMs at various scales (up to 72B).",
    291       "evidence": "Table 7 shows 72B model improves from 70.14 (official IT) to 71.12 (Condor) on average across 8 benchmarks judged by GPT-4o. However, improvements are small and inconsistent across benchmarks.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Condor significantly outperforms other data synthesis methods.",
    296       "evidence": "Table 3 shows Condor 72B at 60.03 vs Magpie at 55.67, Evol Instruct at 34.33, etc. But no significance tests, single-run results, and Magpie uses 1M data vs Condor's 200K.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "SFT has minimal impact on the model's intrinsic knowledge capabilities.",
    301       "evidence": "Table 2 shows knowledge benchmark scores remain similar across Void, Refine, and Official models (average ~70 for all variants).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Condor Refine shows strong adaptability across different model types (Qwen, InternLM, Llama).",
    306       "evidence": "Table 4 shows improvements across all three model families, with InternLM and Llama showing ~10% improvement.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "Condor is a two-stage synthetic data pipeline (World Knowledge Tree for diverse question generation + Self-Reflection Refinement for response quality improvement) that produces SFT data improving LLM chat performance. Models fine-tuned on 200K Condor-generated samples outperform official RLHF models on human-preference benchmarks when judged by GPT-4o and CompassJudger. The approach works across model families (Qwen, InternLM, Llama) and scales (7B to 72B), with larger models showing greater benefit. Knowledge-based QA performance remains largely unaffected by the SFT process.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance or significance testing",
    315       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or significance tests, yet the paper makes claims of 'significant' improvement."
    316     },
    317     {
    318       "flag": "Partial self-evaluation conflict",
    319       "detail": "Authors from Shanghai AI Laboratory evaluate InternLM models (their own) and use their own CompassJudger-1-32B as one of two judge models, creating a potential circular evaluation."
    320     },
    321     {
    322       "flag": "LLM-as-judge without human validation",
    323       "detail": "The paper claims to improve 'human-preference' capabilities but never validates with actual humans. All evaluation relies on LLM judges (GPT-4o, CompassJudger), which may have systematic biases toward certain response styles."
    324     },
    325     {
    326       "flag": "Incomplete data release",
    327       "detail": "Only 20K of the 200K samples used in experiments are released, limiting reproducibility of the main results."
    328     },
    329     {
    330       "flag": "No contamination analysis",
    331       "detail": "Multiple evaluation benchmarks (HumanEval, GSM8K, MATH, BoolQ) predate the models used, and the synthetic data generation process could inadvertently reproduce benchmark content. No contamination analysis is performed."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
    337       "authors": ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra"],
    338       "year": 2023,
    339       "arxiv_id": "2212.10560",
    340       "relevance": "Foundational work on synthetic instruction data generation for LLM alignment, directly compared in this paper."
    341     },
    342     {
    343       "title": "WizardLM: Empowering Large Language Models to Follow Complex Instructions",
    344       "authors": ["Can Xu", "Qingfeng Sun", "Kai Zheng"],
    345       "year": 2023,
    346       "arxiv_id": "2304.12244",
    347       "relevance": "Evol-Instruct method for evolving instruction data, used as a baseline in Table 3."
    348     },
    349     {
    350       "title": "Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing",
    351       "authors": ["Zhangchen Xu", "Fengqing Jiang", "Luyao Niu"],
    352       "year": 2024,
    353       "arxiv_id": "2406.08464",
    354       "relevance": "Key baseline for synthetic data generation that leverages chat templates; compared in Table 3 and distribution analysis."
    355     },
    356     {
    357       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    358       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    359       "year": 2023,
    360       "arxiv_id": "2303.17651",
    361       "relevance": "Self-refinement approach for LLM outputs, related to Condor's refinement stage."
    362     },
    363     {
    364       "title": "Reinforced Self-Training (ReST) for Language Modeling",
    365       "authors": ["Caglar Gulcehre", "Tom Le Paine", "Srivatsan Srinivasan"],
    366       "year": 2023,
    367       "arxiv_id": "2308.08998",
    368       "relevance": "Self-training approach using reward models for LLM improvement, related to self-iteration experiments."
    369     },
    370     {
    371       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    372       "authors": ["Noah Shinn", "Federico Cassano", "Edward Berman"],
    373       "year": 2023,
    374       "arxiv_id": "2303.11366",
    375       "relevance": "Self-reflection mechanism for language agents, related to Condor's self-reflection refinement approach."
    376     },
    377     {
    378       "title": "A Survey on Self-Evolution of Large Language Models",
    379       "authors": ["Zhengwei Tao", "Ting-En Lin", "Xiancai Chen"],
    380       "year": 2024,
    381       "arxiv_id": "2404.14387",
    382       "relevance": "Survey categorizing LLM self-iteration methods, providing context for Condor's self-improvement claims."
    383     },
    384     {
    385       "title": "Synthesizing Post-Training Data for LLMs through Multi-Agent Simulation",
    386       "authors": ["Shuo Tang", "Xianghe Pang", "Zexi Liu"],
    387       "year": 2024,
    388       "arxiv_id": "2410.14251",
    389       "relevance": "Multi-agent approach to synthetic data generation (MATRIX-Gen), contrasted with Condor's single-model approach."
    390     },
    391     {
    392       "title": "Large Language Models Can Self-Improve",
    393       "authors": ["Jiaxin Huang", "Shixiang Shane Gu", "Le Hou"],
    394       "year": 2022,
    395       "arxiv_id": "2210.11610",
    396       "relevance": "Early work on LLM self-improvement without labeled data, foundational to Condor's self-iteration experiments."
    397     },
    398     {
    399       "title": "Evaluating Large Language Models Trained on Code",
    400       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    401       "year": 2021,
    402       "arxiv_id": "2107.03374",
    403       "relevance": "HumanEval benchmark used for code evaluation in this paper's knowledge benchmark suite."
    404     }
    405   ]
    406 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs