scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25239B)
      1 {
      2   "paper": {
      3     "title": "CHORUS: Zero-shot Hierarchical Retrieval and Orchestration for Generating Linear Programming Code",
      4     "authors": ["Tasnim Ahmed", "Salimur Choudhury"],
      5     "year": 2025,
      6     "venue": "LION 19 (19th Learning and Intelligent Optimization Conference)",
      7     "arxiv_id": "2505.01485"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive link is provided in the paper. The paper describes the framework but does not release the source code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The authors curated the NL4Opt-Code dataset (extending NL4Opt with Gurobi code annotations) but do not provide a download link or release it. The original NL4Opt dataset is publicly available, but the code annotations they created are not released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions hardware (Intel Xeon Platinum 8358, NVIDIA H100 NVL GPU, 251 GB RAM) in Section 3.3, but provides no software environment details such as library versions, requirements.txt, or dependency specifications."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but there are no scripts, commands, or README-style instructions to replicate the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 1 and 2 report only point estimates (e.g., accuracy 0.6125) with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CHORUS 'improves performance by a significant margin' and 'outperforms' baselines, but no statistical significance tests (p-values, t-tests, etc.) are provided. Comparisons are based solely on comparing point estimates."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context, e.g., 'accuracy of Llama3.3 (70B) increases from 0.2289 to 0.5675' (Section 3.4) and '386.95% accuracy improvement' (Section 3.5). Absolute before/after numbers are consistently provided."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper does not state the exact number of test instances in the NL4Opt-Code benchmark, nor justify the sample size. No power analysis or justification for the dataset size is provided."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or multi-run results are reported. All results appear to be single-run numbers with no indication of result stability across multiple executions."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 includes baseline comparisons: the same LLMs without CHORUS, plus GPT-3.5 and GPT-4. Table 2 includes Traditional RAG as another baseline."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include GPT-3.5, GPT-4, Llama3.3, Phi4, Deepseek-r1, and Qwen2.5-coder, which are all contemporary models. The paper also discusses OptiMUS, OptLLM, and E-OPT as related work, though direct numerical comparison with those frameworks is not provided."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 2 presents a comprehensive ablation study with 5 configurations: Baseline, Baseline + Expert Prompt, Traditional RAG, CHORUS without reasoning, and full CHORUS. This isolates the contributions of expert prompting, retrieval strategy, and structured reasoning."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 reports four metrics: Accuracy, Syntactic Validity, Semantic Similarity, and Edit Distance, each defined with formulas in Section 3.2."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the generated code is included. All evaluation is automated (accuracy based on matching objective function values, syntactic parsing, embedding similarity, edit distance). Given that the paper claims code quality improvements, human review of code quality would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper does not explicitly describe a train/dev/test split. The NL4Opt dataset has development and test sets, but the paper does not clearly state which split was used for evaluation. Section 3.2 mentions the NL4Opt competition's structure but does not specify a held-out test protocol."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The NL4Opt dataset spans six domains (sales, advertising, investment, production, transportation, sciences) but no per-domain breakdown of results is provided. Only aggregate accuracy across all problems is reported."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses failure modes: smaller models (Llama3.1 8B) suffer accuracy drops with expert prompting due to limited context windows (Section 3.5), and Traditional RAG causes 'fragmented or incomplete contents' that confuse models. The conclusion also acknowledges 'Code generation remains highly sensitive to prompt engineering.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that expert prompting reduces Llama3.1 (8B) accuracy by 61.3% (from 7.96% to 3.08%), and that Traditional RAG consistently hurts performance compared to CHORUS. Edit distance is acknowledged as 'not a particularly reliable metric' that shows no improvement."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims CHORUS 'improves the performance of open-source LLMs' by a 'significant margin' and allows them to 'outperform or match GPT3.5 and GPT4.' Tables 1 and 2 support these claims: Phi4 with CHORUS achieves 0.6125 accuracy vs GPT-4's 0.6367, and Llama3.3 with CHORUS achieves 0.5675 vs GPT-3.5's 0.5260."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about component contributions (e.g., expert prompting, hierarchical chunking, structured reasoning). The ablation study in Table 2 systematically removes components to measure their individual impact, which is an adequate controlled single-variable manipulation design for these claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Generating Linear Programming Code' broadly, but the experiments only test on one dataset (NL4Opt-Code) with one solver (Gurobi) in one language (Python). The abstract claims the framework is 'adaptable to any mathematical problem, given an available solver' but this is not tested. The conclusion acknowledges 'alignment of LLMs with other optimization topics...is left for future research' but the title and abstract do not bound the claims to the tested setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether the gains come simply from providing more context tokens rather than from the specific hierarchical structure, or whether prompt engineering alone (without RAG) could match performance for larger models."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are specified by family and size (e.g., 'Llama3.1 (8B)', 'Phi4 (14B)', 'GPT3.5', 'GPT4') but no specific version strings, snapshot dates, or API versions are provided. For GPT-3.5 and GPT-4, no model ID (e.g., 'gpt-4-0613') is given."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompts in natural language ('The system prompt directs the language model to assume the role of an expert...whereas the user prompt outlines strict requirements such as the function name, error handling strategy...' Section 2.4) but the actual prompt text is not provided. The output schema class definition (GurobiSolution) is shown but not the full system/user prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper mentions 'configuring the sampling temperature to minimize stochasticity' (Section 2.4) but does not report the actual temperature value or any other hyperparameters (top-p, max tokens, top-k retrieval count, embedding model details). The max chunk size of 400 tokens is mentioned but core generation hyperparameters are absent."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The CHORUS pipeline is described in detail in Section 2: hierarchical tree indexing (Section 2.1), two-stage retrieval with keyword extraction (Section 2.2), cross-encoder reranking selecting top-3 conceptual docs and top-2 code examples (Section 2.3), and structured output parsing (Section 2.4). Figure 1 provides a visual overview."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2.1 describes how Gurobi documentation is processed into hierarchical tree chunks and metadata-augmented code examples. Section 3.2 describes the dataset annotation process in two stages (LP formulation derivation, then Gurobi code generation) with expert verification."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The conclusion (Section 4) contains a substantial discussion of limitations: 'code generation remains highly sensitive to prompt engineering, and smaller models struggle to fully incorporate all contextual elements within limited context windows. Additionally, the alignment of LLMs with other optimization topics (e.g., integer linear, mixed, or non-linear problems) is left for future research.'"
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations mentioned are specific to some extent (smaller models, prompt sensitivity, scope limited to LP) but there is no dedicated threats-to-validity section and the discussion does not address key threats like single-dataset evaluation, single-run results, potential overfitting of prompts to the specific benchmark, or annotator bias in ground truth creation."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While the conclusion mentions that other optimization types are 'left for future research,' the paper does not explicitly state what the results do NOT show. It does not bound claims to the specific dataset, solver, or language tested, and the abstract's claim of adaptability to 'any mathematical problem' goes well beyond what was demonstrated."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Neither the NL4Opt-Code dataset (their curated extension) nor the raw experimental outputs are made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.2 describes the dataset annotation process: a domain expert (graduate student with optimization coursework) derived LP formulations from NL4Opt problem descriptions, verified by another expert, then generated corresponding Gurobi code."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited for the study. The annotators were domain experts involved in dataset creation, not study participants."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from NL4Opt dataset to NL4Opt-Code is documented in Section 3.2: original problem descriptions → LP formulation derivation by annotator → verification by second expert → Gurobi code generation. The documentation processing pipeline is described in Section 2.1."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding sources."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: both authors are from the School of Computing, Queen's University, Ontario, Canada. No evaluated product is affiliated with their institution."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure statement means this criterion is not satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates multiple LLMs (Llama3.1, Llama3.3, Phi4, Deepseek-r1, Qwen2.5-coder, GPT-3.5, GPT-4) on the NL4Opt-Code benchmark but does not state any training data cutoff dates for these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The NL4Opt dataset was published in 2022 (NeurIPS competition). The models used were trained after 2022 and could have seen this data. No discussion of potential train/test overlap is provided."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The NL4Opt dataset was publicly available before the training cutoff of all tested models. While the authors created new code annotations (NL4Opt-Code), the problem descriptions that the models must understand are from the original NL4Opt dataset, which could be in training data. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper claims CHORUS enables open-source LLMs to match GPT-4 'while requiring far fewer computational resources' (abstract) but does not report actual inference costs, API costs, tokens consumed, or wall-clock time per problem."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is specified (H100 NVL GPU, Section 3.3) but total compute budget (GPU hours, total inference time, or total cost) is not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CHORUS improves accuracy of open-source LLMs on LP code generation by a significant margin compared to baseline and conventional RAG.",
    286       "evidence": "Table 1 shows accuracy improvements: Llama3.3 from 0.2289 to 0.5675, Phi4 from 0.1938 to 0.6125, Deepseek-r1 from 0.1073 to 0.5848, Qwen2.5-coder from 0.4644 to 0.5986 (Section 3.4).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "CHORUS allows open-source LLMs to outperform or match GPT-3.5 and GPT-4 performance.",
    291       "evidence": "Table 1: Phi4+CHORUS achieves 0.6125 accuracy vs GPT-4's 0.6367, and Llama3.3+CHORUS achieves 0.5675 vs GPT-3.5's 0.5260 (Section 3.4). However, no model with CHORUS actually exceeds GPT-4's accuracy.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Traditional RAG underperforms CHORUS by 46.14-89.33%.",
    296       "evidence": "Table 2 shows Traditional RAG accuracy vs CHORUS accuracy across all 5 LLMs. The percentage reduction figures are stated in Section 3.5.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Expert prompting causes 386.95% accuracy improvement for Deepseek-r1 but 61.3% reduction for Llama3.1.",
    301       "evidence": "Table 2: Deepseek-r1 baseline 0.1073 vs baseline+expert prompt 0.5225; Llama3.1 baseline 0.0796 vs 0.0308 (Section 3.5).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Structured reasoning (reasoning_steps field) provides consistent accuracy gains across LLMs.",
    306       "evidence": "Table 2 shows CHORUS vs CHORUS (w/o reasoning): improvements of 1.71% to 92.29% across models. The largest gain is for Llama3.1 (8B) from 0.0104 to 0.1349 (Section 3.5).",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CHORUS, a RAG framework using hierarchical documentation chunking, metadata-augmented code retrieval, and cross-encoder reranking, substantially improves LP code generation accuracy for open-source LLMs on the NL4Opt-Code benchmark. Phi4 (14B) with CHORUS achieves 0.6125 accuracy, approaching GPT-4's 0.6367, while baseline Phi4 achieves only 0.1938. Ablation studies show that traditional fixed-length RAG actually hurts performance compared to no RAG, while hierarchical chunking, expert prompting, and structured reasoning each contribute positively. Smaller models (8B) struggle to benefit from the framework due to limited context window capacity.",
    312   "red_flags": [
    313     {
    314       "flag": "No uncertainty quantification",
    315       "detail": "All results appear to be single-run numbers with no standard deviations, confidence intervals, or multi-run averaging. Given the acknowledged variability in LLM outputs ('model responses demonstrate significant variability across queries'), single-run results are insufficient to establish reliable performance differences."
    316     },
    317     {
    318       "flag": "No statistical significance tests",
    319       "detail": "Claims of 'significant' improvements are based on comparing point estimates without any statistical tests. The word 'significant' is used in a colloquial sense rather than a statistical one."
    320     },
    321     {
    322       "flag": "Benchmark contamination risk unaddressed",
    323       "detail": "NL4Opt was published in 2022 and is publicly available. All tested models were trained after 2022 and may have memorized the problem descriptions. Baseline performance could be inflated by data contamination, and CHORUS's relative improvement may differ on uncontaminated data."
    324     },
    325     {
    326       "flag": "Prompts and hyperparameters not disclosed",
    327       "detail": "The expert prompts described as critical to performance are not provided in full text. Temperature and other generation hyperparameters are not specified. This prevents reproduction and makes it impossible to verify whether the prompts were overfit to the benchmark."
    328     },
    329     {
    330       "flag": "No per-domain breakdown",
    331       "detail": "The NL4Opt dataset covers 6 domains but results are only reported as aggregates. Performance could vary substantially across domains, and aggregate accuracy could mask poor performance on certain problem types."
    332     },
    333     {
    334       "flag": "Cost claims without evidence",
    335       "detail": "The abstract claims open-source LLMs with CHORUS require 'far fewer computational resources' than GPT-4, but no actual cost or latency measurements are reported to support this claim."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "The Llama 3 Herd of Models",
    341       "authors": ["Abhimanyu Dubey et al."],
    342       "year": 2024,
    343       "relevance": "Major open-source LLM family used as baseline and with CHORUS framework, relevant to LLM capability evaluation."
    344     },
    345     {
    346       "title": "Phi-4 Technical Report",
    347       "authors": ["Marah Abdin et al."],
    348       "year": 2024,
    349       "relevance": "Open-source LLM evaluated in the study, relevant to understanding smaller model code generation capabilities."
    350     },
    351     {
    352       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    353       "authors": ["DeepSeek-AI et al."],
    354       "year": 2025,
    355       "relevance": "Reasoning-focused LLM evaluated for code generation, relevant to LLM reasoning and code generation capabilities."
    356     },
    357     {
    358       "title": "Qwen2.5-Coder Technical Report",
    359       "authors": ["Binyuan Hui et al."],
    360       "year": 2024,
    361       "relevance": "Code-specialized LLM evaluated in the study, relevant to code generation with specialized models."
    362     },
    363     {
    364       "title": "GPT-4 Technical Report",
    365       "authors": ["OpenAI et al."],
    366       "year": 2024,
    367       "relevance": "Primary closed-source baseline for code generation performance comparison."
    368     },
    369     {
    370       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    371       "authors": ["Patrick Lewis et al."],
    372       "year": 2020,
    373       "relevance": "Foundational RAG paper that CHORUS builds upon, relevant to understanding retrieval-augmented LLM approaches."
    374     },
    375     {
    376       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    377       "authors": ["Jason Wei et al."],
    378       "year": 2022,
    379       "relevance": "Chain-of-thought prompting technique that CHORUS's structured reasoning extends, relevant to LLM reasoning methodology."
    380     },
    381     {
    382       "title": "OptiMUS: Optimization Modeling Using MIP Solvers and Large Language Models",
    383       "authors": ["Ali AhmadiTeshnizi", "Wenzhi Gao", "Madeleine Udell"],
    384       "year": 2023,
    385       "arxiv_id": "2310.06116",
    386       "relevance": "Multi-agent LLM framework for optimization problems, directly comparable approach to CHORUS."
    387     },
    388     {
    389       "title": "Let Me Speak Freely? A Study on the Impact of Format Restrictions on Large Language Model Performance",
    390       "authors": ["Zhi Rui Tam et al."],
    391       "year": 2024,
    392       "relevance": "Studies how structured output constraints affect LLM reasoning, directly motivates CHORUS's reasoning_steps design."
    393     },
    394     {
    395       "title": "Benchmarking LLMs for Optimization Modeling and Enhancing Reasoning via Reverse Socratic Synthesis",
    396       "authors": ["Zhicheng Yang et al."],
    397       "year": 2024,
    398       "relevance": "E-OPT benchmark for mathematical programming code generation, provides comparative evaluation framework."
    399     },
    400     {
    401       "title": "Solving General Natural-Language-Description Optimization Problems with Large Language Models",
    402       "authors": ["Jiahao Zhang et al."],
    403       "year": 2024,
    404       "relevance": "OptLLM framework integrating LLMs with solvers for LP problems, directly comparable to CHORUS."
    405     }
    406   ]
    407 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs