scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29347B)
      1 {
      2   "paper": {
      3     "title": "OR-LLM-Agent: Automating Modeling and Solving of Operations Research Optimization Problems with Reasoning LLM",
      4     "authors": [
      5       "Bowen Zhang",
      6       "Pengcheng Luo",
      7       "Genke Yang",
      8       "Boon-Hee Soong",
      9       "Chau Yuen"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2503.10009",
     14     "doi": "10.48550/arXiv.2503.10009"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "OR-LLM-Agent, a three-stage framework (Math Agent, Code Agent, Debugging Agent) built on reasoning LLMs, achieves 82.93% accuracy on the authors' BWOR benchmark with DeepSeek-R1, outperforming standalone reasoning LLMs by at least 7%. The authors find that reasoning LLMs sometimes underperform non-reasoning counterparts on existing OR benchmarks (NL4OPT, MAMO, IndustryOR), but consistently outperform them on BWOR. An ablation study shows each stage (modeling, coding, debugging) contributes incrementally to performance, with the full pipeline reducing code error rates by 4.04% compared to standalone LLMs.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "GitHub repository provided in the paper: https://github.com/bwz96sco/or_llm_agent."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "BWOR dataset released on HuggingFace: https://huggingface.co/datasets/SJTU/BWOR. Other benchmarks (NL4OPT, MAMO, IndustryOR) are publicly available from the ORLM repository."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing is provided in the paper. Only mentions Python and Gurobi solver without version details."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. The methodology is described algorithmically (Algorithm 1) but there are no specific commands or setup instructions to replicate experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results are reported as point estimates (e.g., '82.93% accuracy') with no confidence intervals or error bars in any table or figure."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Claims like 'outperforms... by at least 7% in accuracy' are based solely on comparing raw numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used anywhere."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Percentage improvements are reported with baseline context throughout, e.g., 'improves average accuracy by 4.06%, from 62.20% to 66.26%' (ablation study) and 'reduces the mean code error rate by 4.04%, dropping from 4.56% to 0.52%' (Table 2)."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "BWOR contains only 82 problems with no justification for this sample size. No power analysis or discussion of whether 82 problems is sufficient for the claims made."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Results appear to be single-run. No standard deviation, variance, or spread measures are reported across any experimental runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Extensive baselines included: SOTA methods (tag-BART, Chain-of-Experts, OptiMUS, ORLM), reasoning LLMs (GPT-o3, GPT-o4-mini, Gemini 2.5 Pro, DeepSeek-R1), non-reasoning LLMs (GPT-4o, Gemini 2.0 Flash, DeepSeek-V3), and open-source models (LLAMA3-8B, DeepSeek-R1-Distill-32B)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include very recent models: GPT-o3, GPT-o4-mini, Gemini 2.5 Pro, and DeepSeek-R1, all from 2024-2025."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Ablation study on BWOR compares three configurations: Direct Code Generation, Math Agent + Code Agent, and full OR-LLM-Agent (Math + Code + Debugging). Results in Figure 11 show each component's contribution."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Only accuracy (absolute error from ground truth below 0.1) is used as the evaluation metric. No other metrics (e.g., partial credit, model quality, solve time) are reported."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of the system's outputs is performed. All evaluation is automated via numerical comparison against ground-truth answers. Human evaluation of mathematical model quality would have strengthened the claims."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "All benchmarks (NL4OPT, MAMO, IndustryOR, BWOR) are used purely for evaluation. No fine-tuning or hyperparameter selection is performed on any of these datasets."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down across five datasets (NL4OPT, MAMO-Easy, MAMO-Complex, IndustryOR, BWOR) in Table 1, and separately by code error rate (Table 2) and mathematical model accuracy (Table 3)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Error analysis in Tables 2 and 3 separately analyzes code error rates and mathematical model accuracy. The paper discusses how the debugging agent reduces code errors and when math model self-repair is triggered."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that reasoning LLMs sometimes underperform non-reasoning counterparts on NL4OPT, MAMO, and IndustryOR (e.g., 'GPT-o4-mini achieves 5.00% lower accuracy than GPT-4o' on IndustryOR). Also reports missing results due to service interruption."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims 'outperforms advanced methods... by at least 7% in accuracy' on BWOR. Table 1 shows OR-LLM-Agent(DeepSeek-R1) at 82.93% vs GPT-o3 at 75.61%, a 7.32% gap. The claim is supported, though narrowly scoped to BWOR."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The causal claim that 'task decomposition improves performance' is supported by the ablation study (Figure 11) which uses controlled single-variable manipulation: adding Math Agent improves by 4.06%, adding Debugging Agent improves by another 5.49%."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims 'Automating Modeling and Solving of Operations Research Optimization Problems' broadly, but the primary results are on BWOR (82 textbook problems). The paper demotes established benchmarks where results are less favorable, making the generalization claim rest heavily on a small, author-constructed dataset."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations are discussed. The improvement could stem from using 3+ LLM calls per problem instead of 1 (more compute), but this confound is not considered. The paper attributes all gains to task decomposition without considering alternatives."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures accuracy (absolute error < 0.1 from ground truth) and claims to measure accuracy. The measurement directly matches the claim with no proxy gap."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are identified by marketing names only: 'GPT-o3', 'GPT-o4-mini', 'Gemini 2.5 Pro', 'DeepSeek-R1'. No snapshot dates, API versions, or specific model identifiers are provided."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figures 3, 4, 6, and 7 show the actual prompt text used for the Math Agent, Code Agent, Code Self-repair, and Math Model Self-repair respectively. The code repository also contains the prompts."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No API hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the LLMs used. These settings significantly affect output quality."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The three-agent scaffolding is described in detail: Algorithm 1 provides pseudocode, Figures 1-2 show workflow diagrams, and the debugging loop (5 attempts, code repair for attempts 1-3, math model repair on attempt 4) is fully specified."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "BWOR is described as '82 problems collected from standard OR textbooks (Hu 2010, 2012)' that were 'manually translated into English,' but no inclusion/exclusion criteria, selection rationale, or translation verification process is documented. For other benchmarks, they simply state they were 'obtained from the ORLM repository' with no further detail."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No dedicated limitations or threats-to-validity section exists. The paper moves directly from experimental results to the conclusion."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No threats to validity are discussed anywhere in the paper. Issues like small dataset size, scaffold confounding, lack of variance reporting, and benchmark selection bias are not acknowledged."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No explicit scope boundaries are stated. The paper does not specify what types of OR problems are excluded, what model families might not work, or what the approach cannot handle."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "BWOR dataset is released on HuggingFace (https://huggingface.co/datasets/SJTU/BWOR) and code on GitHub, enabling independent verification of results."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "BWOR is described as '82 problems collected from standard OR textbooks (Hu 2010, 2012)' but no selection criteria, chapter coverage, or systematic collection methodology is provided. It is unclear how these 82 problems were chosen from the textbooks."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data source is OR textbooks and standard public benchmarks."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The pipeline from textbook to final benchmark is undocumented. How problems were selected, how translation was validated, and how ground-truth answers were verified by 'domain experts' are not described with enough detail to reproduce."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information, acknowledgments section, or grant numbers are provided anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University (Ningbo AI Institute and Dept. of Automation) and Nanyang Technological University."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Cannot assess funder independence since no funding information is disclosed."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the models used (GPT-o3, GPT-o4-mini, Gemini 2.5 Pro, DeepSeek-R1, etc.). This is important since NL4OPT and MAMO are publicly available and may be in training data."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of potential train/test overlap. NL4OPT (2022), MAMO, and IndustryOR are public benchmarks that could be in the training data of the evaluated models. This is especially relevant given the unexpected finding that reasoning LLMs underperform non-reasoning ones on these benchmarks."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "NL4OPT was published in 2022 and MAMO in 2024; both are publicly available and could be in training data. The paper does not discuss contamination risk despite this being a plausible alternative explanation for the puzzling reasoning vs. non-reasoning performance inversion on these benchmarks."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "The framework makes 3+ LLM calls per problem (Math Agent + Code Agent + up to 5 debugging attempts), but no API costs, token counts, or latency measurements are reported."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No total computational budget (API spend, total tokens, wall-clock time) is stated despite using expensive reasoning LLMs (GPT-o3, DeepSeek-R1) across multiple datasets."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Results appear to be single-run. No seed sensitivity analysis or multi-seed results are reported."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged across multiple runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Design choices like the number of debugging attempts (5), when to trigger math model repair (attempt 4), and prompt wording appear tuned but no search budget or justification is provided."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The debugging thresholds (code repair for attempts 1-3, math model repair on attempt 4, give up after 5) are not justified. No explanation for why these specific values were chosen."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Many pairwise comparisons are made across models and datasets (Table 1 has 15+ model-dataset combinations) with no correction for multiple comparisons."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Authors evaluate their own framework against baselines without acknowledging author-evaluation bias. Open-source baseline results (LLAMA3-8B, DeepSeek-R1-Distill-32B all scoring 0.00%) were reproduced using ORLM prompts, potentially disadvantaging those baselines."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "OR-LLM-Agent uses 3+ reasoning LLM calls per problem while baselines use a single call. This substantial compute difference is never discussed or controlled for. The 7%+ improvement could partly or fully reflect more compute, not better architecture."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The paper analyzes whether existing OR benchmarks (NL4OPT, MAMO, IndustryOR) properly differentiate reasoning from non-reasoning models, comparing against known math/code benchmarks (AIME 2024, LiveCodeBench). They argue BWOR has better construct validity because reasoning models consistently outperform non-reasoning ones on it."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "OR-LLM-Agent uses a multi-agent scaffold with debugging loops while baselines are single-call LLMs. The improvement is attributed to 'task decomposition' and 'reasoning LLMs' without controlling for the scaffold effect. The ablation adds components (and LLM calls) simultaneously."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of whether benchmark problems existed before the models' training cutoffs. NL4OPT (2022) and MAMO (2024) solutions could be in training data."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information not available in real-world OR problem solving."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether benchmark problems share structural similarities or overlap across datasets."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention methods are applied. No canary strings, membership inference, decontamination, or temporal analysis."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "OR-LLM-Agent with DeepSeek-R1 outperforms all compared methods by at least 7% accuracy on BWOR.",
    371       "evidence": "Table 1 shows OR-LLM-Agent(DeepSeek-R1) achieves 82.93% on BWOR, compared to 75.61% for GPT-o3 (best standalone reasoning LLM) — a 7.32% gap.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Reasoning LLMs sometimes underperform non-reasoning counterparts on existing OR benchmarks (NL4OPT, MAMO, IndustryOR).",
    376       "evidence": "Figure 9 and surrounding analysis show GPT-o4-mini achieves 5% lower than GPT-4o on IndustryOR, Gemini 2.5 Pro underperforms Gemini 2.0 Flash by 7.11% on MAMO-Complex, and on MAMO-Easy all reasoning models score lower (1.84%-17.79% decline).",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "BWOR more effectively differentiates reasoning vs. non-reasoning LLMs than existing OR benchmarks.",
    381       "evidence": "On BWOR, reasoning LLMs consistently outperform non-reasoning counterparts (10.98%-35.37% improvement), aligning with their known math/code superiority, while other benchmarks show inconsistent patterns.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Task decomposition (Math + Code + Debug) improves performance over direct code generation by ~9.5%.",
    386       "evidence": "Ablation study (Figure 11) shows Direct Code Generation at 62.20% average, Math+Code at 66.26% (+4.06%), and full framework at 71.75% (+5.49% more) on BWOR.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "OR-LLM-Agent reduces code error rate by 4.04% compared to standalone LLMs.",
    391       "evidence": "Table 2 shows mean code error rate of 0.52% for OR-LLM-Agent vs. 4.56% for reasoning and non-reasoning LLMs across all datasets.",
    392       "supported": "moderate"
    393     }
    394   ],
    395   "red_flags": [
    396     {
    397       "flag": "Author-constructed primary benchmark",
    398       "detail": "The primary benchmark BWOR (82 problems) is constructed by the authors. Established benchmarks (NL4OPT, MAMO, IndustryOR) where the framework shows less clear advantage are demoted to 'supplementary analysis' and 'excluded from core comparisons.' This creates a risk of benchmark selection bias."
    399     },
    400     {
    401       "flag": "No error bars on tiny dataset",
    402       "detail": "All results on BWOR (82 problems) are single-run point estimates with no variance, confidence intervals, or significance tests. On 82 binary-ish outcomes, stochastic variation in LLM outputs could easily account for several percentage points of difference."
    403     },
    404     {
    405       "flag": "Scaffold confound unaddressed",
    406       "detail": "OR-LLM-Agent makes 3+ LLM calls per problem (modeling + coding + up to 5 debugging attempts) while all baselines use a single LLM call. The 7% improvement could partly or wholly reflect more compute rather than better architecture. No matched-compute comparison is provided."
    407     },
    408     {
    409       "flag": "Contamination as alternative explanation ignored",
    410       "detail": "The finding that reasoning LLMs underperform on NL4OPT/MAMO but not BWOR could be explained by differential contamination (public benchmarks in training data help non-reasoning models' memorization while BWOR is novel). This alternative explanation is not discussed."
    411     },
    412     {
    413       "flag": "Zero-scoring baselines suggest prompt/setup issues",
    414       "detail": "LLAMA3-8B-Base, LLAMA3-8B-Instruct, and DeepSeek-R1-Distill-32B all score exactly 0.00% across every dataset. This suggests the ORLM prompt or execution setup may be incompatible with these models rather than reflecting their actual capability."
    415     },
    416     {
    417       "flag": "No limitations section",
    418       "detail": "The paper contains no limitations, threats to validity, or scope boundary discussion despite multiple methodological concerns (tiny primary benchmark, missing variance, compute confound, potential contamination)."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Autogen: Enabling next-gen llm applications via multi-agent conversation",
    424       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    425       "year": 2023,
    426       "arxiv_id": "2308.08155",
    427       "relevance": "Multi-agent LLM framework for collaborative code generation and debugging, directly relevant to agentic AI workflows."
    428     },
    429     {
    430       "title": "Evaluating large language models trained on code",
    431       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    432       "year": 2021,
    433       "arxiv_id": "2107.03374",
    434       "relevance": "Codex and HumanEval benchmark — foundational work on LLM code generation capability evaluation."
    435     },
    436     {
    437       "title": "Competition-level code generation with alphacode",
    438       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    439       "year": 2022,
    440       "relevance": "Competitive programming code generation with large-scale sampling, relevant to LLM coding capability evaluation."
    441     },
    442     {
    443       "title": "Opencodeinterpreter: Integrating code generation with execution and refinement",
    444       "authors": ["Tianyu Zheng", "Ge Zhang", "Tianhao Shen"],
    445       "year": 2024,
    446       "arxiv_id": "2402.14658",
    447       "relevance": "Integrated code generation, execution, and optimization pipeline, relevant to automated programming agent design."
    448     },
    449     {
    450       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    451       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    452       "year": 2025,
    453       "arxiv_id": "2501.12948",
    454       "relevance": "Core reasoning LLM used in the framework, relevant to understanding reasoning vs non-reasoning model capabilities."
    455     },
    456     {
    457       "title": "Gpt-4 technical report",
    458       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    459       "year": 2023,
    460       "arxiv_id": "2303.08774",
    461       "relevance": "Technical report for GPT-4, a key baseline model in LLM capability evaluation."
    462     },
    463     {
    464       "title": "OptiMUS: Optimization Modeling Using mip Solvers and large language models",
    465       "authors": ["Ali AhmadiTeshnizi", "Wenzhi Gao", "Madeleine Udell"],
    466       "year": 2024,
    467       "relevance": "LLM-based modular agent for solving optimization problems from natural language, direct competitor and relevant baseline."
    468     },
    469     {
    470       "title": "Orlm: A customizable framework in training large models for automated optimization modeling",
    471       "authors": ["Chenghao Huang", "Zhitao Tang", "Shuang Hu"],
    472       "year": 2025,
    473       "relevance": "Fine-tuning framework for LLMs on OR tasks with synthetic data pipeline, key baseline and benchmark source."
    474     },
    475     {
    476       "title": "Chain-of-experts: When llms meet complex operations research problems",
    477       "authors": ["Ziyang Xiao", "Dongxiang Zhang", "Yangjun Wu"],
    478       "year": 2023,
    479       "relevance": "Multi-agent cooperative framework for linear programming, directly relevant to LLM-based OR problem solving."
    480     },
    481     {
    482       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    483       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    484       "year": 2024,
    485       "arxiv_id": "2403.07974",
    486       "relevance": "Contamination-free code benchmark used to validate model coding capability claims."
    487     },
    488     {
    489       "title": "Deepseek-v3 technical report",
    490       "authors": ["Aixin Liu", "Bei Feng", "Bing Xue"],
    491       "year": 2024,
    492       "arxiv_id": "2412.19437",
    493       "relevance": "Technical report for DeepSeek-V3, a non-reasoning baseline model in the evaluation."
    494     },
    495     {
    496       "title": "Phi-4 technical report",
    497       "authors": ["Marah Abdin", "Jyoti Aneja", "Harkirat Behl"],
    498       "year": 2024,
    499       "arxiv_id": "2412.08905",
    500       "relevance": "Small language model technical report, relevant to understanding LLM capabilities in mathematical reasoning."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "OR practitioners could use the released framework to automate optimization problem solving, though it requires Gurobi licenses and expensive API access to reasoning LLMs."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "The finding that reasoning LLMs underperform non-reasoning ones on some OR benchmarks is mildly surprising, though the explanation is underdeveloped."
    511     },
    512     "fear_safety": {
    513       "score": 0,
    514       "justification": "No AI safety or security concerns raised by this work."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversial claims or conflicts with other work."
    519     },
    520     "demo_ability": {
    521       "score": 2,
    522       "justification": "Code released on GitHub and dataset on HuggingFace, though requires Gurobi solver setup and API keys to run."
    523     },
    524     "brand_recognition": {
    525       "score": 1,
    526       "justification": "Uses well-known models (GPT-o3, DeepSeek-R1) but authors are from SJTU/NTU, not top-of-mind AI labs."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs